/
lua-utf8.lua
226 lines (191 loc) · 5.53 KB
/
lua-utf8.lua
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
local sbyte = string.byte
local ssub = string.sub
local concat = table.concat
local abs = math.abs
local _M = {}
-- Accept range for utf8
local accept_range = {
{lo = 0x80, hi = 0xBF},
{lo = 0xA0, hi = 0xBF},
{lo = 0x80, hi = 0x9F},
{lo = 0x90, hi = 0xBF},
{lo = 0x80, hi = 0x8F}
}
-- Check the given string is a valid utf8 string.
--
-- Args:
-- str: string
-- Returns:
-- (bool) whether the input string is a valid utf8 string.
-- (number) position of the first invalid byte if given.
function _M.validate(str)
local i, n = 1, #str
local first, byte, left_size, range_idx
while i <= n do
first = sbyte(str, i)
if first >= 0x80 then
range_idx = 1
if first >= 0xC2 and first <= 0xDF then --2 bytes
left_size = 1
elseif first >= 0xE0 and first <= 0xEF then --3 bytes
left_size = 2
if first == 0xE0 then
range_idx = 2
elseif first == 0xED then
range_idx = 3
end
elseif first >= 0xF0 and first <= 0xF4 then --4 bytes
left_size = 3
if first == 0xF0 then
range_idx = 4
elseif first == 0xF4 then
range_idx = 5
end
else
return false, i
end
if i + left_size > n then
return false, i
end
for j = 1, left_size do
byte = sbyte(str, i + j)
if byte < accept_range[range_idx].lo or byte > accept_range[range_idx].hi then
return false, i
end
range_idx = 1
end
i = i + left_size
end
i = i + 1
end
return true
end
-- Get the given string len.
--
-- Args:
-- str: string
-- Returns:
-- (number) the length of valid utf8 string part.
-- (number) position of the first invalid byte if given.
function _M.len(str)
local i, n, c = 1, #str, 0
local first, byte, left_size, range_idx
while i <= n do
first = sbyte(str, i)
if first >= 0x80 then
range_idx = 1
if first >= 0xC2 and first <= 0xDF then --2 bytes
left_size = 1
elseif first >= 0xE0 and first <= 0xEF then --3 bytes
left_size = 2
if first == 0xE0 then
range_idx = 2
elseif first == 0xED then
range_idx = 3
end
elseif first >= 0xF0 and first <= 0xF4 then --4 bytes
left_size = 3
if first == 0xF0 then
range_idx = 4
elseif first == 0xF4 then
range_idx = 5
end
else
return c, i
end
if i + left_size > n then
return c, i
end
for j = 1, left_size do
byte = sbyte(str, i + j)
if byte < accept_range[range_idx].lo or byte > accept_range[range_idx].hi then
return c, i
end
range_idx = 1
end
i = i + left_size
end
i = i + 1
c = c + 1
end
return c
end
-- Reverse the given valid utf8 string.
-- Args:
-- str: string
-- Returns:
-- (string) a reversed utf8 string of the given string.
function _M.reverse(str)
if #str <= 1 then
return str
end
local i, n, c = 1, #str, 0
local first, left_size
local utf8_arr = {}
while i <= n do
first = sbyte(str, i)
left_size = 0
if first >= 0x80 then
if first >= 0xC2 and first <= 0xDF then --2 bytes
left_size = 1
elseif first >= 0xE0 and first <= 0xEF then --3 bytes
left_size = 2
elseif first >= 0xF0 and first <= 0xF4 then --4 bytes
left_size = 3
end
end
c = c + 1
utf8_arr[c] = ssub(str, i, i + left_size)
i = i + left_size + 1
end
for j = 1, (c + 1) / 2 do
utf8_arr[c - j + 1], utf8_arr[j] = utf8_arr[j], utf8_arr[c - j + 1]
end
return concat(utf8_arr)
end
-- Sub the given valid utf8 string with the given index.
-- Args:
-- str: string
-- s: number, start index of utf8 character
-- e: number, end index of utf8 character
-- Returns:
-- (string) utf8 sub string
function _M.sub(str, s, e)
local len = _M.len(str)
e = e or len
local utf8_start = s >= 0 and s or len - abs(s) + 1
local utf8_end = e >= 0 and e or len - abs(e) + 1
if utf8_end > len then
utf8_end = len
end
if utf8_start > len or
utf8_start <= 0 or
utf8_end <= 0 then
return ""
end
local i, c = 1, 0
local first, left_size, byte_start, byte_end
while c < len do
first = sbyte(str, i)
left_size = 0
if first >= 0x80 then
if first >= 0xC2 and first <= 0xDF then --2 bytes
left_size = 1
elseif first >= 0xE0 and first <= 0xEF then --3 bytes
left_size = 2
elseif first >= 0xF0 and first <= 0xF4 then --4 bytes
left_size = 3
end
end
c = c + 1
if c == utf8_start then
byte_start = i
end
if c == utf8_end then
byte_end = i + left_size
end
i = i + left_size + 1
end
return ssub(str, byte_start, byte_end)
end
return _M