-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathutf8case.jl
212 lines (205 loc) · 7.44 KB
/
utf8case.jl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
#=
Copyright 2018 Gandalf Software, Inc., Scott P. Jones
Licensed under MIT License, see LICENSE.md
=#
# These are more complex case folding functions, and maybe belong in a separate UTF8Str.jl package
# Note: these only check for cases in Unicode where 2 byte sequences
# could expand to 3 byte sequences. In the standard Unicode tables,
# that is the only expansion that occurs for upper or lower case
function _lower_utf8(beg, off, len)
# Note, the final length may be larger or smaller
buf, out = _allocate(UInt8, len)
unsafe_copyto!(out, beg, off)
fin = beg + len
pnt = beg + off
outend = out + len
out += off
while pnt < fin
ch = get_codeunit(pnt)
if ch < 0x80
set_codeunit!(out, ch + (_isupper_a(ch) << 5))
out += 1
elseif ch < 0xc4
ch = (ch << 6) | (get_codeunit(pnt += 1) & 0x3f)
out = output_utf8_2byte!(out, ch + (_isupper_l(ch) << 5))
elseif ch < 0xe0
# 2 byte
c16 = get_utf8_2byte(pnt += 1, ch)
if _isupper_u(c16)
c16 = _lowercase_u(c16)
# Check if still 2 byte, could increase to 3 byte, decrease to 1 byte
if c16 < 0x80
set_codeunit!(out, c16%UInt8)
out += 1
elseif c16 < 0x800
out = output_utf8_2byte!(out, c16)
else
# Check to see if we need to resize
diff = (outend - out - 3) - (fin - pnt - 1)
if diff < 0
outend -= diff
resize!(buf, outend - out)
out = pointer(buf)
outend = out + sizeof(buf)
end
out = output_utf8_3byte!(out, c16)
end
else
out = output_utf8_2byte!(out, c16)
end
elseif ch < 0xf0
# 3 byte
c16 = get_utf8_3byte(pnt += 2, ch)
if _isupper_u(c16)
c16 = _lowercase_u(c16)
# Check if still 3 byte, could drop to 2 byte
if c16 < 0x800
out = output_utf8_2byte!(out, c16)
else
out = output_utf8_3byte!(out, c16)
end
else
out = output_utf8_3byte!(out, c16)
end
else
# 4 byte
c32 = get_utf8_4byte(pnt += 3, ch)
_isupper_u(c32) && (c32 = _lowercase_u(c32))
out = output_utf8_4byte!(out, c32)
end
pnt += 1
end
out < outend && (buf = resize!(buf, out - pointer(buf)))
Str(UTF8CSE, buf)
end
function _upper_utf8(beg, off, len)
# Note, the final length may be larger or smaller
buf, out = _allocate(UInt8, len)
unsafe_copyto!(out, beg, off)
fin = beg + len
pnt = beg + off
outend = out + len
out += off
while pnt < fin
ch = get_codeunit(pnt)
if ch < 0x80
set_codeunit!(out, ch - (_islower_a(ch)<<5))
out += 1
elseif ch < 0xc4
ch = (ch << 6) | (get_codeunit(pnt += 1) & 0x3f)
if !V6_COMPAT && ch == 0xdf
# Increasing from 2 to 3 bytes, check to see if we need to resize
diff = (outend - out - 3) - (fin - pnt - 1)
if diff < 0
outend -= diff
resize!(buf, outend - out)
out = pointer(buf)
outend = out + sizeof(buf)
end
out = output_utf8_3byte!(out, 0x1e9e)
else
out = output_utf8_2byte!(out, _can_upper_l(ch) ? (ch - 0x20)%UInt16
: ch == 0xb5 ? 0x39c
: ch == 0xff ? 0x178
: ch%UInt16)
end
elseif ch < 0xe0
# 2 byte
c16 = get_utf8_2byte(pnt += 1, ch)
if _islower_u(c16)
c16 = _uppercase_u(c16)
# Check if still 2 byte, could increase to 3 byte, or decrease to 1 byte
if c16 < 0x80
set_codeunit!(out, c16%UInt8)
out += 1
elseif c16 < 0x800
out = output_utf8_2byte!(out, c16)
else
# Increasing from 2 to 3 bytes, check to see if we need to resize
diff = (outend - out - 3) - (fin - pnt - 1)
if diff < 0
outend -= diff
resize!(buf, outend - out)
out = pointer(buf)
outend = out + sizeof(buf)
end
out = output_utf8_3byte!(out, c16)
end
else
out = output_utf8_2byte!(out, c16)
end
elseif ch < 0xf0
# 3 byte
c16 = get_utf8_3byte(pnt += 2, ch)
if _islower_u(c16)
c16 = _uppercase_u(c16)
# Check if still 3 byte, uppercase form could drop to 2 byte
if c16 < 0x800
out = output_utf8_2byte!(out, c16)
else
out = output_utf8_3byte!(out, c16)
end
else
out = output_utf8_3byte!(out, c16)
end
else
# 4 byte
c32 = get_utf8_4byte(pnt += 3, ch)
_islower_u(c32) && (c32 = _uppercase_u(c32))
out = output_utf8_4byte!(out, c32)
end
pnt += 1
end
out < outend && (buf = resize!(buf, out - pointer(buf)))
Str(UTF8CSE, buf)
end
function lowercase(str::Str{UTF8CSE})
@preserve str begin
pnt = beg = pointer(str)
fin = beg + sizeof(str)
while pnt < fin
ch = get_codeunit(pnt)
prv = pnt
(ch < 0x80
? _isupper_a(ch)
: (ch < 0xc4
? _isupper_l((ch << 6) | (get_codeunit(pnt += 1) & 0x3f))
: _isupper_u(ch >= 0xf0
? get_utf8_4byte(pnt += 3, ch)
: (ch < 0xe0
? get_utf8_2byte(pnt += 1, ch)
: get_utf8_3byte(pnt += 2, ch))%UInt32))) &&
return _lower_utf8(beg, prv-beg, ncodeunits(str))
pnt += 1
end
str
end
end
# Check if can be uppercased
@inline function _check_uppercase(ch, pnt)
# ch < 0xc2 && return false (not needed, validated UTF-8 string)
cont = get_codeunit(pnt)
ch == 0xc3 ? ((cont > (V6_COMPAT ? 0x9f : 0x9e)) & (cont != 0xb7)) : (cont == 0xb5)
end
function uppercase(str::Str{UTF8CSE})
@preserve str begin
pnt = beg = pointer(str)
fin = beg + sizeof(str)
while pnt < fin
ch = get_codeunit(pnt)
prv = pnt
(ch < 0x80
? _islower_a(ch)
: (ch > 0xc3
? _islower_u(ch >= 0xf0
? get_utf8_4byte(pnt += 3, ch)
: (ch < 0xe0
? get_utf8_2byte(pnt += 1, ch)
: get_utf8_3byte(pnt += 2, ch))%UInt32)
: _check_uppercase(ch, pnt += 1))) &&
return _upper_utf8(beg, prv-beg, ncodeunits(str))
pnt += 1
end
str
end
end