@@ -208,12 +208,19 @@ end
208
208
209
209
using Base. Unicode: utf8proc_error, UTF8PROC_DECOMPOSE, UTF8PROC_CASEFOLD, UTF8PROC_STRIPMARK
210
210
211
- function _decompose_char! (codepoint:: Union{Integer,Char} , dest:: Vector{UInt32} , options:: Integer )
212
- ret = @ ccall utf8proc_decompose_char (codepoint:: UInt32 , dest:: Ptr{UInt32} , length (dest):: Int , options:: Cint , C_NULL :: Ptr{Cint} ):: Int
211
+ function _decompose_char! (codepoint:: Union{Integer,Char} , dest:: Vector{UInt32} , offset :: Integer , options:: Integer )
212
+ ret = GC . @preserve dest @ ccall utf8proc_decompose_char (codepoint:: UInt32 , pointer ( dest, 1 + offset) :: Ptr{UInt32} , ( length (dest) - offset ):: Int , options:: Cint , C_NULL :: Ptr{Cint} ):: Int
213
213
ret < 0 && utf8proc_error (ret)
214
214
return ret
215
215
end
216
216
217
+ # would be good to have higher-level accessor functions in utf8proc. alternatively,
218
+ # we could mirror the whole utf8proc_property_t struct in Julia, but that is annoying
219
+ # because of the bitfields.
220
+ combining_class (uc:: Integer ) =
221
+ 0x000301 ≤ uc ≤ 0x10ffff ? unsafe_load (ccall (:utf8proc_get_property , Ptr{UInt16}, (UInt32,), uc), 2 ) : 0x0000
222
+ combining_class (c:: AbstractChar ) = ismalformed (c) ? 0x0000 : combining_class (UInt32 (c))
223
+
217
224
"""
218
225
isequal_normalized(s1::AbstractString, s2::AbstractString; casefold=false, stripmark=false, chartransform=identity)
219
226
@@ -225,6 +232,9 @@ As with [`Unicode.normalize`](@ref), you can also pass an arbitrary
225
232
function via the `chartransform` keyword (mapping `Integer` codepoints to codepoints)
226
233
to perform custom normalizations, such as [`Unicode.julia_chartransform`](@ref).
227
234
235
+ !!! compat "Julia 1.8"
236
+ The `isequal_normalized` function was added in Julia 1.8.
237
+
228
238
# Examples
229
239
230
240
For example, the string `"noël"` can be constructed in two canonically equivalent ways
@@ -251,29 +261,78 @@ julia> isequal_normalized(s1, "NOËL", casefold=true)
251
261
true
252
262
```
253
263
"""
254
- function isequal_normalized (s1:: AbstractString , s2:: AbstractString ; casefold:: Bool = false , stripmark:: Bool = false , chartransform= identity)
255
- function decompose_next_char! (c, state, d, options, s)
256
- n = _decompose_char! (c, d, options)
257
- if n > length (d) # may be possible in future Unicode versions?
258
- n = _decompose_char! (c, resize! (d, n), options)
264
+ isequal_normalized (s1:: AbstractString , s2:: AbstractString ; casefold:: Bool = false , stripmark:: Bool = false , chartransform= identity) =
265
+ _isequal_normalized! (s1, s2, Vector {UInt32} (undef, 4 ), Vector {UInt32} (undef, 4 ), chartransform; casefold, stripmark)
266
+
267
+ # like isequal_normalized, but takes pre-allocated codepoint buffers as arguments, and chartransform is a positional argument
268
+ function _isequal_normalized! (s1:: AbstractString , s2:: AbstractString ,
269
+ d1:: Vector{UInt32} , d2:: Vector{UInt32} , chartransform:: F = identity;
270
+ casefold:: Bool = false , stripmark:: Bool = false ) where {F}
271
+ function decompose_next_chars! (state, d, options, s)
272
+ local n
273
+ offset = 0
274
+ @inbounds while true
275
+ # read a char and decompose it to d
276
+ c = chartransform (UInt32 (state[1 ]))
277
+ state = iterate (s, state[2 ])
278
+ if c < 0x80 # fast path for common ASCII case
279
+ n = 1 + offset
280
+ n > length (d) && resize! (d, 2 n)
281
+ d[n] = casefold ? (0x41 ≤ c ≤ 0x5A ? c+ 0x20 : c) : c
282
+ break # ASCII characters are all zero combining class
283
+ else
284
+ while true
285
+ n = _decompose_char! (c, d, offset, options) + offset
286
+ if n > length (d)
287
+ resize! (d, 2 n)
288
+ continue
289
+ end
290
+ break
291
+ end
292
+ end
293
+
294
+ # decomposed chars must be sorted in ascending order of combining class,
295
+ # which means we need to keep fetching chars until we get to non-combining
296
+ (iszero (combining_class (d[n])) || isnothing (state)) && break # non-combining
297
+ offset = n
259
298
end
260
- return 1 , n, iterate (s, state)
299
+
300
+ # sort by combining class
301
+ if n < 32 # almost always true
302
+ for j1 = 2 : n # insertion sort
303
+ cc = combining_class (d[j1])
304
+ iszero (cc) && continue # don't re-order non-combiners
305
+ for j2 = j1: - 1 : 2
306
+ combining_class (d[j2- 1 ]) ≤ cc && break
307
+ d[j2- 1 ], d[j2] = d[j2], d[j2- 1 ]
308
+ end
309
+ end
310
+ else # avoid n^2 complexity in crazy large-n case
311
+ j = 1
312
+ @views while j < n
313
+ j₀ = j + something (findnext (iszero ∘ combining_class, d[j+ 1 : n], 1 ), n+ 1 - j)
314
+ sort! (d[j: j₀- 1 ], by= combining_class)
315
+ j = j₀
316
+ end
317
+ end
318
+
319
+ # split return statement to help type inference:
320
+ return state === nothing ? (1 , n, nothing ) : (1 , n, state)
261
321
end
262
322
options = UTF8PROC_DECOMPOSE
263
323
casefold && (options |= UTF8PROC_CASEFOLD)
264
324
stripmark && (options |= UTF8PROC_STRIPMARK)
265
325
i1,i2 = iterate (s1),iterate (s2)
266
- d1,d2 = Vector {UInt32} (undef, 4 ), Vector {UInt32} (undef, 4 ) # codepoint buffers
267
326
n1 = n2 = 0 # lengths of codepoint buffers
268
327
j1 = j2 = 1 # indices in d1, d2
269
328
while true
270
329
if j1 > n1
271
330
i1 === nothing && return i2 === nothing && j2 > n2
272
- j1, n1, i1 = decompose_next_char! ( chartransform ( UInt32 (i1[ 1 ])), i1[ 2 ] , d1, options, s1)
331
+ j1, n1, i1 = decompose_next_chars! (i1 , d1, options, s1)
273
332
end
274
333
if j2 > n2
275
334
i2 === nothing && return false
276
- j2, n2, i2 = decompose_next_char! ( chartransform ( UInt32 (i2[ 1 ])), i2[ 2 ] , d2, options, s2)
335
+ j2, n2, i2 = decompose_next_chars! (i2 , d2, options, s2)
277
336
end
278
337
d1[j1] == d2[j2] || return false
279
338
j1 += 1 ; j2 += 1
0 commit comments