diff --git a/Project.toml b/Project.toml index 3cac18e..fda49aa 100644 --- a/Project.toml +++ b/Project.toml @@ -4,7 +4,7 @@ authors = ["ScottPJones "] keywords = ["Strings"] license = "MIT" uuid = "e79e7a6a-7bb1-5a4d-9d64-da657b06f53a" -version = "0.1.10" +version = "0.1.11" [deps] Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f" diff --git a/REQUIRE b/REQUIRE index 0150ff3..2a93f0e 100644 --- a/REQUIRE +++ b/REQUIRE @@ -3,4 +3,4 @@ MurmurHash3 0.1.5 ModuleInterfaceTools 0.1.6 StrAPI 0.1.8 CharSetEncodings 0.1.8 -ChrBase 0.1.6 +ChrBase 0.1.7 diff --git a/src/StrBase.jl b/src/StrBase.jl index 15bdcd0..b3ad859 100644 --- a/src/StrBase.jl +++ b/src/StrBase.jl @@ -32,7 +32,7 @@ using ModuleInterfaceTools _memcmp, _memcpy, _memset, _fwd_memchr, _rev_memchr, empty_string, _calcpnt, _mask_bytes, _allocate, MS_UTF8, MS_UTF16, MS_UTF32, MS_SubUTF32, MS_Latin, MS_ByteStr, MS_RawUTF8, - _wrap_substr, _empty_sub, + _wrap_substr, _empty_sub, AccessType, UInt16_U, UInt32_U, UInt16_S, UInt32_S, UInt16_US, UInt32_US, alignedtype, swappedtype, checkkeep, splitarr, __split, __rsplit, __replace diff --git a/src/casefold.jl b/src/casefold.jl index aefbe95..40c5ee2 100644 --- a/src/casefold.jl +++ b/src/casefold.jl @@ -359,7 +359,7 @@ function uppercase(str::MaybeSub{S}) where {C<:Union{UCS2_CSEs,UTF32_CSEs},S<:St pnt = beg = pointer(str) fin = beg + sizeof(str) while pnt < fin - _can_upper_ch(get_codeunit(pnt)) && return _upper(C, beg, pnt-beg, ncodeunits(str)) + _wide_lower_ch(get_codeunit(pnt)) && return _upper(C, beg, pnt-beg, ncodeunits(str)) pnt += sizeof(CU) end str diff --git a/src/utf16case.jl b/src/utf16case.jl index 5e09278..fa76844 100644 --- a/src/utf16case.jl +++ b/src/utf16case.jl @@ -97,7 +97,7 @@ function uppercase(str::UTF16Str) prv = pnt (ch > 0xd7ff # May be surrogate pair ? _islower_u(ch > 0xdfff ? ch%UInt32 : get_supplementary(ch, get_codeunit(pnt += 2))) - : _can_upper_ch(ch)) && + : _wide_lower_ch(ch)) && return _upper(UTF16Str, beg, prv-beg, ncodeunits(str)) pnt += 2 end diff --git a/src/utf8case.jl b/src/utf8case.jl index 1d532d9..b191f51 100644 --- a/src/utf8case.jl +++ b/src/utf8case.jl @@ -91,18 +91,22 @@ function _upper_utf8(beg, off, len) out += 1 elseif ch < 0xc4 ch = (ch << 6) | (get_codeunit(pnt += 1) & 0x3f) - if _can_upper_l(ch) - c16 = (ch - 0x20)%UInt16 - elseif ch == 0xb5 - c16 = 0x39c - elseif ch == 0xff - c16 = 0x178 - elseif !V6_COMPAT && ch == 0xdf - c16 = 0x1e9e + if !V6_COMPAT && ch == 0xdf + # Increasing from 2 to 3 bytes, check to see if we need to resize + diff = (outend - out - 3) - (fin - pnt - 1) + if diff < 0 + outend -= diff + resize!(buf, outend - out) + out = pointer(buf) + outend = out + sizeof(buf) + end + out = output_utf8_3byte!(out, 0x1e9e) else - c16 = ch%UInt16 + out = output_utf8_2byte!(out, _can_upper_l(ch) ? (ch - 0x20)%UInt16 + : ch == 0xb5 ? 0x39c + : ch == 0xff ? 0x178 + : ch%UInt16) end - out = output_utf8_2byte!(out, c16) elseif ch < 0xe0 # 2 byte c16 = get_utf8_2byte(pnt += 1, ch) diff --git a/test/basic.jl b/test/basic.jl index c1bda75..f9043f8 100644 --- a/test/basic.jl +++ b/test/basic.jl @@ -34,6 +34,28 @@ end end end +@testset "casefold string" begin + for ST in (ASCIIStr, LatinStr, UCS2Str, UTF32Str, UTF8Str, UTF16Str) + C = eltype(ST) + tm = typemax(C) + for c = 0:Int(tm) + # Skip surrogates + 0xd800 <= c < 0xe000 && continue + ch = C(c) + # Check to make sure this character would still fit uppercased + cu = uppercase(ch) + cu > tm && continue + for str in ("$ch test Beg", "test End $ch", "test $ch Mid", "$ch") + cvtstr = convert(ST, str) + @test uppercase(str) == uppercase(cvtstr) + @test lowercase(str) == lowercase(cvtstr) + @test titlecase(str) == titlecase(cvtstr) + @test uppercase_first(str) == uppercase_first(cvtstr) + end + end + end +end + @testset "{starts,ends}_with" begin for (ST, type_list) in compat_types, CT in type_list, str in test_strings_base[CT] cvtstr = convert(ST, str)