diff --git a/Makefile b/Makefile index 4a09217..c03d9c0 100644 --- a/Makefile +++ b/Makefile @@ -30,6 +30,9 @@ low_level_SOURCES := test/low_level.cpp TARGETS += high_level high_level_SOURCES := test/high_level.cpp +TARGETS += unicode +unicode_SOURCES := test/unicode.cpp + TARGETS += libhello.$(dylib) libhello.$(dylib)_SOURCES = examples/hello.cpp CXXFLAGS__examples/hello.cpp = -Wno-shadow @@ -44,9 +47,10 @@ libpeer.$(dylib)_LDFLAGS = $(LDFLAGS_shared) all: $(TARGETS) .PHONY: test -test: low_level high_level +test: low_level high_level unicode $(BUILD)/low_level $(BUILD)/high_level + $(BUILD)/unicode .PHONY: examples examples: libhello.$(dylib) examples/Hello.class libpeer.$(dylib) examples/NativePeer.class diff --git a/include/jni/unicode.hpp b/include/jni/unicode.hpp new file mode 100644 index 0000000..ba1e9dc --- /dev/null +++ b/include/jni/unicode.hpp @@ -0,0 +1,244 @@ +#pragma once + +#include +#include + +namespace jni + { + inline std::u16string MakeUTF16(std::experimental::string_view utf8) + { + std::u16string result; + result.reserve(utf8.size()); + + // State-machine implementation based on "Flexible and Economical UTF-8 Decoder", + // by Bjoern Hoehrmann, http://bjoern.hoehrmann.de/utf-8/decoder/dfa/. + + // If the leading byte is: + // + // 00..7f, then we're done + // + // c2..df, leading byte for two-byte sequence + // Second byte must be 80..bf + // + // e1..ec, leading byte for three-byte sequence + // ee..ef, leading byte for three-byte sequence + // Second and third byte must be 80..bf + // + // f1..f3, leading byte for four-byte sequence + // Second, third, and fourth byte must be 80..bf + // + // e0, leading byte for three-byte, possibly-overlong sequence + // Second byte must be a0..bf + // Third byte must be 80..bf + // + // ed, leading byte for three-byte sequence with potential invalid code points + // Second byte must be 80..9f + // Third byte must be 80..bf + // + // f0, leading byte for four-byte, possibly-overlong sequence + // Second byte must be 90..bf + // Third and fourth byte must be 80..bf + // + // f4, leading byte for four-byte sequence with potential invalid code points + // Second byte must be 80..8f + // Third and fourth byte must be 80..bf + // + // All other leading bytes are invalid. 80..bf are invalid continuation bytes. + // c0 and c1 are an invalid overlong sequence. f5..ff are not used in UTF-8. + + static constexpr uint8_t types[256] = + { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 00..0f + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 10..1f + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 20..2f + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 30..3f + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 40..4f + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 50..5f + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 60..6f + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 70..7f + + // Continuation bytes + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 80..8f + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, // 90..9f + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, // a0..af + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, // b0..bf + + // Leading bytes for two-byte sequences + 8, 8, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // c0..cf + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // d0..df + + // Leading bytes for three- and four-byte sequences + 10, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 3, 3, // e0..ef + 11, 6, 6, 6, 5, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8 // f0..ff + }; + + // A constant for transitions to the invalid state, to help valid transitions stand out in the transition table below. + static constexpr uint8_t _ = 1; + + static constexpr uint8_t transitions[9 * 16] = + { + 0, _, 2, 3, 5, 8, 7, _, _, _, 4, 6, _, _, _, _, // state 0 - starting state + _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, // state 1 - invalid state + _, 0, _, _, _, _, _, 0, _, 0, _, _, _, _, _, _, // state 2 - expecting one continuation byte (types 1, 7, or 9) + _, 2, _, _, _, _, _, 2, _, 2, _, _, _, _, _, _, // state 3 - expecting two continuation bytes + _, _, _, _, _, _, _, 2, _, _, _, _, _, _, _, _, // state 4 - expecting a0..bf (type 7), then one continuation byte + _, 2, _, _, _, _, _, _, _, 2, _, _, _, _, _, _, // state 5 - expecting 80..9f (type 1 or 9), then one continuation byte + _, _, _, _, _, _, _, 3, _, 3, _, _, _, _, _, _, // state 6 - expecting 90..bf (type 7 or 7), then two continuation bytes + _, 3, _, _, _, _, _, 3, _, 3, _, _, _, _, _, _, // state 7 - expecting three continuation bytes + _, 3, _, _, _, _, _, _, _, _, _, _, _, _, _, _ // state 8 - expecting 80..8f (type 1), then two continuation bytes + }; + + uint8_t state = 0; + uint8_t prev = 0; + uint32_t pt = 0; + + for (auto it = utf8.begin(); it != utf8.end(); prev = state, ++it) + { + const char c = *it; + uint8_t type = types[static_cast(c)]; + + if (state == 0) + { + pt = (0xFF >> type) & c; + } + else + { + pt = (pt << 6) | (c & 0b111111); + } + + state = transitions[state * 16 + type]; + + if (state == 0) + { + if (pt > 0xFFFF) + { + result += static_cast(0xD800 + ((pt - 0x10000) >> 10)); + result += static_cast(0xDC00 + (pt & 0b1111111111)); + } + else + { + result += static_cast(pt); + } + } + else if (state == 1) + { + result += 0xFFFD; + state = 0; + if (prev != 0) + { + it--; + } + } + } + + if (state != 0 && state != 1) + { + result += 0xFFFD; + } + + return result; + } + + inline std::string MakeUTF8(std::experimental::u16string_view utf16) + { + std::string result; + result.reserve(utf16.size() * 3 / 2); + + static constexpr uint8_t types[256] = + { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 00..0f + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 10..1f + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 20..2f + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 30..3f + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 40..4f + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 50..5f + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 60..6f + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 70..7f + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 80..8f + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 90..9f + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // a0..af + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // b0..bf + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // c0..cf + 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, // d0..df + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // e0..ef + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 // f0..ff + }; + + // A constant for transitions to the invalid state, to help valid transitions stand out in the transition table below. + static constexpr uint8_t _ = 1; + + static constexpr uint8_t transitions[3 * 4] = + { + 0, 2, _, _, // state 0 - starting state + _, _, _, _, // state 1 - invalid state + _, _, 0, _ // state 2 - expecting low surrogate + }; + + uint8_t state = 0; + uint8_t prev = 0; + uint32_t pt = 0; + + for (auto it = utf16.begin(); it != utf16.end(); prev = state, ++it) + { + const char16_t c = *it; + uint8_t type = types[static_cast(c >> 8)]; + + if (state == 0) + { + pt = (0xFFFF >> (!!type * 6)) & c; + } + else + { + pt = (pt << 10) | (c & 0b1111111111); + } + + state = transitions[state * 4 + type]; + + if (state == 0) + { + if (pt < 0x80) + { + result += static_cast(pt); + } + else if (pt < 0x800) + { + result += static_cast(0b11000000 | (pt >> 6)); + result += static_cast(0b10000000 | ((pt >> 0) & 0b111111)); + } + else if (pt < 0x10000) + { + result += static_cast(0b11100000 | (pt >> 12)); + result += static_cast(0b10000000 | ((pt >> 6) & 0b111111)); + result += static_cast(0b10000000 | ((pt >> 0) & 0b111111)); + } + else + { + result += static_cast(0b11110000 | (pt >> 18)); + result += static_cast(0b10000000 | ((pt >> 12) & 0b111111)); + result += static_cast(0b10000000 | ((pt >> 6) & 0b111111)); + result += static_cast(0b10000000 | ((pt >> 0) & 0b111111)); + } + } + else if (state == 1) + { + result += static_cast(0xEF); + result += static_cast(0xBF); + result += static_cast(0xBD); + state = 0; + if (prev != 0) + { + it--; + } + } + } + + if (state != 0 && state != 1) + { + result += static_cast(0xEF); + result += static_cast(0xBF); + result += static_cast(0xBD); + } + + return result; + } + } diff --git a/test/unicode.cpp b/test/unicode.cpp new file mode 100644 index 0000000..b32136f --- /dev/null +++ b/test/unicode.cpp @@ -0,0 +1,28 @@ +#include + +#include + +using namespace std::literals::string_literals; + +int main() + { + assert(jni::MakeUTF16(u8"") == u""); + + for (char16_t c = 0; c <= 0x7F; ++c) + { + char c8 = static_cast(c); + assert(jni::MakeUTF16(std::experimental::string_view(&c8, 1)) == std::u16string(&c, 1)); + } + + for (char16_t c = 0x80; c <= 0xC2; ++c) + { + char c8 = static_cast(c); + assert(jni::MakeUTF16(std::experimental::string_view(&c8, 1)) == u"\xFFFD"); + } + + assert(jni::MakeUTF16(u8"abcd") == u"abcd"); + assert(jni::MakeUTF16(u8"Hello world, Καλημέρα κόσμε, コンニチハ") == u"Hello world, Καλημέρα κόσμε, コンニチハ"); + + assert(jni::MakeUTF16(u8"\xED\xA0") == u"\xFFFD\xFFFD"); + assert(jni::MakeUTF16(u8"\xED\xA0\x80") == u"\xFFFD\xFFFD\xFFFD"); + }