Making sure the delimiter doesn't break on non-breaking space.

agutkin · copybara-github · commit cf78fbba5c48 · 2021-06-30T09:03:57.000-07:00
PiperOrigin-RevId: 382316110
diff --git a/nisaba/port/BUILD.bazel b/nisaba/port/BUILD.bazel
@@ -23,11 +23,19 @@ cc_library(
     hdrs = ["integral_types.h"],
 )
 
+cc_library(
+    name = "unicode_properties",
+    srcs = ["unicode_properties.cc"],
+    hdrs = ["unicode_properties.h"],
+    deps = ["@com_google_absl//absl/container:flat_hash_set"],
+)
+
 cc_library(
    name = "utf8_delimiters",
    srcs = ["utf8_delimiters.cc"],
    hdrs = ["utf8_delimiters.h"],
    deps = [
+       ":unicode_properties",
        "@com_google_absl//absl/container:flat_hash_set",
        "@com_google_absl//absl/strings",
        "@com_github_utfcpp//:utfcpp",
@@ -61,6 +69,7 @@ cc_test(
     name = "utf8_util_test",
     srcs = ["utf8_util_test.cc"],
     deps = [
+        ":unicode_properties",
         ":utf8_util",
         "@com_google_absl//absl/strings",
         "@com_google_googletest//:gtest_main",
diff --git a/nisaba/port/unicode_properties.cc b/nisaba/port/unicode_properties.cc
@@ -0,0 +1,80 @@
+// Copyright 2021 Nisaba Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "nisaba/port/unicode_properties.h"
+
+#include <algorithm>
+#include <iterator>
+
+namespace nisaba {
+namespace utf8 {
+namespace {
+
+const absl::flat_hash_set<char32_t> kBreakingWhitespace = {
+  U'\u0009',  // character tabulation
+  U'\u000A',  // line feed
+  U'\u000B',  // line tabulation
+  U'\u000C',  // form feed
+  U'\u000D',  // carriage return
+  U'\u0020',  // space
+  U'\u0085',  // next line
+  U'\u00A0',  // no-break space
+  U'\u1680',  // ogham space mark
+  U'\u2000',  // en quad
+  U'\u2001',  // em quad
+  U'\u2002',  // en space
+  U'\u2003',  // em space
+  U'\u2004',  // three-per-em space
+  U'\u2005',  // four-per-em space
+  U'\u2006',  // six-per-em space
+  U'\u2007',  // figure space
+  U'\u2008',  // punctuation space
+  U'\u2009',  // thin space
+  U'\u200A',  // hair space
+  U'\u2028',  // line separator
+  U'\u2029',  // paragraph separator
+  U'\u202F',  // narrow no-break space
+  U'\u205F',  // medium mathematical space
+  U'\u3000',  // ideographic space
+};
+
+const absl::flat_hash_set<char32_t> kNonBreakingWhitespace = {
+  U'\u180E',  // mongolian vowel separator
+  U'\u200B',  // zero width space
+  U'\u200C',  // zero width non-joiner
+  U'\u200D',  // zero width joiner
+  U'\u2060',  // word joiner
+  U'\uFEFF',  // zero width non-breaking space
+};
+
+}  // namespace
+
+absl::flat_hash_set<char32_t> GetBreakingWhitespaceChars() {
+  return kBreakingWhitespace;
+}
+
+absl::flat_hash_set<char32_t> GetNonBreakingWhitespaceChars() {
+  return kNonBreakingWhitespace;
+}
+
+absl::flat_hash_set<char32_t> GetAllWhitespaceChars() {
+  absl::flat_hash_set<char32_t> all_chars;
+  std::set_union(kBreakingWhitespace.begin(), kBreakingWhitespace.end(),
+                 kNonBreakingWhitespace.begin(), kNonBreakingWhitespace.end(),
+                 std::inserter(all_chars, all_chars.begin()));
+  return all_chars;
+}
+
+}  // namespace utf8
+}  // namespace nisaba
diff --git a/nisaba/port/unicode_properties.h b/nisaba/port/unicode_properties.h
@@ -0,0 +1,42 @@
+// Copyright 2021 Nisaba Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Various Unicode properties returned as sets of Unicode characters.
+//
+// This is an extremely simplified implementation that provides the sets of
+// characters that we presently need.
+
+#ifndef NISABA_PORT_UNICODE_PROPERTIES_H_
+#define NISABA_PORT_UNICODE_PROPERTIES_H_
+
+#include <cstdint>
+
+#include "absl/container/flat_hash_set.h"
+
+namespace nisaba {
+namespace utf8 {
+
+// Returns a set of breaking whitespace characters.
+absl::flat_hash_set<char32_t> GetBreakingWhitespaceChars();
+
+// Returns a set of non-breaking whitespace characters.
+absl::flat_hash_set<char32_t> GetNonBreakingWhitespaceChars();
+
+// Returns a set of all whitespace characters.
+absl::flat_hash_set<char32_t> GetAllWhitespaceChars();
+
+}  // namespace utf8
+}  // namespace nisaba
+
+#endif  // NISABA_PORT_UNICODE_PROPERTIES_H_
diff --git a/nisaba/port/utf8_delimiters.cc b/nisaba/port/utf8_delimiters.cc
@@ -14,49 +14,20 @@
 
 #include "nisaba/port/utf8_delimiters.h"
 
+#include "nisaba/port/unicode_properties.h"
 #include "utf8/checked.h"
 
 namespace nisaba {
 namespace utf8 {
 namespace {
 
-const absl::flat_hash_set<char32_t> kUnicodeWhitespace = {
-  U'\u0009',  // character tabulation
-  U'\u000A',  // line feed
-  U'\u000B',  // line tabulation
-  U'\u000C',  // form feed
-  U'\u000D',  // carriage return
-  U'\u0020',  // space
-  U'\u0085',  // next line
-  U'\u00A0',  // no-break space
-  U'\u1680',  // ogham space mark
-  U'\u180E',  // mongolian vowel separator
-  U'\u2000',  // en quad
-  U'\u2001',  // em quad
-  U'\u2002',  // en space
-  U'\u2003',  // em space
-  U'\u2004',  // three-per-em space
-  U'\u2005',  // four-per-em space
-  U'\u2006',  // six-per-em space
-  U'\u2007',  // figure space
-  U'\u2008',  // punctuation space
-  U'\u2009',  // thin space
-  U'\u200A',  // hair space
-  U'\u200B',  // zero width space
-  U'\u200C',  // zero width non-joiner
-  U'\u200D',  // zero width joiner
-  U'\u2028',  // line separator
-  U'\u2029',  // paragraph separator
-  U'\u202F',  // narrow no-break space
-  U'\u205F',  // medium mathematical space
-  U'\u2060',  // word joiner
-  U'\u3000',  // ideographic space
-};
+const absl::flat_hash_set<char32_t> kBreakingWhitespace =
+    GetBreakingWhitespaceChars();
 
 }  // namespace
 
 Utf8WhitespaceDelimiter::Utf8WhitespaceDelimiter() :
-    Utf8Delimiter(kUnicodeWhitespace) {}
+    Utf8Delimiter(kBreakingWhitespace) {}
 
 // TODO: We don't deal with malformed encodings yet.
 absl::string_view Utf8Delimiter::Find(absl::string_view text,
diff --git a/nisaba/port/utf8_delimiters_test.cc b/nisaba/port/utf8_delimiters_test.cc
@@ -39,10 +39,10 @@ TEST(Utf8DelimitersTest, WhitespaceBasicCheck) {
   EXPECT_EQ(" ", delim.Find("world ", 0));
 
   // Check Unicode whitespace:
-  //   - Mongolian vowel separator: U+180E => 0xE1 0xA0 0x8E.
-  //   - Ideographic space: U+3000 => 0xE3 0x80 0x80.
+  //   - [non-breaking] Mongolian vowel separator: U+180E => 0xE1 0xA0 0x8E.
+  //   - [breaking] Ideographic space: U+3000 => 0xE3 0x80 0x80.
   input_text = "hello\xE1\xA0\x8Eworld\xE3\x80\x80";
-  EXPECT_EQ("\xE1\xA0\x8E", delim.Find(input_text, 0));
+  EXPECT_EQ("\xE3\x80\x80", delim.Find(input_text, 0));
   EXPECT_EQ("\xE3\x80\x80", delim.Find(input_text, 8));
 }
 
diff --git a/nisaba/port/utf8_util_test.cc b/nisaba/port/utf8_util_test.cc
@@ -21,6 +21,7 @@
 #include "gtest/gtest.h"
 #include "absl/strings/str_split.h"
 #include "absl/strings/string_view.h"
+#include "nisaba/port/unicode_properties.h"
 
 using ::testing::ElementsAre;
 
@@ -120,6 +121,31 @@ TEST(Utf8UtilTest, CheckPortableUtf8WhitespaceDelimiter) {
   EXPECT_EQ("ᢆ" + final_part, toks[1]);
 }
 
+TEST(Utf8UtilTest, BreakingVsNonBreakingWhitespaceSplit) {
+  // Check that we don't break on non-breaking whitespace characters.
+  const auto &non_break_chars = GetNonBreakingWhitespaceChars();
+  for (auto u32_char : non_break_chars) {
+    const std::string &no_delim = EncodeUnicodeChar(u32_char);
+    const std::string input_text = "a" + no_delim + "b";
+    const std::vector<absl::string_view> toks = absl::StrSplit(
+        input_text, Utf8WhitespaceDelimiter(), absl::SkipEmpty());
+    ASSERT_EQ(1, toks.size()) << "Expected non-breaking char: " << u32_char;
+    EXPECT_EQ(toks[0], input_text);
+  }
+
+  // Check that the splitter works on breaking whitespace.
+  const auto &breaking_chars = GetBreakingWhitespaceChars();
+  for (auto u32_char : breaking_chars) {
+    const std::string &delim = EncodeUnicodeChar(u32_char);
+    const std::string input_text = "a" + delim + "b";
+    const std::vector<absl::string_view> toks = absl::StrSplit(
+        input_text, Utf8WhitespaceDelimiter(), absl::SkipEmpty());
+    ASSERT_EQ(2, toks.size()) << "Expected breaking char: " << u32_char;
+    EXPECT_EQ(toks[0], "a");
+    EXPECT_EQ(toks[1], "b");
+  }
+}
+
 }  // namespace
 }  // namespace utf8
 }  // namespace nisaba