Skip to content

Commit cf78fbb

Browse files
agutkincopybara-github
authored andcommitted
Making sure the delimiter doesn't break on non-breaking space.
PiperOrigin-RevId: 382316110
1 parent dbebbd5 commit cf78fbb

File tree

6 files changed

+164
-36
lines changed

6 files changed

+164
-36
lines changed

nisaba/port/BUILD.bazel

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,11 +23,19 @@ cc_library(
2323
hdrs = ["integral_types.h"],
2424
)
2525

26+
cc_library(
27+
name = "unicode_properties",
28+
srcs = ["unicode_properties.cc"],
29+
hdrs = ["unicode_properties.h"],
30+
deps = ["@com_google_absl//absl/container:flat_hash_set"],
31+
)
32+
2633
cc_library(
2734
name = "utf8_delimiters",
2835
srcs = ["utf8_delimiters.cc"],
2936
hdrs = ["utf8_delimiters.h"],
3037
deps = [
38+
":unicode_properties",
3139
"@com_google_absl//absl/container:flat_hash_set",
3240
"@com_google_absl//absl/strings",
3341
"@com_github_utfcpp//:utfcpp",
@@ -61,6 +69,7 @@ cc_test(
6169
name = "utf8_util_test",
6270
srcs = ["utf8_util_test.cc"],
6371
deps = [
72+
":unicode_properties",
6473
":utf8_util",
6574
"@com_google_absl//absl/strings",
6675
"@com_google_googletest//:gtest_main",

nisaba/port/unicode_properties.cc

Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
// Copyright 2021 Nisaba Authors.
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
#include "nisaba/port/unicode_properties.h"
16+
17+
#include <algorithm>
18+
#include <iterator>
19+
20+
namespace nisaba {
21+
namespace utf8 {
22+
namespace {
23+
24+
const absl::flat_hash_set<char32_t> kBreakingWhitespace = {
25+
U'\u0009', // character tabulation
26+
U'\u000A', // line feed
27+
U'\u000B', // line tabulation
28+
U'\u000C', // form feed
29+
U'\u000D', // carriage return
30+
U'\u0020', // space
31+
U'\u0085', // next line
32+
U'\u00A0', // no-break space
33+
U'\u1680', // ogham space mark
34+
U'\u2000', // en quad
35+
U'\u2001', // em quad
36+
U'\u2002', // en space
37+
U'\u2003', // em space
38+
U'\u2004', // three-per-em space
39+
U'\u2005', // four-per-em space
40+
U'\u2006', // six-per-em space
41+
U'\u2007', // figure space
42+
U'\u2008', // punctuation space
43+
U'\u2009', // thin space
44+
U'\u200A', // hair space
45+
U'\u2028', // line separator
46+
U'\u2029', // paragraph separator
47+
U'\u202F', // narrow no-break space
48+
U'\u205F', // medium mathematical space
49+
U'\u3000', // ideographic space
50+
};
51+
52+
const absl::flat_hash_set<char32_t> kNonBreakingWhitespace = {
53+
U'\u180E', // mongolian vowel separator
54+
U'\u200B', // zero width space
55+
U'\u200C', // zero width non-joiner
56+
U'\u200D', // zero width joiner
57+
U'\u2060', // word joiner
58+
U'\uFEFF', // zero width non-breaking space
59+
};
60+
61+
} // namespace
62+
63+
absl::flat_hash_set<char32_t> GetBreakingWhitespaceChars() {
64+
return kBreakingWhitespace;
65+
}
66+
67+
absl::flat_hash_set<char32_t> GetNonBreakingWhitespaceChars() {
68+
return kNonBreakingWhitespace;
69+
}
70+
71+
absl::flat_hash_set<char32_t> GetAllWhitespaceChars() {
72+
absl::flat_hash_set<char32_t> all_chars;
73+
std::set_union(kBreakingWhitespace.begin(), kBreakingWhitespace.end(),
74+
kNonBreakingWhitespace.begin(), kNonBreakingWhitespace.end(),
75+
std::inserter(all_chars, all_chars.begin()));
76+
return all_chars;
77+
}
78+
79+
} // namespace utf8
80+
} // namespace nisaba

nisaba/port/unicode_properties.h

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
// Copyright 2021 Nisaba Authors.
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
// Various Unicode properties returned as sets of Unicode characters.
16+
//
17+
// This is an extremely simplified implementation that provides the sets of
18+
// characters that we presently need.
19+
20+
#ifndef NISABA_PORT_UNICODE_PROPERTIES_H_
21+
#define NISABA_PORT_UNICODE_PROPERTIES_H_
22+
23+
#include <cstdint>
24+
25+
#include "absl/container/flat_hash_set.h"
26+
27+
namespace nisaba {
28+
namespace utf8 {
29+
30+
// Returns a set of breaking whitespace characters.
31+
absl::flat_hash_set<char32_t> GetBreakingWhitespaceChars();
32+
33+
// Returns a set of non-breaking whitespace characters.
34+
absl::flat_hash_set<char32_t> GetNonBreakingWhitespaceChars();
35+
36+
// Returns a set of all whitespace characters.
37+
absl::flat_hash_set<char32_t> GetAllWhitespaceChars();
38+
39+
} // namespace utf8
40+
} // namespace nisaba
41+
42+
#endif // NISABA_PORT_UNICODE_PROPERTIES_H_

nisaba/port/utf8_delimiters.cc

Lines changed: 4 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -14,49 +14,20 @@
1414

1515
#include "nisaba/port/utf8_delimiters.h"
1616

17+
#include "nisaba/port/unicode_properties.h"
1718
#include "utf8/checked.h"
1819

1920
namespace nisaba {
2021
namespace utf8 {
2122
namespace {
2223

23-
const absl::flat_hash_set<char32_t> kUnicodeWhitespace = {
24-
U'\u0009', // character tabulation
25-
U'\u000A', // line feed
26-
U'\u000B', // line tabulation
27-
U'\u000C', // form feed
28-
U'\u000D', // carriage return
29-
U'\u0020', // space
30-
U'\u0085', // next line
31-
U'\u00A0', // no-break space
32-
U'\u1680', // ogham space mark
33-
U'\u180E', // mongolian vowel separator
34-
U'\u2000', // en quad
35-
U'\u2001', // em quad
36-
U'\u2002', // en space
37-
U'\u2003', // em space
38-
U'\u2004', // three-per-em space
39-
U'\u2005', // four-per-em space
40-
U'\u2006', // six-per-em space
41-
U'\u2007', // figure space
42-
U'\u2008', // punctuation space
43-
U'\u2009', // thin space
44-
U'\u200A', // hair space
45-
U'\u200B', // zero width space
46-
U'\u200C', // zero width non-joiner
47-
U'\u200D', // zero width joiner
48-
U'\u2028', // line separator
49-
U'\u2029', // paragraph separator
50-
U'\u202F', // narrow no-break space
51-
U'\u205F', // medium mathematical space
52-
U'\u2060', // word joiner
53-
U'\u3000', // ideographic space
54-
};
24+
const absl::flat_hash_set<char32_t> kBreakingWhitespace =
25+
GetBreakingWhitespaceChars();
5526

5627
} // namespace
5728

5829
Utf8WhitespaceDelimiter::Utf8WhitespaceDelimiter() :
59-
Utf8Delimiter(kUnicodeWhitespace) {}
30+
Utf8Delimiter(kBreakingWhitespace) {}
6031

6132
// TODO: We don't deal with malformed encodings yet.
6233
absl::string_view Utf8Delimiter::Find(absl::string_view text,

nisaba/port/utf8_delimiters_test.cc

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -39,10 +39,10 @@ TEST(Utf8DelimitersTest, WhitespaceBasicCheck) {
3939
EXPECT_EQ(" ", delim.Find("world ", 0));
4040

4141
// Check Unicode whitespace:
42-
// - Mongolian vowel separator: U+180E => 0xE1 0xA0 0x8E.
43-
// - Ideographic space: U+3000 => 0xE3 0x80 0x80.
42+
// - [non-breaking] Mongolian vowel separator: U+180E => 0xE1 0xA0 0x8E.
43+
// - [breaking] Ideographic space: U+3000 => 0xE3 0x80 0x80.
4444
input_text = "hello\xE1\xA0\x8Eworld\xE3\x80\x80";
45-
EXPECT_EQ("\xE1\xA0\x8E", delim.Find(input_text, 0));
45+
EXPECT_EQ("\xE3\x80\x80", delim.Find(input_text, 0));
4646
EXPECT_EQ("\xE3\x80\x80", delim.Find(input_text, 8));
4747
}
4848

nisaba/port/utf8_util_test.cc

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
#include "gtest/gtest.h"
2222
#include "absl/strings/str_split.h"
2323
#include "absl/strings/string_view.h"
24+
#include "nisaba/port/unicode_properties.h"
2425

2526
using ::testing::ElementsAre;
2627

@@ -120,6 +121,31 @@ TEST(Utf8UtilTest, CheckPortableUtf8WhitespaceDelimiter) {
120121
EXPECT_EQ("" + final_part, toks[1]);
121122
}
122123

124+
TEST(Utf8UtilTest, BreakingVsNonBreakingWhitespaceSplit) {
125+
// Check that we don't break on non-breaking whitespace characters.
126+
const auto &non_break_chars = GetNonBreakingWhitespaceChars();
127+
for (auto u32_char : non_break_chars) {
128+
const std::string &no_delim = EncodeUnicodeChar(u32_char);
129+
const std::string input_text = "a" + no_delim + "b";
130+
const std::vector<absl::string_view> toks = absl::StrSplit(
131+
input_text, Utf8WhitespaceDelimiter(), absl::SkipEmpty());
132+
ASSERT_EQ(1, toks.size()) << "Expected non-breaking char: " << u32_char;
133+
EXPECT_EQ(toks[0], input_text);
134+
}
135+
136+
// Check that the splitter works on breaking whitespace.
137+
const auto &breaking_chars = GetBreakingWhitespaceChars();
138+
for (auto u32_char : breaking_chars) {
139+
const std::string &delim = EncodeUnicodeChar(u32_char);
140+
const std::string input_text = "a" + delim + "b";
141+
const std::vector<absl::string_view> toks = absl::StrSplit(
142+
input_text, Utf8WhitespaceDelimiter(), absl::SkipEmpty());
143+
ASSERT_EQ(2, toks.size()) << "Expected breaking char: " << u32_char;
144+
EXPECT_EQ(toks[0], "a");
145+
EXPECT_EQ(toks[1], "b");
146+
}
147+
}
148+
123149
} // namespace
124150
} // namespace utf8
125151
} // namespace nisaba

0 commit comments

Comments
 (0)