Skip to content

Commit 08ea539

Browse files
authored
unicode : improve naming style (#10838)
* unicode : improve naming style ggml-ci * cont [no ci]
1 parent 644fd71 commit 08ea539

File tree

3 files changed

+61
-62
lines changed

3 files changed

+61
-62
lines changed

src/llama-vocab.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -738,7 +738,7 @@ struct llm_tokenizer_wpm_session {
738738
std::vector<std::string> words(1, "");
739739

740740
for (const uint32_t cpt : cpts_nfd) {
741-
const auto flags = unicode_cpt_flags(cpt);
741+
const auto flags = unicode_cpt_flags_from_cpt(cpt);
742742

743743
if (flags.is_whitespace) {
744744
if (words.back().size()) { // finish previous word if any

src/unicode.cpp

+51-51
Original file line numberDiff line numberDiff line change
@@ -71,15 +71,15 @@ uint32_t unicode_cpt_from_utf8(const std::string & utf8, size_t & offset) {
7171
throw std::invalid_argument("failed to convert utf8 to codepoint");
7272
}
7373

74-
//static std::vector<uint16_t> unicode_cpt_to_utf16(uint32_t cp) {
74+
//static std::vector<uint16_t> unicode_cpt_to_utf16(uint32_t cpt) {
7575
// std::vector<uint16_t> result;
76-
// if (/* 0x0000 <= cp && */ cp <= 0xffff) {
77-
// result.emplace_back(cp);
76+
// if (/* 0x0000 <= cpt && */ cpt <= 0xffff) {
77+
// result.emplace_back(cpt);
7878
// return result;
7979
// }
80-
// if (0x10000 <= cp && cp <= 0x10ffff) {
81-
// result.emplace_back(0xd800 | ((cp - 0x10000) >> 10));
82-
// result.emplace_back(0xdc00 | ((cp - 0x10000) & 0x03ff));
80+
// if (0x10000 <= cpt && cpt <= 0x10ffff) {
81+
// result.emplace_back(0xd800 | ((cpt - 0x10000) >> 10));
82+
// result.emplace_back(0xdc00 | ((cpt - 0x10000) & 0x03ff));
8383
// return result;
8484
// }
8585
// throw std::invalid_argument("failed to convert codepoint to utf16");
@@ -120,8 +120,8 @@ uint32_t unicode_cpt_from_utf8(const std::string & utf8, size_t & offset) {
120120
// return result;
121121
//}
122122

123-
static std::vector<codepoint_flags> unicode_cpt_flags_array() {
124-
std::vector<codepoint_flags> cpt_flags(MAX_CODEPOINTS, codepoint_flags::UNDEFINED);
123+
static std::vector<unicode_cpt_flags> unicode_cpt_flags_array() {
124+
std::vector<unicode_cpt_flags> cpt_flags(MAX_CODEPOINTS, unicode_cpt_flags::UNDEFINED);
125125

126126
assert (unicode_ranges_flags.begin()[0].first == 0);
127127
assert (unicode_ranges_flags.begin()[unicode_ranges_flags.size()-1].first == MAX_CODEPOINTS);
@@ -253,8 +253,8 @@ static std::vector<size_t> unicode_regex_split_custom_gpt2(const std::string & t
253253
return (offset_ini <= pos && pos < offset_end) ? cpts[pos] : OUT_OF_RANGE;
254254
};
255255

256-
auto _get_flags = [&] (const size_t pos) -> codepoint_flags {
257-
return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_flags(cpts[pos]) : codepoint_flags{};
256+
auto _get_flags = [&] (const size_t pos) -> unicode_cpt_flags {
257+
return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_flags_from_cpt(cpts[pos]) : unicode_cpt_flags{};
258258
};
259259

260260
size_t _prev_end = offset_ini;
@@ -371,8 +371,8 @@ static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string &
371371
return (offset_ini <= pos && pos < offset_end) ? cpts[pos] : OUT_OF_RANGE;
372372
};
373373

374-
auto _get_flags = [&] (const size_t pos) -> codepoint_flags {
375-
return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_flags(cpts[pos]) : codepoint_flags{};
374+
auto _get_flags = [&] (const size_t pos) -> unicode_cpt_flags {
375+
return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_flags_from_cpt(cpts[pos]) : unicode_cpt_flags{};
376376
};
377377

378378
size_t _prev_end = offset_ini;
@@ -572,29 +572,29 @@ static std::vector<size_t> unicode_regex_split_custom(const std::string & text,
572572
// interface
573573
//
574574

575-
std::string unicode_cpt_to_utf8(uint32_t cp) {
575+
std::string unicode_cpt_to_utf8(uint32_t cpt) {
576576
std::string result;
577577

578-
if (/* 0x00 <= cp && */ cp <= 0x7f) {
579-
result.push_back(cp);
578+
if (/* 0x00 <= cpt && */ cpt <= 0x7f) {
579+
result.push_back(cpt);
580580
return result;
581581
}
582-
if (0x80 <= cp && cp <= 0x7ff) {
583-
result.push_back(0xc0 | ((cp >> 6) & 0x1f));
584-
result.push_back(0x80 | (cp & 0x3f));
582+
if (0x80 <= cpt && cpt <= 0x7ff) {
583+
result.push_back(0xc0 | ((cpt >> 6) & 0x1f));
584+
result.push_back(0x80 | (cpt & 0x3f));
585585
return result;
586586
}
587-
if (0x800 <= cp && cp <= 0xffff) {
588-
result.push_back(0xe0 | ((cp >> 12) & 0x0f));
589-
result.push_back(0x80 | ((cp >> 6) & 0x3f));
590-
result.push_back(0x80 | (cp & 0x3f));
587+
if (0x800 <= cpt && cpt <= 0xffff) {
588+
result.push_back(0xe0 | ((cpt >> 12) & 0x0f));
589+
result.push_back(0x80 | ((cpt >> 6) & 0x3f));
590+
result.push_back(0x80 | (cpt & 0x3f));
591591
return result;
592592
}
593-
if (0x10000 <= cp && cp <= 0x10ffff) {
594-
result.push_back(0xf0 | ((cp >> 18) & 0x07));
595-
result.push_back(0x80 | ((cp >> 12) & 0x3f));
596-
result.push_back(0x80 | ((cp >> 6) & 0x3f));
597-
result.push_back(0x80 | (cp & 0x3f));
593+
if (0x10000 <= cpt && cpt <= 0x10ffff) {
594+
result.push_back(0xf0 | ((cpt >> 18) & 0x07));
595+
result.push_back(0x80 | ((cpt >> 12) & 0x3f));
596+
result.push_back(0x80 | ((cpt >> 6) & 0x3f));
597+
result.push_back(0x80 | (cpt & 0x3f));
598598
return result;
599599
}
600600

@@ -624,19 +624,19 @@ std::vector<uint32_t> unicode_cpts_from_utf8(const std::string & utf8) {
624624
return result;
625625
}
626626

627-
codepoint_flags unicode_cpt_flags(const uint32_t cp) {
628-
static const codepoint_flags undef(codepoint_flags::UNDEFINED);
627+
unicode_cpt_flags unicode_cpt_flags_from_cpt(const uint32_t cpt) {
628+
static const unicode_cpt_flags undef(unicode_cpt_flags::UNDEFINED);
629629
static const auto cpt_flags = unicode_cpt_flags_array();
630-
return cp < cpt_flags.size() ? cpt_flags[cp] : undef;
630+
return cpt < cpt_flags.size() ? cpt_flags[cpt] : undef;
631631
}
632632

633-
codepoint_flags unicode_cpt_flags(const std::string & utf8) {
634-
static const codepoint_flags undef(codepoint_flags::UNDEFINED);
633+
unicode_cpt_flags unicode_cpt_flags_from_utf8(const std::string & utf8) {
634+
static const unicode_cpt_flags undef(unicode_cpt_flags::UNDEFINED);
635635
if (utf8.empty()) {
636636
return undef; // undefined
637637
}
638638
size_t offset = 0;
639-
return unicode_cpt_flags(unicode_cpt_from_utf8(utf8, offset));
639+
return unicode_cpt_flags_from_cpt(unicode_cpt_from_utf8(utf8, offset));
640640
}
641641

642642
std::string unicode_byte_to_utf8(uint8_t byte) {
@@ -649,41 +649,41 @@ uint8_t unicode_utf8_to_byte(const std::string & utf8) {
649649
return map.at(utf8);
650650
}
651651

652-
uint32_t unicode_tolower(uint32_t cp) {
652+
uint32_t unicode_tolower(uint32_t cpt) {
653653
// binary search
654-
auto it = std::lower_bound(unicode_map_lowercase.begin(), unicode_map_lowercase.end(), cp,
654+
auto it = std::lower_bound(unicode_map_lowercase.begin(), unicode_map_lowercase.end(), cpt,
655655
[](const std::pair<uint32_t, uint32_t> & pair, uint32_t value) {
656656
return pair.first < value;
657657
});
658-
if (it != unicode_map_lowercase.end() && it->first == cp) {
658+
if (it != unicode_map_lowercase.end() && it->first == cpt) {
659659
return it->second;
660660
}
661-
return cp; // Return the original code point if no lowercase mapping is found
661+
return cpt; // Return the original code point if no lowercase mapping is found
662662
}
663663

664664
std::vector<std::string> unicode_regex_split(const std::string & text, const std::vector<std::string> & regex_exprs) {
665665
// unicode categories
666666
static const std::map<std::string, int> k_ucat_enum = {
667-
{ "\\p{N}", codepoint_flags::NUMBER },
668-
{ "\\p{L}", codepoint_flags::LETTER },
669-
{ "\\p{P}", codepoint_flags::PUNCTUATION },
667+
{ "\\p{N}", unicode_cpt_flags::NUMBER },
668+
{ "\\p{L}", unicode_cpt_flags::LETTER },
669+
{ "\\p{P}", unicode_cpt_flags::PUNCTUATION },
670670
};
671671

672672
static const std::map<int, int> k_ucat_cpt = {
673-
{ codepoint_flags::NUMBER, 0xD1 },
674-
{ codepoint_flags::LETTER, 0xD2 },
675-
{ codepoint_flags::PUNCTUATION, 0xD3 },
673+
{ unicode_cpt_flags::NUMBER, 0xD1 },
674+
{ unicode_cpt_flags::LETTER, 0xD2 },
675+
{ unicode_cpt_flags::PUNCTUATION, 0xD3 },
676676
};
677677

678678
static const std::map<int, std::string> k_ucat_map = {
679-
{ codepoint_flags::NUMBER, "\x30-\x39" }, // 0-9
680-
{ codepoint_flags::LETTER, "\x41-\x5A\x61-\x7A" }, // A-Za-z
681-
{ codepoint_flags::PUNCTUATION, "\x21-\x23\x25-\x2A\x2C-\x2F\x3A-\x3B\x3F-\x40\\\x5B-\\\x5D\x5F\\\x7B\\\x7D" }, // !-#%-*,-/:-;?-@\[-\]_\{\}
679+
{ unicode_cpt_flags::NUMBER, "\x30-\x39" }, // 0-9
680+
{ unicode_cpt_flags::LETTER, "\x41-\x5A\x61-\x7A" }, // A-Za-z
681+
{ unicode_cpt_flags::PUNCTUATION, "\x21-\x23\x25-\x2A\x2C-\x2F\x3A-\x3B\x3F-\x40\\\x5B-\\\x5D\x5F\\\x7B\\\x7D" }, // !-#%-*,-/:-;?-@\[-\]_\{\}
682682
};
683683

684684
// compute collapsed codepoints only if needed by at least one regex
685685
bool need_collapse = false;
686-
for (auto & regex_expr : regex_exprs) {
686+
for (const auto & regex_expr : regex_exprs) {
687687
// search for unicode categories
688688
for (const auto & ucat : k_ucat_enum) {
689689
if (std::string::npos != regex_expr.find(ucat.first)) {
@@ -709,7 +709,7 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
709709
continue;
710710
}
711711

712-
const auto flags = unicode_cpt_flags(cpts[i]);
712+
const auto flags = unicode_cpt_flags_from_cpt(cpts[i]);
713713

714714
if (flags.is_whitespace) {
715715
//NOTE: C++ std::regex \s does not mach 0x85, Rust and Python regex does.
@@ -725,7 +725,7 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
725725

726726
std::vector<size_t> bpe_offsets = { cpts.size() };
727727

728-
for (auto & regex_expr : regex_exprs) {
728+
for (const auto & regex_expr : regex_exprs) {
729729
// first, see if we have an efficient custom regex implementation
730730
auto tmp = unicode_regex_split_custom(text, regex_expr, bpe_offsets);
731731

@@ -739,7 +739,7 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
739739
// if a unicode category is used in the regex, we use the collapsed text and replace the unicode category
740740
// with the corresponding collapsed representation
741741
bool use_collapsed = false;
742-
for (auto & ucat : k_ucat_enum) {
742+
for (const auto & ucat : k_ucat_enum) {
743743
if (std::string::npos != regex_expr.find(ucat.first)) {
744744
use_collapsed = true;
745745
break;
@@ -805,7 +805,7 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
805805
// std::wregex \s does not mach non-ASCII whitespaces, using 0x0B as fallback
806806
std::wstring wtext(cpts.begin(), cpts.end());
807807
for (size_t i = 0; i < wtext.size(); ++i) {
808-
if (wtext[i] > 0x7F && unicode_cpt_flags(wtext[i]).is_whitespace) {
808+
if (wtext[i] > 0x7F && unicode_cpt_flags_from_cpt(wtext[i]).is_whitespace) {
809809
wtext[i] = 0x0B;
810810
}
811811
}

src/unicode.h

+9-10
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,7 @@
44
#include <string>
55
#include <vector>
66

7-
// TODO: prefix all symbols with "llama_"
8-
9-
struct codepoint_flags {
7+
struct unicode_cpt_flags {
108
enum {
119
UNDEFINED = 0x0001,
1210
NUMBER = 0x0002, // regex: \p{N}
@@ -35,7 +33,7 @@ struct codepoint_flags {
3533
uint16_t is_nfd : 1;
3634

3735
// decode from uint16
38-
inline codepoint_flags(const uint16_t flags=0) {
36+
inline unicode_cpt_flags(const uint16_t flags = 0) {
3937
*reinterpret_cast<uint16_t*>(this) = flags;
4038
}
4139

@@ -50,18 +48,19 @@ struct codepoint_flags {
5048

5149
size_t unicode_len_utf8(char src);
5250

53-
std::string unicode_cpt_to_utf8(uint32_t cp);
54-
uint32_t unicode_cpt_from_utf8(const std::string & utf8, size_t & offset);
51+
std::string unicode_cpt_to_utf8 (uint32_t cpt);
52+
uint32_t unicode_cpt_from_utf8(const std::string & utf8, size_t & offset);
53+
5554
std::vector<uint32_t> unicode_cpts_from_utf8(const std::string & utf8);
5655

5756
std::vector<uint32_t> unicode_cpts_normalize_nfd(const std::vector<uint32_t> & cpts);
5857

59-
codepoint_flags unicode_cpt_flags(const uint32_t cp);
60-
codepoint_flags unicode_cpt_flags(const std::string & utf8);
58+
unicode_cpt_flags unicode_cpt_flags_from_cpt (uint32_t cpt);
59+
unicode_cpt_flags unicode_cpt_flags_from_utf8(const std::string & utf8);
6160

6261
std::string unicode_byte_to_utf8(uint8_t byte);
63-
uint8_t unicode_utf8_to_byte(const std::string & utf8);
62+
uint8_t unicode_utf8_to_byte(const std::string & utf8);
6463

65-
uint32_t unicode_tolower(uint32_t cp);
64+
uint32_t unicode_tolower(uint32_t cpt);
6665

6766
std::vector<std::string> unicode_regex_split(const std::string & text, const std::vector<std::string> & regex_exprs);

0 commit comments

Comments
 (0)