@@ -71,15 +71,15 @@ uint32_t unicode_cpt_from_utf8(const std::string & utf8, size_t & offset) {
71
71
throw std::invalid_argument (" failed to convert utf8 to codepoint" );
72
72
}
73
73
74
- // static std::vector<uint16_t> unicode_cpt_to_utf16(uint32_t cp ) {
74
+ // static std::vector<uint16_t> unicode_cpt_to_utf16(uint32_t cpt ) {
75
75
// std::vector<uint16_t> result;
76
- // if (/* 0x0000 <= cp && */ cp <= 0xffff) {
77
- // result.emplace_back(cp );
76
+ // if (/* 0x0000 <= cpt && */ cpt <= 0xffff) {
77
+ // result.emplace_back(cpt );
78
78
// return result;
79
79
// }
80
- // if (0x10000 <= cp && cp <= 0x10ffff) {
81
- // result.emplace_back(0xd800 | ((cp - 0x10000) >> 10));
82
- // result.emplace_back(0xdc00 | ((cp - 0x10000) & 0x03ff));
80
+ // if (0x10000 <= cpt && cpt <= 0x10ffff) {
81
+ // result.emplace_back(0xd800 | ((cpt - 0x10000) >> 10));
82
+ // result.emplace_back(0xdc00 | ((cpt - 0x10000) & 0x03ff));
83
83
// return result;
84
84
// }
85
85
// throw std::invalid_argument("failed to convert codepoint to utf16");
@@ -120,8 +120,8 @@ uint32_t unicode_cpt_from_utf8(const std::string & utf8, size_t & offset) {
120
120
// return result;
121
121
// }
122
122
123
- static std::vector<codepoint_flags > unicode_cpt_flags_array () {
124
- std::vector<codepoint_flags > cpt_flags (MAX_CODEPOINTS, codepoint_flags ::UNDEFINED);
123
+ static std::vector<unicode_cpt_flags > unicode_cpt_flags_array () {
124
+ std::vector<unicode_cpt_flags > cpt_flags (MAX_CODEPOINTS, unicode_cpt_flags ::UNDEFINED);
125
125
126
126
assert (unicode_ranges_flags.begin ()[0 ].first == 0 );
127
127
assert (unicode_ranges_flags.begin ()[unicode_ranges_flags.size ()-1 ].first == MAX_CODEPOINTS);
@@ -253,8 +253,8 @@ static std::vector<size_t> unicode_regex_split_custom_gpt2(const std::string & t
253
253
return (offset_ini <= pos && pos < offset_end) ? cpts[pos] : OUT_OF_RANGE;
254
254
};
255
255
256
- auto _get_flags = [&] (const size_t pos) -> codepoint_flags {
257
- return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_flags (cpts[pos]) : codepoint_flags {};
256
+ auto _get_flags = [&] (const size_t pos) -> unicode_cpt_flags {
257
+ return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_flags_from_cpt (cpts[pos]) : unicode_cpt_flags {};
258
258
};
259
259
260
260
size_t _prev_end = offset_ini;
@@ -371,8 +371,8 @@ static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string &
371
371
return (offset_ini <= pos && pos < offset_end) ? cpts[pos] : OUT_OF_RANGE;
372
372
};
373
373
374
- auto _get_flags = [&] (const size_t pos) -> codepoint_flags {
375
- return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_flags (cpts[pos]) : codepoint_flags {};
374
+ auto _get_flags = [&] (const size_t pos) -> unicode_cpt_flags {
375
+ return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_flags_from_cpt (cpts[pos]) : unicode_cpt_flags {};
376
376
};
377
377
378
378
size_t _prev_end = offset_ini;
@@ -572,29 +572,29 @@ static std::vector<size_t> unicode_regex_split_custom(const std::string & text,
572
572
// interface
573
573
//
574
574
575
- std::string unicode_cpt_to_utf8 (uint32_t cp ) {
575
+ std::string unicode_cpt_to_utf8 (uint32_t cpt ) {
576
576
std::string result;
577
577
578
- if (/* 0x00 <= cp && */ cp <= 0x7f ) {
579
- result.push_back (cp );
578
+ if (/* 0x00 <= cpt && */ cpt <= 0x7f ) {
579
+ result.push_back (cpt );
580
580
return result;
581
581
}
582
- if (0x80 <= cp && cp <= 0x7ff ) {
583
- result.push_back (0xc0 | ((cp >> 6 ) & 0x1f ));
584
- result.push_back (0x80 | (cp & 0x3f ));
582
+ if (0x80 <= cpt && cpt <= 0x7ff ) {
583
+ result.push_back (0xc0 | ((cpt >> 6 ) & 0x1f ));
584
+ result.push_back (0x80 | (cpt & 0x3f ));
585
585
return result;
586
586
}
587
- if (0x800 <= cp && cp <= 0xffff ) {
588
- result.push_back (0xe0 | ((cp >> 12 ) & 0x0f ));
589
- result.push_back (0x80 | ((cp >> 6 ) & 0x3f ));
590
- result.push_back (0x80 | (cp & 0x3f ));
587
+ if (0x800 <= cpt && cpt <= 0xffff ) {
588
+ result.push_back (0xe0 | ((cpt >> 12 ) & 0x0f ));
589
+ result.push_back (0x80 | ((cpt >> 6 ) & 0x3f ));
590
+ result.push_back (0x80 | (cpt & 0x3f ));
591
591
return result;
592
592
}
593
- if (0x10000 <= cp && cp <= 0x10ffff ) {
594
- result.push_back (0xf0 | ((cp >> 18 ) & 0x07 ));
595
- result.push_back (0x80 | ((cp >> 12 ) & 0x3f ));
596
- result.push_back (0x80 | ((cp >> 6 ) & 0x3f ));
597
- result.push_back (0x80 | (cp & 0x3f ));
593
+ if (0x10000 <= cpt && cpt <= 0x10ffff ) {
594
+ result.push_back (0xf0 | ((cpt >> 18 ) & 0x07 ));
595
+ result.push_back (0x80 | ((cpt >> 12 ) & 0x3f ));
596
+ result.push_back (0x80 | ((cpt >> 6 ) & 0x3f ));
597
+ result.push_back (0x80 | (cpt & 0x3f ));
598
598
return result;
599
599
}
600
600
@@ -624,19 +624,19 @@ std::vector<uint32_t> unicode_cpts_from_utf8(const std::string & utf8) {
624
624
return result;
625
625
}
626
626
627
- codepoint_flags unicode_cpt_flags (const uint32_t cp ) {
628
- static const codepoint_flags undef (codepoint_flags ::UNDEFINED);
627
+ unicode_cpt_flags unicode_cpt_flags_from_cpt (const uint32_t cpt ) {
628
+ static const unicode_cpt_flags undef (unicode_cpt_flags ::UNDEFINED);
629
629
static const auto cpt_flags = unicode_cpt_flags_array ();
630
- return cp < cpt_flags.size () ? cpt_flags[cp ] : undef;
630
+ return cpt < cpt_flags.size () ? cpt_flags[cpt ] : undef;
631
631
}
632
632
633
- codepoint_flags unicode_cpt_flags (const std::string & utf8) {
634
- static const codepoint_flags undef (codepoint_flags ::UNDEFINED);
633
+ unicode_cpt_flags unicode_cpt_flags_from_utf8 (const std::string & utf8) {
634
+ static const unicode_cpt_flags undef (unicode_cpt_flags ::UNDEFINED);
635
635
if (utf8.empty ()) {
636
636
return undef; // undefined
637
637
}
638
638
size_t offset = 0 ;
639
- return unicode_cpt_flags (unicode_cpt_from_utf8 (utf8, offset));
639
+ return unicode_cpt_flags_from_cpt (unicode_cpt_from_utf8 (utf8, offset));
640
640
}
641
641
642
642
std::string unicode_byte_to_utf8 (uint8_t byte) {
@@ -649,41 +649,41 @@ uint8_t unicode_utf8_to_byte(const std::string & utf8) {
649
649
return map.at (utf8);
650
650
}
651
651
652
- uint32_t unicode_tolower (uint32_t cp ) {
652
+ uint32_t unicode_tolower (uint32_t cpt ) {
653
653
// binary search
654
- auto it = std::lower_bound (unicode_map_lowercase.begin (), unicode_map_lowercase.end (), cp ,
654
+ auto it = std::lower_bound (unicode_map_lowercase.begin (), unicode_map_lowercase.end (), cpt ,
655
655
[](const std::pair<uint32_t , uint32_t > & pair, uint32_t value) {
656
656
return pair.first < value;
657
657
});
658
- if (it != unicode_map_lowercase.end () && it->first == cp ) {
658
+ if (it != unicode_map_lowercase.end () && it->first == cpt ) {
659
659
return it->second ;
660
660
}
661
- return cp ; // Return the original code point if no lowercase mapping is found
661
+ return cpt ; // Return the original code point if no lowercase mapping is found
662
662
}
663
663
664
664
std::vector<std::string> unicode_regex_split (const std::string & text, const std::vector<std::string> & regex_exprs) {
665
665
// unicode categories
666
666
static const std::map<std::string, int > k_ucat_enum = {
667
- { " \\ p{N}" , codepoint_flags ::NUMBER },
668
- { " \\ p{L}" , codepoint_flags ::LETTER },
669
- { " \\ p{P}" , codepoint_flags ::PUNCTUATION },
667
+ { " \\ p{N}" , unicode_cpt_flags ::NUMBER },
668
+ { " \\ p{L}" , unicode_cpt_flags ::LETTER },
669
+ { " \\ p{P}" , unicode_cpt_flags ::PUNCTUATION },
670
670
};
671
671
672
672
static const std::map<int , int > k_ucat_cpt = {
673
- { codepoint_flags ::NUMBER, 0xD1 },
674
- { codepoint_flags ::LETTER, 0xD2 },
675
- { codepoint_flags ::PUNCTUATION, 0xD3 },
673
+ { unicode_cpt_flags ::NUMBER, 0xD1 },
674
+ { unicode_cpt_flags ::LETTER, 0xD2 },
675
+ { unicode_cpt_flags ::PUNCTUATION, 0xD3 },
676
676
};
677
677
678
678
static const std::map<int , std::string> k_ucat_map = {
679
- { codepoint_flags ::NUMBER, " \x30 -\x39 " }, // 0-9
680
- { codepoint_flags ::LETTER, " \x41 -\x5A\x61 -\x7A " }, // A-Za-z
681
- { codepoint_flags ::PUNCTUATION, " \x21 -\x23\x25 -\x2A\x2C -\x2F\x3A -\x3B\x3F -\x40\\\x5B -\\\x5D\x5F\\\x7B\\\x7D " }, // !-#%-*,-/:-;?-@\[-\]_\{\}
679
+ { unicode_cpt_flags ::NUMBER, " \x30 -\x39 " }, // 0-9
680
+ { unicode_cpt_flags ::LETTER, " \x41 -\x5A\x61 -\x7A " }, // A-Za-z
681
+ { unicode_cpt_flags ::PUNCTUATION, " \x21 -\x23\x25 -\x2A\x2C -\x2F\x3A -\x3B\x3F -\x40\\\x5B -\\\x5D\x5F\\\x7B\\\x7D " }, // !-#%-*,-/:-;?-@\[-\]_\{\}
682
682
};
683
683
684
684
// compute collapsed codepoints only if needed by at least one regex
685
685
bool need_collapse = false ;
686
- for (auto & regex_expr : regex_exprs) {
686
+ for (const auto & regex_expr : regex_exprs) {
687
687
// search for unicode categories
688
688
for (const auto & ucat : k_ucat_enum) {
689
689
if (std::string::npos != regex_expr.find (ucat.first )) {
@@ -709,7 +709,7 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
709
709
continue ;
710
710
}
711
711
712
- const auto flags = unicode_cpt_flags (cpts[i]);
712
+ const auto flags = unicode_cpt_flags_from_cpt (cpts[i]);
713
713
714
714
if (flags.is_whitespace ) {
715
715
// NOTE: C++ std::regex \s does not mach 0x85, Rust and Python regex does.
@@ -725,7 +725,7 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
725
725
726
726
std::vector<size_t > bpe_offsets = { cpts.size () };
727
727
728
- for (auto & regex_expr : regex_exprs) {
728
+ for (const auto & regex_expr : regex_exprs) {
729
729
// first, see if we have an efficient custom regex implementation
730
730
auto tmp = unicode_regex_split_custom (text, regex_expr, bpe_offsets);
731
731
@@ -739,7 +739,7 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
739
739
// if a unicode category is used in the regex, we use the collapsed text and replace the unicode category
740
740
// with the corresponding collapsed representation
741
741
bool use_collapsed = false ;
742
- for (auto & ucat : k_ucat_enum) {
742
+ for (const auto & ucat : k_ucat_enum) {
743
743
if (std::string::npos != regex_expr.find (ucat.first )) {
744
744
use_collapsed = true ;
745
745
break ;
@@ -805,7 +805,7 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
805
805
// std::wregex \s does not mach non-ASCII whitespaces, using 0x0B as fallback
806
806
std::wstring wtext (cpts.begin (), cpts.end ());
807
807
for (size_t i = 0 ; i < wtext.size (); ++i) {
808
- if (wtext[i] > 0x7F && unicode_cpt_flags (wtext[i]).is_whitespace ) {
808
+ if (wtext[i] > 0x7F && unicode_cpt_flags_from_cpt (wtext[i]).is_whitespace ) {
809
809
wtext[i] = 0x0B ;
810
810
}
811
811
}
0 commit comments