@@ -71,15 +71,15 @@ uint32_t unicode_cpt_from_utf8(const std::string & utf8, size_t & offset) {
7171    throw  std::invalid_argument (" failed to convert utf8 to codepoint" 
7272}
7373
74- // static std::vector<uint16_t> unicode_cpt_to_utf16(uint32_t cp ) {
74+ // static std::vector<uint16_t> unicode_cpt_to_utf16(uint32_t cpt ) {
7575//     std::vector<uint16_t> result;
76- //     if (/* 0x0000 <= cp  && */ cp  <= 0xffff) {
77- //         result.emplace_back(cp );
76+ //     if (/* 0x0000 <= cpt  && */ cpt  <= 0xffff) {
77+ //         result.emplace_back(cpt );
7878//         return result;
7979//     }
80- //     if (0x10000 <= cp  && cp  <= 0x10ffff) {
81- //         result.emplace_back(0xd800 | ((cp  - 0x10000) >> 10));
82- //         result.emplace_back(0xdc00 | ((cp  - 0x10000) & 0x03ff));
80+ //     if (0x10000 <= cpt  && cpt  <= 0x10ffff) {
81+ //         result.emplace_back(0xd800 | ((cpt  - 0x10000) >> 10));
82+ //         result.emplace_back(0xdc00 | ((cpt  - 0x10000) & 0x03ff));
8383//         return result;
8484//     }
8585//     throw std::invalid_argument("failed to convert codepoint to utf16");
@@ -120,8 +120,8 @@ uint32_t unicode_cpt_from_utf8(const std::string & utf8, size_t & offset) {
120120//     return result;
121121// }
122122
123- static  std::vector<codepoint_flags > unicode_cpt_flags_array () {
124-     std::vector<codepoint_flags > cpt_flags (MAX_CODEPOINTS, codepoint_flags ::UNDEFINED);
123+ static  std::vector<unicode_cpt_flags > unicode_cpt_flags_array () {
124+     std::vector<unicode_cpt_flags > cpt_flags (MAX_CODEPOINTS, unicode_cpt_flags ::UNDEFINED);
125125
126126    assert  (unicode_ranges_flags.begin ()[0 ].first  == 0 );
127127    assert  (unicode_ranges_flags.begin ()[unicode_ranges_flags.size ()-1 ].first  == MAX_CODEPOINTS);
@@ -253,8 +253,8 @@ static std::vector<size_t> unicode_regex_split_custom_gpt2(const std::string & t
253253            return  (offset_ini <= pos && pos < offset_end) ? cpts[pos] : OUT_OF_RANGE;
254254        };
255255
256-         auto  _get_flags = [&] (const  size_t  pos) -> codepoint_flags  {
257-             return  (offset_ini <= pos && pos < offset_end) ? unicode_cpt_flags (cpts[pos]) : codepoint_flags {};
256+         auto  _get_flags = [&] (const  size_t  pos) -> unicode_cpt_flags  {
257+             return  (offset_ini <= pos && pos < offset_end) ? unicode_cpt_flags_from_cpt (cpts[pos]) : unicode_cpt_flags {};
258258        };
259259
260260        size_t  _prev_end = offset_ini;
@@ -371,8 +371,8 @@ static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string &
371371            return  (offset_ini <= pos && pos < offset_end) ? cpts[pos] : OUT_OF_RANGE;
372372        };
373373
374-         auto  _get_flags = [&] (const  size_t  pos) -> codepoint_flags  {
375-             return  (offset_ini <= pos && pos < offset_end) ? unicode_cpt_flags (cpts[pos]) : codepoint_flags {};
374+         auto  _get_flags = [&] (const  size_t  pos) -> unicode_cpt_flags  {
375+             return  (offset_ini <= pos && pos < offset_end) ? unicode_cpt_flags_from_cpt (cpts[pos]) : unicode_cpt_flags {};
376376        };
377377
378378        size_t  _prev_end = offset_ini;
@@ -572,29 +572,29 @@ static std::vector<size_t> unicode_regex_split_custom(const std::string & text,
572572//  interface
573573// 
574574
575- std::string unicode_cpt_to_utf8 (uint32_t  cp ) {
575+ std::string unicode_cpt_to_utf8 (uint32_t  cpt ) {
576576    std::string result;
577577
578-     if  (/*  0x00 <= cp  && */ cp  <= 0x7f ) {
579-         result.push_back (cp );
578+     if  (/*  0x00 <= cpt  && */ cpt  <= 0x7f ) {
579+         result.push_back (cpt );
580580        return  result;
581581    }
582-     if  (0x80  <= cp  && cp  <= 0x7ff ) {
583-         result.push_back (0xc0  | ((cp  >> 6 ) & 0x1f ));
584-         result.push_back (0x80  | (cp  & 0x3f ));
582+     if  (0x80  <= cpt  && cpt  <= 0x7ff ) {
583+         result.push_back (0xc0  | ((cpt  >> 6 ) & 0x1f ));
584+         result.push_back (0x80  | (cpt  & 0x3f ));
585585        return  result;
586586    }
587-     if  (0x800  <= cp  && cp  <= 0xffff ) {
588-         result.push_back (0xe0  | ((cp  >> 12 ) & 0x0f ));
589-         result.push_back (0x80  | ((cp  >> 6 ) & 0x3f ));
590-         result.push_back (0x80  | (cp  & 0x3f ));
587+     if  (0x800  <= cpt  && cpt  <= 0xffff ) {
588+         result.push_back (0xe0  | ((cpt  >> 12 ) & 0x0f ));
589+         result.push_back (0x80  | ((cpt  >> 6 ) & 0x3f ));
590+         result.push_back (0x80  | (cpt  & 0x3f ));
591591        return  result;
592592    }
593-     if  (0x10000  <= cp  && cp  <= 0x10ffff ) {
594-         result.push_back (0xf0  | ((cp  >> 18 ) & 0x07 ));
595-         result.push_back (0x80  | ((cp  >> 12 ) & 0x3f ));
596-         result.push_back (0x80  | ((cp  >> 6 ) & 0x3f ));
597-         result.push_back (0x80  | (cp  & 0x3f ));
593+     if  (0x10000  <= cpt  && cpt  <= 0x10ffff ) {
594+         result.push_back (0xf0  | ((cpt  >> 18 ) & 0x07 ));
595+         result.push_back (0x80  | ((cpt  >> 12 ) & 0x3f ));
596+         result.push_back (0x80  | ((cpt  >> 6 ) & 0x3f ));
597+         result.push_back (0x80  | (cpt  & 0x3f ));
598598        return  result;
599599    }
600600
@@ -624,19 +624,19 @@ std::vector<uint32_t> unicode_cpts_from_utf8(const std::string & utf8) {
624624    return  result;
625625}
626626
627- codepoint_flags  unicode_cpt_flags (const  uint32_t  cp ) {
628-     static  const  codepoint_flags  undef (codepoint_flags ::UNDEFINED);
627+ unicode_cpt_flags  unicode_cpt_flags_from_cpt (const  uint32_t  cpt ) {
628+     static  const  unicode_cpt_flags  undef (unicode_cpt_flags ::UNDEFINED);
629629    static  const  auto  cpt_flags = unicode_cpt_flags_array ();
630-     return  cp  < cpt_flags.size () ? cpt_flags[cp ] : undef;
630+     return  cpt  < cpt_flags.size () ? cpt_flags[cpt ] : undef;
631631}
632632
633- codepoint_flags  unicode_cpt_flags (const  std::string & utf8) {
634-     static  const  codepoint_flags  undef (codepoint_flags ::UNDEFINED);
633+ unicode_cpt_flags  unicode_cpt_flags_from_utf8 (const  std::string & utf8) {
634+     static  const  unicode_cpt_flags  undef (unicode_cpt_flags ::UNDEFINED);
635635    if  (utf8.empty ()) {
636636        return  undef;  //  undefined
637637    }
638638    size_t  offset = 0 ;
639-     return  unicode_cpt_flags (unicode_cpt_from_utf8 (utf8, offset));
639+     return  unicode_cpt_flags_from_cpt (unicode_cpt_from_utf8 (utf8, offset));
640640}
641641
642642std::string unicode_byte_to_utf8 (uint8_t  byte) {
@@ -649,41 +649,41 @@ uint8_t unicode_utf8_to_byte(const std::string & utf8) {
649649    return  map.at (utf8);
650650}
651651
652- uint32_t  unicode_tolower (uint32_t  cp ) {
652+ uint32_t  unicode_tolower (uint32_t  cpt ) {
653653    //  binary search
654-     auto  it = std::lower_bound (unicode_map_lowercase.begin (), unicode_map_lowercase.end (), cp ,
654+     auto  it = std::lower_bound (unicode_map_lowercase.begin (), unicode_map_lowercase.end (), cpt ,
655655        [](const  std::pair<uint32_t , uint32_t > & pair, uint32_t  value) {
656656            return  pair.first  < value;
657657        });
658-     if  (it != unicode_map_lowercase.end () && it->first  == cp ) {
658+     if  (it != unicode_map_lowercase.end () && it->first  == cpt ) {
659659        return  it->second ;
660660    }
661-     return  cp ;  //  Return the original code point if no lowercase mapping is found
661+     return  cpt ;  //  Return the original code point if no lowercase mapping is found
662662}
663663
664664std::vector<std::string> unicode_regex_split (const  std::string & text, const  std::vector<std::string> & regex_exprs) {
665665    //  unicode categories
666666    static  const  std::map<std::string, int > k_ucat_enum = {
667-         { " \\ p{N}" codepoint_flags ::NUMBER },
668-         { " \\ p{L}" codepoint_flags ::LETTER },
669-         { " \\ p{P}" codepoint_flags ::PUNCTUATION },
667+         { " \\ p{N}" unicode_cpt_flags ::NUMBER },
668+         { " \\ p{L}" unicode_cpt_flags ::LETTER },
669+         { " \\ p{P}" unicode_cpt_flags ::PUNCTUATION },
670670    };
671671
672672    static  const  std::map<int , int > k_ucat_cpt = {
673-         { codepoint_flags ::NUMBER,         0xD1  },
674-         { codepoint_flags ::LETTER,         0xD2  },
675-         { codepoint_flags ::PUNCTUATION,    0xD3  },
673+         { unicode_cpt_flags ::NUMBER,      0xD1  },
674+         { unicode_cpt_flags ::LETTER,      0xD2  },
675+         { unicode_cpt_flags ::PUNCTUATION, 0xD3  },
676676    };
677677
678678    static  const  std::map<int , std::string> k_ucat_map = {
679-         { codepoint_flags ::NUMBER,         " \x30 -\x39 " //  0-9
680-         { codepoint_flags ::LETTER,         " \x41 -\x5A\x61 -\x7A " //  A-Za-z
681-         { codepoint_flags ::PUNCTUATION,    " \x21 -\x23\x25 -\x2A\x2C -\x2F\x3A -\x3B\x3F -\x40\\\x5B -\\\x5D\x5F\\\x7B\\\x7D " //  !-#%-*,-/:-;?-@\[-\]_\{\}
679+         { unicode_cpt_flags ::NUMBER,      " \x30 -\x39 " //  0-9
680+         { unicode_cpt_flags ::LETTER,      " \x41 -\x5A\x61 -\x7A " //  A-Za-z
681+         { unicode_cpt_flags ::PUNCTUATION, " \x21 -\x23\x25 -\x2A\x2C -\x2F\x3A -\x3B\x3F -\x40\\\x5B -\\\x5D\x5F\\\x7B\\\x7D " //  !-#%-*,-/:-;?-@\[-\]_\{\}
682682    };
683683
684684    //  compute collapsed codepoints only if needed by at least one regex
685685    bool  need_collapse = false ;
686-     for  (auto  & regex_expr : regex_exprs) {
686+     for  (const   auto  & regex_expr : regex_exprs) {
687687        //  search for unicode categories
688688        for  (const  auto  & ucat : k_ucat_enum) {
689689            if  (std::string::npos != regex_expr.find (ucat.first )) {
@@ -709,7 +709,7 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
709709                continue ;
710710            }
711711
712-             const  auto  flags = unicode_cpt_flags (cpts[i]);
712+             const  auto  flags = unicode_cpt_flags_from_cpt (cpts[i]);
713713
714714            if  (flags.is_whitespace ) {
715715                // NOTE: C++ std::regex \s does not mach 0x85, Rust and Python regex does.
@@ -725,7 +725,7 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
725725
726726    std::vector<size_t > bpe_offsets = { cpts.size () };
727727
728-     for  (auto  & regex_expr : regex_exprs) {
728+     for  (const   auto  & regex_expr : regex_exprs) {
729729        //  first, see if we have an efficient custom regex implementation
730730        auto  tmp = unicode_regex_split_custom (text, regex_expr, bpe_offsets);
731731
@@ -739,7 +739,7 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
739739            //  if a unicode category is used in the regex, we use the collapsed text and replace the unicode category
740740            //  with the corresponding collapsed representation
741741            bool  use_collapsed = false ;
742-             for  (auto  & ucat : k_ucat_enum) {
742+             for  (const   auto  & ucat : k_ucat_enum) {
743743                if  (std::string::npos != regex_expr.find (ucat.first )) {
744744                    use_collapsed = true ;
745745                    break ;
@@ -805,7 +805,7 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
805805                //  std::wregex \s does not mach non-ASCII whitespaces, using 0x0B as fallback
806806                std::wstring wtext (cpts.begin (), cpts.end ());
807807                for  (size_t  i = 0 ; i < wtext.size (); ++i) {
808-                     if  (wtext[i] > 0x7F  && unicode_cpt_flags (wtext[i]).is_whitespace ) {
808+                     if  (wtext[i] > 0x7F  && unicode_cpt_flags_from_cpt (wtext[i]).is_whitespace ) {
809809                        wtext[i] = 0x0B ;
810810                    }
811811                }
0 commit comments