Skip to content

Commit 32c8486

Browse files
authored
wpm : portable unicode tolower (ggml-org#6305)
Also use C locale for ispunct/isspace, and split unicode-data.cpp from unicode.cpp.
1 parent 557410b commit 32c8486

9 files changed

+1699
-1425
lines changed

Diff for: CMakeLists.txt

+1
Original file line numberDiff line numberDiff line change
@@ -1170,6 +1170,7 @@ add_library(llama
11701170
llama.h
11711171
unicode.h
11721172
unicode.cpp
1173+
unicode-data.cpp
11731174
)
11741175

11751176
target_include_directories(llama PUBLIC .)

Diff for: Makefile

+4-1
Original file line numberDiff line numberDiff line change
@@ -678,7 +678,10 @@ ggml-quants.o: ggml-quants.c ggml.h ggml-quants.h ggml-common.h
678678
unicode.o: unicode.cpp unicode.h
679679
$(CXX) $(CXXFLAGS) -c $< -o $@
680680

681-
OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o
681+
unicode-data.o: unicode-data.cpp unicode-data.h
682+
$(CXX) $(CXXFLAGS) -c $< -o $@
683+
684+
OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o
682685

683686
llama.o: llama.cpp unicode.h ggml.h ggml-alloc.h ggml-backend.h ggml-cuda.h ggml-metal.h llama.h
684687
$(CXX) $(CXXFLAGS) -c $< -o $@

Diff for: Package.swift

+1
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ let package = Package(
3232
"ggml.c",
3333
"llama.cpp",
3434
"unicode.cpp",
35+
"unicode-data.cpp",
3536
"ggml-alloc.c",
3637
"ggml-backend.c",
3738
"ggml-quants.c",

Diff for: build.zig

+8-7
Original file line numberDiff line numberDiff line change
@@ -116,6 +116,7 @@ pub fn build(b: *std.build.Builder) !void {
116116
const ggml_backend = make.obj("ggml-backend", "ggml-backend.c");
117117
const ggml_quants = make.obj("ggml-quants", "ggml-quants.c");
118118
const unicode = make.obj("unicode", "unicode.cpp");
119+
const unicode_data = make.obj("unicode-data", "unicode-data.cpp");
119120
const llama = make.obj("llama", "llama.cpp");
120121
const buildinfo = make.obj("common", "common/build-info.cpp");
121122
const common = make.obj("common", "common/common.cpp");
@@ -127,14 +128,14 @@ pub fn build(b: *std.build.Builder) !void {
127128
const clip = make.obj("clip", "examples/llava/clip.cpp");
128129
const llava = make.obj("llava", "examples/llava/llava.cpp");
129130

130-
_ = make.exe("main", "examples/main/main.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, common, buildinfo, sampling, console, grammar_parser });
131-
_ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, common, buildinfo });
132-
_ = make.exe("perplexity", "examples/perplexity/perplexity.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, common, buildinfo });
133-
_ = make.exe("embedding", "examples/embedding/embedding.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, common, buildinfo });
134-
_ = make.exe("finetune", "examples/finetune/finetune.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, common, buildinfo, train });
135-
_ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, common, buildinfo, train });
131+
_ = make.exe("main", "examples/main/main.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, buildinfo, sampling, console, grammar_parser });
132+
_ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, buildinfo });
133+
_ = make.exe("perplexity", "examples/perplexity/perplexity.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, buildinfo });
134+
_ = make.exe("embedding", "examples/embedding/embedding.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, buildinfo });
135+
_ = make.exe("finetune", "examples/finetune/finetune.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, buildinfo, train });
136+
_ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, buildinfo, train });
136137

137-
const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, common, buildinfo, sampling, grammar_parser, json_schema_to_grammar, clip, llava });
138+
const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, buildinfo, sampling, grammar_parser, json_schema_to_grammar, clip, llava });
138139
if (server.target.isWindows()) {
139140
server.linkSystemLibrary("ws2_32");
140141
}

Diff for: llama.cpp

+8-14
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,7 @@
6161
#include <algorithm>
6262
#include <array>
6363
#include <cassert>
64+
#include <cctype>
6465
#include <cfloat>
6566
#include <cinttypes>
6667
#include <climits>
@@ -71,7 +72,6 @@
7172
#include <cstdio>
7273
#include <cstring>
7374
#include <ctime>
74-
#include <cwctype>
7575
#include <forward_list>
7676
#include <fstream>
7777
#include <functional>
@@ -11010,7 +11010,7 @@ struct llm_tokenizer_wpm {
1101011010
if (type == CODEPOINT_TYPE_ACCENT_MARK || type == CODEPOINT_TYPE_CONTROL) {
1101111011
continue;
1101211012
}
11013-
code = to_lower(code);
11013+
code = unicode_tolower(code);
1101411014
if (type == CODEPOINT_TYPE_WHITESPACE) {
1101511015
code = ' ';
1101611016
}
@@ -11030,7 +11030,7 @@ struct llm_tokenizer_wpm {
1103011030
std::vector<std::string> words;
1103111031
while (r < new_str.size()) {
1103211032
// if is whitespace
11033-
if (isspace(new_str[r])) {
11033+
if (isspace(new_str[r], std::locale::classic())) {
1103411034
if (r > l) words.push_back(new_str.substr(l, (r - l)));
1103511035
l = r + 1;
1103611036
r = l;
@@ -11044,18 +11044,12 @@ struct llm_tokenizer_wpm {
1104411044
return words;
1104511045
}
1104611046

11047-
uint32_t to_lower(uint32_t code) {
11048-
static const std::locale locale("en_US.UTF-8");
11049-
#if defined(_WIN32)
11050-
if (code > 0xFFFF) {
11051-
return code;
11052-
}
11053-
#endif
11054-
return std::tolower(wchar_t(code), locale);
11055-
}
11056-
1105711047
bool is_ascii_punct(uint32_t code) {
11058-
return code < 256 && ispunct(code);
11048+
if (code > 0xFF) {
11049+
return false;
11050+
}
11051+
auto c = char(static_cast<unsigned char>(code));
11052+
return ispunct(c, std::locale::classic());
1105911053
}
1106011054

1106111055
bool is_chinese_char(uint32_t cpt) {

0 commit comments

Comments
 (0)