Skip to content

Commit 0c4d489

Browse files
phymbertggerganovslaren
authored
quantize: add imatrix and dataset metadata in GGUF (ggml-org#6658)
* imatrix: save the dataset file used in the output file * llama: support kv overrides type string string * common: factorize KV Overrides parsing between common and server * quantize: add imatrix n entries and dataset KV metadata quantize: factorize KV Overrides parsing between common ggml-org#6656 * llama: remove kv override str_value initialization as it does not compile on some toolchain * quantize: add imatrix m_last_call as `quantize.imatrix.chunks_count` * quantize: add imatrix filename in KV * llama: add llama_model_kv_override_free * common: add llama_model_kv_override_free common: free kv override if used after model loading * llama: finally move the string KV override value to the stack * llama : minor * no need to add a NUL to the std::vector, std::string can be initialized from a pair of iterators. Co-authored-by: slaren <[email protected]> * kv override: ensure string termination --------- Co-authored-by: Georgi Gerganov <[email protected]> Co-authored-by: slaren <[email protected]>
1 parent 017e699 commit 0c4d489

File tree

9 files changed

+186
-171
lines changed

9 files changed

+186
-171
lines changed

Diff for: Makefile

+1-1
Original file line numberDiff line numberDiff line change
@@ -768,7 +768,7 @@ batched-bench: examples/batched-bench/batched-bench.cpp build-info.o ggml.
768768
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
769769
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
770770

771-
quantize: examples/quantize/quantize.cpp build-info.o ggml.o llama.o $(OBJS)
771+
quantize: examples/quantize/quantize.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
772772
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
773773
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
774774

Diff for: common/common.cpp

+49-39
Original file line numberDiff line numberDiff line change
@@ -234,8 +234,54 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
234234
return result;
235235
}
236236

237+
bool parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides) {
238+
const char * sep = strchr(data, '=');
239+
if (sep == nullptr || sep - data >= 128) {
240+
fprintf(stderr, "%s: malformed KV override '%s'\n", __func__, data);
241+
return false;
242+
}
243+
llama_model_kv_override kvo;
244+
std::strncpy(kvo.key, data, sep - data);
245+
kvo.key[sep - data] = 0;
246+
sep++;
247+
if (strncmp(sep, "int:", 4) == 0) {
248+
sep += 4;
249+
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT;
250+
kvo.val_i64 = std::atol(sep);
251+
} else if (strncmp(sep, "float:", 6) == 0) {
252+
sep += 6;
253+
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_FLOAT;
254+
kvo.val_f64 = std::atof(sep);
255+
} else if (strncmp(sep, "bool:", 5) == 0) {
256+
sep += 5;
257+
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_BOOL;
258+
if (std::strcmp(sep, "true") == 0) {
259+
kvo.val_bool = true;
260+
} else if (std::strcmp(sep, "false") == 0) {
261+
kvo.val_bool = false;
262+
} else {
263+
fprintf(stderr, "%s: invalid boolean value for KV override '%s'\n", __func__, data);
264+
return false;
265+
}
266+
} else if (strncmp(sep, "str:", 4) == 0) {
267+
sep += 4;
268+
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_STR;
269+
if (strlen(sep) > 127) {
270+
fprintf(stderr, "%s: malformed KV override '%s', value cannot exceed 127 chars\n", __func__, data);
271+
return false;
272+
}
273+
strncpy(kvo.val_str, sep, 127);
274+
kvo.val_str[127] = '\0';
275+
} else {
276+
fprintf(stderr, "%s: invalid type for KV override '%s'\n", __func__, data);
277+
return false;
278+
}
279+
overrides.emplace_back(std::move(kvo));
280+
return true;
281+
}
282+
237283
bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_params & params, int & i, bool & invalid_param) {
238-
llama_sampling_params& sparams = params.sparams;
284+
llama_sampling_params & sparams = params.sparams;
239285

240286
if (arg == "-s" || arg == "--seed") {
241287
if (++i >= argc) {
@@ -1244,47 +1290,11 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
12441290
invalid_param = true;
12451291
return true;
12461292
}
1247-
char* sep = strchr(argv[i], '=');
1248-
if (sep == nullptr || sep - argv[i] >= 128) {
1249-
fprintf(stderr, "error: Malformed KV override: %s\n", argv[i]);
1250-
invalid_param = true;
1251-
return true;
1252-
}
1253-
struct llama_model_kv_override kvo;
1254-
std::strncpy(kvo.key, argv[i], sep - argv[i]);
1255-
kvo.key[sep - argv[i]] = 0;
1256-
sep++;
1257-
if (strncmp(sep, "int:", 4) == 0) {
1258-
sep += 4;
1259-
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT;
1260-
kvo.int_value = std::atol(sep);
1261-
}
1262-
else if (strncmp(sep, "float:", 6) == 0) {
1263-
sep += 6;
1264-
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_FLOAT;
1265-
kvo.float_value = std::atof(sep);
1266-
}
1267-
else if (strncmp(sep, "bool:", 5) == 0) {
1268-
sep += 5;
1269-
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_BOOL;
1270-
if (std::strcmp(sep, "true") == 0) {
1271-
kvo.bool_value = true;
1272-
}
1273-
else if (std::strcmp(sep, "false") == 0) {
1274-
kvo.bool_value = false;
1275-
}
1276-
else {
1277-
fprintf(stderr, "error: Invalid boolean value for KV override: %s\n", argv[i]);
1278-
invalid_param = true;
1279-
return true;
1280-
}
1281-
}
1282-
else {
1293+
if (!parse_kv_override(argv[i], params.kv_overrides)) {
12831294
fprintf(stderr, "error: Invalid type for KV override: %s\n", argv[i]);
12841295
invalid_param = true;
12851296
return true;
12861297
}
1287-
params.kv_overrides.push_back(kvo);
12881298
return true;
12891299
}
12901300
#ifndef LOG_DISABLE_LOGS
@@ -1555,7 +1565,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
15551565
printf(" path to dynamic lookup cache to use for lookup decoding (updated by generation)\n");
15561566
printf(" --override-kv KEY=TYPE:VALUE\n");
15571567
printf(" advanced option to override model metadata by key. may be specified multiple times.\n");
1558-
printf(" types: int, float, bool. example: --override-kv tokenizer.ggml.add_bos_token=bool:false\n");
1568+
printf(" types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false\n");
15591569
printf(" -ptc N, --print-token-count N\n");
15601570
printf(" print token count every N tokens (default: %d)\n", params.n_print);
15611571
printf(" --check-tensors check model tensor data for invalid values\n");

Diff for: common/common.h

+2
Original file line numberDiff line numberDiff line change
@@ -171,6 +171,8 @@ struct gpt_params {
171171
std::string image = ""; // path to an image file
172172
};
173173

174+
bool parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
175+
174176
bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params);
175177

176178
bool gpt_params_parse(int argc, char ** argv, gpt_params & params);

Diff for: examples/imatrix/imatrix.cpp

+44-33
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ struct Stats {
2323
};
2424

2525
struct StatParams {
26+
std::string dataset;
2627
std::string ofile = "imatrix.dat";
2728
int n_output_frequency = 10;
2829
int verbosity = 1;
@@ -46,7 +47,7 @@ class IMatrixCollector {
4647
std::vector<float> m_src1_data;
4748
std::vector<char> m_ids; // the expert ids from ggml_mul_mat_id
4849
//
49-
void save_imatrix(const char * file_name) const;
50+
void save_imatrix(const char * file_name, const char * dataset) const;
5051
void keep_imatrix(int ncall) const;
5152
};
5253

@@ -199,32 +200,41 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
199200
}
200201

201202
void IMatrixCollector::save_imatrix() const {
202-
save_imatrix(m_params.ofile.empty() ? "imatrix.dat" : m_params.ofile.c_str());
203+
save_imatrix(m_params.ofile.empty() ? "imatrix.dat" : m_params.ofile.c_str(), m_params.dataset.c_str());
203204
}
204205

205206
void IMatrixCollector::keep_imatrix(int ncall) const {
206207
auto file_name = m_params.ofile;
207208
if (file_name.empty()) file_name = "imatrix.dat";
208209
file_name += ".at_";
209210
file_name += std::to_string(ncall);
210-
save_imatrix(file_name.c_str());
211+
save_imatrix(file_name.c_str(), m_params.dataset.c_str());
211212
}
212213

213-
void IMatrixCollector::save_imatrix(const char * fname) const {
214+
void IMatrixCollector::save_imatrix(const char * fname, const char * dataset) const {
214215
std::ofstream out(fname, std::ios::binary);
215216
int n_entries = m_stats.size();
216-
out.write((const char*)&n_entries, sizeof(n_entries));
217-
for (auto& p : m_stats) {
217+
out.write((const char *) &n_entries, sizeof(n_entries));
218+
for (const auto & p : m_stats) {
218219
int len = p.first.size();
219-
out.write((const char*)&len, sizeof(len));
220+
out.write((const char *) &len, sizeof(len));
220221
out.write(p.first.c_str(), len);
221-
out.write((const char*)&p.second.ncall, sizeof(p.second.ncall));
222+
out.write((const char *) &p.second.ncall, sizeof(p.second.ncall));
222223
int nval = p.second.values.size();
223-
out.write((const char*)&nval, sizeof(nval));
224-
if (nval > 0) out.write((const char*)p.second.values.data(), nval*sizeof(float));
224+
out.write((const char *) &nval, sizeof(nval));
225+
if (nval > 0) out.write((const char *) p.second.values.data(), nval * sizeof(float));
225226
}
227+
228+
// Write the number of call the matrix was computed with
229+
out.write((const char *) &m_last_call, sizeof(m_last_call));
230+
231+
// Write the dataset name at the end of the file to later on specify it in quantize
232+
int n_dataset = strlen(dataset);
233+
out.write((const char *) &n_dataset, sizeof(n_dataset));
234+
out.write(dataset, n_dataset);
235+
226236
if (m_params.verbosity > 0) {
227-
fprintf(stderr, "\n%s: stored collected data after %d chunks in %s\n",__func__,m_last_call,fname);
237+
fprintf(stderr, "\n%s: stored collected data after %d chunks in %s\n", __func__, m_last_call, fname);
228238
}
229239
}
230240

@@ -547,6 +557,29 @@ int main(int argc, char ** argv) {
547557
}
548558
}
549559

560+
gpt_params params;
561+
params.n_batch = 512;
562+
if (!gpt_params_parse(args.size(), args.data(), params)) {
563+
return 1;
564+
}
565+
566+
params.logits_all = true;
567+
params.n_batch = std::min(params.n_batch, params.n_ctx);
568+
569+
print_build_info();
570+
571+
if (params.seed == LLAMA_DEFAULT_SEED) {
572+
params.seed = time(NULL);
573+
}
574+
575+
fprintf(stderr, "%s: seed = %u\n", __func__, params.seed);
576+
577+
std::mt19937 rng(params.seed);
578+
if (params.random_prompt) {
579+
params.prompt = gpt_random_prompt(rng);
580+
}
581+
582+
sparams.dataset = params.prompt_file;
550583
g_collector.set_parameters(std::move(sparams));
551584

552585
if (!combine_files.empty()) {
@@ -585,28 +618,6 @@ int main(int argc, char ** argv) {
585618
}
586619
}
587620

588-
gpt_params params;
589-
params.n_batch = 512;
590-
if (!gpt_params_parse(args.size(), args.data(), params)) {
591-
return 1;
592-
}
593-
594-
params.logits_all = true;
595-
params.n_batch = std::min(params.n_batch, params.n_ctx);
596-
597-
print_build_info();
598-
599-
if (params.seed == LLAMA_DEFAULT_SEED) {
600-
params.seed = time(NULL);
601-
}
602-
603-
fprintf(stderr, "%s: seed = %u\n", __func__, params.seed);
604-
605-
std::mt19937 rng(params.seed);
606-
if (params.random_prompt) {
607-
params.prompt = gpt_random_prompt(rng);
608-
}
609-
610621
llama_backend_init();
611622
llama_numa_init(params.numa);
612623

Diff for: examples/quantize/CMakeLists.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
set(TARGET quantize)
22
add_executable(${TARGET} quantize.cpp)
33
install(TARGETS ${TARGET} RUNTIME)
4-
target_link_libraries(${TARGET} PRIVATE llama build_info ${CMAKE_THREAD_LIBS_INIT})
4+
target_link_libraries(${TARGET} PRIVATE llama common ${CMAKE_THREAD_LIBS_INIT})
55
target_include_directories(${TARGET} PRIVATE ../../common)
66
target_compile_features(${TARGET} PRIVATE cxx_std_11)

0 commit comments

Comments
 (0)