|
3 | 3 |
|
4 | 4 | import argparse
|
5 | 5 | import concurrent.futures
|
6 |
| -import copy |
7 | 6 | import enum
|
8 | 7 | import faulthandler
|
9 | 8 | import functools
|
10 |
| -import io |
11 | 9 | import itertools
|
12 | 10 | import json
|
13 | 11 | import math
|
|
23 | 21 | from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
|
24 | 22 | from dataclasses import dataclass
|
25 | 23 | from pathlib import Path
|
26 |
| -from typing import IO, TYPE_CHECKING, Any, Callable, Generator, Iterable, Literal, Sequence, TypeVar |
| 24 | +from typing import IO, TYPE_CHECKING, Any, Callable, Iterable, Literal, TypeVar |
27 | 25 |
|
28 | 26 | import numpy as np
|
29 | 27 | from sentencepiece import SentencePieceProcessor
|
30 | 28 |
|
31 | 29 | import os
|
32 | 30 | if 'NO_LOCAL_GGUF' not in os.environ:
|
33 |
| - sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf')) |
| 31 | + sys.path.insert(1, str(Path(__file__).parent / 'gguf-py')) |
34 | 32 | import gguf
|
35 | 33 |
|
36 | 34 | if TYPE_CHECKING:
|
@@ -851,7 +849,7 @@ def add_meta_vocab(self, vocab: Vocab) -> None:
|
851 | 849 | elif isinstance(vocab, BpeVocab):
|
852 | 850 | self.gguf.add_tokenizer_model("gpt2")
|
853 | 851 | else:
|
854 |
| - raise ValueError(f'Unknown vocab type: Not BpeVocab or SentencePieceVocab') |
| 852 | + raise ValueError('Unknown vocab type: Not BpeVocab or SentencePieceVocab') |
855 | 853 | self.gguf.add_token_list(tokens)
|
856 | 854 | self.gguf.add_token_scores(scores)
|
857 | 855 | self.gguf.add_token_types(toktypes)
|
@@ -905,7 +903,7 @@ def maybe_do_quantize(item: tuple[DataType, NDArray]) -> NDArray:
|
905 | 903 | return dt.quantize(arr)
|
906 | 904 |
|
907 | 905 | @staticmethod
|
908 |
| - def write_all(fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: Vocab, svocab: gguf.SpecialVocab, concurrency: int = DEFAULT_CONCURRENCY, endianess=gguf.GGUFEndian.LITTLE) -> None: |
| 906 | + def write_all(fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: Vocab, svocab: gguf.SpecialVocab, concurrency: int = DEFAULT_CONCURRENCY, endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE) -> None: |
909 | 907 | check_vocab_size(params, vocab)
|
910 | 908 |
|
911 | 909 | of = OutputFile(fname_out, endianess=endianess)
|
@@ -1114,11 +1112,15 @@ def do_dump_model(model_plus: ModelPlus) -> None:
|
1114 | 1112 |
|
1115 | 1113 |
|
1116 | 1114 | def main(args_in: list[str] | None = None) -> None:
|
| 1115 | + output_choices = ["f32", "f16"] |
| 1116 | + if np.uint32(1) == np.uint32(1).newbyteorder("<"): |
| 1117 | + # We currently only support Q8_0 output on little endian systems. |
| 1118 | + output_choices.append("q8_0") |
1117 | 1119 | parser = argparse.ArgumentParser(description="Convert a LLaMa model to a GGML compatible file")
|
1118 | 1120 | parser.add_argument("--dump", action="store_true", help="don't convert, just show what's in the model")
|
1119 | 1121 | parser.add_argument("--dump-single", action="store_true", help="don't convert, just show what's in a single model file")
|
1120 | 1122 | parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab")
|
1121 |
| - parser.add_argument("--outtype", choices=["f32", "f16", "q8_0"], help="output format - note: q8_0 may be very slow (default: f16 or f32 based on input)") |
| 1123 | + parser.add_argument("--outtype", choices=output_choices, help="output format - note: q8_0 may be very slow (default: f16 or f32 based on input)") |
1122 | 1124 | parser.add_argument("--vocab-dir", type=Path, help="directory containing tokenizer.model, if separate from model file")
|
1123 | 1125 | parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
|
1124 | 1126 | parser.add_argument("model", type=Path, help="directory containing model file, or model file itself (*.pth, *.pt, *.bin)")
|
|
0 commit comments