-
Notifications
You must be signed in to change notification settings - Fork 33
Expand file tree
/
Copy pathoga.py
More file actions
549 lines (461 loc) · 20.7 KB
/
oga.py
File metadata and controls
549 lines (461 loc) · 20.7 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
# onnxruntime_genai is not lint-friendly yet and PyLint can't
# find any of the class methods
# pylint: disable=no-member
#
# Model builder constraints:
# 11/10/24 Need transformers <4.45.0 OR onnxruntime-genai 0.5.0 (which must be built from source)
# (transformers v4.45 changes the format of the tokenizer.json file which will be supported in
# onnxruntime-genai 0.5)
#
import argparse
import os
import time
import json
import shutil
from fnmatch import fnmatch
from queue import Queue
from packaging.version import Version
from huggingface_hub import snapshot_download
import onnxruntime_genai as og
import onnxruntime_genai.models.builder as model_builder
from turnkeyml.state import State
from turnkeyml.tools import FirstTool
import turnkeyml.common.status as status
import turnkeyml.common.printing as printing
from lemonade.tools.adapter import (
ModelAdapter,
TokenizerAdapter,
PassthroughTokenizerResult,
)
from lemonade.cache import Keys
# ONNX Runtime GenAI models will be cached in this subfolder of the lemonade cache folder
oga_models_path = "oga_models"
# ONNX Runtime GenAI model builder tool uses this subfolder of the lemonade cache as its cache
oga_model_builder_cache_path = "model_builder"
# Mapping from processor to executiion provider, used in pathnames and by model_builder
execution_providers = {
"cpu": "cpu",
"npu": "npu",
"igpu": "dml",
"hybrid": "hybrid",
"cuda": "cuda",
}
class OrtGenaiTokenizer(TokenizerAdapter):
def __init__(self, model: og.Model):
# Initialize the tokenizer and produce the initial tokens.
self.tokenizer = og.Tokenizer(model)
# Placeholder value since some code will try to query it
# If we actually need this to return a proper value, then
# og.GeneratorParams.eos_token_id has it
self.eos_token_id = None
def __call__(self, prompt: str, return_tensors="np"):
tokens = self.tokenizer.encode(prompt)
return PassthroughTokenizerResult(tokens)
# onnxruntime_genai's tokenizer doesn't support any arguments
# yet, so we just ignore skip_special_tokens and hope it
# doesn't have a major negative effect
# pylint: disable=unused-argument
def decode(self, response, skip_special_tokens=True) -> str:
return self.tokenizer.decode(response)
class OrtGenaiStreamer:
def __init__(self, tokenizer: OrtGenaiTokenizer, timeout=None):
self.tokenizer = tokenizer
self.text_queue = Queue()
self.stop_signal = None
self.timeout = timeout
def add_text(self, text: str):
self.text_queue.put(text, timeout=self.timeout)
def done(self):
self.text_queue.put(self.stop_signal, timeout=self.timeout)
def __iter__(self):
return self
def __next__(self):
value = self.text_queue.get(timeout=self.timeout)
if value == self.stop_signal:
raise StopIteration()
else:
return value
class OrtGenaiModel(ModelAdapter):
def __init__(self, input_folder):
super().__init__()
self.model = og.Model(input_folder)
self.type = "ort-genai"
self.config = self.load_config(input_folder)
def load_config(self, input_folder):
config_path = os.path.join(input_folder, "genai_config.json")
if os.path.exists(config_path):
with open(config_path, "r", encoding="utf-8") as f:
return json.load(f)
return None
def generate(
self,
input_ids,
max_new_tokens=512,
do_sample=True,
top_k=50,
top_p=1.0,
temperature=0.7,
streamer: OrtGenaiStreamer = None,
pad_token_id=None,
stopping_criteria=None,
):
params = og.GeneratorParams(self.model)
# There is a breaking API change in OGA 0.6.0
# Determine whether we should use the old or new APIs
# This also supports 0.6.0.dev0, which evaluates to less than 0.6.0 in Version
use_oga_post_6_api = (
Version(og.__version__) >= Version("0.6.0") or "0.6.0" in og.__version__
)
use_oga_pre_6_api = not use_oga_post_6_api
if pad_token_id:
params.pad_token_id = pad_token_id
max_length = len(input_ids) + max_new_tokens
if use_oga_pre_6_api:
params.input_ids = input_ids
if self.config and "search" in self.config:
search_config = self.config["search"]
params.set_search_options(
do_sample=search_config.get("do_sample", do_sample),
top_k=search_config.get("top_k", top_k),
top_p=search_config.get("top_p", top_p),
temperature=search_config.get("temperature", temperature),
max_length=max_length,
min_length=0,
early_stopping=search_config.get("early_stopping", False),
length_penalty=search_config.get("length_penalty", 1.0),
num_beams=search_config.get("num_beams", 1),
num_return_sequences=search_config.get("num_return_sequences", 1),
repetition_penalty=search_config.get("repetition_penalty", 1.0),
past_present_share_buffer=search_config.get(
"past_present_share_buffer", True
),
# Not currently supported by OGA
# diversity_penalty=search_config.get('diversity_penalty', 0.0),
# no_repeat_ngram_size=search_config.get('no_repeat_ngram_size', 0),
)
else:
params.set_search_options(
do_sample=do_sample,
top_k=top_k,
top_p=top_p,
temperature=temperature,
max_length=max_length,
min_length=max_length,
)
params.try_graph_capture_with_max_batch_size(1)
generator = og.Generator(self.model, params)
if use_oga_post_6_api:
generator.append_tokens(input_ids)
if streamer is None:
prompt_start_time = time.perf_counter()
if use_oga_pre_6_api:
generator.compute_logits()
generator.generate_next_token()
prompt_end_time = time.perf_counter()
self.time_to_first_token = prompt_end_time - prompt_start_time
if max_new_tokens > 1:
token_gen_times = []
while not generator.is_done():
token_gen_start_time = time.perf_counter()
if use_oga_pre_6_api:
generator.compute_logits()
generator.generate_next_token()
token_gen_end_time = time.perf_counter()
token_gen_times.append(token_gen_end_time - token_gen_start_time)
if token_gen_times:
# List will be empty if we generated 1 or 0 tokens, and we don't
# want a divide-by-zero error in those cases
avg_token_gen_latency_s = sum(token_gen_times) / len(
token_gen_times
)
self.tokens_per_second = 1 / avg_token_gen_latency_s
return [generator.get_sequence(0)]
else:
tokenizer_stream = streamer.tokenizer.tokenizer.create_stream()
stop_early = False
while not generator.is_done() and not stop_early:
if use_oga_pre_6_api:
generator.compute_logits()
generator.generate_next_token()
new_token = generator.get_next_tokens()[0]
new_text = tokenizer_stream.decode(new_token)
streamer.add_text(new_text)
if stopping_criteria is not None:
if stopping_criteria[0].stop_event.is_set():
stop_early = True
streamer.done()
class OgaLoad(FirstTool):
"""
Tool that loads an LLM in OnnxRuntime-GenAI for use with CPU or DirectML execution providers.
Input: path to a checkpoint.
Supported choices for cpu and igpu from HF model repository:
LLM models on Huggingface supported by model_builder. See documentation
(https://github.com/onnx/turnkeyml/blob/main/docs/ort_genai_igpu.md) for supported
models.
Supported choices for npu from HF model repository:
Models on Hugging Face that follow the "amd/**-onnx-ryzen-strix" pattern
Local models for cpu, igpu, or npu:
The specified checkpoint is converted to a local path, via mapping to lower case
and replacing '/' with '_'. If this model already exists in the 'models' folderr
of the lemonade cache and if it has a subfolder <device>-<dtype>, then this model
will be used. If the --force flag is used and the model is built with model_builder,
then it will be rebuilt.
Output:
state.model: handle to a Huggingface-style LLM loaded on DirectML device
state.tokenizer = Huggingface-style LLM tokenizer instance
state.dtype = data type of the model on DirectML device
state.checkpoint = name of the checkpoint used to load state.model
Note: This tool expects the onnxruntime-genai-directml library to be pre-installed.
If that library is not installed, this tool will not load.
"""
unique_name = "oga-load"
def __init__(self):
super().__init__(monitor_message="Loading OnnxRuntime-GenAI model")
self.status_stats = [Keys.DTYPE, Keys.DEVICE, Keys.OGA_MODELS_SUBFOLDER]
@staticmethod
def parser(add_help: bool = True) -> argparse.ArgumentParser:
parser = __class__.helpful_parser(
short_description="Load model in onnxruntime-genai (OGA)",
add_help=add_help,
)
parser.add_argument(
"-ip",
"--input_path",
default="",
help="the local huggingface model in your disk",
)
parser.add_argument(
"-d",
"--device",
choices=["igpu", "npu", "cpu", "hybrid", "cuda"],
default="igpu",
help="Which device to load the model on to (default: igpu)",
)
parser.add_argument(
"--dtype",
choices=["int4", "fp16", "fp32"],
required=True,
help="Data type to load the model in",
)
parser.add_argument(
"--int4-block-size",
default=None,
help="Specify the block_size for int4 quantization.",
choices=[16, 32, 64, 128, 256],
type=int,
)
parser.add_argument(
"--force",
action="store_true",
help="Forces downloading of Hugging-Face model again (if changed). Additionally for"
" cpu and igpu devices only, forces model_builder to run again on the HF model"
" (changed or not).",
)
parser.add_argument(
"--download",
action="store_true",
help="Download the model if needed, but don't load it",
)
parser.add_argument(
"--subfolder",
default=None,
help="Subfolder where model is located <LEMONADE CACHE>/oga_models/<MODELNAME>"
"/<SUBFOLDER>, default is <EP for device>-<dtype>. The EPs are: "
f'{", ".join([value + " for " + key for key, value in execution_providers.items()])}.',
)
return parser
def run(
self,
state: State,
input: str,
input_path: str = "",
device: str = "igpu",
dtype: str = "int4",
int4_block_size: int = None,
force: bool = False,
download: bool = False,
subfolder: str = None,
) -> State:
checkpoint = input
state.checkpoint = checkpoint
# See whether the device;dtype;checkpoint combination is supported for download from HF
hf_supported_models = {
"cpu": {"int4": "*/*", "fp32": "*/*"},
"igpu": {"int4": "*/*", "fp16": "*/*"},
"npu": {"int4": "amd/**-onnx-ryzen-strix"},
"hybrid": {"int4": "amd/**-hybrid"},
"cuda": {"int4": "*/*", "fp16": "*/*"},
}
hf_supported = (
device in hf_supported_models
and dtype in hf_supported_models[device]
and fnmatch(checkpoint, hf_supported_models[device][dtype])
)
# Check to see if the model already exists locally
if subfolder is None:
subfolder = f"{execution_providers[device]}-{dtype}"
subfolder += (
f"-block-{int4_block_size}"
if dtype == "int4" and int4_block_size is not None
else ""
)
oga_models_subfolder = os.path.join(
checkpoint.replace("/", "_").lower(), subfolder
)
full_model_path = os.path.join(
state.cache_dir, oga_models_path, oga_models_subfolder
)
model_exists_locally = os.path.isdir(full_model_path) and os.listdir(
full_model_path
)
# Check if model needs to be downloaded and/or built or rebuilt
if not model_exists_locally or force:
if not hf_supported:
# Download/build can't be done
raise ValueError(
"The (device, dtype, checkpoint) combination is not supported: "
f"({device}, {dtype}, {checkpoint}). The supported combinations "
f"for Hugging Face models are "
+ ", ".join(
[
f"({dev}, {dt}, {hf_supported_models[dev][dt]})"
for dev in hf_supported_models.keys()
for dt in hf_supported_models[dev]
]
)
+ "."
)
# Download the model from HF
if device == "npu" or device == "hybrid":
# NPU models on HF are ready to go and HF does its own caching
full_model_path = snapshot_download(
repo_id=checkpoint,
ignore_patterns=["*.md", "*.txt"],
)
oga_models_subfolder = None
if device == "hybrid":
# Locate the directory containing hybrid-llm-artifacts_1.3.0 in the system PATH
hybrid_artifacts_path = None
hybrid_artifacts_path = os.environ.get("AMD_OGA_HYBRID")
if hybrid_artifacts_path is None:
raise RuntimeError(
"Could not find hybrid-llm-artifacts_1.3.0 in system PATH. "
"Please ensure it is added to your PATH environment variable."
)
if hybrid_artifacts_path:
# Construct the path to onnx_custom_ops.dll
custom_ops_path = os.path.join(
hybrid_artifacts_path,
"hybrid-llm-artifacts",
"onnx_utils",
"bin",
"onnx_custom_ops.dll",
)
config_path = os.path.join(full_model_path, "genai_config.json")
# Check if the config file exists
if os.path.exists(config_path):
with open(config_path, "r", encoding="utf-8") as f:
config = json.load(f)
# Modify the custom_ops_library under decoder -> session_options
if (
"model" in config
and "decoder" in config["model"]
and "session_options" in config["model"]["decoder"]
):
config["model"]["decoder"]["session_options"][
"custom_ops_library"
] = custom_ops_path
# Write the changes back to the file
with open(config_path, "w", encoding="utf-8") as f:
json.dump(config, f, indent=4)
# Copy DirectML.dll from lib to bin folder
src_dll = os.path.join(
hybrid_artifacts_path,
"hybrid-llm-artifacts",
"onnxruntime_genai",
"lib",
"DirectML.dll",
)
dst_dll = os.path.join(
hybrid_artifacts_path,
"hybrid-llm-artifacts",
"onnx_utils",
"bin",
"DirectML.dll",
)
# Create destination directory if it doesn't exist
os.makedirs(os.path.dirname(dst_dll), exist_ok=True)
shutil.copy2(src_dll, dst_dll)
else:
# device is 'cpu' or 'igpu'
# Use model_builder to download model and convert to ONNX
printing.log_info(f"Building {checkpoint} for {device} using {dtype}")
extra_options = {}
if int4_block_size is not None:
extra_options["int4-block-size"] = int4_block_size
try:
model_builder.create_model(
checkpoint, # model_name
input_path, # input_path
full_model_path, # output_path
dtype, # precision
execution_providers[device], # execution_provider
os.path.join(
state.cache_dir, oga_model_builder_cache_path
), # cache_dir
**extra_options,
)
except NotImplementedError as e:
# Model architecture is not supported by model builder
raise NotImplementedError("[Model builder] " + str(e)) from e
except OSError as e:
# Model is not found either locally nor in HF repository
raise ValueError("[Model builder] " + str(e)) from e
if not download:
# The download only flag is not set, so load model
if device == "npu":
if "AMD_OGA" not in os.environ:
raise RuntimeError(
"Please set environment variable AMD_OGA to the path of the amd_oga files"
)
# Check AMD_OGA points to oga library files
oga_path = os.environ["AMD_OGA"]
if not os.path.exists(
os.path.join(oga_path, "libs", "onnxruntime.dll")
):
raise RuntimeError(
f"Cannot find libs/onnxruntime.dll in AMD_OGA folder: {oga_path}"
)
# Save current directory and PATH
saved_cwd = os.getcwd()
saved_path = os.environ["PATH"]
# Change to the AMD_OGA distribution directory
os.chdir(oga_path)
os.environ["PATH"] += os.pathsep + os.path.join(
os.environ["AMD_OGA"], "libs"
)
# Common environment variables for all NPU models
os.environ["DD_ROOT"] = ".\\bins"
os.environ["DEVICE"] = "stx"
os.environ["XLNX_ENABLE_CACHE"] = "0"
# Phi models require USE_AIE_RoPE=0
if "phi-" in checkpoint.lower():
os.environ["USE_AIE_RoPE"] = "0"
else:
os.environ["USE_AIE_RoPE"] = "1"
state.model = OrtGenaiModel(full_model_path)
state.tokenizer = OrtGenaiTokenizer(state.model.model)
state.dtype = dtype
state.save_stat(Keys.CHECKPOINT, checkpoint)
state.save_stat(Keys.DTYPE, dtype)
state.save_stat(Keys.DEVICE, device)
if oga_models_subfolder is not None:
state.save_stat(Keys.OGA_MODELS_SUBFOLDER, oga_models_subfolder)
# Create a UniqueInvocationInfo and ModelInfo so that we can display status
# at the end of the sequence
status.add_to_state(state=state, name=input, model=input)
if device == "npu":
# Restore cwd and PATH
os.chdir(saved_cwd)
os.environ["PATH"] = saved_path
return state