turnkeyml/src/lemonade/tools/ort_genai/oga.py at 6d76740c4487c3aae81ae11ef051e80fb01027cd · onnx/turnkeyml · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
# onnxruntime_genai is not lint-friendly yet and PyLint can't
# find any of the class methods
# pylint: disable=no-member
#
# Model builder constraints:
#   11/10/24 Need transformers <4.45.0 OR onnxruntime-genai 0.5.0 (which must be built from source)
#   (transformers v4.45 changes the format of the tokenizer.json file which will be supported in
#   onnxruntime-genai 0.5)
#

import argparse
import os
import time
import json
import shutil
from fnmatch import fnmatch
from queue import Queue
from packaging.version import Version
from huggingface_hub import snapshot_download
import onnxruntime_genai as og
import onnxruntime_genai.models.builder as model_builder
from turnkeyml.state import State
from turnkeyml.tools import FirstTool
import turnkeyml.common.status as status
import turnkeyml.common.printing as printing
from lemonade.tools.adapter import (
    ModelAdapter,
    TokenizerAdapter,
    PassthroughTokenizerResult,
)
from lemonade.cache import Keys

# ONNX Runtime GenAI models will be cached in this subfolder of the lemonade cache folder
oga_models_path = "oga_models"

# ONNX Runtime GenAI model builder tool uses this subfolder of the lemonade cache as its cache
oga_model_builder_cache_path = "model_builder"

# Mapping from processor to executiion provider, used in pathnames and by model_builder
execution_providers = {
    "cpu": "cpu",
    "npu": "npu",
    "igpu": "dml",
    "hybrid": "hybrid",
    "cuda": "cuda",
}


class OrtGenaiTokenizer(TokenizerAdapter):
    def __init__(self, model: og.Model):
        # Initialize the tokenizer and produce the initial tokens.
        self.tokenizer = og.Tokenizer(model)
        # Placeholder value since some code will try to query it
        # If we actually need this to return a proper value, then
        # og.GeneratorParams.eos_token_id has it
        self.eos_token_id = None

    def __call__(self, prompt: str, return_tensors="np"):
        tokens = self.tokenizer.encode(prompt)

        return PassthroughTokenizerResult(tokens)

    # onnxruntime_genai's tokenizer doesn't support any arguments
    # yet, so we just ignore skip_special_tokens and hope it
    # doesn't have a major negative effect
    # pylint: disable=unused-argument
    def decode(self, response, skip_special_tokens=True) -> str:
        return self.tokenizer.decode(response)


class OrtGenaiStreamer:
    def __init__(self, tokenizer: OrtGenaiTokenizer, timeout=None):
        self.tokenizer = tokenizer
        self.text_queue = Queue()
        self.stop_signal = None
        self.timeout = timeout

    def add_text(self, text: str):
        self.text_queue.put(text, timeout=self.timeout)

    def done(self):
        self.text_queue.put(self.stop_signal, timeout=self.timeout)

    def __iter__(self):
        return self

    def __next__(self):
        value = self.text_queue.get(timeout=self.timeout)
        if value == self.stop_signal:
            raise StopIteration()
        else:
            return value


class OrtGenaiModel(ModelAdapter):

    def __init__(self, input_folder):
        super().__init__()
        self.model = og.Model(input_folder)
        self.type = "ort-genai"
        self.config = self.load_config(input_folder)

    def load_config(self, input_folder):
        config_path = os.path.join(input_folder, "genai_config.json")
        if os.path.exists(config_path):
            with open(config_path, "r", encoding="utf-8") as f:
                return json.load(f)
        return None

    def generate(
        self,
        input_ids,
        max_new_tokens=512,
        do_sample=True,
        top_k=50,
        top_p=1.0,
        temperature=0.7,
        streamer: OrtGenaiStreamer = None,
        pad_token_id=None,
        stopping_criteria=None,
    ):
        params = og.GeneratorParams(self.model)

        # There is a breaking API change in OGA 0.6.0
        # Determine whether we should use the old or new APIs
        # This also supports 0.6.0.dev0, which evaluates to less than 0.6.0 in Version
        use_oga_post_6_api = (
            Version(og.__version__) >= Version("0.6.0") or "0.6.0" in og.__version__
        )
        use_oga_pre_6_api = not use_oga_post_6_api

        if pad_token_id:
            params.pad_token_id = pad_token_id

        max_length = len(input_ids) + max_new_tokens

        if use_oga_pre_6_api:
            params.input_ids = input_ids

        if self.config and "search" in self.config:
            search_config = self.config["search"]
            params.set_search_options(
                do_sample=search_config.get("do_sample", do_sample),
                top_k=search_config.get("top_k", top_k),
                top_p=search_config.get("top_p", top_p),
                temperature=search_config.get("temperature", temperature),
                max_length=max_length,
                min_length=0,
                early_stopping=search_config.get("early_stopping", False),
                length_penalty=search_config.get("length_penalty", 1.0),
                num_beams=search_config.get("num_beams", 1),
                num_return_sequences=search_config.get("num_return_sequences", 1),
                repetition_penalty=search_config.get("repetition_penalty", 1.0),
                past_present_share_buffer=search_config.get(
                    "past_present_share_buffer", True
                ),
                # Not currently supported by OGA
                # diversity_penalty=search_config.get('diversity_penalty', 0.0),
                # no_repeat_ngram_size=search_config.get('no_repeat_ngram_size', 0),
            )
        else:
            params.set_search_options(
                do_sample=do_sample,
                top_k=top_k,
                top_p=top_p,
                temperature=temperature,
                max_length=max_length,
                min_length=max_length,
            )
        params.try_graph_capture_with_max_batch_size(1)

        generator = og.Generator(self.model, params)
        if use_oga_post_6_api:
            generator.append_tokens(input_ids)

        if streamer is None:
            prompt_start_time = time.perf_counter()
            if use_oga_pre_6_api:
                generator.compute_logits()
            generator.generate_next_token()
            prompt_end_time = time.perf_counter()

            self.time_to_first_token = prompt_end_time - prompt_start_time

            if max_new_tokens > 1:

                token_gen_times = []
                while not generator.is_done():
                    token_gen_start_time = time.perf_counter()
                    if use_oga_pre_6_api:
                        generator.compute_logits()
                    generator.generate_next_token()
                    token_gen_end_time = time.perf_counter()

                    token_gen_times.append(token_gen_end_time - token_gen_start_time)

                if token_gen_times:
                    # List will be empty if we generated 1 or 0 tokens, and we don't
                    # want a divide-by-zero error in those cases
                    avg_token_gen_latency_s = sum(token_gen_times) / len(
                        token_gen_times
                    )
                    self.tokens_per_second = 1 / avg_token_gen_latency_s

            return [generator.get_sequence(0)]
        else:
            tokenizer_stream = streamer.tokenizer.tokenizer.create_stream()

            stop_early = False

            while not generator.is_done() and not stop_early:
                if use_oga_pre_6_api:
                    generator.compute_logits()
                generator.generate_next_token()

                new_token = generator.get_next_tokens()[0]
                new_text = tokenizer_stream.decode(new_token)

                streamer.add_text(new_text)

                if stopping_criteria is not None:
                    if stopping_criteria[0].stop_event.is_set():
                        stop_early = True

            streamer.done()


class OgaLoad(FirstTool):
    """
    Tool that loads an LLM in OnnxRuntime-GenAI for use with CPU or DirectML execution providers.

    Input: path to a checkpoint.
        Supported choices for cpu and igpu from HF model repository:
            LLM models on Huggingface supported by model_builder.  See documentation
            (https://github.com/onnx/turnkeyml/blob/main/docs/ort_genai_igpu.md) for supported
            models.
        Supported choices for npu from HF model repository:
            Models on Hugging Face that follow the "amd/**-onnx-ryzen-strix" pattern
        Local models for cpu, igpu, or npu:
            The specified checkpoint is converted to a local path, via mapping to lower case
            and replacing '/' with '_'.  If this model already exists in the 'models' folderr
            of the lemonade cache and if it has a subfolder <device>-<dtype>, then this model
            will be used.  If the --force flag is used and the model is built with model_builder,
            then it will be rebuilt.


    Output:
        state.model: handle to a Huggingface-style LLM loaded on DirectML device
        state.tokenizer = Huggingface-style LLM tokenizer instance
        state.dtype = data type of the model on DirectML device
        state.checkpoint = name of the checkpoint used to load state.model

    Note: This tool expects the onnxruntime-genai-directml library to be pre-installed.
            If that library is not installed, this tool will not load.
    """

    unique_name = "oga-load"

    def __init__(self):
        super().__init__(monitor_message="Loading OnnxRuntime-GenAI model")

        self.status_stats = [Keys.DTYPE, Keys.DEVICE, Keys.OGA_MODELS_SUBFOLDER]

    @staticmethod
    def parser(add_help: bool = True) -> argparse.ArgumentParser:
        parser = __class__.helpful_parser(
            short_description="Load model in onnxruntime-genai (OGA)",
            add_help=add_help,
        )

        parser.add_argument(
            "-ip",
            "--input_path",
            default="",
            help="the local huggingface model in your disk",
        )

        parser.add_argument(
            "-d",
            "--device",
            choices=["igpu", "npu", "cpu", "hybrid", "cuda"],
            default="igpu",
            help="Which device to load the model on to (default: igpu)",
        )

        parser.add_argument(
            "--dtype",
            choices=["int4", "fp16", "fp32"],
            required=True,
            help="Data type to load the model in",
        )

        parser.add_argument(
            "--int4-block-size",
            default=None,
            help="Specify the block_size for int4 quantization.",
            choices=[16, 32, 64, 128, 256],
            type=int,
        )

        parser.add_argument(
            "--force",
            action="store_true",
            help="Forces downloading of Hugging-Face model again (if changed).  Additionally for"
            " cpu and igpu devices only, forces model_builder to run again on the HF model"
            " (changed or not).",
        )

        parser.add_argument(
            "--download",
            action="store_true",
            help="Download the model if needed, but don't load it",
        )

        parser.add_argument(
            "--subfolder",
            default=None,
            help="Subfolder where model is located <LEMONADE CACHE>/oga_models/<MODELNAME>"
            "/<SUBFOLDER>, default is <EP for device>-<dtype>.  The EPs are: "
            f'{", ".join([value + " for " + key for key, value in execution_providers.items()])}.',
        )

        return parser

    def run(
        self,
        state: State,
        input: str,
        input_path: str = "",
        device: str = "igpu",
        dtype: str = "int4",
        int4_block_size: int = None,
        force: bool = False,
        download: bool = False,
        subfolder: str = None,
    ) -> State:

        checkpoint = input
        state.checkpoint = checkpoint

        # See whether the device;dtype;checkpoint combination is supported for download from HF
        hf_supported_models = {
            "cpu": {"int4": "*/*", "fp32": "*/*"},
            "igpu": {"int4": "*/*", "fp16": "*/*"},
            "npu": {"int4": "amd/**-onnx-ryzen-strix"},
            "hybrid": {"int4": "amd/**-hybrid"},
            "cuda": {"int4": "*/*", "fp16": "*/*"},
        }

        hf_supported = (
            device in hf_supported_models
            and dtype in hf_supported_models[device]
            and fnmatch(checkpoint, hf_supported_models[device][dtype])
        )

        # Check to see if the model already exists locally
        if subfolder is None:
            subfolder = f"{execution_providers[device]}-{dtype}"
            subfolder += (
                f"-block-{int4_block_size}"
                if dtype == "int4" and int4_block_size is not None
                else ""
            )
        oga_models_subfolder = os.path.join(
            checkpoint.replace("/", "_").lower(), subfolder
        )
        full_model_path = os.path.join(
            state.cache_dir, oga_models_path, oga_models_subfolder
        )
        model_exists_locally = os.path.isdir(full_model_path) and os.listdir(
            full_model_path
        )

        # Check if model needs to be downloaded and/or built or rebuilt
        if not model_exists_locally or force:

            if not hf_supported:
                # Download/build can't be done
                raise ValueError(
                    "The (device, dtype, checkpoint) combination is not supported: "
                    f"({device}, {dtype}, {checkpoint}). The supported combinations "
                    f"for Hugging Face models are "
                    + ", ".join(
                        [
                            f"({dev}, {dt}, {hf_supported_models[dev][dt]})"
                            for dev in hf_supported_models.keys()
                            for dt in hf_supported_models[dev]
                        ]
                    )
                    + "."
                )

            # Download the model from HF
            if device == "npu" or device == "hybrid":

                # NPU models on HF are ready to go and HF does its own caching
                full_model_path = snapshot_download(
                    repo_id=checkpoint,
                    ignore_patterns=["*.md", "*.txt"],
                )
                oga_models_subfolder = None

                if device == "hybrid":
                    # Locate the directory containing hybrid-llm-artifacts_1.3.0 in the system PATH
                    hybrid_artifacts_path = None
                    hybrid_artifacts_path = os.environ.get("AMD_OGA_HYBRID")

                    if hybrid_artifacts_path is None:
                        raise RuntimeError(
                            "Could not find hybrid-llm-artifacts_1.3.0 in system PATH. "
                            "Please ensure it is added to your PATH environment variable."
                        )

                    if hybrid_artifacts_path:
                        # Construct the path to onnx_custom_ops.dll
                        custom_ops_path = os.path.join(
                            hybrid_artifacts_path,
                            "hybrid-llm-artifacts",
                            "onnx_utils",
                            "bin",
                            "onnx_custom_ops.dll",
                        )

                        config_path = os.path.join(full_model_path, "genai_config.json")

                        # Check if the config file exists
                        if os.path.exists(config_path):
                            with open(config_path, "r", encoding="utf-8") as f:
                                config = json.load(f)

                            # Modify the custom_ops_library under decoder -> session_options
                            if (
                                "model" in config
                                and "decoder" in config["model"]
                                and "session_options" in config["model"]["decoder"]
                            ):
                                config["model"]["decoder"]["session_options"][
                                    "custom_ops_library"
                                ] = custom_ops_path

                            # Write the changes back to the file
                            with open(config_path, "w", encoding="utf-8") as f:
                                json.dump(config, f, indent=4)

                        # Copy DirectML.dll from lib to bin folder
                        src_dll = os.path.join(
                            hybrid_artifacts_path,
                            "hybrid-llm-artifacts",
                            "onnxruntime_genai",
                            "lib",
                            "DirectML.dll",
                        )
                        dst_dll = os.path.join(
                            hybrid_artifacts_path,
                            "hybrid-llm-artifacts",
                            "onnx_utils",
                            "bin",
                            "DirectML.dll",
                        )

                        # Create destination directory if it doesn't exist
                        os.makedirs(os.path.dirname(dst_dll), exist_ok=True)
                        shutil.copy2(src_dll, dst_dll)
            else:
                # device is 'cpu' or 'igpu'

                # Use model_builder to download model and convert to ONNX
                printing.log_info(f"Building {checkpoint} for {device} using {dtype}")
                extra_options = {}
                if int4_block_size is not None:
                    extra_options["int4-block-size"] = int4_block_size
                try:
                    model_builder.create_model(
                        checkpoint,  # model_name
                        input_path,  # input_path
                        full_model_path,  # output_path
                        dtype,  # precision
                        execution_providers[device],  # execution_provider
                        os.path.join(
                            state.cache_dir, oga_model_builder_cache_path
                        ),  # cache_dir
                        **extra_options,
                    )
                except NotImplementedError as e:
                    # Model architecture is not supported by model builder
                    raise NotImplementedError("[Model builder] " + str(e)) from e
                except OSError as e:
                    # Model is not found either locally nor in HF repository
                    raise ValueError("[Model builder] " + str(e)) from e

        if not download:
            # The download only flag is not set, so load model
            if device == "npu":
                if "AMD_OGA" not in os.environ:
                    raise RuntimeError(
                        "Please set environment variable AMD_OGA to the path of the amd_oga files"
                    )

                # Check AMD_OGA points to oga library files
                oga_path = os.environ["AMD_OGA"]
                if not os.path.exists(
                    os.path.join(oga_path, "libs", "onnxruntime.dll")
                ):
                    raise RuntimeError(
                        f"Cannot find libs/onnxruntime.dll in AMD_OGA folder: {oga_path}"
                    )

                # Save current directory and PATH
                saved_cwd = os.getcwd()
                saved_path = os.environ["PATH"]

                # Change to the AMD_OGA distribution directory
                os.chdir(oga_path)
                os.environ["PATH"] += os.pathsep + os.path.join(
                    os.environ["AMD_OGA"], "libs"
                )

                # Common environment variables for all NPU models
                os.environ["DD_ROOT"] = ".\\bins"
                os.environ["DEVICE"] = "stx"
                os.environ["XLNX_ENABLE_CACHE"] = "0"

                # Phi models require USE_AIE_RoPE=0
                if "phi-" in checkpoint.lower():
                    os.environ["USE_AIE_RoPE"] = "0"
                else:
                    os.environ["USE_AIE_RoPE"] = "1"

            state.model = OrtGenaiModel(full_model_path)
            state.tokenizer = OrtGenaiTokenizer(state.model.model)
            state.dtype = dtype

            state.save_stat(Keys.CHECKPOINT, checkpoint)
            state.save_stat(Keys.DTYPE, dtype)
            state.save_stat(Keys.DEVICE, device)
            if oga_models_subfolder is not None:
                state.save_stat(Keys.OGA_MODELS_SUBFOLDER, oga_models_subfolder)

            # Create a UniqueInvocationInfo and ModelInfo so that we can display status
            # at the end of the sequence
            status.add_to_state(state=state, name=input, model=input)

            if device == "npu":
                # Restore cwd and PATH
                os.chdir(saved_cwd)
                os.environ["PATH"] = saved_path

        return state