-
Notifications
You must be signed in to change notification settings - Fork 18
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add support for using a Llama.cpp binary and model from TurnkeyML (#234)
- Loading branch information
Showing
4 changed files
with
254 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,48 @@ | ||
# LLAMA.CPP | ||
|
||
Run transformer models using a Llama.cpp binary and checkpoint. This model can then be used with chatting or benchmarks such as MMLU. | ||
|
||
## Prerequisites | ||
|
||
This flow has been verified with a generic Llama.cpp model. | ||
|
||
These instructions are only for linux or Windows with wsl. It may be necessary to be running WSL in an Administrator command prompt. | ||
|
||
These instructions also assume that TurnkeyML's llm extensions have been installed (for example with "pip install -e .[llm]") | ||
|
||
|
||
### Set up Environment (Assumes TurnkeyML is already installed) | ||
|
||
Build or obtain the Llama.cpp model and desired checkpoint. | ||
For example (see the [llama.cpp](https://github.com/ggerganov/llama.cpp/blob/master/docs/build.md | ||
) source for more details): | ||
1. cd ~ | ||
1. git clone https://github.com/ggerganov/llama.cpp | ||
1. cd llama.cpp | ||
1. make | ||
1. cd models | ||
1. wget https://huggingface.co/TheBloke/Dolphin-Llama2-7B-GGUF/resolve/main/dolphin-llama2-7b.Q5_K_M.gguf | ||
|
||
|
||
## Usage | ||
|
||
The Llama.cpp tool currently supports the following parameters | ||
|
||
| Parameter | Definition | Default | | ||
| --------- | ---------------------------------------------------- | ------- | | ||
| executable | Path to the Llama.cpp-generated application binary | None | | ||
| model-binary | Model checkpoint (do not use if --input is passed to lemonade) | None | | ||
| threads | Number of threads to use for computation | 1 | | ||
| context-size | Maximum context length | 512 | | ||
| temp | Temperature to use for inference (leave out to use the application default) | None | | ||
|
||
### Example (assuming Llama.cpp built and a checkpoint loaded as above) | ||
|
||
```bash | ||
lemonade --input ~/llama.cpp/models/dolphin-llama2-7b.Q5_K_M.gguf load-llama-cpp --executable ~/llama.cpp/llama-cli accuracy-mmlu --ntrain 5 | ||
``` | ||
|
||
On windows, the llama.cpp binary might be in a different location (such as llama.cpp\build\bin\Release\), in which case the command mgiht be something like: | ||
```bash | ||
lemonade --input ~\llama.cpp\models\dolphin-llama2-7b.Q5_K_M.gguf load-llama-cpp --executable ~\llama.cpp\build\bin\Release\llama-cli accuracy-mmlu --ntrain 5 | ||
``` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,183 @@ | ||
import argparse | ||
import os | ||
import subprocess | ||
from typing import Optional | ||
|
||
from turnkeyml.state import State | ||
from turnkeyml.tools import FirstTool | ||
|
||
import turnkeyml.common.build as build | ||
from .adapter import PassthroughTokenizer, ModelAdapter | ||
|
||
|
||
def llamacpp_dir(state: State): | ||
return os.path.join(build.output_dir(state.cache_dir, state.build_name), "llamacpp") | ||
|
||
class LlamaCppAdapter(ModelAdapter): | ||
unique_name = "llama-cpp-adapter" | ||
|
||
def __init__(self, executable, model, tool_dir, context_size, threads, temp): | ||
super().__init__() | ||
|
||
self.executable = executable | ||
self.model = model | ||
self.tool_dir = tool_dir | ||
self.context_size = context_size | ||
self.threads = threads | ||
self.temp = temp | ||
|
||
def generate(self, input_ids: str, max_new_tokens: Optional[int] = None): | ||
""" | ||
Pass a text prompt into the llamacpp inference CLI. | ||
The input_ids arg here should receive the original text that | ||
would normally be encoded by a tokenizer. | ||
""" | ||
|
||
cmd = [ | ||
self.executable, | ||
"-e", | ||
] | ||
|
||
optional_params = { | ||
"ctx-size": self.context_size, | ||
"n-predict": max_new_tokens, | ||
"threads": self.threads, | ||
"model": self.model, | ||
"prompt": input_ids, | ||
"temp": self.temp | ||
} | ||
|
||
for flag, value in optional_params.items(): | ||
if value is not None: | ||
cmd.append(f"--{flag} {value}") | ||
|
||
cmd = [str(m) for m in cmd] | ||
|
||
process = subprocess.Popen( | ||
cmd, | ||
stdout=subprocess.PIPE, | ||
stderr=subprocess.PIPE, | ||
universal_newlines=True, | ||
) | ||
|
||
raw_output, raw_err= process.communicate() | ||
|
||
if process.returncode != 0: | ||
raise subprocess.CalledProcessError( | ||
process.returncode, process.args, raw_output, raw_err) | ||
|
||
prompt_found = False | ||
output_text = "" | ||
prompt_first_line = input_ids.split("\n")[0] | ||
for line in raw_output.splitlines(): | ||
if prompt_first_line in line: | ||
prompt_found = True | ||
if prompt_found: | ||
line = line.replace("</s> [end of text]", "") | ||
output_text = output_text + line | ||
|
||
if not prompt_found: | ||
raise Exception("Prompt not found in result, this is a bug in lemonade.") | ||
|
||
return [output_text] | ||
|
||
class LoadLlamaCpp(FirstTool): | ||
unique_name = "load-llama-cpp" | ||
|
||
def __init__(self): | ||
super().__init__(monitor_message="Running llama.cpp model") | ||
|
||
@staticmethod | ||
def parser(add_help: bool = True) -> argparse.ArgumentParser: | ||
parser = __class__.helpful_parser( | ||
short_description="Wrap Llamacpp models with an API", | ||
add_help=add_help, | ||
) | ||
|
||
parser.add_argument( | ||
"--executable", | ||
required=True, | ||
type=str, | ||
help="Executable name", | ||
) | ||
|
||
default_threads = 1 | ||
parser.add_argument( | ||
"--threads", | ||
required=False, | ||
type=int, | ||
default=default_threads, | ||
help=f"Number of threads to use for generation (default: {default_threads})", | ||
) | ||
|
||
context_size = 512 | ||
parser.add_argument( | ||
"--context-size", | ||
required=False, | ||
type=int, | ||
default=context_size, | ||
help=f"Context size of the prompt (default: {context_size})", | ||
) | ||
|
||
parser.add_argument( | ||
"--model-binary", | ||
required=False, | ||
help="Path to a .gguf model to use with benchmarking.", | ||
) | ||
|
||
parser.add_argument( | ||
"--temp", | ||
type=float, | ||
required=False, | ||
help="Temperature", | ||
) | ||
|
||
return parser | ||
|
||
def run( | ||
self, | ||
state: State, | ||
input: str = None, | ||
context_size: int = None, | ||
threads: int = None, | ||
executable: str = None, | ||
model_binary: str = None, | ||
temp: float = None, | ||
) -> State: | ||
""" | ||
Create a tokenizer instance and model instance in `state` that support: | ||
input_ids = tokenizer(prompt, return_tensors="pt").input_ids | ||
response = model.generate(input_ids, max_new_tokens=1) | ||
response_text = tokenizer.decode(response[0], skip_special_tokens=True).strip() | ||
""" | ||
|
||
if executable is None: | ||
raise Exception(f"{self.__class__.unique_name} requires an executable") | ||
|
||
if (input is not None and input != ""): | ||
model_binary = input | ||
|
||
# Save execution parameters | ||
state.save_stat("context_size", context_size) | ||
state.save_stat("threads", threads) | ||
|
||
if model_binary is None: | ||
raise Exception( | ||
f"{self.__class__.unique_name} requires the preceding tool to pass a " | ||
"Llamacpp model, " | ||
"or for the user to supply a model with `--model-binary`" | ||
) | ||
|
||
state.model = LlamaCppAdapter( | ||
executable = executable, | ||
model=model_binary, | ||
tool_dir=llamacpp_dir(state), | ||
context_size=context_size, | ||
threads=threads, | ||
temp=temp, | ||
) | ||
state.tokenizer = PassthroughTokenizer() | ||
|
||
return state |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters