Skip to content

Merge from upstream #2

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 12 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 1 addition & 3 deletions .github/workflows/build-wheels-cuda.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -61,11 +61,9 @@ jobs:
- name: Setup Mamba
uses: conda-incubator/[email protected]
with:
activate-environment: "build"
activate-environment: "llamacpp"
python-version: ${{ matrix.pyver }}
miniforge-variant: Mambaforge
miniforge-version: latest
use-mamba: true
add-pip-as-python-dependency: true
auto-activate-base: false

Expand Down
14 changes: 14 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,20 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

## [Unreleased]

## [0.3.9]

- feat: Update llama.cpp to ggerganov/llama.cpp@8733e0cf6eefc7c7752297cc22d0836706f4222c

## [0.3.8]

- feat: Update llama.cpp to ggerganov/llama.cpp@7841fc723e059d1fd9640e5c0ef19050fcc7c698

## [0.3.7]

- feat: Update llama.cpp to ggerganov/llama.cpp@794fe23f29fb40104975c91fe19f23798f7c726e
- fix(ci): Fix the CUDA workflow by @oobabooga in #1894
- fix: error showing time spent in llama perf context print, adds `no_perf` flag to `Llama` class by @shakalaca in #1898

## [0.3.6]

- feat: Update llama.cpp to ggerganov/llama.cpp@f7cd13301c2a88f97073fd119072b4cc92c08df1
Expand Down
5 changes: 4 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,9 @@ if (LLAMA_BUILD)
# Enable building of the common library
set(LLAMA_BUILD_COMMON ON CACHE BOOL "Build llama.cpp common library" FORCE)

# Disable building curl support
set(LLAMA_CURL OFF CACHE BOOL "llama.cpp: enable curl" FORCE)

# Architecture detection and settings for Apple platforms
if (APPLE)
# Get the target architecture
Expand Down Expand Up @@ -143,7 +146,7 @@ if (LLAMA_BUILD)
endif()

# Building llava
add_subdirectory(vendor/llama.cpp/examples/llava)
add_subdirectory(vendor/llama.cpp/tools/mtmd)
set_target_properties(llava_shared PROPERTIES OUTPUT_NAME "llava")

if (WIN32)
Expand Down
2 changes: 1 addition & 1 deletion llama_cpp/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from .llama_cpp import *
from .llama import *

__version__ = "0.3.6"
__version__ = "0.3.9"
181 changes: 101 additions & 80 deletions llama_cpp/_internals.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,13 @@ def __init__(
if model is None:
raise ValueError(f"Failed to load model from file: {path_model}")

vocab = llama_cpp.llama_model_get_vocab(model)

if vocab is None:
raise ValueError(f"Failed to get vocab from model: {path_model}")

self.model = model
self.vocab = vocab

def free_model():
if self.model is None:
Expand All @@ -75,7 +81,7 @@ def vocab_type(self) -> int:
return llama_cpp.llama_vocab_type(self.model)

def n_vocab(self) -> int:
return llama_cpp.llama_n_vocab(self.model)
return llama_cpp.llama_n_vocab(self.vocab)

def n_ctx_train(self) -> int:
return llama_cpp.llama_n_ctx_train(self.model)
Expand All @@ -84,7 +90,7 @@ def n_embd(self) -> int:
return llama_cpp.llama_n_embd(self.model)

def rope_freq_scale_train(self) -> float:
return llama_cpp.llama_rope_freq_scale_train(self.model)
return llama_cpp.llama_model_rope_freq_scale_train(self.model)

def desc(self) -> str:
buf = ctypes.create_string_buffer(1024)
Expand All @@ -98,67 +104,67 @@ def n_params(self) -> int:
return llama_cpp.llama_model_n_params(self.model)

def get_tensor(self, name: str) -> ctypes.c_void_p:
return llama_cpp.llama_get_model_tensor(self.model, name.encode("utf-8"))
raise NotImplementedError("get_tensor is not implemented in llama.cpp")

# Vocab

def token_get_text(self, token: int) -> str:
return llama_cpp.llama_token_get_text(self.model, token).decode("utf-8")
return llama_cpp.llama_token_get_text(self.vocab, token).decode("utf-8")

def token_get_score(self, token: int) -> float:
return llama_cpp.llama_token_get_score(self.model, token)
return llama_cpp.llama_token_get_score(self.vocab, token)

def token_get_attr(self, token: int) -> int:
return llama_cpp.llama_token_get_attr(self.model, token)
return llama_cpp.llama_token_get_attr(self.vocab, token)

# Special tokens

def token_bos(self) -> int:
return llama_cpp.llama_token_bos(self.model)
return llama_cpp.llama_token_bos(self.vocab)

def token_eos(self) -> int:
return llama_cpp.llama_token_eos(self.model)
return llama_cpp.llama_token_eos(self.vocab)

def token_cls(self) -> int:
return llama_cpp.llama_token_cls(self.model)
return llama_cpp.llama_token_cls(self.vocab)

def token_sep(self) -> int:
return llama_cpp.llama_token_sep(self.model)
return llama_cpp.llama_token_sep(self.vocab)

def token_nl(self) -> int:
return llama_cpp.llama_token_nl(self.model)
return llama_cpp.llama_token_nl(self.vocab)

def token_prefix(self) -> int:
return llama_cpp.llama_token_prefix(self.model)
raise NotImplementedError("token_prefix is not implemented in llama.cpp")

def token_middle(self) -> int:
return llama_cpp.llama_token_middle(self.model)
raise NotImplementedError("token_middle is not implemented in llama.cpp")

def token_suffix(self) -> int:
return llama_cpp.llama_token_suffix(self.model)
raise NotImplementedError("token_suffix is not implemented in llama.cpp")

def token_eot(self) -> int:
return llama_cpp.llama_token_eot(self.model)
return llama_cpp.llama_token_eot(self.vocab)

def add_bos_token(self) -> bool:
return llama_cpp.llama_add_bos_token(self.model)
return llama_cpp.llama_add_bos_token(self.vocab)

def add_eos_token(self) -> bool:
return llama_cpp.llama_add_eos_token(self.model)
return llama_cpp.llama_add_eos_token(self.vocab)

# Tokenization

def tokenize(self, text: bytes, add_bos: bool, special: bool):
n_ctx = self.n_ctx_train()
tokens = (llama_cpp.llama_token * n_ctx)()
n_tokens = llama_cpp.llama_tokenize(
self.model, text, len(text), tokens, n_ctx, add_bos, special
self.vocab, text, len(text), tokens, n_ctx, add_bos, special
)
if n_tokens < 0:
n_tokens = abs(n_tokens)
tokens = (llama_cpp.llama_token * n_tokens)()
n_tokens = llama_cpp.llama_tokenize(
self.model, text, len(text), tokens, n_tokens, add_bos, special
self.vocab, text, len(text), tokens, n_tokens, add_bos, special
)
if n_tokens < 0:
raise RuntimeError(
Expand All @@ -168,7 +174,7 @@ def tokenize(self, text: bytes, add_bos: bool, special: bool):

def token_to_piece(self, token: int, special: bool = False) -> bytes:
buf = ctypes.create_string_buffer(32)
llama_cpp.llama_token_to_piece(self.model, token, buf, 32, 0, special)
llama_cpp.llama_token_to_piece(self.vocab, token, buf, 32, 0, special)
return bytes(buf)

def detokenize(self, tokens: List[int], special: bool = False) -> bytes:
Expand All @@ -177,7 +183,7 @@ def detokenize(self, tokens: List[int], special: bool = False) -> bytes:
buffer = (ctypes.c_char * size)()
for token in tokens:
n = llama_cpp.llama_token_to_piece(
self.model, llama_cpp.llama_token(token), buffer, size, 0, special
self.vocab, llama_cpp.llama_token(token), buffer, size, 0, special
)
assert n <= size
output += bytes(buffer[:n])
Expand Down Expand Up @@ -320,7 +326,8 @@ def get_embeddings(self):

def set_rng_seed(self, seed: int):
# TODO: Fix
llama_cpp.llama_set_rng_seed(self.ctx, seed)
# llama_cpp.llama_set_rng_seed(self.ctx, seed)
raise NotImplementedError("set_rng_seed is not implemented in llama.cpp")

def sample_repetition_penalties(
self,
Expand All @@ -331,55 +338,63 @@ def sample_repetition_penalties(
penalty_freq: float,
penalty_present: float,
):
llama_cpp.llama_sample_repetition_penalties(
self.ctx,
llama_cpp.byref(candidates.candidates),
last_tokens_data,
penalty_last_n,
penalty_repeat,
penalty_freq,
penalty_present,
)
# llama_cpp.llama_sample_repetition_penalties(
# self.ctx,
# llama_cpp.byref(candidates.candidates),
# last_tokens_data,
# penalty_last_n,
# penalty_repeat,
# penalty_freq,
# penalty_present,
# )
raise NotImplementedError("sample_repetition_penalties is not implemented in llama.cpp")

def sample_softmax(self, candidates: "_LlamaTokenDataArray"):
llama_cpp.llama_sample_softmax(
self.ctx,
llama_cpp.byref(candidates.candidates),
)
# llama_cpp.llama_sample_softmax(
# self.ctx,
# llama_cpp.byref(candidates.candidates),
# )
raise NotImplementedError("sample_softmax is not implemented in llama.cpp")

def sample_top_k(self, candidates: "_LlamaTokenDataArray", k: int, min_keep: int):
llama_cpp.llama_sample_top_k(
self.ctx, llama_cpp.byref(candidates.candidates), k, min_keep
)
# llama_cpp.llama_sample_top_k(
# self.ctx, llama_cpp.byref(candidates.candidates), k, min_keep
# )
raise NotImplementedError("sample_top_k is not implemented in llama.cpp")

def sample_top_p(self, candidates: "_LlamaTokenDataArray", p: float, min_keep: int):
llama_cpp.llama_sample_top_p(
self.ctx, llama_cpp.byref(candidates.candidates), p, min_keep
)
# llama_cpp.llama_sample_top_p(
# self.ctx, llama_cpp.byref(candidates.candidates), p, min_keep
# )
raise NotImplementedError("sample_top_p is not implemented in llama.cpp")

def sample_min_p(self, candidates: "_LlamaTokenDataArray", p: float, min_keep: int):
llama_cpp.llama_sample_min_p(
self.ctx, llama_cpp.byref(candidates.candidates), p, min_keep
)
# llama_cpp.llama_sample_min_p(
# self.ctx, llama_cpp.byref(candidates.candidates), p, min_keep
# )
raise NotImplementedError("sample_min_p is not implemented in llama.cpp")

def sample_typical(
self, candidates: "_LlamaTokenDataArray", p: float, min_keep: int
):
llama_cpp.llama_sample_typical(
self.ctx, llama_cpp.byref(candidates.candidates), p, min_keep
)
# llama_cpp.llama_sample_typical(
# self.ctx, llama_cpp.byref(candidates.candidates), p, min_keep
# )
raise NotImplementedError("sample_typical is not implemented in llama.cpp")

def sample_temp(self, candidates: "_LlamaTokenDataArray", temp: float):
llama_cpp.llama_sample_temp(
self.ctx, llama_cpp.byref(candidates.candidates), temp
)
# llama_cpp.llama_sample_temp(
# self.ctx, llama_cpp.byref(candidates.candidates), temp
# )
raise NotImplementedError("sample_temp is not implemented in llama.cpp")

def sample_grammar(self, candidates: "_LlamaTokenDataArray", grammar: LlamaGrammar):
llama_cpp.llama_sample_grammar(
self.ctx,
llama_cpp.byref(candidates.candidates),
grammar.grammar,
)
# llama_cpp.llama_sample_grammar(
# self.ctx,
# llama_cpp.byref(candidates.candidates),
# grammar.grammar,
# )
raise NotImplementedError("sample_grammar is not implemented in llama.cpp")

def sample_token_mirostat(
self,
Expand All @@ -389,14 +404,15 @@ def sample_token_mirostat(
m: int,
mu: llama_cpp.CtypesPointerOrRef[ctypes.c_float],
) -> int:
return llama_cpp.llama_sample_token_mirostat(
self.ctx,
llama_cpp.byref(candidates.candidates),
tau,
eta,
m,
mu,
)
raise NotImplementedError("sample_token_mirostat is not implemented in llama.cpp")
# return llama_cpp.llama_sample_token_mirostat(
# self.ctx,
# llama_cpp.byref(candidates.candidates),
# tau,
# eta,
# m,
# mu,
# )

def sample_token_mirostat_v2(
self,
Expand All @@ -405,29 +421,33 @@ def sample_token_mirostat_v2(
eta: float,
mu: llama_cpp.CtypesPointerOrRef[ctypes.c_float],
) -> int:
return llama_cpp.llama_sample_token_mirostat_v2(
self.ctx,
llama_cpp.byref(candidates.candidates),
tau,
eta,
mu,
)
raise NotImplementedError("sample_token_mirostat_v2 is not implemented in llama.cpp")
# return llama_cpp.llama_sample_token_mirostat_v2(
# self.ctx,
# llama_cpp.byref(candidates.candidates),
# tau,
# eta,
# mu,
# )

def sample_token_greedy(self, candidates: "_LlamaTokenDataArray") -> int:
return llama_cpp.llama_sample_token_greedy(
self.ctx,
llama_cpp.byref(candidates.candidates),
)
raise NotImplementedError("sample_token_greedy is not implemented in llama.cpp")
# return llama_cpp.llama_sample_token_greedy(
# self.ctx,
# llama_cpp.byref(candidates.candidates),
# )

def sample_token(self, candidates: "_LlamaTokenDataArray") -> int:
return llama_cpp.llama_sample_token(
self.ctx,
llama_cpp.byref(candidates.candidates),
)
raise NotImplementedError("sample_token is not implemented in llama.cpp")
# return llama_cpp.llama_sample_token(
# self.ctx,
# llama_cpp.byref(candidates.candidates),
# )

# Grammar
def grammar_accept_token(self, grammar: LlamaGrammar, token: int):
llama_cpp.llama_grammar_accept_token(grammar.grammar, self.ctx, token)
raise NotImplementedError("grammar_accept_token is not implemented in llama.cpp")
# llama_cpp.llama_grammar_accept_token(grammar.grammar, self.ctx, token)

def reset_timings(self):
llama_cpp.llama_perf_context_reset(self.ctx)
Expand Down Expand Up @@ -788,7 +808,7 @@ def add_mirostat_v2(self, seed: int, tau: float, eta: float):

def add_grammar(self, model: LlamaModel, grammar: LlamaGrammar):
sampler = llama_cpp.llama_sampler_init_grammar(
model.model, grammar._grammar.encode("utf-8"), grammar._root.encode("utf-8")
model.vocab, grammar._grammar.encode("utf-8"), grammar._root.encode("utf-8")
)
self._add_sampler(sampler)

Expand Down Expand Up @@ -842,6 +862,7 @@ def get_seed(self) -> int:

def sample(self, ctx: LlamaContext, idx: int) -> int:
assert self.sampler is not None
assert ctx.ctx is not None
return llama_cpp.llama_sampler_sample(self.sampler, ctx.ctx, idx)

def close(self):
Expand Down
Loading