Skip to content

Commit 2881a0a

Browse files
Merge pull request #35 from menloresearch/update-dev-from-master-2025-03-29-00-08
Sync master with upstream release b4988
2 parents 5ddd731 + 3714c3e commit 2881a0a

File tree

23 files changed

+1332
-758
lines changed

23 files changed

+1332
-758
lines changed

common/sampling.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -208,6 +208,9 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
208208
trigger_patterns_c.data(), trigger_patterns_c.size(),
209209
trigger_tokens.data(), trigger_tokens.size())
210210
: llama_sampler_init_grammar(vocab, params.grammar.c_str(), "root");
211+
if (!grmr) {
212+
return nullptr;
213+
}
211214
}
212215

213216
auto * result = new common_sampler {

examples/rpc/CMakeLists.txt

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,4 @@
1-
add_executable(rpc-server rpc-server.cpp)
2-
target_link_libraries(rpc-server PRIVATE ggml llama)
1+
set(TARGET rpc-server)
2+
add_executable(${TARGET} rpc-server.cpp)
3+
target_link_libraries(${TARGET} PRIVATE ggml)
4+
target_compile_features(${TARGET} PRIVATE cxx_std_17)

examples/rpc/README.md

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,3 +72,14 @@ $ bin/llama-cli -m ../models/tinyllama-1b/ggml-model-f16.gguf -p "Hello, my name
7272

7373
This way you can offload model layers to both local and remote devices.
7474

75+
### Local cache
76+
77+
The RPC server can use a local cache to store large tensors and avoid transferring them over the network.
78+
This can speed up model loading significantly, especially when using large models.
79+
To enable the cache, use the `-c` option:
80+
81+
```bash
82+
$ bin/rpc-server -c
83+
```
84+
85+
By default, the cache is stored in the `$HOME/.cache/llama.cpp/rpc` directory and can be controlled via the `LLAMA_CACHE` environment variable.

examples/rpc/rpc-server.cpp

Lines changed: 140 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,7 @@
1+
#if defined(_MSC_VER)
2+
#define _SILENCE_CXX17_CODECVT_HEADER_DEPRECATION_WARNING
3+
#endif
4+
15
#include "ggml-cpu.h"
26

37
#ifdef GGML_USE_CUDA
@@ -18,26 +22,142 @@
1822

1923
#include "ggml-rpc.h"
2024
#ifdef _WIN32
25+
# define DIRECTORY_SEPARATOR '\\'
26+
# include <locale>
2127
# include <windows.h>
28+
# include <fcntl.h>
29+
# include <io.h>
2230
#else
31+
# define DIRECTORY_SEPARATOR '/'
2332
# include <unistd.h>
33+
# include <sys/stat.h>
2434
#endif
35+
#include <codecvt>
2536
#include <string>
2637
#include <stdio.h>
38+
#include <vector>
39+
#include <filesystem>
40+
41+
namespace fs = std::filesystem;
42+
43+
// NOTE: this is copied from common.cpp to avoid linking with libcommon
44+
// returns true if successful, false otherwise
45+
static bool fs_create_directory_with_parents(const std::string & path) {
46+
#ifdef _WIN32
47+
std::wstring_convert<std::codecvt_utf8<wchar_t>> converter;
48+
std::wstring wpath = converter.from_bytes(path);
49+
50+
// if the path already exists, check whether it's a directory
51+
const DWORD attributes = GetFileAttributesW(wpath.c_str());
52+
if ((attributes != INVALID_FILE_ATTRIBUTES) && (attributes & FILE_ATTRIBUTE_DIRECTORY)) {
53+
return true;
54+
}
55+
56+
size_t pos_slash = 0;
57+
58+
// process path from front to back, procedurally creating directories
59+
while ((pos_slash = path.find('\\', pos_slash)) != std::string::npos) {
60+
const std::wstring subpath = wpath.substr(0, pos_slash);
61+
const wchar_t * test = subpath.c_str();
62+
63+
const bool success = CreateDirectoryW(test, NULL);
64+
if (!success) {
65+
const DWORD error = GetLastError();
66+
67+
// if the path already exists, ensure that it's a directory
68+
if (error == ERROR_ALREADY_EXISTS) {
69+
const DWORD attributes = GetFileAttributesW(subpath.c_str());
70+
if (attributes == INVALID_FILE_ATTRIBUTES || !(attributes & FILE_ATTRIBUTE_DIRECTORY)) {
71+
return false;
72+
}
73+
} else {
74+
return false;
75+
}
76+
}
77+
78+
pos_slash += 1;
79+
}
80+
81+
return true;
82+
#else
83+
// if the path already exists, check whether it's a directory
84+
struct stat info;
85+
if (stat(path.c_str(), &info) == 0) {
86+
return S_ISDIR(info.st_mode);
87+
}
88+
89+
size_t pos_slash = 1; // skip leading slashes for directory creation
90+
91+
// process path from front to back, procedurally creating directories
92+
while ((pos_slash = path.find('/', pos_slash)) != std::string::npos) {
93+
const std::string subpath = path.substr(0, pos_slash);
94+
struct stat info;
95+
96+
// if the path already exists, ensure that it's a directory
97+
if (stat(subpath.c_str(), &info) == 0) {
98+
if (!S_ISDIR(info.st_mode)) {
99+
return false;
100+
}
101+
} else {
102+
// create parent directories
103+
const int ret = mkdir(subpath.c_str(), 0755);
104+
if (ret != 0) {
105+
return false;
106+
}
107+
}
108+
109+
pos_slash += 1;
110+
}
111+
112+
return true;
113+
#endif // _WIN32
114+
}
115+
116+
// NOTE: this is copied from common.cpp to avoid linking with libcommon
117+
static std::string fs_get_cache_directory() {
118+
std::string cache_directory = "";
119+
auto ensure_trailing_slash = [](std::string p) {
120+
// Make sure to add trailing slash
121+
if (p.back() != DIRECTORY_SEPARATOR) {
122+
p += DIRECTORY_SEPARATOR;
123+
}
124+
return p;
125+
};
126+
if (getenv("LLAMA_CACHE")) {
127+
cache_directory = std::getenv("LLAMA_CACHE");
128+
} else {
129+
#ifdef __linux__
130+
if (std::getenv("XDG_CACHE_HOME")) {
131+
cache_directory = std::getenv("XDG_CACHE_HOME");
132+
} else {
133+
cache_directory = std::getenv("HOME") + std::string("/.cache/");
134+
}
135+
#elif defined(__APPLE__)
136+
cache_directory = std::getenv("HOME") + std::string("/Library/Caches/");
137+
#elif defined(_WIN32)
138+
cache_directory = std::getenv("LOCALAPPDATA");
139+
#endif // __linux__
140+
cache_directory = ensure_trailing_slash(cache_directory);
141+
cache_directory += "llama.cpp";
142+
}
143+
return ensure_trailing_slash(cache_directory);
144+
}
27145

28146
struct rpc_server_params {
29147
std::string host = "127.0.0.1";
30148
int port = 50052;
31149
size_t backend_mem = 0;
150+
bool use_cache = false;
32151
};
33152

34153
static void print_usage(int /*argc*/, char ** argv, rpc_server_params params) {
35154
fprintf(stderr, "Usage: %s [options]\n\n", argv[0]);
36155
fprintf(stderr, "options:\n");
37-
fprintf(stderr, " -h, --help show this help message and exit\n");
38-
fprintf(stderr, " -H HOST, --host HOST host to bind to (default: %s)\n", params.host.c_str());
39-
fprintf(stderr, " -p PORT, --port PORT port to bind to (default: %d)\n", params.port);
40-
fprintf(stderr, " -m MEM, --mem MEM backend memory size (in MB)\n");
156+
fprintf(stderr, " -h, --help show this help message and exit\n");
157+
fprintf(stderr, " -H HOST, --host HOST host to bind to (default: %s)\n", params.host.c_str());
158+
fprintf(stderr, " -p PORT, --port PORT port to bind to (default: %d)\n", params.port);
159+
fprintf(stderr, " -m MEM, --mem MEM backend memory size (in MB)\n");
160+
fprintf(stderr, " -c, --cache enable local file cache\n");
41161
fprintf(stderr, "\n");
42162
}
43163

@@ -58,6 +178,8 @@ static bool rpc_server_params_parse(int argc, char ** argv, rpc_server_params &
58178
if (params.port <= 0 || params.port > 65535) {
59179
return false;
60180
}
181+
} else if (arg == "-c" || arg == "--cache") {
182+
params.use_cache = true;
61183
} else if (arg == "-m" || arg == "--mem") {
62184
if (++i >= argc) {
63185
return false;
@@ -164,8 +286,20 @@ int main(int argc, char * argv[]) {
164286
} else {
165287
get_backend_memory(&free_mem, &total_mem);
166288
}
167-
printf("Starting RPC server on %s, backend memory: %zu MB\n", endpoint.c_str(), free_mem / (1024 * 1024));
168-
ggml_backend_rpc_start_server(backend, endpoint.c_str(), free_mem, total_mem);
289+
const char * cache_dir = nullptr;
290+
std::string cache_dir_str = fs_get_cache_directory() + "rpc/";
291+
if (params.use_cache) {
292+
if (!fs_create_directory_with_parents(cache_dir_str)) {
293+
fprintf(stderr, "Failed to create cache directory: %s\n", cache_dir_str.c_str());
294+
return 1;
295+
}
296+
cache_dir = cache_dir_str.c_str();
297+
}
298+
printf("Starting RPC server\n");
299+
printf(" endpoint : %s\n", endpoint.c_str());
300+
printf(" local cache : %s\n", cache_dir ? cache_dir : "n/a");
301+
printf(" backend memory : %zu MB\n", free_mem / (1024 * 1024));
302+
ggml_backend_rpc_start_server(backend, endpoint.c_str(), cache_dir, free_mem, total_mem);
169303
ggml_backend_free(backend);
170304
return 0;
171305
}

examples/server/server.cpp

Lines changed: 41 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -489,8 +489,12 @@ struct result_timings {
489489
double predicted_per_token_ms;
490490
double predicted_per_second;
491491

492+
// Optional speculative metrics - only included when > 0
493+
int32_t draft_n = 0;
494+
int32_t draft_n_accepted = 0;
495+
492496
json to_json() const {
493-
return {
497+
json base = {
494498
{"prompt_n", prompt_n},
495499
{"prompt_ms", prompt_ms},
496500
{"prompt_per_token_ms", prompt_per_token_ms},
@@ -501,6 +505,13 @@ struct result_timings {
501505
{"predicted_per_token_ms", predicted_per_token_ms},
502506
{"predicted_per_second", predicted_per_second},
503507
};
508+
509+
if (draft_n > 0) {
510+
base["draft_n"] = draft_n;
511+
base["draft_n_accepted"] = draft_n_accepted;
512+
}
513+
514+
return base;
504515
}
505516
};
506517

@@ -1299,6 +1310,10 @@ struct server_slot {
12991310

13001311
std::function<void(int)> callback_on_release;
13011312

1313+
// Speculative decoding stats
1314+
int32_t n_draft_total = 0; // Total draft tokens generated
1315+
int32_t n_draft_accepted = 0; // Draft tokens actually accepted
1316+
13021317
void reset() {
13031318
SLT_DBG(*this, "%s", "\n");
13041319

@@ -1315,6 +1330,10 @@ struct server_slot {
13151330

13161331
generated_tokens.clear();
13171332
generated_token_probs.clear();
1333+
1334+
// clear speculative decoding stats
1335+
n_draft_total = 0;
1336+
n_draft_accepted = 0;
13181337
}
13191338

13201339
bool is_non_causal() const {
@@ -1381,6 +1400,12 @@ struct server_slot {
13811400
timings.predicted_per_token_ms = t_token_generation / n_decoded;
13821401
timings.predicted_per_second = 1e3 / t_token_generation * n_decoded;
13831402

1403+
// Add speculative metrics
1404+
if (n_draft_total > 0) {
1405+
timings.draft_n = n_draft_total;
1406+
timings.draft_n_accepted = n_draft_accepted;
1407+
}
1408+
13841409
return timings;
13851410
}
13861411

@@ -1428,6 +1453,15 @@ struct server_slot {
14281453
t_prompt_processing, n_prompt_tokens_processed, t_prompt, n_prompt_second,
14291454
t_token_generation, n_decoded, t_gen, n_gen_second,
14301455
t_prompt_processing + t_token_generation, n_prompt_tokens_processed + n_decoded);
1456+
1457+
if (n_draft_total > 0) {
1458+
const float draft_ratio = (float) n_draft_accepted / n_draft_total;
1459+
SLT_INF(*this,
1460+
"\n"
1461+
"draft acceptance rate = %0.5f (%5d accepted / %5d generated)\n",
1462+
draft_ratio, n_draft_accepted, n_draft_total
1463+
);
1464+
}
14311465
}
14321466

14331467
json to_json() const {
@@ -3290,6 +3324,9 @@ struct server_context {
32903324

32913325
llama_tokens draft = common_speculative_gen_draft(slot.spec, params_spec, slot.cache_tokens, id);
32923326

3327+
// keep track of total number of tokens generated in the draft
3328+
slot.n_draft_total += draft.size();
3329+
32933330
// ignore small drafts
32943331
if (slot.params.speculative.n_min > (int) draft.size()) {
32953332
SLT_DBG(slot, "ignoring small draft: %d < %d\n", (int) draft.size(), slot.params.speculative.n_min);
@@ -3315,6 +3352,9 @@ struct server_context {
33153352
slot.n_past += ids.size();
33163353
slot.n_decoded += ids.size();
33173354

3355+
// update how many tokens out of draft was accepted
3356+
slot.n_draft_accepted += ids.size() - 1;
3357+
33183358
slot.cache_tokens.push_back(id);
33193359
slot.cache_tokens.insert(slot.cache_tokens.end(), ids.begin(), ids.end() - 1);
33203360

ggml/include/ggml-rpc.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,9 @@ GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const c
1717

1818
GGML_BACKEND_API void ggml_backend_rpc_get_device_memory(const char * endpoint, size_t * free, size_t * total);
1919

20-
GGML_BACKEND_API void ggml_backend_rpc_start_server(ggml_backend_t backend, const char * endpoint, size_t free_mem, size_t total_mem);
20+
GGML_BACKEND_API void ggml_backend_rpc_start_server(ggml_backend_t backend, const char * endpoint,
21+
const char * cache_dir,
22+
size_t free_mem, size_t total_mem);
2123

2224
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_rpc_reg(void);
2325

ggml/include/ggml.h

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1791,11 +1791,11 @@ extern "C" {
17911791

17921792
#define GGML_KQ_MASK_PAD 64
17931793

1794-
// q: [n_embd, n_batch, n_head, 1]
1795-
// k: [n_embd, n_kv, n_head_kv, 1]
1796-
// v: [n_embd, n_kv, n_head_kv, 1] !! not transposed !!
1797-
// mask: [n_kv, n_batch_pad, 1, 1] !! n_batch_pad = GGML_PAD(n_batch, GGML_KQ_MASK_PAD) !!
1798-
// res: [n_embd, n_head, n_batch, 1] !! permuted !!
1794+
// q: [n_embd_k, n_batch, n_head, 1]
1795+
// k: [n_embd_k, n_kv, n_head_kv, 1]
1796+
// v: [n_embd_v, n_kv, n_head_kv, 1] !! not transposed !!
1797+
// mask: [n_kv, n_batch_pad, 1, 1] !! n_batch_pad = GGML_PAD(n_batch, GGML_KQ_MASK_PAD) !!
1798+
// res: [n_embd_v, n_head, n_batch, 1] !! permuted !!
17991799
GGML_API struct ggml_tensor * ggml_flash_attn_ext(
18001800
struct ggml_context * ctx,
18011801
struct ggml_tensor * q,

0 commit comments

Comments
 (0)