Skip to content

Commit c50af8e

Browse files
Merge pull request #30 from menloresearch/update-dev-from-master-2025-03-26-00-08
Sync master with upstream release b4958
2 parents 1646396 + ef19c71 commit c50af8e

File tree

10 files changed

+112
-47
lines changed

10 files changed

+112
-47
lines changed

ci/README.md

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,4 +26,43 @@ GG_BUILD_CUDA=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
2626
# with SYCL support
2727
source /opt/intel/oneapi/setvars.sh
2828
GG_BUILD_SYCL=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
29+
30+
# with MUSA support
31+
GG_BUILD_MUSA=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
32+
```
33+
34+
## Running MUSA CI in a Docker Container
35+
36+
Assuming `$PWD` is the root of the `llama.cpp` repository, follow these steps to set up and run MUSA CI in a Docker container:
37+
38+
### 1. Create a local directory to store cached models, configuration files and venv:
39+
40+
```bash
41+
mkdir -p $HOME/llama.cpp/ci-cache
42+
```
43+
44+
### 2. Create a local directory to store CI run results:
45+
46+
```bash
47+
mkdir -p $HOME/llama.cpp/ci-results
48+
```
49+
50+
### 3. Start a Docker container and run the CI:
51+
52+
```bash
53+
docker run --privileged -it \
54+
-v $HOME/llama.cpp/ci-cache:/ci-cache \
55+
-v $HOME/llama.cpp/ci-results:/ci-results \
56+
-v $PWD:/ws -w /ws \
57+
mthreads/musa:rc3.1.1-devel-ubuntu22.04
2958
```
59+
60+
Inside the container, execute the following commands:
61+
62+
```bash
63+
apt update -y && apt install -y cmake git python3.10-venv wget
64+
git config --global --add safe.directory /ws
65+
GG_BUILD_MUSA=1 bash ./ci/run.sh /ci-results /ci-cache
66+
```
67+
68+
This setup ensures that the CI runs within an isolated Docker environment while maintaining cached files and results across runs.

ci/run.sh

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,9 @@
1616
# # with VULKAN support
1717
# GG_BUILD_VULKAN=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
1818
#
19+
# # with MUSA support
20+
# GG_BUILD_MUSA=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
21+
#
1922

2023
if [ -z "$2" ]; then
2124
echo "usage: $0 <output-dir> <mnt-dir>"
@@ -52,13 +55,22 @@ if [ ! -z ${GG_BUILD_SYCL} ]; then
5255
echo "source /opt/intel/oneapi/setvars.sh"
5356
exit 1
5457
fi
55-
58+
# Use only main GPU
59+
export ONEAPI_DEVICE_SELECTOR="level_zero:0"
60+
# Enable sysman for correct memory reporting
61+
export ZES_ENABLE_SYSMAN=1
5662
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_SYCL=1 -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON"
5763
fi
5864

5965
if [ ! -z ${GG_BUILD_VULKAN} ]; then
6066
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_VULKAN=1"
6167
fi
68+
69+
if [ ! -z ${GG_BUILD_MUSA} ]; then
70+
# Use qy1 by default (MTT S80)
71+
MUSA_ARCH=${MUSA_ARCH:-21}
72+
CMAKE_EXTRA="-DGGML_MUSA=ON -DMUSA_ARCHITECTURES=${MUSA_ARCH}"
73+
fi
6274
## helpers
6375

6476
# download a file if it does not exist or if it is outdated
@@ -808,7 +820,7 @@ export LLAMA_LOG_PREFIX=1
808820
export LLAMA_LOG_TIMESTAMPS=1
809821

810822
if [ -z ${GG_BUILD_LOW_PERF} ]; then
811-
# Create symlink: ./llama.cpp/models-mnt -> $MNT/models/models-mnt
823+
# Create symlink: ./llama.cpp/models-mnt -> $MNT/models
812824
rm -rf ${SRC}/models-mnt
813825
mnt_models=${MNT}/models
814826
mkdir -p ${mnt_models}

docs/build.md

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -435,6 +435,26 @@ llama_new_context_with_model: CANN compute buffer size = 1260.81 MiB
435435

436436
For detailed info, such as model/device supports, CANN install, please refer to [llama.cpp for CANN](./backend/CANN.md).
437437

438+
## Arm® KleidiAI™
439+
KleidiAI is a library of optimized microkernels for AI workloads, specifically designed for Arm CPUs. These microkernels enhance performance and can be enabled for use by the CPU backend.
440+
441+
To enable KleidiAI, go to the llama.cpp directory and build using CMake
442+
```bash
443+
cmake -B build -DGGML_CPU_KLEIDIAI=ON
444+
cmake --build build --config Release
445+
```
446+
You can verify that KleidiAI is being used by running
447+
```bash
448+
./build/bin/llama-cli -m PATH_TO_MODEL -p "What is a car?"
449+
```
450+
If KleidiAI is enabled, the ouput will contain a line similar to:
451+
```
452+
load_tensors: CPU_KLEIDIAI model buffer size = 3474.00 MiB
453+
```
454+
KleidiAI's microkernels implement optimized tensor operations using Arm CPU features such as dotprod, int8mm and SME. llama.cpp selects the most efficient kernel based on runtime CPU feature detection. However, on platforms that support SME, you must manually enable SME microkernels by setting the environment variable `GGML_KLEIDIAI_SME=1`.
455+
456+
Depending on your build target, other higher priority backends may be enabled by default. To ensure the CPU backend is used, you must disable the higher priority backends either at compile time, e.g. -DGGML_METAL=OFF, or during run-time using the command line option `--device none`.
457+
438458
## Android
439459

440460
To read documentation for how to build on Android, [click here](./android.md)

examples/run/run.cpp

Lines changed: 10 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -38,24 +38,6 @@
3838
}
3939
#endif
4040

41-
GGML_ATTRIBUTE_FORMAT(1, 2)
42-
static std::string fmt(const char * fmt, ...) {
43-
va_list ap;
44-
va_list ap2;
45-
va_start(ap, fmt);
46-
va_copy(ap2, ap);
47-
const int size = vsnprintf(NULL, 0, fmt, ap);
48-
GGML_ASSERT(size >= 0 && size < INT_MAX); // NOLINT
49-
std::string buf;
50-
buf.resize(size);
51-
const int size2 = vsnprintf(const_cast<char *>(buf.data()), buf.size() + 1, fmt, ap2);
52-
GGML_ASSERT(size2 == size);
53-
va_end(ap2);
54-
va_end(ap);
55-
56-
return buf;
57-
}
58-
5941
GGML_ATTRIBUTE_FORMAT(1, 2)
6042
static int printe(const char * fmt, ...) {
6143
va_list args;
@@ -525,11 +507,11 @@ class HttpClient {
525507
int secs = static_cast<int>(seconds) % 60;
526508

527509
if (hrs > 0) {
528-
return fmt("%dh %02dm %02ds", hrs, mins, secs);
510+
return string_format("%dh %02dm %02ds", hrs, mins, secs);
529511
} else if (mins > 0) {
530-
return fmt("%dm %02ds", mins, secs);
512+
return string_format("%dm %02ds", mins, secs);
531513
} else {
532-
return fmt("%ds", secs);
514+
return string_format("%ds", secs);
533515
}
534516
}
535517

@@ -544,7 +526,7 @@ class HttpClient {
544526
}
545527
}
546528

547-
return fmt("%.2f %s", dbl_size, suffix[i]);
529+
return string_format("%.2f %s", dbl_size, suffix[i]);
548530
}
549531

550532
static int update_progress(void * ptr, curl_off_t total_to_download, curl_off_t now_downloaded, curl_off_t,
@@ -578,7 +560,9 @@ class HttpClient {
578560
return (now_downloaded_plus_file_size * 100) / total_to_download;
579561
}
580562

581-
static std::string generate_progress_prefix(curl_off_t percentage) { return fmt("%3ld%% |", static_cast<long int>(percentage)); }
563+
static std::string generate_progress_prefix(curl_off_t percentage) {
564+
return string_format("%3ld%% |", static_cast<long int>(percentage));
565+
}
582566

583567
static double calculate_speed(curl_off_t now_downloaded, const std::chrono::steady_clock::time_point & start_time) {
584568
const auto now = std::chrono::steady_clock::now();
@@ -589,9 +573,9 @@ class HttpClient {
589573
static std::string generate_progress_suffix(curl_off_t now_downloaded_plus_file_size, curl_off_t total_to_download,
590574
double speed, double estimated_time) {
591575
const int width = 10;
592-
return fmt("%*s/%*s%*s/s%*s", width, human_readable_size(now_downloaded_plus_file_size).c_str(), width,
593-
human_readable_size(total_to_download).c_str(), width, human_readable_size(speed).c_str(), width,
594-
human_readable_time(estimated_time).c_str());
576+
return string_format("%*s/%*s%*s/s%*s", width, human_readable_size(now_downloaded_plus_file_size).c_str(),
577+
width, human_readable_size(total_to_download).c_str(), width,
578+
human_readable_size(speed).c_str(), width, human_readable_time(estimated_time).c_str());
595579
}
596580

597581
static int calculate_progress_bar_width(const std::string & progress_prefix, const std::string & progress_suffix) {

ggml/src/ggml-cpu/CMakeLists.txt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -359,9 +359,9 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
359359

360360
# Fetch KleidiAI sources:
361361
include(FetchContent)
362-
set(KLEIDIAI_COMMIT_TAG "v1.3.0")
362+
set(KLEIDIAI_COMMIT_TAG "v1.5.0")
363363
set(KLEIDIAI_DOWNLOAD_URL "https://github.com/ARM-software/kleidiai/archive/refs/tags/${KLEIDIAI_COMMIT_TAG}.tar.gz")
364-
set(KLEIDIAI_ARCHIVE_MD5 "060bd2dc64642b091f461cc8dd7426d9")
364+
set(KLEIDIAI_ARCHIVE_MD5 "ea22e1aefb800e9bc8c74d91633cc58e")
365365

366366
if (POLICY CMP0135)
367367
cmake_policy(SET CMP0135 NEW)

ggml/src/ggml-cpu/kleidiai/kernels.cpp

Lines changed: 2 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -51,11 +51,10 @@ static ggml_kleidiai_kernels gemm_gemv_kernels[] = {
5151
/* .run_kernel = */ kai_run_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4vlx4_1x4vl_sme2_sdot,
5252
},
5353
/* .lhs_info = */ {
54-
/* .get_offset = */ kai_get_lhs_offset_lhs_quant_pack_qsi8d32p_f32,
55-
/* .get_packed_offset = */ kai_get_lhs_packed_offset_lhs_quant_pack_qsi8d32p_f32,
54+
/* .get_offset = */ kai_get_lhs_offset_lhs_quant_pack_qsi8d32p_f32_neon,
55+
/* .get_packed_offset = */ kai_get_lhs_packed_offset_lhs_quant_pack_qsi8d32p_f32_neon,
5656
/* .packed_size = */ kai_get_lhs_packed_size_lhs_quant_pack_qsi8d32p_f32_neon,
5757
/* .pack_func = */ kai_run_lhs_quant_pack_qsi8d32p_f32_neon,
58-
/* .require_aligned_m_idx = */ true,
5958
},
6059
/* .rhs_info = */ {
6160
/* .packed_size = */ kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32ps1s0scalef16_qsu4c32s16s0_neon,
@@ -100,7 +99,6 @@ static ggml_kleidiai_kernels gemm_gemv_kernels[] = {
10099
/* .get_packed_offset = */ kai_get_lhs_packed_offset_lhs_quant_pack_qsi8d32p_f32,
101100
/* .packed_size = */ kai_get_lhs_packed_size_lhs_quant_pack_qsi8d32p_f32,
102101
/* .pack_func = */ kai_run_lhs_quant_pack_qsi8d32p_f32,
103-
/* .require_aligned_m_idx = */ false,
104102
},
105103
/* .rhs_info = */ {
106104
/* .packed_size = */ kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0,
@@ -144,7 +142,6 @@ static ggml_kleidiai_kernels gemm_gemv_kernels[] = {
144142
/* .get_packed_offset = */ kai_get_lhs_packed_offset_lhs_quant_pack_qsi8d32p_f32,
145143
/* .packed_size = */ kai_get_lhs_packed_size_lhs_quant_pack_qsi8d32p_f32,
146144
/* .pack_func = */ kai_run_lhs_quant_pack_qsi8d32p_f32,
147-
/* .require_aligned_m_idx = */ false,
148145
},
149146
/* .rhs_info = */ {
150147
/* .packed_size = */ kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0,
@@ -189,7 +186,6 @@ static ggml_kleidiai_kernels gemm_gemv_kernels[] = {
189186
/* .get_packed_offset = */ kai_get_lhs_packed_offset_lhs_quant_pack_qsi8d32p_f32,
190187
/* .packed_size = */ kai_get_lhs_packed_size_lhs_quant_pack_qsi8d32p_f32,
191188
/* .pack_func = */ kai_run_lhs_quant_pack_qsi8d32p_f32,
192-
/* .require_aligned_m_idx = */ false,
193189
},
194190
/* .rhs_info = */ {
195191
/* .packed_size = */ kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0,
@@ -233,7 +229,6 @@ static ggml_kleidiai_kernels gemm_gemv_kernels[] = {
233229
/* .get_packed_offset = */ kai_get_lhs_packed_offset_lhs_quant_pack_qsi8d32p_f32,
234230
/* .packed_size = */ kai_get_lhs_packed_size_lhs_quant_pack_qsi8d32p_f32,
235231
/* .pack_func = */ kai_run_lhs_quant_pack_qsi8d32p_f32,
236-
/* .require_aligned_m_idx = */ false,
237232
},
238233
/* .rhs_info = */ {
239234
/* .packed_size = */ kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0,

ggml/src/ggml-cpu/kleidiai/kernels.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,6 @@ struct lhs_packing_info {
4040
size_t (*packed_size)(size_t m, size_t k, size_t bl, size_t mr, size_t kr, size_t sr);
4141
void (*pack_func)(size_t m, size_t k, size_t bl, size_t mr, size_t kr, size_t sr, size_t m_idx_start, const float* lhs,
4242
size_t lhs_stride, void* lhs_packed);
43-
bool require_aligned_m_idx;
4443
};
4544

4645
struct rhs_packing_info {

ggml/src/ggml-cpu/kleidiai/kleidiai.cpp

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -124,8 +124,7 @@ class tensor_traits : public ggml::cpu::tensor_traits {
124124
size_t sr = kernel->get_sr();
125125

126126
// Calculate number of columns to be processed per thread
127-
const bool use_multithread = lhs_info->require_aligned_m_idx && m <= mr ? false : true;
128-
const size_t num_m_per_thread = use_multithread ? kai_roundup(m, nth) / nth : m;
127+
const size_t num_m_per_thread = kai_roundup(m, mr * nth) / nth;
129128
const size_t m_start = ith * num_m_per_thread;
130129
size_t m_to_process = num_m_per_thread;
131130
if ((m_start + m_to_process) > m) {
@@ -135,11 +134,11 @@ class tensor_traits : public ggml::cpu::tensor_traits {
135134
if(m_start < m) {
136135
// Transform LHS
137136
const size_t src_stride = src1->nb[1];
138-
const float * src_ptr = reinterpret_cast<const float *>(lhs + lhs_info->get_offset(0, dst->src[1]->nb[1]));
137+
const float * src_ptr = reinterpret_cast<const float *>(lhs + lhs_info->get_offset(m_start, dst->src[1]->nb[1]));
139138
const size_t lhs_packed_offset = lhs_info->get_packed_offset(m_start, k, QK4_0, mr, kr, sr);
140139
void * lhs_packed_ptr = static_cast<void *>(lhs_packed + lhs_packed_offset);
141140

142-
lhs_info->pack_func(m_to_process, k, QK4_0, mr, kr, sr, m_start, src_ptr, src_stride, lhs_packed_ptr);
141+
lhs_info->pack_func(m_to_process, k, QK4_0, mr, kr, sr, 0, src_ptr, src_stride, lhs_packed_ptr);
143142
}
144143

145144
ggml_barrier(params->threadpool);

ggml/src/ggml-sycl/ggml-sycl.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -191,7 +191,7 @@ static void ggml_check_sycl() try {
191191

192192
if (!initialized) {
193193
g_ggml_sycl_debug = get_sycl_env("GGML_SYCL_DEBUG", 0);
194-
g_ggml_sycl_disable_optimize= get_sycl_env("GGML_SYCL_DISABLE_OPT", 0);
194+
g_ggml_sycl_disable_optimize= get_sycl_env("GGML_SYCL_DISABLE_OPT", 1);
195195
g_ggml_sycl_disable_graph = get_sycl_env("GGML_SYCL_DISABLE_GRAPH", 1);
196196
GGML_SYCL_DEBUG("[SYCL] call ggml_check_sycl\n");
197197
GGML_LOG_INFO("Running with Environment Variables:\n");

src/llama-context.cpp

Lines changed: 21 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -294,10 +294,7 @@ llama_context::llama_context(
294294
// TODO: something cleaner
295295
const auto n_outputs_save = n_outputs;
296296

297-
// max number of outputs
298-
n_outputs = n_tokens;
299-
300-
LLAMA_LOG_DEBUG("%s: n_tokens = %d, n_seqs = %d, n_outputs = %d\n", __func__, n_tokens, n_seqs, n_outputs);
297+
LLAMA_LOG_DEBUG("%s: worst-case: n_tokens = %d, n_seqs = %d, n_outputs = %d\n", __func__, n_tokens, n_seqs, n_outputs);
301298

302299
int n_splits_pp = -1;
303300
int n_nodes_pp = -1;
@@ -313,8 +310,15 @@ llama_context::llama_context(
313310
// reserve pp graph first so that buffers are only allocated once
314311
{
315312
llama_ubatch ubatch_pp = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr};
313+
314+
// max number of outputs
315+
n_outputs = ubatch_pp.n_tokens;
316+
317+
LLAMA_LOG_DEBUG("%s: reserving graph for n_tokens = %d, n_seqs = %d\n", __func__, ubatch_pp.n_tokens, ubatch_pp.n_seqs);
318+
316319
auto * gf = graph_init();
317320
graph_build(ctx_compute.get(), gf, ubatch_pp, LLM_GRAPH_TYPE_DEFAULT);
321+
318322
if (!ggml_backend_sched_reserve(sched.get(), gf)) {
319323
throw std::runtime_error("failed to allocate compute pp buffers");
320324
}
@@ -326,20 +330,33 @@ llama_context::llama_context(
326330
// reserve with tg graph to get the number of splits and nodes
327331
{
328332
llama_ubatch ubatch_tg = { true, 1, 1, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr};
333+
334+
n_outputs = ubatch_tg.n_tokens;
335+
336+
LLAMA_LOG_DEBUG("%s: reserving graph for n_tokens = %d, n_seqs = %d\n", __func__, ubatch_tg.n_tokens, ubatch_tg.n_seqs);
337+
329338
auto * gf = graph_init();
330339
graph_build(ctx_compute.get(), gf, ubatch_tg, LLM_GRAPH_TYPE_DEFAULT);
340+
331341
if (!ggml_backend_sched_reserve(sched.get(), gf)) {
332342
throw std::runtime_error("failed to allocate compute tg buffers");
333343
}
344+
334345
n_splits_tg = ggml_backend_sched_get_n_splits(sched.get());
335346
n_nodes_tg = ggml_graph_n_nodes(gf);
336347
}
337348

338349
// reserve again with pp graph to avoid ggml-alloc reallocations during inference
339350
{
340351
llama_ubatch ubatch_pp = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr};
352+
353+
n_outputs = ubatch_pp.n_tokens;
354+
355+
LLAMA_LOG_DEBUG("%s: reserving graph for n_tokens = %d, n_seqs = %d\n", __func__, ubatch_pp.n_tokens, ubatch_pp.n_seqs);
356+
341357
auto * gf = graph_init();
342358
graph_build(ctx_compute.get(), gf, ubatch_pp, LLM_GRAPH_TYPE_DEFAULT);
359+
343360
if (!ggml_backend_sched_reserve(sched.get(), gf)) {
344361
throw std::runtime_error("failed to allocate compute pp buffers");
345362
}

0 commit comments

Comments
 (0)