Set ngl to 999 by default to align with upstream

ericcurtin · p1-0tr · commit 6006f684762f · 2025-09-05T13:51:46.000+02:00
Upstream now defaults to 999 which means use max available layer.

We could try and rely on the upstream defaults, but there is one case
where we manually set to 0 which makes it tricky so maybe aligning is
better for now.

Signed-off-by: Eric Curtin &lt;ericcurtin17@gmail.com&gt;
diff --git a/Makefile b/Makefile
@@ -75,8 +75,8 @@ help:
 	@echo "  help           	- Show this help message"
 	@echo ""
 	@echo "Backend configuration options:"
-	@echo "  LLAMA_ARGS    - Arguments for llama.cpp (e.g., \"--verbose --jinja -ngl 100 --ctx-size 2048\")"
+	@echo "  LLAMA_ARGS    - Arguments for llama.cpp (e.g., \"--verbose --jinja -ngl 999 --ctx-size 2048\")"
 	@echo ""
 	@echo "Example usage:"
-	@echo "  make run LLAMA_ARGS=\"--verbose --jinja -ngl 100 --ctx-size 2048\""
-	@echo "  make docker-run LLAMA_ARGS=\"--verbose --jinja -ngl 100 --threads 4 --ctx-size 2048\""
+	@echo "  make run LLAMA_ARGS=\"--verbose --jinja -ngl 999 --ctx-size 2048\""
+	@echo "  make docker-run LLAMA_ARGS=\"--verbose --jinja -ngl 999 --threads 4 --ctx-size 2048\""
diff --git a/pkg/inference/backends/llamacpp/llamacpp.go b/pkg/inference/backends/llamacpp/llamacpp.go
@@ -256,7 +256,7 @@ func (l *llamaCpp) GetRequiredMemoryForModel(ctx context.Context, model string,
 		if runtime.GOOS == "windows" && runtime.GOARCH == "arm64" && mdlConfig.Quantization != "Q4_0" {
 			ngl = 0 // only Q4_0 models can be accelerated on Adreno
 		}
-		ngl = 100
+		ngl = 999
 	}
 
 	// TODO(p1-0tr): for now assume we are running on GPU (single one) - Devices[1];
diff --git a/pkg/inference/backends/llamacpp/llamacpp_config.go b/pkg/inference/backends/llamacpp/llamacpp_config.go
@@ -18,7 +18,7 @@ type Config struct {
 
 // NewDefaultLlamaCppConfig creates a new LlamaCppConfig with default values.
 func NewDefaultLlamaCppConfig() *Config {
-	args := append([]string{"--jinja", "-ngl", "100", "--metrics"})
+	args := append([]string{"--jinja", "-ngl", "999", "--metrics"})
 
 	// Special case for Windows ARM64
 	if runtime.GOOS == "windows" && runtime.GOARCH == "arm64" {
diff --git a/pkg/inference/backends/llamacpp/llamacpp_config_test.go b/pkg/inference/backends/llamacpp/llamacpp_config_test.go
@@ -32,8 +32,8 @@ func TestNewDefaultLlamaCppConfig(t *testing.T) {
 	if nglIndex+1 >= len(config.Args) {
 		t.Error("No value found after -ngl argument")
 	}
-	if config.Args[nglIndex+1] != "100" {
-		t.Errorf("Expected -ngl value to be 100, got %s", config.Args[nglIndex+1])
+	if config.Args[nglIndex+1] != "999" {
+		t.Errorf("Expected -ngl value to be 999, got %s", config.Args[nglIndex+1])
 	}
 
 	// Test Windows ARM64 specific case
@@ -87,7 +87,7 @@ func TestGetArgs(t *testing.T) {
 			},
 			expected: []string{
 				"--jinja",
-				"-ngl", "100",
+				"-ngl", "999",
 				"--metrics",
 				"--model", modelPath,
 				"--host", socket,
@@ -102,7 +102,7 @@ func TestGetArgs(t *testing.T) {
 			},
 			expected: []string{
 				"--jinja",
-				"-ngl", "100",
+				"-ngl", "999",
 				"--metrics",
 				"--model", modelPath,
 				"--host", socket,
@@ -121,7 +121,7 @@ func TestGetArgs(t *testing.T) {
 			},
 			expected: []string{
 				"--jinja",
-				"-ngl", "100",
+				"-ngl", "999",
 				"--metrics",
 				"--model", modelPath,
 				"--host", socket,
@@ -143,7 +143,7 @@ func TestGetArgs(t *testing.T) {
 			},
 			expected: []string{
 				"--jinja",
-				"-ngl", "100",
+				"-ngl", "999",
 				"--metrics",
 				"--model", modelPath,
 				"--host", socket,
@@ -162,7 +162,7 @@ func TestGetArgs(t *testing.T) {
 			},
 			expected: []string{
 				"--jinja",
-				"-ngl", "100",
+				"-ngl", "999",
 				"--metrics",
 				"--model", modelPath,
 				"--host", socket,

Original file line number	Diff line number	Diff line change
`@@ -256,7 +256,7 @@ func (l *llamaCpp) GetRequiredMemoryForModel(ctx context.Context, model string,`
`256`	`256`	`if runtime.GOOS == "windows" && runtime.GOARCH == "arm64" && mdlConfig.Quantization != "Q4_0" {`
`257`	`257`	`ngl = 0 // only Q4_0 models can be accelerated on Adreno`
`258`	`258`	`}`
`259`		`- ngl = 100`
	`259`	`+ ngl = 999`
`260`	`260`	`}`
`261`	`261`
`262`	`262`	`// TODO(p1-0tr): for now assume we are running on GPU (single one) - Devices[1];`