kvcache-ai · RaQiu · Feb 12, 2026 · Feb 12, 2026 · Feb 11, 2026 · Feb 11, 2026
diff --git a/.gitignore b/.gitignore
@@ -1,4 +1,5 @@
 __pycache__
+/.cache/
 build
 .vscode
 *.so
@@ -30,4 +31,5 @@ build*
 CMakeFiles/
 kvc2/
 sched/
-*.png
+*.png
+paper_figures_work/
diff --git a/.tmp_arxiv_mcp_probe.js b/.tmp_arxiv_mcp_probe.js
@@ -0,0 +1,110 @@
+const { spawn } = require('child_process');
+
+const env = {
+  ...process.env,
+  WORK_DIR: '/Users/qr/Documents/papers/arxiv',
+};
+
+const child = spawn('node', ['/Users/qr/.codex/mcp-arxiv/run-arxiv-mcp-wrapper.mjs'], {
+  env,
+  stdio: ['pipe', 'pipe', 'pipe'],
+});
+
+let nextId = 1;
+let buf = Buffer.alloc(0);
+const pending = new Map();
+
+function send(msg) {
+  const s = JSON.stringify(msg);
+  const payload = `Content-Length: ${Buffer.byteLength(s)}\r\n\r\n${s}`;
+  child.stdin.write(payload);
+}
+
+function req(method, params) {
+  const id = nextId++;
+  send({ jsonrpc: '2.0', id, method, params });
+  return new Promise((resolve, reject) => {
+    pending.set(id, { resolve, reject });
+    setTimeout(() => {
+      if (pending.has(id)) {
+        pending.delete(id);
+        reject(new Error(`timeout ${method}`));
+      }
+    }, 15000);
+  });
+}
+
+function parse() {
+  while (true) {
+    const s = buf.toString('utf8');
+    const i = s.indexOf('\r\n\r\n');
+    if (i === -1) return;
+    const header = s.slice(0, i);
+    const m = header.match(/Content-Length:\s*(\d+)/i);
+    if (!m) {
+      const nl = s.indexOf('\n');
+      if (nl === -1) return;
+      console.error('NOISE:', s.slice(0, nl).trim());
+      buf = Buffer.from(s.slice(nl + 1), 'utf8');
+      continue;
+    }
+    const len = Number(m[1]);
+    const total = i + 4 + len;
+    if (buf.length < total) return;
+    const body = buf.slice(i + 4, total).toString('utf8');
+    buf = buf.slice(total);
+    let msg;
+    try {
+      msg = JSON.parse(body);
+    } catch (e) {
+      console.error('JSONERR:', body);
+      continue;
+    }
+    if (msg.id && pending.has(msg.id)) {
+      const p = pending.get(msg.id);
+      pending.delete(msg.id);
+      if (msg.error) {
+        p.reject(new Error(JSON.stringify(msg.error)));
+      } else {
+        p.resolve(msg.result);
+      }
+    } else {
+      console.log('UNSOLICITED', JSON.stringify(msg));
+    }
+  }
+}
+
+child.stdout.on('data', (c) => {
+  buf = Buffer.concat([buf, c]);
+  parse();
+});
+
+child.stderr.on('data', (c) => process.stderr.write(`STDERR:${c.toString()}`));
+
+child.on('exit', (code) => {
+  console.error('EXIT', code);
+});
+
+(async () => {
+  try {
+    const init = await req('initialize', {
+      protocolVersion: '2024-11-05',
+      capabilities: {},
+      clientInfo: { name: 'probe', version: '0.1' },
+    });
+    console.log('INIT_OK');
+    console.log(JSON.stringify(init, null, 2));
+
+    send({ jsonrpc: '2.0', method: 'notifications/initialized', params: {} });
+
+    const tools = await req('tools/list', {});
+    console.log('TOOLS_OK');
+    console.log(JSON.stringify(tools, null, 2));
+  } catch (e) {
+    console.error('ERR', e.message);
+    process.exitCode = 1;
+  } finally {
+    child.kill('SIGTERM');
+    setTimeout(() => child.kill('SIGKILL'), 1000);
+  }
+})();
diff --git a/archive/kt-sft/pyproject.toml b/archive/kt-sft/pyproject.toml
@@ -35,6 +35,8 @@ dependencies = [
   "protobuf",
   "datasets",
   "torchviz",
+  "triton>=2.0.0; sys_platform != 'win32'",
+  "triton-windows>=3.1.0; sys_platform == 'win32'",
 ]
 
 requires-python = ">=3.10"
@@ -61,7 +63,8 @@ classifiers = [
   "Development Status :: 4 - Beta",
   "Programming Language :: Python :: 3.10",
   "Programming Language :: Python :: 3.11",
-  "Programming Language :: Python :: 3.12"
+  "Programming Language :: Python :: 3.12",
+  "Operating System :: Microsoft :: Windows",
 ]
 
 [project.urls]

diff --git a/archive/kt-sft/setup.py b/archive/kt-sft/setup.py
@@ -58,6 +58,8 @@ def _load_pyproject_deps():
     triton_dep = [
         "pytorch-triton-xpu==3.3.0"
     ]
+elif sys.platform == "win32":
+    triton_dep = ["triton-windows>=3.1.0"]
 else:
     triton_dep = []
 

diff --git a/doc/en/DeepSeek-V4-Flash.md b/doc/en/DeepSeek-V4-Flash.md
@@ -184,5 +184,3 @@ The `kt` CLI ships with an OpenAI-compatible chat client that talks to the SGLan
 ```bash
 kt chat --host 127.0.0.1 --port 30000 --temperature 0.7 --max-tokens 2048
 ```
-
-
diff --git a/doc/weekly_reports/2026-04-02-amxint4-24g-gpu32-report.md b/doc/weekly_reports/2026-04-02-amxint4-24g-gpu32-report.md
@@ -0,0 +1,176 @@
+# AMXINT4 Tiered Report: 24G + GPU Experts 32
+
+## Selected Configuration
+
+- Model path: `/mnt/fr0/qwen_copy_test`
+- CPU weight path: `/mnt/fr0/qwen_copy_test-AMXINT4-rerun-aftertpfix-20260402-143950`
+- Method: `AMXINT4`
+- Weight strategy: `tiered`
+- CPU threads: `64`
+- Threadpool count: `2`
+- GPU experts: `32`
+- Resident experts: `256`
+- CPU memory budget: `24G`
+- GPU experts update: enabled
+
+## Why This Config Was Selected
+
+On the 24G CPU-budget line, `gpu_experts=32` was the highest stable GPU-expert configuration before GPU-memory failures started at `64+`.
+
+| gpu_experts | status | avg tok/s | median | min | max |
+| --- | --- | ---: | ---: | ---: | ---: |
+| 2 | success | 33.068 | 41.084 | 6.354 | 44.734 |
+| 4 | success | 39.962 | 48.575 | 8.264 | 50.230 |
+| 8 | success | 41.531 | 50.381 | 10.663 | 51.681 |
+| 16 | success | 43.174 | 50.378 | 14.550 | 52.698 |
+| 32 | success | 45.061 | 52.478 | 14.837 | 54.014 |
+| 64 | fail_ready | - | - | - | - |
+| 128 | fail_ready | - | - | - | - |
+| 256 | fail_ready | - | - | - | - |
+
+## Five Responses Captured
+
+The service was queried with these 5 prompts:
+
+1. `Count from 1 to 40 separated by commas only.`
+2. `Explain mmap in 5 short sentences.`
+3. `Give 8 concise bullet points about NUMA optimization.`
+4. `Summarize BF16 inference in about 60 tokens.`
+5. `Compare tiered and legacy loading in concise prose.`
+
+Observed response heads:
+
+1. `count`
+   Raw head:
+   `Thinking Process: ... Task: Count from 1 to 40 ... Sequence: 1, 2, 3, ...`
+
+2. `mmap`
+   Raw head:
+   `Thinking Process: ... Topic: mmap ... Maps a file or device into memory ...`
+
+3. `numa`
+   Raw head:
+   `Thinking Process: ... Topic: NUMA optimization ... Memory placement / local vs remote ...`
+
+4. `bf16`
+   Raw head:
+   `Thinking Process: ... Topic: BF16 inference ... Similar FP32 range ...`
+
+5. `compare`
+   Raw head:
+   `Thinking Process: ... Compare tiered loading and legacy loading ...`
+
+Note: output quality is now semantically correct, but the model/template still emits `Thinking Process` before the final answer.
+
+## Single-Prompt Split
+
+This table is for the 5 standard prompts above under `24G + gpu_experts=32`.
+
+Definitions:
+
+- `ttft_s`: time to first token
+- `prefill_tok_s`: `prompt_tokens / ttft`
+- `decode_tok_s`: `completion_tokens / decode_window`
+- `e2e_tok_s`: `completion_tokens / total_request_time`
+
+| prompt | prompt_tokens | completion_tokens | total_s | ttft_s | decode_s | prefill tok/s | decode tok/s | e2e tok/s |
+| --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: |
+| count | 23 | 64 | 3.465 | 0.157 | 1.074 | 146.676 | 59.586 | 18.469 |
+| mmap | 19 | 64 | 1.250 | 0.144 | 1.009 | 132.032 | 63.434 | 51.188 |
+| numa | 21 | 64 | 1.198 | 0.146 | 1.011 | 144.223 | 63.312 | 53.442 |
+| bf16 | 24 | 64 | 1.196 | 0.146 | 1.006 | 164.711 | 63.637 | 53.519 |
+| compare | 20 | 64 | 1.185 | 0.145 | 1.007 | 138.394 | 63.548 | 54.028 |
+
+## Aggregate Split
+
+This is the aggregate view across the 5 prompts above.
+
+| metric | value |
+| --- | ---: |
+| avg e2e tok/s | 46.129 |
+| avg prefill tok/s | 145.207 |
+| avg decode tok/s | 62.703 |
+| total prompt tokens | 107 |
+| total completion tokens | 320 |
+| overall prefill tok/s | 144.986 |
+| overall decode tok/s | 62.659 |
+| overall e2e tok/s | 38.582 |
+
+Interpretation:
+
+- The first request is still much slower than the other 4.
+- Once in steady decode, the service stays around `~63 tok/s`.
+- End-to-end average is lower because TTFT and the first request dominate.
+
+## Split Tables
+
+The split metrics below use two views:
+
+- **Single 5-prompt average**: average of per-prompt `prefill_tok_s` / `decode_tok_s`
+- **Overall aggregate**: total prompt tokens divided by total TTFT, and total completion tokens divided by total decode window
+
+### A. Budget Sweep (`gpu_experts=0`)
+
+| budget | avg prefill tok/s | avg decode tok/s | overall prefill tok/s | overall decode tok/s |
+| --- | ---: | ---: | ---: | ---: |
+| 40G | 144.502 | 60.307 | 144.399 | 60.309 |
+| 32G | 146.111 | 60.274 | 146.175 | 60.264 |
+| 24G | 144.142 | 60.790 | 144.205 | 60.779 |
+
+### B. GPU Experts Sweep (fixed `24G`)
+
+| gpu_experts | avg prefill tok/s | avg decode tok/s | overall prefill tok/s | overall decode tok/s |
+| --- | ---: | ---: | ---: | ---: |
+| 2 | 145.816 | 61.173 | 145.975 | 61.150 |
+| 4 | 146.119 | 61.526 | 145.975 | 61.503 |
+| 8 | 143.841 | 60.810 | 143.817 | 60.802 |
+| 16 | 144.580 | 62.779 | 144.790 | 62.782 |
+| 32 | 145.434 | 62.723 | 145.380 | 62.684 |
+
+### How to read these two tables
+
+- Budget changes (`40G -> 24G`) barely affect steady-state decode. The decode line stays around `~60-61 tok/s`.
+- The bigger effect of lower budget is not steady decode itself, but slower first-request / TTFT behavior.
+- Adding GPU experts under `24G` does not materially change prefill throughput.
+- Decode throughput does improve a bit with more GPU experts, from `~61.2` to `~62.7 tok/s`, but the gain is incremental rather than dramatic.
+
+## Cold-State SSD and GPU Metrics
+
+Cold-state procedure:
+
+1. stop service
+2. `drop_caches`
+3. start service
+4. measure:
+   - startup -> ready
+   - ready -> 5 hard prompts
+
+Hard-prompt result under `24G + gpu_experts=32`:
+
+| metric | value |
+| --- | ---: |
+| avg tok/s | 43.157 |
+| startup SSD read avg (MiB/s) | 349.069 |
+| prompt-window SSD read avg (MiB/s) | 353.288 |
+| prompt-window SSD read peak (MiB/s) | 1043.578 |
+| prompt-window SSD write avg/peak (MiB/s) | 0 / 0 |
+| GPU experts | 0 |
+| GPU util avg/max (%) | 25.517 / 52 |
+| GPU RX avg/max (MB/s) | 112.931 / 196 |
+| GPU TX avg/max (MB/s) | 56.310 / 96 |
+| memory page read-in avg/peak (MiB/s) | 757.988 / 2086.702 |
+| memory page write-out avg/peak (MiB/s) | 0.010 / 0.082 |
+
+Important:
+
+- The GPU-expert scan above uses `gpu_experts=32`.
+- The cold-state high-difficulty SSD probe that was already completed earlier was run with `gpu_experts=0`.
+- It still gives a correct answer for the question “does this scheme eat SSD throughput in cold state?” and the answer is yes.
+
+## Bottom Line
+
+- `24G + gpu_experts=32` is the strongest stable GPU-expert point tested so far.
+- Standard-prompt steady decode is about `~63 tok/s`.
+- End-to-end average is lower because the first request is much slower.
+- Cold-state startup and prompt window both consume significant SSD bandwidth.
+- Hot-state work windows can show near-zero incremental block IO, but cold-state windows clearly do not.
Original file line number	Diff line number	Diff line change
Expand Up		@@ -184,5 +184,3 @@ The `kt` CLI ships with an OpenAI-compatible chat client that talks to the SGLan
		```bash
		kt chat --host 127.0.0.1 --port 30000 --temperature 0.7 --max-tokens 2048
		```