1bit-inference-engine/bench.c at main · devesh-shetty/1bit-inference-engine · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
#include "ternary.h"
#include "transformer.h"
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include <math.h>

#if defined(__APPLE__)
#ifndef ACCELERATE_NEW_LAPACK
#define ACCELERATE_NEW_LAPACK
#endif
#include <Accelerate/Accelerate.h>
#define HAS_BLAS 1
#endif

// ===================================================================
// Timing helpers
// ===================================================================

static double now_ns(void) {
    struct timespec ts;
    clock_gettime(CLOCK_MONOTONIC, &ts);
    return (double)ts.tv_sec * 1e9 + (double)ts.tv_nsec;
}

// ===================================================================
// Random data generation
// ===================================================================

static void fill_random(float *buf, int n, unsigned seed) {
    srand(seed);
    for (int i = 0; i < n; i++)
        buf[i] = ((float)rand() / (float)RAND_MAX) - 0.5f;
}

// ===================================================================
// Correctness check
// ===================================================================

static double max_abs_diff(const float *a, const float *b, int n) {
    double mx = 0.0;
    for (int i = 0; i < n; i++) {
        double d = fabs((double)a[i] - (double)b[i]);
        if (d > mx) mx = d;
    }
    return mx;
}

// ===================================================================
// Benchmarking
// ===================================================================

typedef struct {
    double total_ns;
    int    iterations;
    double mean_us;
    double median_us;
} TimingResult;

static int cmp_double(const void *a, const void *b) {
    double da = *(const double *)a, db = *(const double *)b;
    return (da > db) - (da < db);
}

static TimingResult bench_fn(void (*fn)(const float *, const TernaryMatrix *,
                                         float *),
                             const float *x, const TernaryMatrix *W,
                             float *out, int warmup, int iterations) {
    // Warmup
    for (int i = 0; i < warmup; i++) fn(x, W, out);

    double *times = (double *)malloc(iterations * sizeof(double));
    double total = 0.0;
    for (int i = 0; i < iterations; i++) {
        double t0 = now_ns();
        fn(x, W, out);
        double t1 = now_ns();
        times[i] = (t1 - t0) / 1e3; // ns → µs
        total += times[i];
    }
    qsort(times, iterations, sizeof(double), cmp_double);
    TimingResult r = {
        .total_ns   = total * 1e3,
        .iterations = iterations,
        .mean_us    = total / iterations,
        .median_us  = times[iterations / 2],
    };
    free(times);
    return r;
}

// Wrapper for FP32 matvec (different signature)
typedef struct {
    const float *W;
    int in_dim, out_dim;
} FP32Ctx;

static FP32Ctx g_fp32_ctx;

static void fp32_wrapper(const float *x, const TernaryMatrix *unused,
                         float *out) {
    (void)unused;
    matmul_fp32(out, x, g_fp32_ctx.W, g_fp32_ctx.in_dim, g_fp32_ctx.out_dim);
}

#if HAS_BLAS
static void blas_wrapper(const float *x, const TernaryMatrix *unused,
                         float *out) {
    (void)unused;
    cblas_sgemv(CblasRowMajor, CblasNoTrans,
                g_fp32_ctx.out_dim, g_fp32_ctx.in_dim,
                1.0f, g_fp32_ctx.W, g_fp32_ctx.in_dim,
                x, 1, 0.0f, out, 1);
}
#endif

// Wrapper for naive ternary (different signature)
typedef struct {
    const int8_t *w;
    float scale;
    int in_dim, out_dim;
} NaiveCtx;

static NaiveCtx g_naive_ctx;

static void naive_wrapper(const float *x, const TernaryMatrix *unused,
                          float *out) {
    (void)unused;
    ternary_matvec_naive(x, g_naive_ctx.w, out, g_naive_ctx.in_dim,
                         g_naive_ctx.out_dim, g_naive_ctx.scale);
}

// ===================================================================
// Main benchmark
// ===================================================================

static void run_benchmark(int in_dim, int out_dim, int warmup, int iters) {
    printf("\n=== Benchmark: %d × %d ===\n", out_dim, in_dim);

    // Allocate
    float *x   = (float *)malloc(in_dim * sizeof(float));
    float *W   = (float *)malloc((size_t)in_dim * out_dim * sizeof(float));
    float *out_fp  = (float *)calloc(out_dim, sizeof(float));
    float *out_nv  = (float *)calloc(out_dim, sizeof(float));
    float *out_pk  = (float *)calloc(out_dim, sizeof(float));
    float *out_sm  = (float *)calloc(out_dim, sizeof(float));

    fill_random(x, in_dim, 42);
    fill_random(W, in_dim * out_dim, 123);

    // Quantise weights to ternary
    int8_t *tw = (int8_t *)malloc((size_t)in_dim * out_dim);
    float scale = quantize_absmean(W, tw, in_dim * out_dim);
    TernaryMatrix TW = ternary_matrix_pack(tw, scale, in_dim, out_dim);

    // Count sparsity
    int zeros = 0;
    for (int i = 0; i < in_dim * out_dim; i++)
        if (tw[i] == 0) zeros++;
    double sparsity = 100.0 * zeros / (in_dim * out_dim);

    // Memory comparison
    size_t fp32_bytes   = (size_t)in_dim * out_dim * sizeof(float);
    size_t ternary_bits = (size_t)in_dim * out_dim * 2; // 2 bits per weight
    size_t ternary_bytes = (ternary_bits + 7) / 8;
    double compression  = (double)fp32_bytes / ternary_bytes;

    printf("  Weight sparsity:  %.1f%% zeros\n", sparsity);
    printf("  FP32 weight mem:  %.2f KB\n", fp32_bytes / 1024.0);
    printf("  Ternary weight mem: %.2f KB  (%.1fx compression)\n",
           ternary_bytes / 1024.0, compression);

    // ---- Correctness ----
    // FP32 baseline (with ternary weights cast to float for comparison)
    float *W_from_ternary = (float *)malloc((size_t)in_dim * out_dim * sizeof(float));
    for (int i = 0; i < in_dim * out_dim; i++)
        W_from_ternary[i] = (float)tw[i] * scale;
    matmul_fp32(out_fp, x, W_from_ternary, in_dim, out_dim);

    ternary_matvec_naive(x, tw, out_nv, in_dim, out_dim, scale);
    ternary_matvec_packed(x, &TW, out_pk);
    ternary_matvec_simd(x, &TW, out_sm);

    double err_naive  = max_abs_diff(out_fp, out_nv, out_dim);
    double err_packed = max_abs_diff(out_fp, out_pk, out_dim);
    double err_simd   = max_abs_diff(out_fp, out_sm, out_dim);
    printf("  Correctness (vs FP32-of-ternary):\n");
    printf("    naive:  max_err = %.2e\n", err_naive);
    printf("    packed: max_err = %.2e\n", err_packed);
    printf("    simd:   max_err = %.2e\n", err_simd);

    // ---- Timing ----
    // FP32
    g_fp32_ctx = (FP32Ctx){.W = W, .in_dim = in_dim, .out_dim = out_dim};
    TimingResult t_fp32 = bench_fn(fp32_wrapper, x, &TW, out_fp, warmup, iters);

    // Naive ternary
    g_naive_ctx = (NaiveCtx){
        .w = tw, .scale = scale, .in_dim = in_dim, .out_dim = out_dim};
    TimingResult t_naive = bench_fn(naive_wrapper, x, &TW, out_nv, warmup, iters);

    // Packed ternary
    TimingResult t_packed = bench_fn(ternary_matvec_packed, x, &TW, out_pk,
                                     warmup, iters);

    // SIMD ternary
    TimingResult t_simd = bench_fn(ternary_matvec_simd, x, &TW, out_sm,
                                    warmup, iters);

    // BLAS baseline (Apple Accelerate, uses AMX)
#if HAS_BLAS
    TimingResult t_blas = bench_fn(blas_wrapper, x, &TW, out_fp, warmup, iters);
#endif

    printf("  Timing (median of %d iters, %d warmup):\n", iters, warmup);
#if HAS_BLAS
    printf("    Accelerate:    %10.1f µs\n", t_blas.median_us);
#endif
    printf("    FP32 NEON:     %10.1f µs\n", t_fp32.median_us);
    printf("    Ternary naive: %10.1f µs  (%.2fx vs FP32)\n",
           t_naive.median_us, t_fp32.median_us / t_naive.median_us);
    printf("    Ternary packed:%10.1f µs  (%.2fx vs FP32)\n",
           t_packed.median_us, t_fp32.median_us / t_packed.median_us);
    printf("    Ternary SIMD:  %10.1f µs  (%.2fx vs FP32)\n",
           t_simd.median_us, t_fp32.median_us / t_simd.median_us);
#if HAS_BLAS
    printf("    SIMD vs BLAS:  %.2fx\n",
           t_blas.median_us / t_simd.median_us);
#endif

    // Compute GOPS (giga operations per second)
    // Each output element requires in_dim multiply-accumulates
    double total_ops = (double)in_dim * out_dim;
    double simd_gops = total_ops / (t_simd.median_us * 1e3);  // µs → ns → GOPS
    double fp32_gops = total_ops / (t_fp32.median_us * 1e3);
    printf("  Throughput:\n");
    printf("    FP32:          %.1f GOPS\n", fp32_gops);
    printf("    Ternary SIMD:  %.1f GOPS\n", simd_gops);

    // Structured output for autoresearch
    // Primary metric: ternary SIMD median µs at 2048×2048
    if (in_dim == 2048 && out_dim == 2048) {
        printf("\nMETRIC total_us=%.1f\n", t_simd.median_us);
        printf("METRIC fp32_us=%.1f\n", t_fp32.median_us);
        printf("METRIC naive_us=%.1f\n", t_naive.median_us);
        printf("METRIC packed_us=%.1f\n", t_packed.median_us);
        printf("METRIC simd_us=%.1f\n", t_simd.median_us);
        printf("METRIC speedup_vs_fp32=%.2f\n",
               t_fp32.median_us / t_simd.median_us);
        printf("METRIC sparsity_pct=%.1f\n", sparsity);
        printf("METRIC compression_ratio=%.1f\n", compression);
        printf("METRIC simd_gops=%.1f\n", simd_gops);
        printf("METRIC fp32_gops=%.1f\n", fp32_gops);
#if HAS_BLAS
        printf("METRIC blas_us=%.1f\n", t_blas.median_us);
        printf("METRIC speedup_vs_blas=%.2f\n",
               t_blas.median_us / t_simd.median_us);
#endif
    }

    free(x); free(W); free(W_from_ternary);
    free(out_fp); free(out_nv); free(out_pk); free(out_sm);
    free(tw); ternary_matrix_free(&TW);
}

int main(void) {
    printf("1-Bit Inference Engine — Ternary MatVec Benchmark\n");
    printf("==================================================\n");

    // Primary benchmark: 2048×2048 (typical transformer hidden dim)
    run_benchmark(2048, 2048, 5, 50);

    // Additional sizes for scaling analysis
    run_benchmark(768,  768,  5, 50);
    run_benchmark(4096, 4096, 3, 20);
    run_benchmark(8192, 8192, 2, 10);

    // ============================================================
    // GEMM benchmark (batch matmul)
    // ============================================================
    printf("\n\n=== Ternary GEMM (Batch MatMul) ===\n");
    {
        int in_d = 2048, out_d = 2048;
        int batches[] = {1, 2, 4, 8, 16};
        int nbatches = 5;

        // Set up ternary weights
        float *W_fp = (float *)malloc((size_t)in_d * out_d * sizeof(float));
        fill_random(W_fp, in_d * out_d, 77);
        int8_t *tw_g = (int8_t *)malloc((size_t)in_d * out_d);
        float sc = quantize_absmean(W_fp, tw_g, in_d * out_d);
        TernaryMatrix TW_g = ternary_matrix_pack(tw_g, sc, in_d, out_d);

        for (int bi = 0; bi < nbatches; bi++) {
            int batch = batches[bi];
            float *X = (float *)malloc((size_t)batch * in_d * sizeof(float));
            float *C = (float *)calloc((size_t)batch * out_d, sizeof(float));
            fill_random(X, batch * in_d, 42 + batch);

            // Warmup
            for (int w = 0; w < 3; w++) ternary_gemm(X, &TW_g, C, batch, in_d, out_d);

            // Benchmark
            int iters = 20;
            double t0g = now_ns();
            for (int it = 0; it < iters; it++)
                ternary_gemm(X, &TW_g, C, batch, in_d, out_d);
            double t1g = now_ns();
            double gemm_us = (t1g - t0g) / iters / 1e3;
            double per_row_us = gemm_us / batch;
            double total_ops = (double)batch * in_d * out_d;
            double gops = total_ops / (gemm_us * 1e3);

            printf("  batch=%2d: %7.0f µs total, %6.1f µs/row, %5.1f GOPS",
                   batch, gemm_us, per_row_us, gops);
            if (batch >= 2)
                printf("  (%.1fx vs batch=1 SDOT)", 45.0 / per_row_us);
            printf("\n");

            if (batch == 8) {
                printf("\nMETRIC gemm_b8_us=%.0f\n", gemm_us);
                printf("METRIC gemm_b8_per_row_us=%.1f\n", per_row_us);
                printf("METRIC gemm_b8_gops=%.1f\n", gops);
            }

            free(X); free(C);
        }
        free(W_fp); free(tw_g); ternary_matrix_free(&TW_g);
    }

    // ============================================================
    // Transformer forward pass benchmark
    // ============================================================
    printf("\n\n=== Transformer Forward Pass ===\n");
    {
        // Small model: 6 layers, dim=512, 8 heads, vocab=1000
        TransformerConfig cfg = {
            .dim = 512, .hidden_dim = 1376,  // 8/3 * 512
            .n_heads = 8, .head_dim = 64,
            .n_layers = 6, .vocab_size = 1000,
            .max_seq_len = 128,
        };

        printf("  Config: %d layers, dim=%d, %d heads, vocab=%d\n",
               cfg.n_layers, cfg.dim, cfg.n_heads, cfg.vocab_size);

        TransformerWeights tw = alloc_random_weights(&cfg, 42);
        RunState rs = alloc_run_state(&cfg);

        // Count ternary parameters
        long ternary_params = 0;
        for (int l = 0; l < cfg.n_layers; l++) {
            ternary_params += (long)cfg.dim * cfg.dim * 4; // Q,K,V,O
            ternary_params += (long)cfg.dim * cfg.hidden_dim; // up
            ternary_params += (long)cfg.hidden_dim * cfg.dim; // down
        }
        long fp32_params = (long)cfg.vocab_size * cfg.dim * 2; // emb + output
        printf("  Ternary params: %.1fM  FP32 params: %.1fM\n",
               ternary_params / 1e6, fp32_params / 1e6);
        printf("  Ternary weight mem: %.1f MB (INT8 expanded)\n",
               ternary_params / 1e6);

        // Warmup
        for (int i = 0; i < 3; i++) forward(&cfg, &tw, &rs, 42, i);

        // Benchmark: generate 10 tokens autoregressively with argmax sampling
        int gen_tokens = 10;
        int token = 42;  // seed token
        printf("  Generating %d tokens: [%d", gen_tokens, token);
        double t0 = now_ns();
        for (int t = 0; t < gen_tokens; t++) {
            float *logits = forward(&cfg, &tw, &rs, token, 3 + t);
            token = argmax(logits, cfg.vocab_size);
            printf(", %d", token);
        }
        double t1 = now_ns();
        printf("]\n");
        double total_ms = (t1 - t0) / 1e6;
        double per_token_ms = total_ms / gen_tokens;
        double tok_per_sec = 1000.0 / per_token_ms;

        printf("  Generated %d tokens in %.1f ms\n", gen_tokens, total_ms);
        printf("  Per token: %.2f ms  (%.0f tokens/sec)\n",
               per_token_ms, tok_per_sec);

        // Verify logits are not all-zero or NaN
        float *logits = rs.logits;
        int ok = 0;
        for (int i = 0; i < cfg.vocab_size; i++) {
            if (logits[i] != 0.0f && !isnan(logits[i])) { ok = 1; break; }
        }
        printf("  Logits sanity: %s\n", ok ? "OK (non-zero, non-NaN)" : "FAIL");

        printf("\nMETRIC fwd_token_ms=%.2f\n", per_token_ms);
        printf("METRIC fwd_tok_per_sec=%.0f\n", tok_per_sec);

        free_run_state(&rs);
        free_weights(&tw, &cfg);
    }

    // Larger model: closer to real LLM scale
    printf("\n=== Transformer Forward Pass (Larger) ===\n");
    {
        TransformerConfig cfg = {
            .dim = 2048, .hidden_dim = 5504,  // 8/3 * 2048
            .n_heads = 16, .head_dim = 128,
            .n_layers = 12, .vocab_size = 1000,
            .max_seq_len = 64,
        };

        printf("  Config: %d layers, dim=%d, %d heads, vocab=%d\n",
               cfg.n_layers, cfg.dim, cfg.n_heads, cfg.vocab_size);

        TransformerWeights tw = alloc_random_weights(&cfg, 99);
        RunState rs = alloc_run_state(&cfg);

        long ternary_params = 0;
        for (int l = 0; l < cfg.n_layers; l++) {
            ternary_params += (long)cfg.dim * cfg.dim * 4;
            ternary_params += (long)cfg.dim * cfg.hidden_dim;
            ternary_params += (long)cfg.hidden_dim * cfg.dim;
        }
        long i8_bytes = ternary_params; // 1 byte per INT8 weight
        long packed_bytes = ternary_params / 4; // 2 bits per weight
        long fp32_eq_bytes = ternary_params * 4; // what FP32 would use
        printf("  Ternary params: %.0fM\n", ternary_params / 1e6);
        printf("  Weight memory: %.1f MB (INT8) / %.1f MB (2-bit packed) / %.1f MB (if FP32)\n",
               i8_bytes / 1e6, packed_bytes / 1e6, fp32_eq_bytes / 1e6);

        // Warmup
        for (int i = 0; i < 2; i++) forward(&cfg, &tw, &rs, 42, i);

        // Generate tokens with profiling
        extern void forward_enable_profile(void);
        extern void forward_get_profile(double *, double *, double *);
        forward_enable_profile();

        int gen_tokens = 20;
        double t0 = now_ns();
        for (int t = 0; t < gen_tokens; t++) {
            forward(&cfg, &tw, &rs, 42, 2 + t);
        }
        double t1 = now_ns();
        double total_ms = (t1 - t0) / 1e6;
        double per_token_ms = total_ms / gen_tokens;
        double tok_per_sec = 1000.0 / per_token_ms;

        double matmul_us, attn_us, other_us;
        forward_get_profile(&matmul_us, &attn_us, &other_us);
        double prof_total = matmul_us + attn_us + other_us;
        printf("  Per token: %.2f ms  (%.0f tokens/sec)\n",
               per_token_ms, tok_per_sec);
        printf("  Profile (last token, %d layers):\n", cfg.n_layers);
        printf("    Ternary matmul:          %7.0f µs  (%.0f%%)\n",
               matmul_us, 100.0 * matmul_us / prof_total);
        printf("    Attention (RoPE+MHA):    %5.0f µs  (%.0f%%)\n",
               attn_us, 100.0 * attn_us / prof_total);
        printf("    Other (norms+residuals): %5.0f µs  (%.0f%%)\n",
               other_us, 100.0 * other_us / prof_total);
        printf("  Logits sanity: %s\n",
               (rs.logits[0] != 0.0f && !isnan(rs.logits[0])) ? "OK" : "FAIL");

        printf("\nMETRIC fwd_large_ms=%.2f\n", per_token_ms);
        printf("METRIC fwd_large_tps=%.0f\n", tok_per_sec);

        free_run_state(&rs);
        free_weights(&tw, &cfg);
    }

    return 0;
}