PolyhedraZK · slzk · Jun 28, 2025 · Jun 28, 2025 · Jun 28, 2025 · Jun 28, 2025
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -12,6 +12,7 @@ members = [
     "config_macros", # proc macros used to declare a new config, this has to a separate crate due to rust compilation issues
     "gkr",
     "gkr_engine", # definitions of GKR engine and associated types
+    "gpu", # GPU support and circuit serialization
     "hasher", # definitions of FiatShamirFieldHasher, FiatShamirBytesHash, and associated types
     "poly_commit",
     "serdes", # serialization and deserialization of various data structures

diff --git a/bin/src/main.rs b/bin/src/main.rs
@@ -8,9 +8,9 @@ use circuit::Circuit;
 use clap::Parser;
 use gkr::{
     BN254ConfigMIMC5KZG, BN254ConfigSha2Hyrax, BN254ConfigSha2Raw, GF2ExtConfigSha2Orion,
-    GF2ExtConfigSha2Raw, Goldilocksx8ConfigSha2Orion, Goldilocksx8ConfigSha2Raw,
-    M31x1ConfigSha2RawVanilla, M31x16ConfigSha2OrionSquare, M31x16ConfigSha2OrionVanilla,
-    M31x16ConfigSha2RawSquare, M31x16ConfigSha2RawVanilla, Prover,
+    GF2ExtConfigSha2Raw, Goldilocksx1ConfigSha2Raw, Goldilocksx8ConfigSha2Orion,
+    Goldilocksx8ConfigSha2Raw, M31x1ConfigSha2RawVanilla, M31x16ConfigSha2OrionSquare,
+    M31x16ConfigSha2OrionVanilla, M31x16ConfigSha2RawSquare, M31x16ConfigSha2RawVanilla, Prover,
     utils::{
         KECCAK_BABYBEAR_CIRCUIT, KECCAK_BABYBEAR_WITNESS, KECCAK_BN254_CIRCUIT,
         KECCAK_BN254_WITNESS, KECCAK_GF2_CIRCUIT, KECCAK_GF2_WITNESS, KECCAK_GOLDILOCKS_CIRCUIT,
@@ -69,7 +69,13 @@ fn main() {
 
         "m31ext3" => match pcs_type {
             PolynomialCommitmentType::Raw => match args.circuit.as_str() {
-                "keccak" => run_benchmark::<M31x16ConfigSha2RawVanilla>(&args, mpi_config.clone()),
+                "keccak" => {
+                    if std::env::var("EXPANDER_GPU").is_ok_and(|v| v == "1") {
+                        run_benchmark::<M31x1ConfigSha2RawVanilla>(&args, mpi_config.clone())
+                    } else {
+                        run_benchmark::<M31x16ConfigSha2RawVanilla>(&args, mpi_config.clone())
+                    }
+                }
                 "poseidon" => run_benchmark::<M31x16ConfigSha2RawSquare>(&args, mpi_config.clone()),
                 _ => unreachable!(),
             },
@@ -112,7 +118,13 @@ fn main() {
         },
         "goldilocks" => match pcs_type {
             PolynomialCommitmentType::Raw => match args.circuit.as_str() {
-                "keccak" => run_benchmark::<Goldilocksx8ConfigSha2Raw>(&args, mpi_config.clone()),
+                "keccak" => {
+                    if std::env::var("EXPANDER_GPU").is_ok_and(|v| v == "1") {
+                        run_benchmark::<Goldilocksx1ConfigSha2Raw>(&args, mpi_config.clone())
+                    } else {
+                        run_benchmark::<Goldilocksx8ConfigSha2Raw>(&args, mpi_config.clone())
+                    }
+                }
                 _ => unreachable!(),
             },
             PolynomialCommitmentType::Orion => match args.circuit.as_str() {
@@ -206,6 +218,7 @@ where
         (FieldType::M31x1, "keccak") => 2,
         (FieldType::M31x16, "keccak") => 2,
         (FieldType::M31x16, "poseidon") => 120,
+        (FieldType::Goldilocksx1, "keccak") => 2,
         (FieldType::Goldilocksx8, "keccak") => 2,
         (FieldType::BabyBearx16, "keccak") => 2,
         (FieldType::BN254, "keccak") => 2,

diff --git a/gkr/Cargo.toml b/gkr/Cargo.toml
@@ -13,6 +13,7 @@ gf2_128 = { path = "../arith/gf2_128" }
 gkr_engine = { path = "../gkr_engine" }
 gkr_hashers = { path = "../hasher" }
 goldilocks = { path = "../arith/goldilocks" }
+gpu = { path = "../gpu" }
 mersenne31 = { path = "../arith/mersenne31" }
 poly_commit = { path = "../poly_commit" }
 polynomials = { path = "../arith/polynomials" }

diff --git a/gkr/src/prover/gkr_vanilla.rs b/gkr/src/prover/gkr_vanilla.rs
@@ -14,7 +14,11 @@ pub fn gkr_prove<F: FieldEngine>(
     sp: &mut ProverScratchPad<F>,
     transcript: &mut impl Transcript,
     mpi_config: &MPIConfig,
-) -> (F::ChallengeField, ExpanderDualVarChallenge<F>) {
+) -> (F::ChallengeField, ExpanderDualVarChallenge<F>)
+where
+    F::CircuitField: std::fmt::Debug,
+    F::SimdCircuitField: std::fmt::Debug,
+{
     let layer_num = circuit.layers.len();
 
     let mut challenge: ExpanderDualVarChallenge<F> =
@@ -36,6 +40,21 @@ pub fn gkr_prove<F: FieldEngine>(
         mpi_config,
     );
 
+    // Serialize circuit to file if EXPANDER_GPU environment variable is set to 1
+    if std::env::var("EXPANDER_GPU").is_ok_and(|v| v == "1") {
+        // Only let rank 0 process handle serialization
+        if mpi_config.is_root() {
+            if let Err(e) =
+                gpu::serdes::serial_circuit_witness_as_plaintext(circuit, transcript, &challenge)
+            {
+                println!("Failed to serialize circuit: {e}");
+            }
+        }
+    }
+
+    let mut final_vx_claim = None;
+    let mut final_vy_claim = None;
+
     for i in (0..layer_num).rev() {
         let timer = Timer::new(
             &format!(
@@ -47,7 +66,7 @@ pub fn gkr_prove<F: FieldEngine>(
             mpi_config.is_root(),
         );
 
-        (_, _) = sumcheck_prove_gkr_layer(
+        let (vx_claim, vy_claim) = sumcheck_prove_gkr_layer(
             &circuit.layers[i],
             &mut challenge,
             alpha,
@@ -57,6 +76,12 @@ pub fn gkr_prove<F: FieldEngine>(
             i == layer_num - 1,
         );
 
+        // Store the final layer claims for later use
+        if i == 0 {
+            final_vx_claim = Some(vx_claim);
+            final_vy_claim = vy_claim;
+        }
+
         if challenge.rz_1.is_some() {
             // TODO: try broadcast beta.unwrap directly
             let mut tmp = transcript.generate_field_element::<F::ChallengeField>();
@@ -68,5 +93,18 @@ pub fn gkr_prove<F: FieldEngine>(
         timer.stop();
     }
 
+    // Print final claims if EXPANDER_GPU environment variable is set to 1
+    if std::env::var("EXPANDER_GPU").is_ok_and(|v| v == "1") {
+        // Only let rank 0 process handle printing final claims
+        if mpi_config.is_root() {
+            if let Some(vx) = final_vx_claim {
+                gpu::serdes::print_final_claims::<F>(&vx, &final_vy_claim);
+                println!("GKR final proof claims as shown above.");
+            }
+        }
+        // For GPU mode, we'll let the program continue and exit naturally
+        // This allows MPI to properly clean up
+    }
+
     (claimed_v, challenge)
 }
diff --git a/gpu/.gitignore b/gpu/.gitignore
@@ -0,0 +1 @@
+data
diff --git a/gpu/Cargo.toml b/gpu/Cargo.toml
@@ -0,0 +1,11 @@
+[package]
+name = "gpu"
+version = "0.1.0"
+edition = "2021"
+
+[dependencies]
+circuit = { path = "../circuit" }
+gkr_engine = { path = "../gkr_engine" }
+gkr_hashers = { path = "../hasher" }
+transcript = { path = "../transcript" }
+thiserror.workspace = true 
diff --git a/gpu/Makefile b/gpu/Makefile
@@ -0,0 +1,55 @@
+# Makefile for Expander-GPU project
+PROFILE_LEVEL ?= 0
+DO_ALL ?= 1
+FIELD_TYPE ?= m31ext3
+
+# Supported field types
+FIELDS := m31ext3 goldilocksext2 bn254
+
+# Generate circuit file names for all supported field types
+CIRCUIT_FILES := $(addprefix data/keccak_,$(addsuffix .gpu.circuit,$(FIELDS)))
+
+# Base command template
+# Usage: $(call run_field,FIELD,MPI_LEN,RUN_FLAGS)
+define run_field
+./expander-gpu --field $(1) --circuit data/keccak_$(1).gpu.circuit --log_level 0 --mpi_len $(2) $(3)
+endef
+
+# Helper function to run expander with different field types
+# Usage: $(call run_expander,MPI_LEN,RUN_FLAGS)
+define run_expander
+	$(if $(filter 1,$(DO_ALL)), \
+		$(call run_field,m31ext3,$(1),$(2)); \
+		$(call run_field,goldilocksext2,$(1),$(2)); \
+		$(call run_field,bn254,$(1),$(2)), \
+		$(if $(filter $(FIELD_TYPE),$(FIELDS)), \
+			$(call run_field,$(FIELD_TYPE),$(1),$(2)), \
+			$(error Invalid FIELD_TYPE '$(FIELD_TYPE)'. Must be one of: $(FIELDS))))
+endef
+
+.PHONY: clean test prepare-data profile mpi-profile mpi-test
+
+# When circuit files are missing, guide user to run the prepare script.
+data/keccak_%.gpu.circuit:
+	@echo "Error: Circuit file '$@' is missing."
+	@echo "Please run './prepare-data.sh' to generate necessary circuit files."
+	@exit 1
+
+# Prepare all circuit data files (only if they don't exist)
+data: $(CIRCUIT_FILES)
+	@echo "All circuit data files are ready."
+
+test: data
+	$(call run_expander,128,)
+
+profile: data
+	$(call run_expander,8192,--enable-same-input --profile $(PROFILE_LEVEL))
+
+mpi-test: data
+	$(call run_expander,128,--enable-mpi-merge)
+
+mpi-profile: data
+	$(call run_expander,16384,--enable-same-input --profile $(PROFILE_LEVEL) --enable-mpi-merge)
+
+clean:
+	rm -rf data
diff --git a/gpu/expander-gpu b/gpu/expander-gpu
diff --git a/gpu/prepare-data.sh b/gpu/prepare-data.sh
@@ -0,0 +1,28 @@
+# Create data folder
+mkdir data
+cd data
+
+# Download two repo for generating circuit and witness for GPU
+git clone [email protected]:PolyhedraZK/ExpanderCompilerCollection.git
+git clone [email protected]:PolyhedraZK/Expander.git
+
+# Use Expander Compiler to generate Circuit and Witness for Expander
+cd ExpanderCompilerCollection
+cargo test --release keccak
+
+# Move data to Expander
+mkdir ../Expander/data
+cp expander_compiler/*.txt ../Expander/data
+cd ../Expander
+
+# Use Expander's GPU serialization to produce circuit and witness for GPU usage
+git checkout gpu-expander
+EXPANDER_GPU=1 RUSTFLAGS="-C target-cpu=native -C target-feature=+avx512f" cargo run --release --bin=gkr -- --circuit keccak --pcs Raw --threads 1 --field m31ext3
+EXPANDER_GPU=1 RUSTFLAGS="-C target-cpu=native -C target-feature=+avx512f" cargo run --release --bin=gkr -- --circuit keccak --pcs Raw --threads 1 --field goldilocks
+EXPANDER_GPU=1 RUSTFLAGS="-C target-cpu=native -C target-feature=+avx512f" cargo run --release --bin=gkr -- --circuit keccak --pcs Raw --threads 1 --field fr
+mv data/*.gpu.* ..
+cd ..
+
+# Remove this two repo
+rm -rf ExpanderCompilerCollection
+cd ..
diff --git a/gpu/readme.md b/gpu/readme.md
@@ -0,0 +1,131 @@
+<div align="center" style="width: 100%;">
+  <img 
+    src="https://expander.polyhedra.network/assets/static/logo-with-text.16d5af29.svg" 
+    alt="Expander Logo"
+    style="width: 400px; height: auto;"
+  />
+</div>
+
+
+# Expander GPU Acceleration
+
+Expander is a proof generation backend for Polyhedra Network. It aims to support fast proof generation.
+
+Expander now includes a high-performance GPU backend powered by CUDA, designed to dramatically accelerate proof generation. This backend is optimized for NVIDIA GPUs and offers significant speedups, especially for complex circuits and large-scale computations.
+
+### Key Features
+
+- **Massive Parallelism**: Leverages the full power of modern GPUs to process thousands of proofs in parallel.
+- **MPI Merge**: Introduces an innovative "MPI Merge" feature that can compress proofs from thousands of independent computations into a single, compact proof. In our tests, we've achieved a compression ratio of up to `16384:1`. This is particularly useful in scenarios with large batches of similar computations.
+- **Broad Field Support**: The GPU backend supports multiple field types, including `BN254`, `Goldilocks`, and `M31`.
+
+### System Requirements
+
+- **NVIDIA GPU**: A CUDA-enabled NVIDIA GPU with compute capability 7.0+ is recommended.
+- **CUDA Toolkit**: Version 12.5 or newer.
+- **Compiler**: `clang` and `clang++`.
+- **Build Tools**: `cmake` (version 3.18+) and `ninja`.
+
+### Build Instructions
+
+The current release of Expander-GPU is in binary form. Please contact us if you are interested in source code access.
+
+## GPU Benchmarks
+
+The GPU backend delivers substantial performance improvements over the CPU implementation. The following benchmarks were run on an NVIDIA GPU, showcasing the throughput for various configurations.
+
+### Performance Results
+
+| Field            | Throughput (8192 proofs) | Throughput (16384 proofs, MPI Merged) |
+|------------------|--------------------------|---------------------------------------|
+| `m31ext3`        | ~2788 proofs/sec         | ~3040 computations/sec                |
+| `goldilocksext2` | ~2597 proofs/sec         | ~2255 computations/sec                |
+| `bn254`          | ~1313 proofs/sec         | ~1525 computations/sec                |
+
+**Note on BN254 Performance**: The GPU acceleration is particularly impactful for the `BN254` field. Compared to our highly optimized AVX512 CPU backend, **the GPU implementation provides a 7-10x speedup** compared to AMD 9950X3D, achieving over 1500 merged computations per second. This makes Expander an ideal choice for ZK applications built on Ethereum-friendly curves.
+
+### Running Benchmarks Manually and Profiling
+
+You can reproduce these benchmarks using the `Makefile`:
+
+```sh
+# Run standard benchmark with 8192 parallel proofs
+make profile
+
+# Run benchmark with 16384 parallel proofs and MPI merge enabled
+make mpi-profile
+
+# Run standard benchmark with detailed profiling data
+make profile PROFILE_LEVEL=2
+
+# Run standard benchmark with detailed profiling data
+make mpi-profile PROFILE_LEVEL=2
+```
+
+You can customize the `FIELD_TYPE` and `PROFILE_LEVEL` variables in the `Makefile` to test different configurations. You should be able to see a detailed profiling report as below.
+
+```
+====== GKR System Initialization ======
+Parsed RZ0 Challenge: 0x128e207ced0a98b1401e2e521465544111847e131de192a5f527ecbd1611d6b0
+
+GPU Memory Allocation Summary:
+  Circuit:      29.47 MB (30898000 B)
+  Transcript:   4.03 GB (4330817408 B)
+  Scratchpad:   14.25 GB (15303180288 B)
+  Total:        18.31 GB (19664895696 B)
+
+MPI Merge Status:
+  MPI Length:           8192 (independent computations)
+  Number of Proofs:     8192 (final transcripts)
+  MPI Merge Enabled:    NO
+
+System Configuration:
+  Circuit Layers:       144 layers
+  MPI Length:           8192
+  Enable MPI Merge:     false
+  Field Type:           bn254
+  Fiat-Shamir Type:     sha2-256
+  Max Input Variables:  13
+  Max Output Variables: 13
+
+Prove Done! Final Claims:
+  vx_claim = [0x08d2107f3419f056dda4310fd9de72a8eca95840b26a20068d70262ea9495086]
+  vy_claim = [0x18e99e28f39df8da3cea05e6382991fa69fb57053d500112de6dab091267656c]
+
+====== GKR Hierarchical Profiling Results (with GPU timing) ======
+Function Name                            Call Count   Total Time (s)  Avg Time (ms)   % of Total
+---------------------------------------- ------------ --------------- --------------- ----------
+Sumcheck                                 287          2.790811        9.724           49.89    %
+  - receive_challange                    3513         1.292797        0.368           23.11    %
+  - poly_eval_at                         3513         1.140716        0.325           20.39    %
+  - Fiat-shamir(sumcheck)                3513         0.329160        0.094           5.88     %
+  - Apply phase 2 coef                   1754         0.021809        0.012           0.39     %
+Prepare H(x)                             144          1.603928        11.138          28.67    %
+  - eq_eval_at                           287          0.682776        2.379           12.21    %
+    - eq_eval_combine                    287          0.388053        1.352           6.94     %
+    - scatter_to_build_eq_buf            3504         0.241123        0.069           4.31     %
+    - scatter_to_first_element           574          0.034887        0.061           0.62     %
+  - build_hgx_mult_and_add               144          0.616053        4.278           11.01    %
+    - build_hgx_mult                     143          0.376732        2.634           6.74     %
+    - build_hgx_add                      144          0.237986        1.653           4.25     %
+  - acc_from_rx_to_rz0                   142          0.379750        2.674           6.79     %
+  - memset_clear_x_vals                  143          0.275147        1.924           4.92     %
+Prepare H(y)                             143          1.182382        8.268           21.14    %
+  - build_hgy_mult_only                  143          0.463457        3.241           8.29     %
+  - memset_clear_y_vals                  143          0.367486        2.570           6.57     %
+Fiat-shamir(gkr)                         717          0.016354        0.023           0.29     %
+TOTAL                                    -            5.593475        -               100.00%   
+=============================================
+
+====== Expander-GPU Performance Metrics ======
+Field element type:   bn254
+Fiat-shamir type:     sha2-256
+GKR proof size:       379232 bytes
+GKR proof time:       5.594314 seconds
+Proofs per second:    1464.34 proof/sec
+```
+
+## Acknowledgments
+
+The code of Expander-GPU is derived from the [ICICLE project](https://github.com/ingonyama-zk/icicle). 
+We are grateful to the ICICLE team for their contributions to the community, providing efficient field element operations on GPU that enable high-performance cryptographic computations.