Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ members = [
"config_macros", # proc macros used to declare a new config, this has to a separate crate due to rust compilation issues
"gkr",
"gkr_engine", # definitions of GKR engine and associated types
"gpu", # GPU support and circuit serialization
"hasher", # definitions of FiatShamirFieldHasher, FiatShamirBytesHash, and associated types
"poly_commit",
"serdes", # serialization and deserialization of various data structures
Expand Down
23 changes: 18 additions & 5 deletions bin/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,9 @@ use circuit::Circuit;
use clap::Parser;
use gkr::{
BN254ConfigMIMC5KZG, BN254ConfigSha2Hyrax, BN254ConfigSha2Raw, GF2ExtConfigSha2Orion,
GF2ExtConfigSha2Raw, Goldilocksx8ConfigSha2Orion, Goldilocksx8ConfigSha2Raw,
M31x1ConfigSha2RawVanilla, M31x16ConfigSha2OrionSquare, M31x16ConfigSha2OrionVanilla,
M31x16ConfigSha2RawSquare, M31x16ConfigSha2RawVanilla, Prover,
GF2ExtConfigSha2Raw, Goldilocksx1ConfigSha2Raw, Goldilocksx8ConfigSha2Orion,
Goldilocksx8ConfigSha2Raw, M31x1ConfigSha2RawVanilla, M31x16ConfigSha2OrionSquare,
M31x16ConfigSha2OrionVanilla, M31x16ConfigSha2RawSquare, M31x16ConfigSha2RawVanilla, Prover,
utils::{
KECCAK_BABYBEAR_CIRCUIT, KECCAK_BABYBEAR_WITNESS, KECCAK_BN254_CIRCUIT,
KECCAK_BN254_WITNESS, KECCAK_GF2_CIRCUIT, KECCAK_GF2_WITNESS, KECCAK_GOLDILOCKS_CIRCUIT,
Expand Down Expand Up @@ -69,7 +69,13 @@ fn main() {

"m31ext3" => match pcs_type {
PolynomialCommitmentType::Raw => match args.circuit.as_str() {
"keccak" => run_benchmark::<M31x16ConfigSha2RawVanilla>(&args, mpi_config.clone()),
"keccak" => {
if std::env::var("EXPANDER_GPU").is_ok_and(|v| v == "1") {
run_benchmark::<M31x1ConfigSha2RawVanilla>(&args, mpi_config.clone())
} else {
run_benchmark::<M31x16ConfigSha2RawVanilla>(&args, mpi_config.clone())
}
}
"poseidon" => run_benchmark::<M31x16ConfigSha2RawSquare>(&args, mpi_config.clone()),
_ => unreachable!(),
},
Expand Down Expand Up @@ -112,7 +118,13 @@ fn main() {
},
"goldilocks" => match pcs_type {
PolynomialCommitmentType::Raw => match args.circuit.as_str() {
"keccak" => run_benchmark::<Goldilocksx8ConfigSha2Raw>(&args, mpi_config.clone()),
"keccak" => {
if std::env::var("EXPANDER_GPU").is_ok_and(|v| v == "1") {
run_benchmark::<Goldilocksx1ConfigSha2Raw>(&args, mpi_config.clone())
} else {
run_benchmark::<Goldilocksx8ConfigSha2Raw>(&args, mpi_config.clone())
}
}
_ => unreachable!(),
},
PolynomialCommitmentType::Orion => match args.circuit.as_str() {
Expand Down Expand Up @@ -206,6 +218,7 @@ where
(FieldType::M31x1, "keccak") => 2,
(FieldType::M31x16, "keccak") => 2,
(FieldType::M31x16, "poseidon") => 120,
(FieldType::Goldilocksx1, "keccak") => 2,
(FieldType::Goldilocksx8, "keccak") => 2,
(FieldType::BabyBearx16, "keccak") => 2,
(FieldType::BN254, "keccak") => 2,
Expand Down
1 change: 1 addition & 0 deletions gkr/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ gf2_128 = { path = "../arith/gf2_128" }
gkr_engine = { path = "../gkr_engine" }
gkr_hashers = { path = "../hasher" }
goldilocks = { path = "../arith/goldilocks" }
gpu = { path = "../gpu" }
mersenne31 = { path = "../arith/mersenne31" }
poly_commit = { path = "../poly_commit" }
polynomials = { path = "../arith/polynomials" }
Expand Down
42 changes: 40 additions & 2 deletions gkr/src/prover/gkr_vanilla.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,11 @@ pub fn gkr_prove<F: FieldEngine>(
sp: &mut ProverScratchPad<F>,
transcript: &mut impl Transcript,
mpi_config: &MPIConfig,
) -> (F::ChallengeField, ExpanderDualVarChallenge<F>) {
) -> (F::ChallengeField, ExpanderDualVarChallenge<F>)
where
F::CircuitField: std::fmt::Debug,
F::SimdCircuitField: std::fmt::Debug,
{
let layer_num = circuit.layers.len();

let mut challenge: ExpanderDualVarChallenge<F> =
Expand All @@ -36,6 +40,21 @@ pub fn gkr_prove<F: FieldEngine>(
mpi_config,
);

// Serialize circuit to file if EXPANDER_GPU environment variable is set to 1
if std::env::var("EXPANDER_GPU").is_ok_and(|v| v == "1") {
// Only let rank 0 process handle serialization
if mpi_config.is_root() {
if let Err(e) =
gpu::serdes::serial_circuit_witness_as_plaintext(circuit, transcript, &challenge)
{
println!("Failed to serialize circuit: {e}");
}
}
}

let mut final_vx_claim = None;
let mut final_vy_claim = None;

for i in (0..layer_num).rev() {
let timer = Timer::new(
&format!(
Expand All @@ -47,7 +66,7 @@ pub fn gkr_prove<F: FieldEngine>(
mpi_config.is_root(),
);

(_, _) = sumcheck_prove_gkr_layer(
let (vx_claim, vy_claim) = sumcheck_prove_gkr_layer(
&circuit.layers[i],
&mut challenge,
alpha,
Expand All @@ -57,6 +76,12 @@ pub fn gkr_prove<F: FieldEngine>(
i == layer_num - 1,
);

// Store the final layer claims for later use
if i == 0 {
final_vx_claim = Some(vx_claim);
final_vy_claim = vy_claim;
}

if challenge.rz_1.is_some() {
// TODO: try broadcast beta.unwrap directly
let mut tmp = transcript.generate_field_element::<F::ChallengeField>();
Expand All @@ -68,5 +93,18 @@ pub fn gkr_prove<F: FieldEngine>(
timer.stop();
}

// Print final claims if EXPANDER_GPU environment variable is set to 1
if std::env::var("EXPANDER_GPU").is_ok_and(|v| v == "1") {
// Only let rank 0 process handle printing final claims
if mpi_config.is_root() {
if let Some(vx) = final_vx_claim {
gpu::serdes::print_final_claims::<F>(&vx, &final_vy_claim);
println!("GKR final proof claims as shown above.");
}
}
// For GPU mode, we'll let the program continue and exit naturally
// This allows MPI to properly clean up
}

(claimed_v, challenge)
}
1 change: 1 addition & 0 deletions gpu/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
data
11 changes: 11 additions & 0 deletions gpu/Cargo.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
[package]
name = "gpu"
version = "0.1.0"
edition = "2021"

[dependencies]
circuit = { path = "../circuit" }
gkr_engine = { path = "../gkr_engine" }
gkr_hashers = { path = "../hasher" }
transcript = { path = "../transcript" }
thiserror.workspace = true
55 changes: 55 additions & 0 deletions gpu/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
# Makefile for Expander-GPU project
PROFILE_LEVEL ?= 0
DO_ALL ?= 1
FIELD_TYPE ?= m31ext3

# Supported field types
FIELDS := m31ext3 goldilocksext2 bn254

# Generate circuit file names for all supported field types
CIRCUIT_FILES := $(addprefix data/keccak_,$(addsuffix .gpu.circuit,$(FIELDS)))

# Base command template
# Usage: $(call run_field,FIELD,MPI_LEN,RUN_FLAGS)
define run_field
./expander-gpu --field $(1) --circuit data/keccak_$(1).gpu.circuit --log_level 0 --mpi_len $(2) $(3)
endef

# Helper function to run expander with different field types
# Usage: $(call run_expander,MPI_LEN,RUN_FLAGS)
define run_expander
$(if $(filter 1,$(DO_ALL)), \
$(call run_field,m31ext3,$(1),$(2)); \
$(call run_field,goldilocksext2,$(1),$(2)); \
$(call run_field,bn254,$(1),$(2)), \
$(if $(filter $(FIELD_TYPE),$(FIELDS)), \
$(call run_field,$(FIELD_TYPE),$(1),$(2)), \
$(error Invalid FIELD_TYPE '$(FIELD_TYPE)'. Must be one of: $(FIELDS))))
endef

.PHONY: clean test prepare-data profile mpi-profile mpi-test

# When circuit files are missing, guide user to run the prepare script.
data/keccak_%.gpu.circuit:
@echo "Error: Circuit file '$@' is missing."
@echo "Please run './prepare-data.sh' to generate necessary circuit files."
@exit 1

# Prepare all circuit data files (only if they don't exist)
data: $(CIRCUIT_FILES)
@echo "All circuit data files are ready."

test: data
$(call run_expander,128,)

profile: data
$(call run_expander,8192,--enable-same-input --profile $(PROFILE_LEVEL))

mpi-test: data
$(call run_expander,128,--enable-mpi-merge)

mpi-profile: data
$(call run_expander,16384,--enable-same-input --profile $(PROFILE_LEVEL) --enable-mpi-merge)

clean:
rm -rf data
Binary file added gpu/expander-gpu
Binary file not shown.
28 changes: 28 additions & 0 deletions gpu/prepare-data.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
# Create data folder
mkdir data
cd data

# Download two repo for generating circuit and witness for GPU
git clone [email protected]:PolyhedraZK/ExpanderCompilerCollection.git
git clone [email protected]:PolyhedraZK/Expander.git

# Use Expander Compiler to generate Circuit and Witness for Expander
cd ExpanderCompilerCollection
cargo test --release keccak

# Move data to Expander
mkdir ../Expander/data
cp expander_compiler/*.txt ../Expander/data
cd ../Expander

# Use Expander's GPU serialization to produce circuit and witness for GPU usage
git checkout gpu-expander
EXPANDER_GPU=1 RUSTFLAGS="-C target-cpu=native -C target-feature=+avx512f" cargo run --release --bin=gkr -- --circuit keccak --pcs Raw --threads 1 --field m31ext3
EXPANDER_GPU=1 RUSTFLAGS="-C target-cpu=native -C target-feature=+avx512f" cargo run --release --bin=gkr -- --circuit keccak --pcs Raw --threads 1 --field goldilocks
EXPANDER_GPU=1 RUSTFLAGS="-C target-cpu=native -C target-feature=+avx512f" cargo run --release --bin=gkr -- --circuit keccak --pcs Raw --threads 1 --field fr
mv data/*.gpu.* ..
cd ..

# Remove this two repo
rm -rf ExpanderCompilerCollection
cd ..
131 changes: 131 additions & 0 deletions gpu/readme.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
<div align="center" style="width: 100%;">
<img
src="https://expander.polyhedra.network/assets/static/logo-with-text.16d5af29.svg"
alt="Expander Logo"
style="width: 400px; height: auto;"
/>
</div>


# Expander GPU Acceleration

Expander is a proof generation backend for Polyhedra Network. It aims to support fast proof generation.

Expander now includes a high-performance GPU backend powered by CUDA, designed to dramatically accelerate proof generation. This backend is optimized for NVIDIA GPUs and offers significant speedups, especially for complex circuits and large-scale computations.

### Key Features

- **Massive Parallelism**: Leverages the full power of modern GPUs to process thousands of proofs in parallel.
- **MPI Merge**: Introduces an innovative "MPI Merge" feature that can compress proofs from thousands of independent computations into a single, compact proof. In our tests, we've achieved a compression ratio of up to `16384:1`. This is particularly useful in scenarios with large batches of similar computations.
- **Broad Field Support**: The GPU backend supports multiple field types, including `BN254`, `Goldilocks`, and `M31`.

### System Requirements

- **NVIDIA GPU**: A CUDA-enabled NVIDIA GPU with compute capability 7.0+ is recommended.
- **CUDA Toolkit**: Version 12.5 or newer.
- **Compiler**: `clang` and `clang++`.
- **Build Tools**: `cmake` (version 3.18+) and `ninja`.

### Build Instructions

The current release of Expander-GPU is in binary form. Please contact us if you are interested in source code access.

## GPU Benchmarks

The GPU backend delivers substantial performance improvements over the CPU implementation. The following benchmarks were run on an NVIDIA GPU, showcasing the throughput for various configurations.

### Performance Results

| Field | Throughput (8192 proofs) | Throughput (16384 proofs, MPI Merged) |
|------------------|--------------------------|---------------------------------------|
| `m31ext3` | ~2788 proofs/sec | ~3040 computations/sec |
| `goldilocksext2` | ~2597 proofs/sec | ~2255 computations/sec |
| `bn254` | ~1313 proofs/sec | ~1525 computations/sec |

**Note on BN254 Performance**: The GPU acceleration is particularly impactful for the `BN254` field. Compared to our highly optimized AVX512 CPU backend, **the GPU implementation provides a 7-10x speedup** compared to AMD 9950X3D, achieving over 1500 merged computations per second. This makes Expander an ideal choice for ZK applications built on Ethereum-friendly curves.

### Running Benchmarks Manually and Profiling

You can reproduce these benchmarks using the `Makefile`:

```sh
# Run standard benchmark with 8192 parallel proofs
make profile

# Run benchmark with 16384 parallel proofs and MPI merge enabled
make mpi-profile

# Run standard benchmark with detailed profiling data
make profile PROFILE_LEVEL=2

# Run standard benchmark with detailed profiling data
make mpi-profile PROFILE_LEVEL=2
```

You can customize the `FIELD_TYPE` and `PROFILE_LEVEL` variables in the `Makefile` to test different configurations. You should be able to see a detailed profiling report as below.

```
====== GKR System Initialization ======
Parsed RZ0 Challenge: 0x128e207ced0a98b1401e2e521465544111847e131de192a5f527ecbd1611d6b0

GPU Memory Allocation Summary:
Circuit: 29.47 MB (30898000 B)
Transcript: 4.03 GB (4330817408 B)
Scratchpad: 14.25 GB (15303180288 B)
Total: 18.31 GB (19664895696 B)

MPI Merge Status:
MPI Length: 8192 (independent computations)
Number of Proofs: 8192 (final transcripts)
MPI Merge Enabled: NO

System Configuration:
Circuit Layers: 144 layers
MPI Length: 8192
Enable MPI Merge: false
Field Type: bn254
Fiat-Shamir Type: sha2-256
Max Input Variables: 13
Max Output Variables: 13

Prove Done! Final Claims:
vx_claim = [0x08d2107f3419f056dda4310fd9de72a8eca95840b26a20068d70262ea9495086]
vy_claim = [0x18e99e28f39df8da3cea05e6382991fa69fb57053d500112de6dab091267656c]

====== GKR Hierarchical Profiling Results (with GPU timing) ======
Function Name Call Count Total Time (s) Avg Time (ms) % of Total
---------------------------------------- ------------ --------------- --------------- ----------
Sumcheck 287 2.790811 9.724 49.89 %
- receive_challange 3513 1.292797 0.368 23.11 %
- poly_eval_at 3513 1.140716 0.325 20.39 %
- Fiat-shamir(sumcheck) 3513 0.329160 0.094 5.88 %
- Apply phase 2 coef 1754 0.021809 0.012 0.39 %
Prepare H(x) 144 1.603928 11.138 28.67 %
- eq_eval_at 287 0.682776 2.379 12.21 %
- eq_eval_combine 287 0.388053 1.352 6.94 %
- scatter_to_build_eq_buf 3504 0.241123 0.069 4.31 %
- scatter_to_first_element 574 0.034887 0.061 0.62 %
- build_hgx_mult_and_add 144 0.616053 4.278 11.01 %
- build_hgx_mult 143 0.376732 2.634 6.74 %
- build_hgx_add 144 0.237986 1.653 4.25 %
- acc_from_rx_to_rz0 142 0.379750 2.674 6.79 %
- memset_clear_x_vals 143 0.275147 1.924 4.92 %
Prepare H(y) 143 1.182382 8.268 21.14 %
- build_hgy_mult_only 143 0.463457 3.241 8.29 %
- memset_clear_y_vals 143 0.367486 2.570 6.57 %
Fiat-shamir(gkr) 717 0.016354 0.023 0.29 %
TOTAL - 5.593475 - 100.00%
=============================================

====== Expander-GPU Performance Metrics ======
Field element type: bn254
Fiat-shamir type: sha2-256
GKR proof size: 379232 bytes
GKR proof time: 5.594314 seconds
Proofs per second: 1464.34 proof/sec
```

## Acknowledgments

The code of Expander-GPU is derived from the [ICICLE project](https://github.com/ingonyama-zk/icicle).
We are grateful to the ICICLE team for their contributions to the community, providing efficient field element operations on GPU that enable high-performance cryptographic computations.
Loading
Loading