Skip to content

Model shapes config #2036

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 5 commits into
base: bench-gpu-profiling
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
62 changes: 61 additions & 1 deletion benchmarks/microbenchmarks/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -63,14 +63,74 @@ Currently, quantization string is in same format as the one being passed in llam

### Model Types
- `linear`: Simple linear layer
- `ln_linear_sigmoid`: LayerNorm + Linear + Sigmoid
- `ln_linear_<activation>`: LayerNorm + Linear + Activation, where activation can be:
- `ln_linear_sigmoid`: LayerNorm + Linear + Sigmoid
- `ln_linear_relu`: LayerNorm + Linear + ReLU
- `ln_linear_leakyrelu`: LayerNorm + Linear + LeakyReLU
- `ln_linear_relu6`: LayerNorm + Linear + ReLU6
- `ln_linear_gelu`: LayerNorm + Linear + GELU
- `ln_linear_silu`: LayerNorm + Linear + SiLU
- `ln_linear_hardswish`: LayerNorm + Linear + Hardswish
- `transformer_block`: Transformer block with self-attention and MLP

### Device Options
- `cuda`: NVIDIA GPU
- `xpu`: Intel GPU
- `mps`: Apple Silicon GPU
- `cpu`: CPU fallback

### Shape Generation Options
- `custom`: Manually specify shapes as a list of [m, k, n] dimensions
```yaml
matrix_shapes:
- name: "custom"
shapes: [
[1024, 1024, 1024], # [m, k, n]
[2048, 4096, 1024]
]
```

- `llama`: Use LLaMa 2 70B single-node weight shapes (assumes fused attn.wqkv and ffn.w13)
- Generates shapes for: "attn.wqkv", "attn.w0", "ffn.w13", "ffn.w2"
```yaml
matrix_shapes:
- name: "llama"
```

- `pow2`: Generate shapes with dimensions that are powers of 2
- Parameters:
- `min_power`: Minimum power of 2 (default: 10, which is 1024)
- `max_power`: Maximum power of 2 (default: 14, which is 16,384)
```yaml
matrix_shapes:
- name: "pow2"
min_power: 10 # 2^10 = 1024
max_power: 12 # 2^12 = 4096
```

- `pow2_extended`: Generate shapes with dimensions that are powers of 2 and powers of 2 + half
- Parameters:
- `min_power`: Minimum power of 2 (default: 10, which is 1024)
- `max_power`: Maximum power of 2 (default: 14, which is 16,384)
```yaml
matrix_shapes:
- name: "pow2_extended"
min_power: 10 # Generates: 1024, 1536, 2048, 3072, etc.
max_power: 11
```

- `sweep`: Generate a sweep of shapes with different powers of 2 for M, K, N dimensions
- Parameters:
- `min_power`: Minimum power of 2 (default: 8, which is 256)
- `max_power`: Maximum power of 2 (default: 15, which is 32,768)
- Note: This generates all combinations of M, K, N dimensions, which can be a large number of shapes
```yaml
matrix_shapes:
- name: "sweep"
min_power: 8 # 2^8 = 256
max_power: 9 # 2^9 = 512
```

## Output

Results are saved to a CSV file in the specified output directory
Expand Down
6 changes: 4 additions & 2 deletions benchmarks/microbenchmarks/benchmark_inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,12 +22,14 @@
BenchmarkConfig,
BenchmarkResult,
clean_caches,
create_model_and_input,
model_inference_time_in_ms,
string_to_config,
)
from torchao.quantization import quantize_
from torchao.sparsity.sparse_api import sparsify_
from torchao.testing.model_architectures import (
create_model_and_input_data,
)


def run(config: BenchmarkConfig) -> BenchmarkResult:
Expand All @@ -38,7 +40,7 @@ def run(config: BenchmarkConfig) -> BenchmarkResult:
# Create output directory if it doesn't exist
Path(config.output_dir).mkdir(parents=True, exist_ok=True)

base_model, input_data = create_model_and_input(
base_model, input_data = create_model_and_input_data(
config.model_type,
config.m,
config.k,
Expand Down
43 changes: 42 additions & 1 deletion benchmarks/microbenchmarks/benchmark_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,9 +48,50 @@ def get_shapes_for_config(
name = shape_config["name"]
if name == "custom":
shapes.extend([(name, shape) for shape in shape_config["shapes"]])
elif name == "llama":
# LLaMa 2 70B single-node weight shapes
# assumes fused attn.wqkv and ffn.w13
bsz, seq_len = 4, 4096
M = bsz * seq_len
llama_shapes = {
"attn.wqkv": (M, 8192, 1280),
"attn.w0": (M, 1024, 8192),
"ffn.w13": (M, 8192, 7168),
"ffn.w2": (M, 3584, 8192),
}
shapes.extend([(f"{name}_{k}", v) for k, v in llama_shapes.items()])
elif name == "pow2":
# Generate shapes with dimensions that are powers of 2
min_power_of_2 = shape_config.get("min_power", 10) # 1024
max_power_of_2 = shape_config.get("max_power", 14) # 16,384
for idx, power_of_2 in enumerate(range(min_power_of_2, max_power_of_2 + 1)):
val = 2**power_of_2
shapes.append((f"{name}_{idx}", [val, val, val]))
elif name == "pow2_extended":
# Generate shapes with dimensions that are powers of 2 and powers of 2 + half
min_power_of_2 = shape_config.get("min_power", 10) # 1024
max_power_of_2 = shape_config.get("max_power", 14) # 16,384
for idx, power_of_2 in enumerate(range(min_power_of_2, max_power_of_2 + 1)):
val1 = 2**power_of_2
val2 = 2**power_of_2 + 2 ** (power_of_2 - 1)
shapes.append((f"{name}_{idx*2}", [val1, val1, val1]))
shapes.append((f"{name}_{idx*2+1}", [val2, val2, val2]))
elif name == "sweep":
# Generate a sweep of shapes with different powers of 2 for M, K, N
min_p2 = shape_config.get("min_power", 8) # 256
max_p2 = shape_config.get("max_power", 15) # 32,768
counter = 0
for M_p2 in range(min_p2, max_p2 + 1):
M = 2**M_p2
for K_p2 in range(min_p2, max_p2 + 1):
K = 2**K_p2
for N_p2 in range(min_p2, max_p2 + 1):
N = 2**N_p2
shapes.append((f"{name}_{counter}", [M, K, N]))
counter += 1
else:
raise NotImplementedError(
f"Shape config {name} not supported. Currently only supports custom shapes."
f"Shape config {name} not supported. Supported options: custom, llama, pow2, pow2_extended, sweep."
)
return shapes

Expand Down
45 changes: 45 additions & 0 deletions benchmarks/microbenchmarks/test/benchmark_config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -26,3 +26,48 @@ model_params:
device: "cuda"
model_type: "linear"
enable_profiler: true # Enable profiling for this model

- name: "ln_linear_sigmoid_cuda"
matrix_shapes:
- name: "custom"
shapes: [
[2048, 4096, 1024],
]
high_precision_dtype: "torch.bfloat16"
use_torch_compile: true
torch_compile_mode: "max-autotune"
device: "cuda"
model_type: "ln_linear_sigmoid"
enable_profiler: true

- name: "bf16_transformer_block"
matrix_shapes:
- name: "custom"
shapes: [
[2048, 4096, 1024], # For transformer_block, k is the hidden dimension
]
high_precision_dtype: "torch.bfloat16"
use_torch_compile: true
torch_compile_mode: "max-autotune"
device: "cuda"
model_type: "transformer_block" # TODO: Add a custom model (Figure out how to do this, maybe pass a .py file with model definition)
enable_profiler: true

- name: "large_bf16_ln_linear"
matrix_shapes:
- name: "llama" # Example of using LLaMa shapes
- name: "pow2" # Example of using power of 2 shapes
min_power: 10 # 1024
max_power: 12 # 4096
- name: "pow2_extended" # Example of using extended power of 2 shapes
min_power: 10 # 1024
max_power: 11 # 2048
- name: "sweep" # Example of using sweep shapes (commented out as it generates many shapes)
min_power: 8 # 256
max_power: 9 # 512
high_precision_dtype: "torch.bfloat16"
use_torch_compile: true
torch_compile_mode: "max-autotune"
device: "cuda"
model_type: "linear"
enable_profiler: true # Enable profiling for this model
2 changes: 1 addition & 1 deletion benchmarks/microbenchmarks/test/test_benchmark_profiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,8 @@
)
from benchmarks.microbenchmarks.utils import (
BenchmarkConfig,
ToyLinearModel,
)
from torchao.testing.model_architectures import ToyLinearModel


class TestBenchmarkProfiler(unittest.TestCase):
Expand Down
60 changes: 60 additions & 0 deletions benchmarks/microbenchmarks/test/test_benchmark_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,12 +57,72 @@ def tearDown(self):
shutil.rmtree(self.temp_dir)

def test_get_shapes_for_config(self):
# Test custom shapes
shapes = get_shapes_for_config(
self.test_config["model_params"][0]["matrix_shapes"]
)
self.assertEqual(len(shapes), 1)
self.assertEqual(shapes[0], ("custom", [1024, 1024, 1024]))

# Test llama shapes
llama_shapes = get_shapes_for_config([{"name": "llama"}])
self.assertEqual(len(llama_shapes), 4) # 4 LLaMa shapes
self.assertTrue(
any(name.startswith("llama_attn.wqkv") for name, _ in llama_shapes)
)
self.assertTrue(
any(name.startswith("llama_attn.w0") for name, _ in llama_shapes)
)
self.assertTrue(
any(name.startswith("llama_ffn.w13") for name, _ in llama_shapes)
)
self.assertTrue(
any(name.startswith("llama_ffn.w2") for name, _ in llama_shapes)
)

# Test pow2 shapes
pow2_shapes = get_shapes_for_config(
[{"name": "pow2", "min_power": 10, "max_power": 12}]
)
self.assertEqual(len(pow2_shapes), 3) # 3 powers of 2 (10, 11, 12)
self.assertEqual(pow2_shapes[0], ("pow2_0", [1024, 1024, 1024])) # 2^10
self.assertEqual(pow2_shapes[1], ("pow2_1", [2048, 2048, 2048])) # 2^11
self.assertEqual(pow2_shapes[2], ("pow2_2", [4096, 4096, 4096])) # 2^12

# Test pow2_extended shapes
pow2_extended_shapes = get_shapes_for_config(
[{"name": "pow2_extended", "min_power": 10, "max_power": 11}]
)
self.assertEqual(
len(pow2_extended_shapes), 4
) # 2 powers of 2, each with 2 variants
self.assertEqual(
pow2_extended_shapes[0], ("pow2_extended_0", [1024, 1024, 1024])
) # 2^10
self.assertEqual(
pow2_extended_shapes[1], ("pow2_extended_1", [1536, 1536, 1536])
) # 2^10 + 2^9
self.assertEqual(
pow2_extended_shapes[2], ("pow2_extended_2", [2048, 2048, 2048])
) # 2^11
self.assertEqual(
pow2_extended_shapes[3], ("pow2_extended_3", [3072, 3072, 3072])
) # 2^11 + 2^10

# Test sweep shapes (limited to a small range for testing)
sweep_shapes = get_shapes_for_config(
[{"name": "sweep", "min_power": 8, "max_power": 9}]
)
# For min_power=8, max_power=9, we should have 8 shapes (2^3 = 8 combinations)
self.assertEqual(len(sweep_shapes), 8)
# Check that all shapes have the expected format
for name, shape in sweep_shapes:
self.assertTrue(name.startswith("sweep_"))
self.assertEqual(len(shape), 3) # [M, K, N]
# Check that all dimensions are powers of 2 between 2^8 and 2^9
for dim in shape:
self.assertTrue(dim in [256, 512]) # 2^8, 2^9

def test_get_param_combinations(self):
model_param = self.test_config["model_params"][0]
shapes, params = get_param_combinations(model_param)
Expand Down
18 changes: 10 additions & 8 deletions benchmarks/microbenchmarks/test/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,15 +16,17 @@
BlockSparseWeightConfig,
Float8DynamicActivationFloat8SemiSparseWeightConfig,
Int4WeightOnlyConfig,
LNLinearSigmoid,
SemiSparseWeightConfig,
ToyLinearModel,
clean_caches,
create_model_and_input,
generate_results_csv,
get_default_device,
string_to_config,
)
from torchao.testing.model_architectures import (
LNLinearActivationModel,
ToyLinearModel,
create_model_and_input_data,
)


class TestUtils(unittest.TestCase):
Expand Down Expand Up @@ -153,7 +155,7 @@ def test_toy_linear_model(self):
self.assertEqual(out.dtype, torch.float32)

def test_ln_linear_sigmoid(self):
model = LNLinearSigmoid(fc_dim1=64, fc_dim2=32, dtype=torch.float32)
model = LNLinearActivationModel(fc_dim1=64, fc_dim2=32, dtype=torch.float32)
x = torch.randn(16, 64)
out = model(x)
self.assertEqual(out.shape, (16, 32))
Expand All @@ -162,9 +164,9 @@ def test_ln_linear_sigmoid(self):
torch.all((out >= 0) & (out <= 1))
) # Check sigmoid output range

def test_create_model_and_input(self):
def test_create_model_and_input_data(self):
m, k, n = 16, 64, 32
model, input_data = create_model_and_input(
model, input_data = create_model_and_input_data(
model_type="linear",
m=m,
k=k,
Expand All @@ -175,15 +177,15 @@ def test_create_model_and_input(self):
self.assertIsInstance(model, ToyLinearModel)
self.assertEqual(input_data.shape, (m, k))

model, input_data = create_model_and_input(
model, input_data = create_model_and_input_data(
model_type="ln_linear_sigmoid",
m=m,
k=k,
n=n,
high_precision_dtype=torch.float32,
device="cpu",
)
self.assertIsInstance(model, LNLinearSigmoid)
self.assertIsInstance(model, LNLinearActivationModel)
self.assertEqual(input_data.shape, (m, k))

def test_generate_results_csv(self):
Expand Down
Loading
Loading