Updates

jainapurva · jainapurva · commit bcdb20c34562 · 2025-04-10T15:43:28.000-07:00
diff --git a/benchmarks/microbenchmarks/test/benchmark_config.yml b/benchmarks/microbenchmarks/test/benchmark_config.yml
@@ -8,8 +8,8 @@ quantization_config_recipe_names:
   - "float8wo"
 # sparsity_config_recipe_names:
   # Will run a baseline inference for model by default, without sparsity for comparison
-  - "semi-sparse"
-  - "block"
+  # - "semi-sparse"
+  # - "block"
 output_dir: "benchmarks/microbenchmarks/results"
 model_params:
   - name: "small_bf16_linear"
diff --git a/torchao/testing/model_architectures.py b/torchao/testing/model_architectures.py
@@ -21,9 +21,7 @@ def forward(self, x):
 
 
 class LNLinearActivationModel(nn.Module):
-    def __init__(
-        self, fc_dim1, fc_dim2, dtype=torch.bfloat16, activation="sigmoid", device=None
-    ):
+    def __init__(self, fc_dim1, fc_dim2, dtype=torch.bfloat16, activation="sigmoid"):
         super().__init__()
 
         activation = activation.lower()
@@ -41,7 +39,7 @@ def __init__(
             raise ValueError(f"Unsupported activation: {activation}")
 
         self.ln = nn.LayerNorm(fc_dim1, elementwise_affine=False)
-        self.fc = nn.Linear(fc_dim1, fc_dim2, bias=False).to(dtype=dtype, device=device)
+        self.fc = nn.Linear(fc_dim1, fc_dim2, bias=False).to(dtype=dtype)
         self.activation = activation_map[activation]
 
     def forward(self, x):
@@ -50,6 +48,20 @@ def forward(self, x):
         return self.activation(x)
 
 
+class RMSNorm(nn.Module):
+    def __init__(self, dim: int, eps: float = 1e-5):
+        super().__init__()
+        self.eps = eps
+        self.weight = nn.Parameter(torch.ones(dim))
+
+    def _norm(self, x):
+        return x * torch.rsqrt(torch.mean(x * x, dim=-1, keepdim=True) + self.eps)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        output = self._norm(x.float()).type_as(x)
+        return output * self.weight
+
+
 class TransformerBlock(torch.nn.Module):
     def __init__(self, hidden_dim, num_heads=8, mlp_ratio=4, dtype=torch.bfloat16):
         super().__init__()
@@ -72,8 +84,8 @@ def __init__(self, hidden_dim, num_heads=8, mlp_ratio=4, dtype=torch.bfloat16):
         )
 
         # Layer norms
-        self.norm1 = nn.RMSNorm(hidden_dim, dtype=dtype)
-        self.norm2 = nn.RMSNorm(hidden_dim, dtype=dtype)
+        self.norm1 = RMSNorm(hidden_dim).to(dtype)
+        self.norm2 = RMSNorm(hidden_dim).to(dtype)
 
         # Activation
         self.activation = torch.nn.GELU()