pytorch
diff --git a/‎.github/workflows/build-wheels_m1.yml
Lines changed: 31 additions & 0 deletions b/‎.github/workflows/build-wheels_m1.yml
Lines changed: 31 additions & 0 deletions
diff --git a/‎.github/workflows/build_wheels_aarch64_linux.yml
Lines changed: 33 additions & 1 deletion b/‎.github/workflows/build_wheels_aarch64_linux.yml
Lines changed: 33 additions & 1 deletion
diff --git a/‎.github/workflows/build_wheels_linux.yml
Lines changed: 4 additions & 2 deletions b/‎.github/workflows/build_wheels_linux.yml
Lines changed: 4 additions & 2 deletions
diff --git a/‎.github/workflows/build_wheels_windows.yml
Lines changed: 35 additions & 0 deletions b/‎.github/workflows/build_wheels_windows.yml
Lines changed: 35 additions & 0 deletions
diff --git a/‎.github/workflows/torchao_experimental_test.yml
Lines changed: 18 additions & 2 deletions b/‎.github/workflows/torchao_experimental_test.yml
Lines changed: 18 additions & 2 deletions
diff --git a/‎README.md
Lines changed: 13 additions & 13 deletions b/‎README.md
Lines changed: 13 additions & 13 deletions
diff --git a/‎benchmarks/float8/float8_roofline.py
Lines changed: 2 additions & 3 deletions b/‎benchmarks/float8/float8_roofline.py
Lines changed: 2 additions & 3 deletions
diff --git a/‎benchmarks/float8/profile_linear_float8.py
Lines changed: 2 additions & 4 deletions b/‎benchmarks/float8/profile_linear_float8.py
Lines changed: 2 additions & 4 deletions
diff --git a/‎examples/sam2_amg_server/README.md
Lines changed: 26 additions & 0 deletions b/‎examples/sam2_amg_server/README.md
Lines changed: 26 additions & 0 deletions
@@ -41,3 +41,34 @@ jobs:
       runner-type: macos-m1-stable
       smoke-test-script: test/smoke_test.py
       trigger-event: ${{ github.event_name }}
+  notify:
+    runs-on: ubuntu-latest
+    name: Email notification
+    needs: [generate-matrix, build]
+    if: failure() && github.event_name == 'schedule'
+    steps:
+      - uses: dawidd6/action-send-mail@v4
+        with:
+          server_address: smtp.gmail.com
+          server_port: 465
+          username: torchao.notify
+          password: ${{ secrets.TORCHAO_NOTIFY_PASSWORD }}
+          from: [email protected]
+          to: ${{ secrets.TORCHAO_NOTIFY_RECIPIENT }}
+          subject: Scheduled Build Failure for TorchAO
+          body: |
+            Build Failure Notification for TorchAO
+            A failure occurred in the Build Linux Wheels workflow.
+            Run Details:
+            - Workflow: ${{ github.workflow }}
+            - Run Type: ${{ github.event_name }}
+            - Repository: ${{ github.repository }}
+            - Branch/PR: ${{ github.ref }}
+            - Commit: ${{ github.sha }}
+            You can view the full run details here:
+            ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
+            Error Information:
+            ${{ needs.generate-matrix.result == 'failure' && 'Matrix generation failed' || '' }}
+            ${{ needs.build.result == 'failure' && 'Build job failed' || '' }}
+            
+            This is an automated notification. Please check the GitHub Actions page for more details about the failure.
@@ -29,7 +29,8 @@ jobs:
       test-infra-repository: pytorch/test-infra
       test-infra-ref: main
       with-cuda: disable
-
+      # please note: excluding 3.13t for aarch64 builds for now
+      python-versions: '["3.9", "3.10", "3.11", "3.12", "3.13"]'
   build:
     needs: generate-matrix
     permissions:
@@ -53,3 +54,34 @@ jobs:
       setup-miniconda: false
     secrets:
       PYPI_API_TOKEN: ${{ secrets.PYPI_API_TOKEN }}
+  notify:
+    runs-on: ubuntu-latest
+    name: Email notification
+    needs: [generate-matrix, build]
+    if: failure() && github.event_name == 'schedule'
+    steps:
+      - uses: dawidd6/action-send-mail@v4
+        with:
+          server_address: smtp.gmail.com
+          server_port: 465
+          username: torchao.notify
+          password: ${{ secrets.TORCHAO_NOTIFY_PASSWORD }}
+          from: [email protected]
+          to: ${{ secrets.TORCHAO_NOTIFY_RECIPIENT }}
+          subject: Scheduled Build Failure for TorchAO
+          body: |
+            Build Failure Notification for TorchAO
+            A failure occurred in the Build AARCH64 Wheels workflow.
+            Run Details:
+            - Workflow: ${{ github.workflow }}
+            - Run Type: ${{ github.event_name }}
+            - Repository: ${{ github.repository }}
+            - Branch/PR: ${{ github.ref }}
+            - Commit: ${{ github.sha }}
+            You can view the full run details here:
+            ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
+            Error Information:
+            ${{ needs.generate-matrix.result == 'failure' && 'Matrix generation failed' || '' }}
+            ${{ needs.build.result == 'failure' && 'Build job failed' || '' }}
+            
+            This is an automated notification. Please check the GitHub Actions page for more details about the failure.
@@ -30,6 +30,8 @@ jobs:
       with-cuda: enable
       with-rocm: enable
       with-xpu: enable
+      # please note: excluding 3.13t for aarch64 builds for now
+      python-versions: '["3.9", "3.10", "3.11", "3.12", "3.13"]'
 
   build:
     needs: generate-matrix
@@ -70,7 +72,7 @@ jobs:
           password: ${{ secrets.TORCHAO_NOTIFY_PASSWORD }}
           from: [email protected]
           to: ${{ secrets.TORCHAO_NOTIFY_RECIPIENT }}
-          subject: breakbutterflyScheduled Build Failure for TorchAO
+          subject: Scheduled Build Failure for TorchAO
           body: |
             Build Failure Notification for TorchAO
 
@@ -89,5 +91,5 @@ jobs:
             Error Information:
             ${{ needs.generate-matrix.result == 'failure' && 'Matrix generation failed' || '' }}
             ${{ needs.build.result == 'failure' && 'Build job failed' || '' }}
-            
+
             This is an automated notification. Please check the GitHub Actions page for more details about the failure.
@@ -60,3 +60,38 @@ jobs:
       package-name: ${{ matrix.package-name }}
       smoke-test-script: ${{ matrix.smoke-test-script }}
       trigger-event: ${{ github.event_name }}
+  notify:
+    runs-on: ubuntu-latest
+    name: Email notification
+    needs: [generate-matrix, build]
+    if: failure() && github.event_name == 'schedule'
+    steps:
+      - uses: dawidd6/action-send-mail@v4
+        with:
+          server_address: smtp.gmail.com
+          server_port: 465
+          username: torchao.notify
+          password: ${{ secrets.TORCHAO_NOTIFY_PASSWORD }}
+          from: [email protected]
+          to: ${{ secrets.TORCHAO_NOTIFY_RECIPIENT }}
+          subject: Scheduled Build Failure for TorchAO
+          body: |
+            Build Failure Notification for TorchAO
+
+            A failure occurred in the Build Windows Wheels workflow.
+
+            Run Details:
+            - Workflow: ${{ github.workflow }}
+            - Run Type: ${{ github.event_name }}
+            - Repository: ${{ github.repository }}
+            - Branch/PR: ${{ github.ref }}
+            - Commit: ${{ github.sha }}
+
+            You can view the full run details here:
+            ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
+
+            Error Information:
+            ${{ needs.generate-matrix.result == 'failure' && 'Matrix generation failed' || '' }}
+            ${{ needs.build.result == 'failure' && 'Build job failed' || '' }}
+
+            This is an automated notification. Please check the GitHub Actions page for more details about the failure.
@@ -35,8 +35,24 @@ jobs:
           conda activate venv
           pip install --extra-index-url "https://download.pytorch.org/whl/nightly/cpu" torch=="2.6.0.dev20250104"
           pip install numpy
+          pip install pytest
           USE_CPP=1 pip install .
-      - name: Run tests
+      - name: Run python tests
         run: |
           conda activate venv
-          python torchao/experimental/tests/test_packed_linear_int8_dynamic_activation_intx_weight_layout.py
+          pytest torchao/experimental/tests/test_int8_dynamic_activation_intx_weight.py
+          python torchao/experimental/tests/test_embedding_xbit_quantizer.py
+      - name: Run kernels/cpu/aarch64/tests
+        run: |
+          conda activate venv
+          pushd torchao/experimental/kernels/cpu/aarch64/tests
+          sh build_and_run_tests.sh
+          rm -rf /tmp/cmake-out
+          popd
+      - name: Run torchao/experimental/ops/tests
+        run: |
+          conda activate venv
+          pushd torchao/experimental/ops/tests
+          sh build_and_run_tests.sh
+          rm -rf /tmp/cmake-out
+          popd
@@ -29,16 +29,16 @@ For inference, we have the option of
 ```python
 from torchao.quantization.quant_api import (
     quantize_,
-    int8_dynamic_activation_int8_weight,
-    int4_weight_only,
-    int8_weight_only
+    Int8DynamicActivationInt8WeightConfig,
+    Int4WeightOnlyConfig,
+    Int8WeightOnlyConfig
 )
-quantize_(m, int4_weight_only())
+quantize_(m, Int4WeightOnlyConfig())
 ```
 
-For gpt-fast `int4_weight_only()` is the best option at bs=1 as it **2x the tok/s and reduces the VRAM requirements by about 65%** over a torch.compiled baseline.
+For gpt-fast `Int4WeightOnlyConfig()` is the best option at bs=1 as it **2x the tok/s and reduces the VRAM requirements by about 65%** over a torch.compiled baseline.
 
-If you don't have enough VRAM to quantize your entire model on GPU and you find CPU quantization to be too slow then you can use the device argument like so `quantize_(model, int8_weight_only(), device="cuda")` which will send and quantize each layer individually to your GPU.
+If you don't have enough VRAM to quantize your entire model on GPU and you find CPU quantization to be too slow then you can use the device argument like so `quantize_(model, Int8WeightOnlyConfig(), device="cuda")` which will send and quantize each layer individually to your GPU.
 
 If you see slowdowns with any of these techniques or you're unsure which option to use, consider using [autoquant](./torchao/quantization/README.md#autoquantization) which will automatically profile layers and pick the best way to quantize each layer.
 
@@ -63,27 +63,27 @@ Post-training quantization can result in a fast and compact model, but may also
 ```python
 from torchao.quantization import (
     quantize_,
-    int8_dynamic_activation_int4_weight,
+    Int8DynamicActivationInt4WeightConfig,
 )
 from torchao.quantization.qat import (
     FakeQuantizeConfig,
-    from_intx_quantization_aware_training,
-    intx_quantization_aware_training,
+    FromIntXQuantizationAwareTrainingConfig,
+    IntXQuantizationAwareTrainingConfig,
 )
 
 # Insert fake quantization
 activation_config = FakeQuantizeConfig(torch.int8, "per_token", is_symmetric=False)
 weight_config = FakeQuantizeConfig(torch.int4, group_size=32)
 quantize_(
     my_model,
-    intx_quantization_aware_training(activation_config, weight_config),
+    IntXQuantizationAwareTrainingConfig(activation_config, weight_config),
 )
 
 # Run training... (not shown)
 
 # Convert fake quantization to actual quantized operations
-quantize_(my_model, from_intx_quantization_aware_training())
-quantize_(my_model, int8_dynamic_activation_int4_weight(group_size=32))
+quantize_(my_model, FromIntXQuantizationAwareTrainingConfig())
+quantize_(my_model, Int8DynamicActivationInt4WeightConfig(group_size=32))
 ```
 
 ### Float8
@@ -139,7 +139,7 @@ The best example we have combining the composability of lower bit dtype with com
 
 We've added support for authoring and releasing [custom ops](./torchao/csrc/) that do not graph break with `torch.compile()` so if you love writing kernels but hate packaging them so they work all operating systems and cuda versions, we'd love to accept contributions for your custom ops. We have a few examples you can follow
 
-1. [fp6](torchao/dtypes/floatx) for 2x faster inference over fp16 with an easy to use API `quantize_(model, fpx_weight_only(3, 2))`
+1. [fp6](torchao/dtypes/floatx) for 2x faster inference over fp16 with an easy to use API `quantize_(model, FPXWeightOnlyConfig(3, 2))`
 2. [2:4 Sparse Marlin GEMM](https://github.com/pytorch/ao/pull/733) 2x speedups for FP16xINT4 kernels even at batch sizes up to 256
 3. [int4 tinygemm unpacker](https://github.com/pytorch/ao/pull/415) which makes it easier to switch quantized backends for inference
 
 
@@ -63,7 +63,6 @@
     ScalingType,
     convert_to_float8_training,
 )
-from torchao.float8.config import Float8LinearRecipeName, recipe_name_to_linear_config
 from torchao.float8.roofline_utils import (
     get_float8_mem_sympy,
     get_gemm_time_sympy,
@@ -349,7 +348,7 @@ def run(
 
         # get the float8 dynamic axiswise scaling gpu kernel time
         torch._dynamo.reset()
-        config = recipe_name_to_linear_config(Float8LinearRecipeName.ALL_AXISWISE)
+        config = Float8LinearConfig.from_recipe_name("rowwise")
         m_fp8_dyn_axs = convert_to_float8_training(copy.deepcopy(m_orig), config=config)
         m_fp8_dyn_axs = torch.compile(m_fp8_dyn_axs)
         fp8_dyn_axs_time_actual_s = get_gpu_kernel_time(m_fp8_dyn_axs, x)
@@ -358,7 +357,7 @@ def run(
         # TODO(future PR): enable below once basic performance issues
         # are fixed
         # torch._dynamo.reset()
-        # config = recipe_name_to_linear_config(Float8LinearRecipeName.LW_AXISWISE_WITH_GW_HP)
+        # config = Float8LinearConfig.from_recipe_name("rowwise_with_gw_hp")
         # m_fp8_lw = convert_to_float8_training(m_orig, config=config)
         # m_fp8_lw = torch.compile(m_fp8_lw)
         # fp8_lw_time_actual_s = get_gpu_kernel_time(m_fp8_lw, x)
 
@@ -39,9 +39,8 @@
 
 from torchao.float8 import _prototype_register_float8_delayed_scaling_inductor_passes
 from torchao.float8.config import (
-    Float8LinearRecipeName,
+    Float8LinearConfig,
     ScalingType,
-    recipe_name_to_linear_config,
 )
 from torchao.float8.float8_linear_utils import (
     convert_to_float8_training,
@@ -311,8 +310,7 @@ def main(
             emulate=False,
         )
     elif recipe_name is not None:
-        recipe_name = Float8LinearRecipeName(recipe_name)
-        config = recipe_name_to_linear_config(recipe_name)
+        config = Float8LinearConfig.from_recipe_name(recipe_name)
 
     scaling_repr = "_".join(
         [
 
@@ -1,3 +1,29 @@
+# Reproducing experiments locally
+
+You can simply run `python reproduce_experiments.py <path/to/image_paths_file> <path/to/output_folder>`
+
+`image_paths_file` needs to be a flat list of paths to images, for example
+
+```
+/home/$USER/data/sav_val/JPEGImages_24fps/sav_044979/00349.jpg
+/home/$USER/data/sav_val/JPEGImages_24fps/sav_006751/00204.jpg
+/home/$USER/data/sav_val/JPEGImages_24fps/sav_053118/00239.jpg
+/home/$USER/data/sav_val/JPEGImages_24fps/sav_053391/00517.jpg
+/home/$USER/data/sav_val/JPEGImages_24fps/sav_018487/00001.jpg
+/home/$USER/data/sav_val/JPEGImages_24fps/sav_028552/00153.jpg
+/home/$USER/data/sav_val/JPEGImages_24fps/sav_013729/00103.jpg
+/home/$USER/data/sav_val/JPEGImages_24fps/sav_014662/00339.jpg
+```
+
+or whichever other files you'd like to use for study. For example you may consider the Segment Anything Video (SA-V) [Dataset](https://github.com/facebookresearch/sam2/tree/main/sav_dataset#download-the-dataset).
+
+The experimental results will then be saved under `output_folder` in result.csv
+
+# Reproducing experiments on Modal
+
+For this you can run `modal_experiments.sh` after, but you'll want to experiments locally first to produce the meta annotations and exported ahead-of-time compiled binaries.
+
+# Using the server locally
 ## Example curl command
 ```
 curl -X POST http://127.0.0.1:5000/upload -F 'image=@/path/to/file.jpg' --output path/to/output.png
Original file line number	Diff line number	Diff line change
`@@ -39,9 +39,8 @@`
`39`	`39`
`40`	`40`	`from torchao.float8 import _prototype_register_float8_delayed_scaling_inductor_passes`
`41`	`41`	`from torchao.float8.config import (`
`42`		`- Float8LinearRecipeName,`
	`42`	`+ Float8LinearConfig,`
`43`	`43`	`ScalingType,`
`44`		`- recipe_name_to_linear_config,`
`45`	`44`	`)`
`46`	`45`	`from torchao.float8.float8_linear_utils import (`
`47`	`46`	`convert_to_float8_training,`
`@@ -311,8 +310,7 @@ def main(`
`311`	`310`	`emulate=False,`
`312`	`311`	`)`
`313`	`312`	`elif recipe_name is not None:`
`314`		`- recipe_name = Float8LinearRecipeName(recipe_name)`
`315`		`- config = recipe_name_to_linear_config(recipe_name)`
	`313`	`+ config = Float8LinearConfig.from_recipe_name(recipe_name)`
`316`	`314`
`317`	`315`	`scaling_repr = "_".join(`
`318`	`316`	`[`