Make TorchAO cpp/Python extension

drisspg · web-flow · commit 7b37eb07c099 · 2025-02-18T10:37:50.000-08:00
Differential Revision: D69634772 Pull Request resolved: #1719
diff --git a/test/dtypes/test_affine_quantized.py b/test/dtypes/test_affine_quantized.py
@@ -23,6 +23,7 @@
 from torchao.utils import (
     TORCH_VERSION_AT_LEAST_2_5,
     TORCH_VERSION_AT_LEAST_2_6,
+    is_fbcode,
     is_sm_at_least_89,
 )
 
@@ -213,6 +214,8 @@ class TestAffineQuantizedBasic(TestCase):
     @common_utils.parametrize("device", COMMON_DEVICES)
     @common_utils.parametrize("dtype", COMMON_DTYPES)
     def test_flatten_unflatten(self, device, dtype):
+        if device == "cuda" and dtype == torch.bfloat16 and is_fbcode():
+            raise unittest.SkipTest("TODO: Failing for cuda + bfloat16 in fbcode")
         apply_quant_list = get_quantization_functions(False, True, device)
         for apply_quant in apply_quant_list:
             linear = torch.nn.Linear(128, 256, dtype=dtype, device=device)
diff --git a/test/quantization/test_marlin_qqq.py b/test/quantization/test_marlin_qqq.py
@@ -1,5 +1,4 @@
 import copy
-import unittest
 
 import pytest
 import torch
@@ -19,13 +18,9 @@
     MappingType,
     choose_qparams_and_quantize_affine_qqq,
 )
-from torchao.utils import TORCH_VERSION_AT_LEAST_2_5, is_fbcode
+from torchao.utils import TORCH_VERSION_AT_LEAST_2_5
 
 
-@unittest.skipIf(
-    is_fbcode(),
-    "Skipping the test in fbcode since we don't have TARGET file for kernels",
-)
 class TestMarlinQQQ(TestCase):
     def setUp(self):
         super().setUp()
diff --git a/test/test_ops.py b/test/test_ops.py
@@ -18,12 +18,7 @@
 )
 from torchao.quantization.quant_primitives import choose_qparams_and_quantize_affine_qqq
 from torchao.sparsity.marlin import inject_24, marlin_24_workspace, pack_to_marlin_24
-from torchao.utils import TORCH_VERSION_AT_LEAST_2_5, compute_max_diff, is_fbcode
-
-if is_fbcode():
-    pytest.skip(
-        "Skipping the test in fbcode since we don't have TARGET file for kernels"
-    )
+from torchao.utils import TORCH_VERSION_AT_LEAST_2_5, compute_max_diff
 
 try:
     import torchao.ops
diff --git a/torchao/__init__.py b/torchao/__init__.py
@@ -9,7 +9,6 @@
     "ignore", message="Failed to initialize NumPy: No module named 'numpy'"
 )
 
-
 # We use this "hack" to set torchao.__version__ correctly
 # the version of ao is dependent on environment variables for multiple architectures
 # For local development this will default to whatever is version.txt
@@ -21,34 +20,28 @@
 except PackageNotFoundError:
     __version__ = "unknown"  # In case this logic breaks don't break the build
 
-_IS_FBCODE = (
-    hasattr(torch._utils_internal, "IS_FBSOURCE") and torch._utils_internal.IS_FBSOURCE
-)
-if not _IS_FBCODE:
-    try:
-        from pathlib import Path
-
-        so_files = list(Path(__file__).parent.glob("_C*.so"))
-        if len(so_files) > 0:
-            assert (
-                len(so_files) == 1
-            ), f"Expected one _C*.so file, found {len(so_files)}"
-            torch.ops.load_library(so_files[0])
-            from . import ops
-
-        # The following library contains CPU kernels from torchao/experimental
-        # They are built automatically by ao/setup.py if on an ARM machine.
-        # They can also be built outside of the torchao install process by
-        # running the script `torchao/experimental/build_torchao_ops.sh <aten|executorch>`
-        # For more information, see https://github.com/pytorch/ao/blob/main/torchao/experimental/docs/readme.md
-        experimental_lib = list(Path(__file__).parent.glob("libtorchao_ops_aten.*"))
-        if len(experimental_lib) > 0:
-            assert (
-                len(experimental_lib) == 1
-            ), f"Expected at most one libtorchao_ops_aten.* file, found {len(experimental_lib)}"
-            torch.ops.load_library(experimental_lib[0])
-    except:
-        logging.debug("Skipping import of cpp extensions")
+try:
+    from pathlib import Path
+
+    so_files = list(Path(__file__).parent.glob("_C*.so"))
+    if len(so_files) > 0:
+        assert len(so_files) == 1, f"Expected one _C*.so file, found {len(so_files)}"
+        torch.ops.load_library(str(so_files[0]))
+        from . import ops
+
+    # The following library contains CPU kernels from torchao/experimental
+    # They are built automatically by ao/setup.py if on an ARM machine.
+    # They can also be built outside of the torchao install process by
+    # running the script `torchao/experimental/build_torchao_ops.sh <aten|executorch>`
+    # For more information, see https://github.com/pytorch/ao/blob/main/torchao/experimental/docs/readme.md
+    experimental_lib = list(Path(__file__).parent.glob("libtorchao_ops_aten.*"))
+    if len(experimental_lib) > 0:
+        assert (
+            len(experimental_lib) == 1
+        ), f"Expected at most one libtorchao_ops_aten.* file, found {len(experimental_lib)}"
+        torch.ops.load_library(str(experimental_lib[0]))
+except:
+    logging.debug("Skipping import of cpp extensions")
 
 from torchao.quantization import (
     autoquant,
@@ -64,6 +57,3 @@
     "testing",
     "ops",
 ]
-
-# test-pytorchbot
-# test-codev
diff --git a/torchao/csrc/cuda/rowwise_scaled_linear_cutlass/rowwise_scaled_linear_cutlass_s4s4.cu b/torchao/csrc/cuda/rowwise_scaled_linear_cutlass/rowwise_scaled_linear_cutlass_s4s4.cu
@@ -14,10 +14,14 @@ rowwise_scaled_linear_cutlass_s4s4(
               " for xq and ", wq.dtype(), " for wq is not supported");
 
   // Dispatch to appropriate kernel template.
-  using ElementA = cutlass::int4b_t;
-  using ElementB = cutlass::int4b_t;
-  return rowwise_scaled_linear_cutlass<ElementA, ElementB>(
+  #if defined(BUILD_ROWWISE_SCALED_LINEAR_CUTLASS)
+  // We get ElementA/ElementB types from the header
+  return rowwise_scaled_linear_cutlass<cutlass::int4b_t, cutlass::int4b_t>(
       xq, x_scale, wq, w_scale, bias);
+  #else
+    TORCH_CHECK(false, "CUTLASS kernels not built - rowwise_scaled_linear_cutlass_s4s4 not available");
+    return at::Tensor{};
+  #endif
 }
 
 TORCH_LIBRARY_IMPL(torchao, CUDA, m) {
diff --git a/torchao/csrc/cuda/rowwise_scaled_linear_cutlass/rowwise_scaled_linear_cutlass_s8s4.cu b/torchao/csrc/cuda/rowwise_scaled_linear_cutlass/rowwise_scaled_linear_cutlass_s8s4.cu
@@ -1,5 +1,4 @@
 #include <torch/library.h>
-
 #include "rowwise_scaled_linear_cutlass.cuh"
 
 namespace torchao {
@@ -13,11 +12,16 @@ rowwise_scaled_linear_cutlass_s8s4(
               __func__, " : The input datatypes combination ", xq.dtype(),
               " for xq and ", wq.dtype(), " for wq is not supported");
 
-  // Dispatch to appropriate kernel template.
+#if defined(BUILD_ROWWISE_SCALED_LINEAR_CUTLASS)
+  // Define ElementA as int8_t since it's a standard type
   using ElementA = int8_t;
-  using ElementB = cutlass::int4b_t;
-  return rowwise_scaled_linear_cutlass<ElementA, ElementB>(
+  // ElementB comes from cutlass header
+  return rowwise_scaled_linear_cutlass<ElementA, cutlass::int4b_t>(
       xq, x_scale, wq, w_scale, bias);
+#else
+  TORCH_CHECK(false, "CUTLASS kernels not built - rowwise_scaled_linear_cutlass_s8s4 not available");
+  return at::Tensor{};
+#endif
 }
 
 TORCH_LIBRARY_IMPL(torchao, CUDA, m) {