pytorch
diff --git a/‎.github/workflows/torchao_experimental_test.yml
Lines changed: 10 additions & 1 deletion b/‎.github/workflows/torchao_experimental_test.yml
Lines changed: 10 additions & 1 deletion
diff --git a/‎torchao/experimental/build_torchao_ops.sh
Lines changed: 1 addition & 0 deletions b/‎torchao/experimental/build_torchao_ops.sh
Lines changed: 1 addition & 0 deletions
diff --git a/‎torchao/experimental/ops/embedding_xbit/CMakeLists.txt
Lines changed: 4 additions & 0 deletions b/‎torchao/experimental/ops/embedding_xbit/CMakeLists.txt
Lines changed: 4 additions & 0 deletions
diff --git a/‎torchao/experimental/ops/linear_8bit_act_xbit_weight/CMakeLists.txt
Lines changed: 6 additions & 2 deletions b/‎torchao/experimental/ops/linear_8bit_act_xbit_weight/CMakeLists.txt
Lines changed: 6 additions & 2 deletions
diff --git a/‎torchao/experimental/ops/linear_8bit_act_xbit_weight/op_linear_8bit_act_xbit_weight-impl.h
Lines changed: 15 additions & 38 deletions b/‎torchao/experimental/ops/linear_8bit_act_xbit_weight/op_linear_8bit_act_xbit_weight-impl.h
Lines changed: 15 additions & 38 deletions
diff --git a/‎torchao/experimental/ops/linear_8bit_act_xbit_weight/op_linear_8bit_act_xbit_weight_aten.cpp
Lines changed: 6 additions & 6 deletions b/‎torchao/experimental/ops/linear_8bit_act_xbit_weight/op_linear_8bit_act_xbit_weight_aten.cpp
Lines changed: 6 additions & 6 deletions
diff --git a/‎torchao/experimental/ops/linear_8bit_act_xbit_weight/op_linear_8bit_act_xbit_weight_executorch.cpp
Lines changed: 65 additions & 0 deletions b/‎torchao/experimental/ops/linear_8bit_act_xbit_weight/op_linear_8bit_act_xbit_weight_executorch.cpp
Lines changed: 65 additions & 0 deletions
diff --git a/‎torchao/experimental/ops/linear_8bit_act_xbit_weight/op_linear_8bit_act_xbit_weight_executorch/w1s.cpp
Lines changed: 0 additions & 29 deletions b/‎torchao/experimental/ops/linear_8bit_act_xbit_weight/op_linear_8bit_act_xbit_weight_executorch/w1s.cpp
Lines changed: 0 additions & 29 deletions
diff --git a/‎torchao/experimental/ops/linear_8bit_act_xbit_weight/op_linear_8bit_act_xbit_weight_executorch/w1sz.cpp
Lines changed: 0 additions & 29 deletions b/‎torchao/experimental/ops/linear_8bit_act_xbit_weight/op_linear_8bit_act_xbit_weight_executorch/w1sz.cpp
Lines changed: 0 additions & 29 deletions
diff --git a/‎torchao/experimental/ops/linear_8bit_act_xbit_weight/op_linear_8bit_act_xbit_weight_executorch/w2s.cpp
Lines changed: 0 additions & 29 deletions b/‎torchao/experimental/ops/linear_8bit_act_xbit_weight/op_linear_8bit_act_xbit_weight_executorch/w2s.cpp
Lines changed: 0 additions & 29 deletions
@@ -33,7 +33,10 @@ jobs:
       - name: Install requirements
         run: |
           conda activate venv
-          pip install torch --index-url "https://download.pytorch.org/whl/nightly/cpu"
+          # Install executorch first because it installs its own version
+          # of torch and torchao, which we do not want to use
+          pip install executorch
+          pip install torch --index-url "https://download.pytorch.org/whl/nightly/cpu" --force-reinstall
           pip install numpy
           pip install pytest
           pip install parameterized
@@ -57,6 +60,12 @@ jobs:
           sh build_and_run_tests.sh
           rm -rf /tmp/cmake-out
           popd
+      - name: ET ops build
+        run: |
+          conda activate venv
+          pushd torchao/experimental
+          sh build_torchao_ops.sh executorch
+          popd
 
   test-mps-ops:
     strategy:
 
@@ -21,6 +21,7 @@ export CMAKE_OUT=cmake-out
 cmake -DCMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH} \
     -DCMAKE_INSTALL_PREFIX=${CMAKE_OUT} \
     -DTORCHAO_BUILD_EXECUTORCH_OPS="${TORCHAO_BUILD_EXECUTORCH_OPS}" \
+    -DTORCHAO_BUILD_CPU_AARCH64=ON \
     -S . \
     -B ${CMAKE_OUT}
 cmake --build  ${CMAKE_OUT} -j 16 --target install --config Release
@@ -27,6 +27,10 @@ if(TORCHAO_BUILD_EXECUTORCH_OPS)
     # libextension_threadpool.a
     # libcpuinfo.a
     # libpthreadpool.a
+    if(NOT DEFINED EXECUTORCH_INCLUDE_DIRS AND NOT DEFINED EXECUTORCH_LIBRARIES)
+        message(WARNING "EXECUTORCH_INCLUDE_DIRS and EXECUTORCH_LIBRARIES are not defined. Looking for ExecuTorch.")
+        find_package(ExecuTorch HINTS ${CMAKE_PREFIX_PATH}/executorch/share/cmake)
+    endif()
     add_library(torchao_ops_embedding_xbit_executorch OBJECT
         op_embedding_xbit_executorch.cpp
     )
 
@@ -41,11 +41,15 @@ if(TORCHAO_BUILD_EXECUTORCH_OPS)
     # libextension_threadpool.a
     # libcpuinfo.a
     # libpthreadpool.a
+    if(NOT DEFINED EXECUTORCH_INCLUDE_DIRS AND NOT DEFINED EXECUTORCH_LIBRARIES)
+        message(WARNING "EXECUTORCH_INCLUDE_DIRS and EXECUTORCH_LIBRARIES are not defined. Looking for ExecuTorch.")
+        find_package(ExecuTorch HINTS ${CMAKE_PREFIX_PATH}/executorch/share/cmake)
+    endif()
     # find_package(ExecuTorch HINTS ${CMAKE_PREFIX_PATH}/executorch/share/cmake)
-    file(GLOB _SRCS "${CMAKE_CURRENT_SOURCE_DIR}/op_linear_8bit_act_xbit_weight_executorch/*.cpp")
+    # file(GLOB _SRCS "${CMAKE_CURRENT_SOURCE_DIR}/op_linear_8bit_act_xbit_weight_executorch/*.cpp")
     add_library(torchao_ops_linear_8bit_act_xbit_weight_executorch OBJECT
         linear_8bit_act_xbit_weight.cpp
-        ${_SRCS}
+        op_linear_8bit_act_xbit_weight_executorch.cpp
     )
     target_link_torchao_parallel_backend(torchao_ops_linear_8bit_act_xbit_weight_executorch executorch)
     target_include_directories(torchao_ops_linear_8bit_act_xbit_weight_executorch PRIVATE "${EXECUTORCH_INCLUDE_DIRS}")
 
@@ -89,10 +89,7 @@ Tensor pack_weights_cpu(const Tensor &weight_qvals, const Tensor &weight_scales,
 template <int weight_nbit>
 Tensor pack_weights_without_zeros_cpu(
     const Tensor &weight_qvals, const Tensor &weight_scales,
-    // TODO(T200095131): convert to int64_t when supported by AOTI
-    // group_size is a tensor with size (0, group_size)
-    const Tensor &group_size_tensor) {
-  int64_t group_size = group_size_tensor.size(1);
+    const int64_t& group_size) {
   return pack_weights_cpu<weight_nbit,
                           /*has_weight_zeros*/ false,
                           /*has_bias*/ false>(weight_qvals, weight_scales,
@@ -105,10 +102,8 @@ template <int weight_nbit>
 Tensor pack_weights_with_zeros_cpu(
     const Tensor &weight_qvals, const Tensor &weight_scales,
     const Tensor &weight_zeros,
-    // TODO(T200095131): convert to int64_t when supported by AOTI
-    // group_size is a meta tensor with size (group_size)
-    const Tensor &group_size_tensor) {
-  int64_t group_size = group_size_tensor.size(1);
+    const int64_t& group_size
+    ) {
   return pack_weights_cpu<weight_nbit,
                           /*has_weight_zeros*/ true,
                           /*has_bias*/ false>(weight_qvals, weight_scales,
@@ -145,10 +140,8 @@ Tensor pack_weights_meta(const Tensor &weight_qvals,
 template <int weight_nbit>
 Tensor pack_weights_without_zeros_meta(
     const Tensor &weight_qvals, const Tensor &weight_scales,
-    // TODO(T200095131): convert to int64_t when supported by AOTI
-    // group_size is a meta tensor with size (group_size)
-    const Tensor &group_size_tensor) {
-  int64_t group_size = group_size_tensor.size(1);
+    const int64_t& group_size
+    ) {
   return pack_weights_meta<weight_nbit,
                            /*has_weight_zeros*/ false,
                            /*has_bias*/ false>(weight_qvals, weight_scales,
@@ -161,10 +154,8 @@ template <int weight_nbit>
 Tensor pack_weights_with_zeros_meta(
     const Tensor &weight_qvals, const Tensor &weight_scales,
     const Tensor &weight_zeros,
-    // TODO(T200095131): convert to int64_t when supported by AOTI
-    // group_size is a meta tensor with size (group_size)
-    const Tensor &group_size_tensor) {
-  int64_t group_size = group_size_tensor.size(1);
+    const int64_t& group_size
+    ) {
   return pack_weights_meta<weight_nbit,
                            /*has_weight_zeros*/ true,
                            /*has_bias*/ false>(weight_qvals, weight_scales,
@@ -176,14 +167,8 @@ Tensor pack_weights_with_zeros_meta(
 template <int weight_nbit, bool has_weight_zeros>
 Tensor
 linear_out_cpu(const Tensor &activations, const Tensor &packed_weights,
-               // TODO(T200095131): convert n_tensor, k_tensor,
-               // group_size_tensor to int64_t when supported by AOTI Currently
-               // they are tensors with size equal to (0, the int they wrap)
-               const Tensor &group_size_tensor, const Tensor &n_tensor,
-               const Tensor &k_tensor, Tensor &out) {
-  int n = n_tensor.size(1);
-  int k = k_tensor.size(1);
-  int group_size = group_size_tensor.size(1);
+               const int64_t& group_size, const int64_t& n,
+               const int64_t& k, Tensor &out) {
   TORCHAO_CHECK(n >= 1, "n must be >= 1");
   TORCHAO_CHECK(k >= 1, "k must be >= 1");
   TORCHAO_CHECK(group_size >= 1, "group_size must be >= 1");
@@ -261,15 +246,12 @@ linear_out_cpu(const Tensor &activations, const Tensor &packed_weights,
 template <int weight_nbit, bool has_weight_zeros>
 Tensor
 linear_cpu(const Tensor &activations, const Tensor &packed_weights,
-           // TODO(T200095131): convert n_tensor, k_tensor, group_size_tensor to
-           // int64_t when supported by AOTI Currently they are tensors with
-           // size equal to (0, the int they wrap)
-           const Tensor &group_size_tensor, const Tensor &n_tensor,
-           const Tensor &k_tensor) {
+           const int64_t &group_size, const int64_t &n,
+           const int64_t &k) {
   Tensor output_tensor = torch::empty({}, torch::kFloat32);
   linear_out_cpu<weight_nbit, has_weight_zeros>(activations, packed_weights,
-                                                group_size_tensor, n_tensor,
-                                                k_tensor, output_tensor);
+                                                group_size, n,
+                                                k, output_tensor);
   return output_tensor;
 }
 #endif // USE_ATEN
@@ -278,13 +260,8 @@ linear_cpu(const Tensor &activations, const Tensor &packed_weights,
 template <int weight_nbit, bool has_weight_zeros>
 Tensor linear_meta(
     const Tensor &activations, const Tensor &packed_weights,
-    // TODO(T200095131): convert n_tensor, k_tensor, group_size_tensor to
-    // int64_t when supported by AOTI
-    // Currently they are tensors with size equal to (0, the int they wrap)
-    const Tensor &group_size_tensor, const Tensor &n_tensor,
-    const Tensor &k_tensor) {
-  int n = n_tensor.size(1);
-  int k = k_tensor.size(1);
+    const int64_t &group_size, const int64_t &n,
+    const int64_t &k) {
   TORCHAO_CHECK(n >= 1, "n must be >= 1");
   TORCHAO_CHECK(k >= 1, "k must be >= 1");
 
 
@@ -9,22 +9,22 @@
 #define DEFINE_OP(weight_nbit)                                                                                                                 \
   m.def(                                                                                                                                       \
       "_pack_8bit_act_" #weight_nbit                                                                                                   \
-      "bit0zp_weight(Tensor weight_qvals, Tensor weight_scales, Tensor group_size) -> Tensor");                                                \
+      "bit0zp_weight(Tensor weight_qvals, Tensor weight_scales, int group_size) -> Tensor");                                                \
   m.def(                                                                                                                                       \
       "_pack_8bit_act_" #weight_nbit                                                                                                   \
-      "bit_weight(Tensor weight_qvals, Tensor weight_scales, Tensor weight_zeros, Tensor group_size) -> Tensor");                              \
+      "bit_weight(Tensor weight_qvals, Tensor weight_scales, Tensor weight_zeros, int group_size) -> Tensor");                              \
   m.def(                                                                                                                                       \
       "_linear_8bit_act_" #weight_nbit                                                                                                         \
-      "bit0zp_weight(Tensor activations, Tensor packed_weights, Tensor group_size, Tensor n, Tensor k) -> Tensor");                            \
+      "bit0zp_weight(Tensor activations, Tensor packed_weights, int group_size, int n, int k) -> Tensor");                            \
   m.def(                                                                                                                                       \
       "_linear_8bit_act_" #weight_nbit                                                                                                         \
-      "bit_weight(Tensor activations, Tensor packed_weights, Tensor group_size, Tensor n, Tensor k) -> Tensor");                               \
+      "bit_weight(Tensor activations, Tensor packed_weights, int group_size, int n, int k) -> Tensor");                               \
   m.def(                                                                                                                                       \
       "_linear_8bit_act_" #weight_nbit                                                                                                         \
-      "bit0zp_weight.out(Tensor activations, Tensor packed_weights, Tensor group_size, Tensor n, Tensor k, *, Tensor(a!) out) -> Tensor(a!)"); \
+      "bit0zp_weight.out(Tensor activations, Tensor packed_weights, int group_size, int n, int k, *, Tensor(a!) out) -> Tensor(a!)"); \
   m.def(                                                                                                                                       \
       "_linear_8bit_act_" #weight_nbit                                                                                                         \
-      "bit_weight.out(Tensor activations, Tensor packed_weights, Tensor group_size, Tensor n, Tensor k, *, Tensor(a!) out) -> Tensor(a!)")
+      "bit_weight.out(Tensor activations, Tensor packed_weights, int group_size, int n, int k, *, Tensor(a!) out) -> Tensor(a!)")
 
 #define DEFINE_CPU_IMPL(weight_nbit)                          \
   m.impl(                                                     \
 
@@ -0,0 +1,65 @@
+#include <torchao/experimental/ops/linear_8bit_act_xbit_weight/op_linear_8bit_act_xbit_weight-impl.h>
+
+#define DEFINE_OP(weight_nbit)                                                 \
+  Tensor _op_out_0zp_##weight_nbit(                                            \
+      RuntimeContext &ctx, const Tensor &activations,                          \
+      const Tensor &packed_weights, const int64_t &group_size,                 \
+      const int64_t &n, const int64_t &k, Tensor &out) {                       \
+    (void)ctx;                                                                 \
+    linear_out_cpu<weight_nbit, false>(activations, packed_weights,            \
+                                       group_size, n, k, out);                 \
+    return out;                                                                \
+  }                                                                            \
+  Tensor _op_out_zp_##weight_nbit(                                             \
+      RuntimeContext &ctx, const Tensor &activations,                          \
+      const Tensor &packed_weights, const int64_t &group_size,                 \
+      const int64_t &n, const int64_t &k, Tensor &out) {                       \
+    (void)ctx;                                                                 \
+    linear_out_cpu<weight_nbit, true>(activations, packed_weights, group_size, \
+                                      n, k, out);                              \
+    return out;                                                                \
+  }
+
+#define REGISTER_0ZP(weight_nbit)                                              \
+  EXECUTORCH_LIBRARY(torchao,                                                  \
+                     "_linear_8bit_act_" #weight_nbit "bit0zp_weight.out",     \
+                     _op_out_0zp_##weight_nbit)
+
+#define REGISTER_ZP(weight_nbit)                                               \
+  EXECUTORCH_LIBRARY(torchao,                                                  \
+                     "_linear_8bit_act_" #weight_nbit "bit_weight.out",        \
+                     _op_out_zp_##weight_nbit)
+
+// This looks a bit ridiculous, but I could not get it to compile with two
+// EXECUTORCH_LIBRARY nested inside DEFINE_OP
+DEFINE_OP(1)
+REGISTER_0ZP(1);
+REGISTER_ZP(1);
+
+DEFINE_OP(2)
+REGISTER_0ZP(2);
+REGISTER_ZP(2);
+
+DEFINE_OP(3)
+REGISTER_0ZP(3);
+REGISTER_ZP(3);
+
+DEFINE_OP(4)
+REGISTER_0ZP(4);
+REGISTER_ZP(4);
+
+DEFINE_OP(5)
+REGISTER_0ZP(5);
+REGISTER_ZP(5);
+
+DEFINE_OP(6)
+REGISTER_0ZP(6);
+REGISTER_ZP(6);
+
+DEFINE_OP(7)
+REGISTER_0ZP(7);
+REGISTER_ZP(7);
+
+DEFINE_OP(8)
+REGISTER_0ZP(8);
+REGISTER_ZP(8);