From 38b50b415d5d7de9c31f777163f9e5b4246c7f73 Mon Sep 17 00:00:00 2001
From: Jane Xu <janeyx@meta.com>
Date: Fri, 10 Jan 2025 15:27:45 -0800
Subject: [PATCH 01/12] [super ugly not working code] use shim.h instead of
 Tensor

---
 .../tensor_core_tiled_layout.cu               | 303 +++++++++++-------
 torchao/csrc/ignore_this.txt                  |  18 ++
 2 files changed, 202 insertions(+), 119 deletions(-)
 create mode 100644 torchao/csrc/ignore_this.txt
diff --git a/torchao/csrc/cuda/tensor_core_tiled_layout/tensor_core_tiled_layout.cu b/torchao/csrc/cuda/tensor_core_tiled_layout/tensor_core_tiled_layout.cu
index ea0f24c202..bfca09edcf 100644
--- a/torchao/csrc/cuda/tensor_core_tiled_layout/tensor_core_tiled_layout.cu
+++ b/torchao/csrc/cuda/tensor_core_tiled_layout/tensor_core_tiled_layout.cu
@@ -1,10 +1,12 @@
 #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 800  // at least Ampere
 
-#include <ATen/ATen.h>
-#include <ATen/core/Tensor.h>
-#include <ATen/cuda/CUDAContext.h>
+// #include <ATen/ATen.h>
+// #include <ATen/core/Tensor.h>
 #include <ATen/DeviceGuard.h>
+#include <ATen/cuda/CUDAContext.h>
 #include <c10/cuda/CUDAGuard.h>
+#include <torch/csrc/inductor/aoti_torch/c/shim.h>
+#include <torch/csrc/inductor/aoti_torch/utils.h>
 #include <torch/library.h>
 
 template <typename U, typename V>
@@ -15,18 +17,18 @@ constexpr __host__ __device__ auto divUp(U a, V b) -> decltype(a + b) {
 }
 constexpr int32_t kWarpSize = 32;
 
-//Simple data structure to represent 4 pairs of bfloat16s, used for vectorized dequantization
-//https://github.com/pytorch/pytorch/blob/b6689e0fb83a1578959ab0d9c6d2d9e11f7df21a/aten/src/ATen/native/cuda/int4mm.cu#L178-L180
-struct __align__(16) bf16x2x4 {
-  __nv_bfloat162 vals[4];
-};
+// Simple data structure to represent 4 pairs of bfloat16s, used for vectorized
+// dequantization
+// https://github.com/pytorch/pytorch/blob/b6689e0fb83a1578959ab0d9c6d2d9e11f7df21a/aten/src/ATen/native/cuda/int4mm.cu#L178-L180
+struct __align__(16) bf16x2x4 { __nv_bfloat162 vals[4]; };
 
-//Copied from https://github.com/pytorch/pytorch/blob/b6689e0fb83a1578959ab0d9c6d2d9e11f7df21a/aten/src/ATen/native/cuda/int4mm.cu#L195C1-L241C1
+// Copied from
+// https://github.com/pytorch/pytorch/blob/b6689e0fb83a1578959ab0d9c6d2d9e11f7df21a/aten/src/ATen/native/cuda/int4mm.cu#L195C1-L241C1
 inline __device__ bf16x2x4 convert_i4x8_to_bf16x2x4(uint32_t source) {
   bf16x2x4 result;
   constexpr int kElements = 8;
 
-  uint32_t* h = reinterpret_cast<uint32_t*>(&result);
+  uint32_t *h = reinterpret_cast<uint32_t *>(&result);
   uint32_t const source_i4s = source;
 
   // First, we extract the i4s and construct an intermediate fp16 number.
@@ -44,10 +46,10 @@ inline __device__ bf16x2x4 convert_i4x8_to_bf16x2x4(uint32_t source) {
   for (int ii = 1; ii < kElements / 2; ++ii) {
     i4s >>= 4; // or is it 8?
     // (i4s & 0x000f000f) | 0x43004300
-    asm volatile(
-        "lop3.b32 %0, %1, %2, %3, %4;\n"
-        : "=r"(h[ii])
-        : "r"(i4s), "n"(MASK), "n"(I4s_TO_BF16s_MAGIC_NUM), "n"(immLut));
+    asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n"
+                 : "=r"(h[ii])
+                 : "r"(i4s), "n"(MASK), "n"(I4s_TO_BF16s_MAGIC_NUM),
+                   "n"(immLut));
   }
 
   // This is the BF16 {-136, -136} represented as an integer.
@@ -73,8 +75,9 @@ template <typename Out_t, int InnerKTiles, int groupSize, bool kDequant = true>
 __global__ void _dequantize_int4_kernel(
     const at::PackedTensorAccessor32<int32_t, 4, at::RestrictPtrTraits> in,
     at::PackedTensorAccessor32<Out_t, 2, at::RestrictPtrTraits> out,
-    std::optional<const at::PackedTensorAccessor32<c10::BFloat16, 3, at::RestrictPtrTraits>> scales_and_zeros = std::nullopt)
-{
+    std::optional<const at::PackedTensorAccessor32<c10::BFloat16, 3,
+                                                   at::RestrictPtrTraits>>
+        scales_and_zeros = std::nullopt) {
 
   constexpr int32_t kNTileSize = 8;
   constexpr int32_t kKTileSize = 16;
@@ -88,7 +91,8 @@ __global__ void _dequantize_int4_kernel(
 
   // 8 k-tile values, 4 per m16n8k16 mma.sync operand B
   // int32_t ks[8];
-  //Only need 4 offsets since TC layout for single tile is 2x2 (2 pairs of 2 contiguous values)
+  // Only need 4 offsets since TC layout for single tile is 2x2 (2 pairs of 2
+  // contiguous values)
   int32_t ks[4];
 
   // Store address base offset
@@ -97,9 +101,9 @@ __global__ void _dequantize_int4_kernel(
 // Unpack 2 k-tiles at a time since min pack size is InnerKTiles = 2
 #pragma unroll
   for (int innerKTile = 0; innerKTile < InnerKTiles; innerKTile += 2) {
-    //Tensor-core layout for m16n8k16 is such that each tile has 2 pairs of 2 contiguous values
-    //Hence, we only need 4 offsets
-    // Offsets of innerTile0
+    // Tensor-core layout for m16n8k16 is such that each tile has 2 pairs of 2
+    // contiguous values Hence, we only need 4 offsets
+    //  Offsets of innerTile0
     auto kBase0 = (kOuterTile * InnerKTiles + innerKTile) * kKTileSize;
     ks[0] = kBase0 + (t % 4) * 2;
     ks[1] = ks[0] + 8;
@@ -112,30 +116,35 @@ __global__ void _dequantize_int4_kernel(
     // inner k-tiles unpack two at a time
     int32_t pack = in[nTile][kOuterTile][t][innerKTile / 2];
 
-    if constexpr(kDequant) {
-      // static_assert(scales_and_zeros.has_value(), "scales_and_zeros must be set when dequantizing");
-      static_assert(std::is_same<Out_t, c10::BFloat16>::value, "Out must be BFloat16 when dequantizing");
+    if constexpr (kDequant) {
+      // static_assert(scales_and_zeros.has_value(), "scales_and_zeros must be
+      // set when dequantizing");
+      static_assert(std::is_same<Out_t, c10::BFloat16>::value,
+                    "Out must be BFloat16 when dequantizing");
       // __nv_bfloat16 v[8];
 
-      // // Extract u4, convert to s4 by subtracting by 2 ** nbits / 2, then convert to bfloat16
+      // // Extract u4, convert to s4 by subtracting by 2 ** nbits / 2, then
+      // convert to bfloat16
       bf16x2x4 v_bf16x2x4 = convert_i4x8_to_bf16x2x4(pack);
 
       // All b values within a 16x16 tile should fall within the same q group
       // Hence we load 1 scale and zero per loop
-      int qgroup = ks[0] /  groupSize;
-      const __nv_bfloat16 *pSZ = reinterpret_cast<const __nv_bfloat16*>(&scales_and_zeros.value()[qgroup][n0][0]);
+      int qgroup = ks[0] / groupSize;
+      const __nv_bfloat16 *pSZ = reinterpret_cast<const __nv_bfloat16 *>(
+          &scales_and_zeros.value()[qgroup][n0][0]);
 
       // Vectorize scales and zeros
       __nv_bfloat162 scale2 = __bfloat162bfloat162(pSZ[0]);
       __nv_bfloat162 zero2 = __bfloat162bfloat162(pSZ[1]);
 
-  #pragma unroll
+#pragma unroll
       for (int i = 0; i < 4; i++) {
-        reinterpret_cast<__nv_bfloat162*>(&pOut[ks[i]])[0] = __hfma2(v_bf16x2x4.vals[i], scale2, zero2);
+        reinterpret_cast<__nv_bfloat162 *>(&pOut[ks[i]])[0] =
+            __hfma2(v_bf16x2x4.vals[i], scale2, zero2);
       }
-    }
-    else {
-      static_assert(std::is_same<Out_t, int32_t>::value, "Out must be int32_t when unpacking to int");
+    } else {
+      static_assert(std::is_same<Out_t, int32_t>::value,
+                    "Out must be int32_t when unpacking to int");
       int32_t v[8];
 
       v[0] = pack & 0x0000000f;
@@ -146,9 +155,9 @@ __global__ void _dequantize_int4_kernel(
       v[3] = (pack >> 20) & 0x0000000f;
       v[5] = (pack >> 24) & 0x0000000f;
       v[7] = (pack >> 28) & 0x0000000f;
-      int2* v_i32x2 = reinterpret_cast<int2 *>(v);
+      int2 *v_i32x2 = reinterpret_cast<int2 *>(v);
 
-    #pragma unroll
+#pragma unroll
       for (int i = 0; i < 4; ++i) {
         reinterpret_cast<int2 *>(&pOut[ks[i]])[0] = v_i32x2[i];
       }
@@ -156,16 +165,9 @@ __global__ void _dequantize_int4_kernel(
   }
 }
 
-// output is [n][k] (int32 dtype)
-// input is [n / 8][k / (InnerKTiles * 16)][32][innerKTiles / 2]
-// scales_and_zeros is [numQGroups][n][2]
-// qGroupSize is 32, 64, 128 or 256
-at::Tensor _dequantize_tensor_core_tiled_layout(
-    const at::Tensor& packed_w,
-    const at::Tensor& scales_and_zeros,
-    int64_t group_size,
-    int64_t innerKTiles)
-{
+at::Tensor _ATH_dequantize_tensor_core_tiled_layout(
+    const AtenTensorHandle packed_w, const AtenTensorHandle scales_and_zeros,
+    int64_t group_size, int64_t innerKTiles) {
 
   constexpr int32_t kNTileSize = 8;
   constexpr int32_t kKTileSize = 16;
@@ -173,94 +175,147 @@ at::Tensor _dequantize_tensor_core_tiled_layout(
   c10::cuda::CUDAGuard g(packed_w.device());
 
   // packed_w preconditions
-  TORCH_CHECK(packed_w.dim() == 4);
-  TORCH_CHECK(packed_w.dtype() == at::kInt);
-  TORCH_CHECK(packed_w.is_contiguous());
-  TORCH_CHECK(packed_w.size(2) == 32);
-  TORCH_CHECK(packed_w.size(3) == innerKTiles / 2);
+  int64_t packed_w_dim;
+  aoti_torch_get_dim(packed_w, &packed_w_dim);
+  TORCH_CHECK(packed_w_dim == 4);
+
+  int32_t packed_w_dtype;
+  aoti_torch_get_dtype(packed_w, &packed_w_dtype);
+  TORCH_CHECK(packed_w_dtype == at::kInt);
+
+  // is_contiguous not existent today
+  // TORCH_CHECK(packed_w.is_contiguous());
+
+  int64_t packed_w_dim_2_size;
+  aoti_torch_get_size(packed_w, 2, &packed_w_dim_2_size);
+  TORCH_CHECK(packed_w_dim_2_size == 32);
+
+  int64_t packed_w_dim_3_size;
+  aoti_torch_get_size(packed_w, 3, &packed_w_dim_3_size);
+  TORCH_CHECK(packed_w_dim_3_size == innerKTiles / 2);
   TORCH_CHECK(innerKTiles == 2 || innerKTiles == 4 || innerKTiles == 8);
 
-  auto numQGroups = scales_and_zeros.size(0);
-  int N = packed_w.size(0) * kNTileSize;
-  int K = packed_w.size(1) * innerKTiles * kKTileSize;
+  auto numQGroups;
+  aoti_torch_get_size(scales_and_zeros, 0, &numQGroups);
+
+  int64_t packed_w_dim_0_size;
+  aoti_torch_get_size(packed_w, 0, &packed_w_dim_0_size);
+  int N = packed_w_dim_0_size * kNTileSize;
+
+  int64_t packed_w_dim_1_size;
+  aoti_torch_get_size(packed_w, 1, &packed_w_dim_1_size);
+  int K = packed_w_dim_1_size * innerKTiles * kKTileSize;
 
   // scales_and_zeros preconditions
-  TORCH_CHECK(
-      group_size == 32 || group_size == 64 || group_size == 128 ||
-      group_size == 256);
+  TORCH_CHECK(group_size == 32 || group_size == 64 || group_size == 128 ||
+              group_size == 256);
   TORCH_CHECK(numQGroups == K / group_size);
-  TORCH_CHECK(scales_and_zeros.dim() == 3);
-  TORCH_CHECK(scales_and_zeros.size(1) == N);
-  TORCH_CHECK(scales_and_zeros.size(2) == 2);
+
+  int64_t scales_and_zeros_dim;
+  aoti_torch_get_dim(scales_and_zeros, &scales_and_zeros_dim);
+  TORCH_CHECK(scales_and_zeros_dim == 3);
+
+  int64_t scales_and_zeros_dim_1_size;
+  int64_t scales_and_zeros_dim_2_size;
+  aoti_torch_get_size(scales_and_zeros, 1, &scales_and_zeros_dim_1_size);
+  aoti_torch_get_size(scales_and_zeros, 2, &scales_and_zeros_dim_2_size);
+  TORCH_CHECK(scales_and_zeros_dim_1_size == N);
+  TORCH_CHECK(scales_and_zeros_dim_2_size == 2);
 
   auto nTiles = divUp(N, kNTileSize);
   auto kSuperTiles = divUp(K, innerKTiles * kKTileSize);
   auto out = at::empty(
       {N, K},
       at::TensorOptions().dtype(at::kBFloat16).device(packed_w.device()));
+  // gotta swap this to be an AtenTensorHandle
 
   auto stream = at::cuda::getCurrentCUDAStream();
   dim3 grid(kSuperTiles, nTiles);
 
-#define RUN_DEQUANT(QGROUPSIZE) \
-  do { \
-    switch(innerKTiles) { \
-      case 2: \
-        _dequantize_int4_kernel<c10::BFloat16, 2, QGROUPSIZE, true><<<grid, kWarpSize, 0, stream>>>( \
-        packed_w.packed_accessor32<int32_t, 4, at::RestrictPtrTraits>(), \
-        out.packed_accessor32<c10::BFloat16, 2, at::RestrictPtrTraits>(), \
-        scales_and_zeros.packed_accessor32<c10::BFloat16, 3, at::RestrictPtrTraits>()); \
-        break; \
-      case 4: \
-        _dequantize_int4_kernel<c10::BFloat16, 4, QGROUPSIZE, true><<<grid, kWarpSize, 0, stream>>>( \
-        packed_w.packed_accessor32<int32_t, 4, at::RestrictPtrTraits>(), \
-        out.packed_accessor32<c10::BFloat16, 2, at::RestrictPtrTraits>(), \
-        scales_and_zeros.packed_accessor32<c10::BFloat16, 3, at::RestrictPtrTraits>()); \
-        break; \
-      case 8: \
-        _dequantize_int4_kernel<c10::BFloat16, 8, QGROUPSIZE, true><<<grid, kWarpSize, 0, stream>>>( \
-        packed_w.packed_accessor32<int32_t, 4, at::RestrictPtrTraits>(), \
+  void *packed_w_data_ptr;
+  void *scales_and_zeros_data_ptr;
+  void *out_data_ptr;
+  aoti_torch_get_data_ptr(packed_w, &packed_w_data_ptr);
+  aoti_torch_get_data_ptr(scales_and_zeros, &scales_and_zeros_data_ptr);
+  aoti_torch_get_data_ptr(out, &out_data_ptr);
+
+// packed_w.packed_accessor32<int32_t, 4, at::RestrictPtrTraits>(),  \
         out.packed_accessor32<c10::BFloat16, 2, at::RestrictPtrTraits>(), \
         scales_and_zeros.packed_accessor32<c10::BFloat16, 3, at::RestrictPtrTraits>()); \
-        break; \
-      default: \
-        break; \
-    } \
-  } while(false)
-
-#define DISPATCH_Q_GROUP() \
-  do { \
-    switch (group_size) { \
-      case 32: \
-        RUN_DEQUANT(32); \
-        break; \
-      case 64: \
-        RUN_DEQUANT(64); \
-        break; \
-      case 128: \
-        RUN_DEQUANT(128); \
-        break; \
-      case 256: \
-        RUN_DEQUANT(256); \
-        break; \
-      default: \
-        break; \
-    } \
-  } while(false)
+
+#define RUN_DEQUANT(QGROUPSIZE)                                                \
+  do {                                                                         \
+    switch (innerKTiles) {                                                     \
+    case 2:                                                                    \
+      _dequantize_int4_kernel<c10::BFloat16, 2, QGROUPSIZE, true>              \
+          <<<grid, kWarpSize, 0, stream>>>(packed_w_data_ptr, out_data_ptr,    \
+                                           scales_and_zeros_data_ptr);         \
+      break;                                                                   \
+    case 4:                                                                    \
+      _dequantize_int4_kernel<c10::BFloat16, 4, QGROUPSIZE, true>              \
+          <<<grid, kWarpSize, 0, stream>>>(packed_w_data_ptr, out_data_ptr,    \
+                                           scales_and_zeros_data_ptr);         \
+      break;                                                                   \
+    case 8:                                                                    \
+      _dequantize_int4_kernel<c10::BFloat16, 8, QGROUPSIZE, true>              \
+          <<<grid, kWarpSize, 0, stream>>>(packed_w_data_ptr, out_data_ptr,    \
+                                           scales_and_zeros_data_ptr);         \
+      break;                                                                   \
+    default:                                                                   \
+      break;                                                                   \
+    }                                                                          \
+  } while (false)
+
+#define DISPATCH_Q_GROUP()                                                     \
+  do {                                                                         \
+    switch (group_size) {                                                      \
+    case 32:                                                                   \
+      RUN_DEQUANT(32);                                                         \
+      break;                                                                   \
+    case 64:                                                                   \
+      RUN_DEQUANT(64);                                                         \
+      break;                                                                   \
+    case 128:                                                                  \
+      RUN_DEQUANT(128);                                                        \
+      break;                                                                   \
+    case 256:                                                                  \
+      RUN_DEQUANT(256);                                                        \
+      break;                                                                   \
+    default:                                                                   \
+      break;                                                                   \
+    }                                                                          \
+  } while (false)
 
   DISPATCH_Q_GROUP();
-  #undef DISPATCH_Q_GROUP
-  #undef RUN_DEQUANT
+#undef DISPATCH_Q_GROUP
+#undef RUN_DEQUANT
 
   return out;
 }
 
 // output is [n][k] (int32 dtype)
 // input is [n / 8][k / (InnerKTiles * 16)][32][innerKTiles / 2]
-at::Tensor _unpack_tensor_core_tiled_layout(
-    const at::Tensor& packed_w,
-    int64_t innerKTiles)
-{
+// scales_and_zeros is [numQGroups][n][2]
+// qGroupSize is 32, 64, 128 or 256
+at::Tensor
+_dequantize_tensor_core_tiled_layout(const at::Tensor &packed_w,
+                                     const at::Tensor &scales_and_zeros,
+                                     int64_t group_size, int64_t innerKTiles) {
+
+  AtenTensorHandle packed_w_ath = tensor_pointer_to_tensor_handle(&packed_w);
+  AtenTensorHandle scales_and_zeros_ath =
+      tensor_pointer_to_tensor_handle(&scales_and_zeros);
+
+  AtenTensorHandle ath_res = _ATH_dequantize_tensor_core_tiled_layout(
+      packed_w_ath, scales_and_zeros_ath, group_size, innerKTiles);
+
+  return ath_res; // tensor_handle_to_tensor_pointer(ath_res);
+}
+
+// output is [n][k] (int32 dtype)
+// input is [n / 8][k / (InnerKTiles * 16)][32][innerKTiles / 2]
+at::Tensor _unpack_tensor_core_tiled_layout(const at::Tensor &packed_w,
+                                            int64_t innerKTiles) {
 
   c10::cuda::CUDAGuard g(packed_w.device());
 
@@ -283,30 +338,40 @@ at::Tensor _unpack_tensor_core_tiled_layout(
   auto kSuperTiles = divUp(K, innerKTiles * kKTileSize);
 
   auto out = at::empty(
-      {N, K},
-      at::TensorOptions().dtype(at::kInt).device(packed_w.device()));
+      {N, K}, at::TensorOptions().dtype(at::kInt).device(packed_w.device()));
 
   auto stream = at::cuda::getCurrentCUDAStream();
   dim3 grid(kSuperTiles, nTiles);
 
   if (innerKTiles == 2) {
-    _dequantize_int4_kernel<int32_t, 2, 0, false><<<grid, kWarpSize, 0, stream>>>(
-        packed_w.packed_accessor32<int32_t, 4, at::RestrictPtrTraits>(),
-        out.packed_accessor32<int32_t, 2, at::RestrictPtrTraits>());
-  }
-   else if (innerKTiles == 4) {
-    _dequantize_int4_kernel<int32_t, 4, 0, false><<<grid, kWarpSize, 0, stream>>>(
-        packed_w.packed_accessor32<int32_t, 4, at::RestrictPtrTraits>(),
-        out.packed_accessor32<int32_t, 2, at::RestrictPtrTraits>());
+    _dequantize_int4_kernel<int32_t, 2, 0, false>
+        <<<grid, kWarpSize, 0, stream>>>(
+            packed_w.packed_accessor32<int32_t, 4, at::RestrictPtrTraits>(),
+            out.packed_accessor32<int32_t, 2, at::RestrictPtrTraits>());
+  } else if (innerKTiles == 4) {
+    _dequantize_int4_kernel<int32_t, 4, 0, false>
+        <<<grid, kWarpSize, 0, stream>>>(
+            packed_w.packed_accessor32<int32_t, 4, at::RestrictPtrTraits>(),
+            out.packed_accessor32<int32_t, 2, at::RestrictPtrTraits>());
   } else if (innerKTiles == 8) {
-    _dequantize_int4_kernel<int32_t, 8, 0, false><<<grid, kWarpSize, 0, stream>>>(
-        packed_w.packed_accessor32<int32_t, 4, at::RestrictPtrTraits>(),
-        out.packed_accessor32<int32_t, 2, at::RestrictPtrTraits>());
+    _dequantize_int4_kernel<int32_t, 8, 0, false>
+        <<<grid, kWarpSize, 0, stream>>>(
+            packed_w.packed_accessor32<int32_t, 4, at::RestrictPtrTraits>(),
+            out.packed_accessor32<int32_t, 2, at::RestrictPtrTraits>());
   }
 
   return out;
 }
 
+// // Use when all tensors arguments accept one (normal) batch dim.
+// // This batching rule expands the batch dim on all Tensors, reshapes it into
+// // dim 0, calls the op, and then reshapes the batch dim out of dim 0.
+// // This is not the most efficient thing; if there are alternatives, plese try
+// // to use them. Use this only as a last resort.
+// #define EXISTING_BDIM_ALL_BOXED(op) \
+//   m.impl(#op,
+//   torch::CppFunction::makeFromBoxedFunction<boxed_existing_bdim_all_batch_rule>());
+
 TORCH_LIBRARY_IMPL(torchao, CUDA, m) {
   m.impl("torchao::unpack_tensor_core_tiled_layout", &_unpack_tensor_core_tiled_layout);
   m.impl("torchao::dequantize_tensor_core_tiled_layout", &_dequantize_tensor_core_tiled_layout);
diff --git a/torchao/csrc/ignore_this.txt b/torchao/csrc/ignore_this.txt
new file mode 100644
index 0000000000..ef8544f6e9
--- /dev/null
+++ b/torchao/csrc/ignore_this.txt
@@ -0,0 +1,18 @@
+#include <torch/csrc/utils/pythoncapi_compat.h>
+
+int main() {
+  PyObject *list = PyList_New(2);
+
+  for (auto i = 0; i < 2; ++i) {
+    PyObject *anint = PyLong_FromLong(i + 1);
+    PyList_SET_ITEM(list, i, anint);
+  }
+
+  auto a = PyList_GetItemRef(list, 0);
+
+  return 0;
+}
+
+int r = main();
+
+// auto lol = torch::getTHPLayout(static_cast<c10::Layout>(1));

From 4d0cebf22378118896a9aa592fb9d9e0d6e8dd1f Mon Sep 17 00:00:00 2001
From: Jane Xu <janeyx@meta.com>
Date: Thu, 16 Jan 2025 14:08:27 -0800
Subject: [PATCH 02/12] Cleaned up PoC

---
 .../tensor_core_tiled_layout.cu               | 81 ++++++++++++++-----
 1 file changed, 63 insertions(+), 18 deletions(-)

diff --git a/torchao/csrc/cuda/tensor_core_tiled_layout/tensor_core_tiled_layout.cu b/torchao/csrc/cuda/tensor_core_tiled_layout/tensor_core_tiled_layout.cu
index bfca09edcf..e00afdff85 100644
--- a/torchao/csrc/cuda/tensor_core_tiled_layout/tensor_core_tiled_layout.cu
+++ b/torchao/csrc/cuda/tensor_core_tiled_layout/tensor_core_tiled_layout.cu
@@ -3,8 +3,10 @@
 // #include <ATen/ATen.h>
 // #include <ATen/core/Tensor.h>
 #include <ATen/DeviceGuard.h>
+#include <ATen/core/TensorAccessor.h>
 #include <ATen/cuda/CUDAContext.h>
 #include <c10/cuda/CUDAGuard.h>
+#include <c10/util/BFloat16.h>
 #include <torch/csrc/inductor/aoti_torch/c/shim.h>
 #include <torch/csrc/inductor/aoti_torch/utils.h>
 #include <torch/library.h>
@@ -165,14 +167,18 @@ __global__ void _dequantize_int4_kernel(
   }
 }
 
-at::Tensor _ATH_dequantize_tensor_core_tiled_layout(
+AtenTensorHandle _ATH_dequantize_tensor_core_tiled_layout(
     const AtenTensorHandle packed_w, const AtenTensorHandle scales_and_zeros,
     int64_t group_size, int64_t innerKTiles) {
 
   constexpr int32_t kNTileSize = 8;
   constexpr int32_t kKTileSize = 16;
 
-  c10::cuda::CUDAGuard g(packed_w.device());
+  int32_t packed_w_device_index;
+  aoti_torch_get_device_index(packed_w, &packed_w_device_index);
+
+  // c10::cuda::CUDAGuard g(packed_w.device());
+  c10::cuda::CUDAGuard g(packed_w_device_index);
 
   // packed_w preconditions
   int64_t packed_w_dim;
@@ -181,7 +187,7 @@ at::Tensor _ATH_dequantize_tensor_core_tiled_layout(
 
   int32_t packed_w_dtype;
   aoti_torch_get_dtype(packed_w, &packed_w_dtype);
-  TORCH_CHECK(packed_w_dtype == at::kInt);
+  TORCH_CHECK(packed_w_dtype == static_cast<int32_t>(at::kInt));
 
   // is_contiguous not existent today
   // TORCH_CHECK(packed_w.is_contiguous());
@@ -195,7 +201,7 @@ at::Tensor _ATH_dequantize_tensor_core_tiled_layout(
   TORCH_CHECK(packed_w_dim_3_size == innerKTiles / 2);
   TORCH_CHECK(innerKTiles == 2 || innerKTiles == 4 || innerKTiles == 8);
 
-  auto numQGroups;
+  int64_t numQGroups;
   aoti_torch_get_size(scales_and_zeros, 0, &numQGroups);
 
   int64_t packed_w_dim_0_size;
@@ -224,20 +230,58 @@ at::Tensor _ATH_dequantize_tensor_core_tiled_layout(
 
   auto nTiles = divUp(N, kNTileSize);
   auto kSuperTiles = divUp(K, innerKTiles * kKTileSize);
-  auto out = at::empty(
-      {N, K},
-      at::TensorOptions().dtype(at::kBFloat16).device(packed_w.device()));
+
+  auto bf16 = aoti_torch_dtype_bfloat16();
+
+  AtenTensorHandle out;
+  int64_t out_sizes[] = {N, K};
+  int64_t out_strides[] = {K, 1};
+  int32_t packed_w_device_type;
+  aoti_torch_get_device_type(packed_w, &packed_w_device_type);
+  aoti_torch_empty_strided(2, out_sizes, out_strides, bf16,
+                           packed_w_device_type, packed_w_device_index, &out);
+
+  // auto out = at::empty(
+  //     {N, K},
+  //     at::TensorOptions().dtype(at::kBFloat16).device(packed_w.device()));
   // gotta swap this to be an AtenTensorHandle
 
   auto stream = at::cuda::getCurrentCUDAStream();
   dim3 grid(kSuperTiles, nTiles);
 
   void *packed_w_data_ptr;
-  void *scales_and_zeros_data_ptr;
-  void *out_data_ptr;
+  int64_t *packed_w_sizes;
+  int64_t *packed_w_strides;
   aoti_torch_get_data_ptr(packed_w, &packed_w_data_ptr);
+  aoti_torch_get_sizes(packed_w, &packed_w_sizes);
+  aoti_torch_get_strides(packed_w, &packed_w_strides);
+  at::GenericPackedTensorAccessor<int32_t, 4, at::RestrictPtrTraits, int32_t>
+      packed_w_pta32(
+          static_cast<typename at::RestrictPtrTraits<int32_t>::PtrType>(
+              packed_w_data_ptr),
+          packed_w_sizes, packed_w_strides);
+
+  void *scales_and_zeros_data_ptr;
+  int64_t *scales_and_zeros_sizes;
+  int64_t *scales_and_zeros_strides;
   aoti_torch_get_data_ptr(scales_and_zeros, &scales_and_zeros_data_ptr);
+  aoti_torch_get_sizes(scales_and_zeros, &scales_and_zeros_sizes);
+  aoti_torch_get_strides(scales_and_zeros, &scales_and_zeros_strides);
+  at::GenericPackedTensorAccessor<c10::BFloat16, 3, at::RestrictPtrTraits,
+                                  int32_t>
+      scales_and_zeros_pta32(
+          static_cast<typename at::RestrictPtrTraits<c10::BFloat16>::PtrType>(
+              scales_and_zeros_data_ptr),
+          scales_and_zeros_sizes, scales_and_zeros_strides);
+
+  void *out_data_ptr;
   aoti_torch_get_data_ptr(out, &out_data_ptr);
+  at::GenericPackedTensorAccessor<c10::BFloat16, 2, at::RestrictPtrTraits,
+                                  int32_t>
+      out_pta32(
+          static_cast<typename at::RestrictPtrTraits<c10::BFloat16>::PtrType>(
+              out_data_ptr),
+          out_sizes, out_strides);
 
 // packed_w.packed_accessor32<int32_t, 4, at::RestrictPtrTraits>(),  \
         out.packed_accessor32<c10::BFloat16, 2, at::RestrictPtrTraits>(), \
@@ -248,18 +292,18 @@ at::Tensor _ATH_dequantize_tensor_core_tiled_layout(
     switch (innerKTiles) {                                                     \
     case 2:                                                                    \
       _dequantize_int4_kernel<c10::BFloat16, 2, QGROUPSIZE, true>              \
-          <<<grid, kWarpSize, 0, stream>>>(packed_w_data_ptr, out_data_ptr,    \
-                                           scales_and_zeros_data_ptr);         \
+          <<<grid, kWarpSize, 0, stream>>>(packed_w_pta32, out_pta32,          \
+                                           scales_and_zeros_pta32);            \
       break;                                                                   \
     case 4:                                                                    \
       _dequantize_int4_kernel<c10::BFloat16, 4, QGROUPSIZE, true>              \
-          <<<grid, kWarpSize, 0, stream>>>(packed_w_data_ptr, out_data_ptr,    \
-                                           scales_and_zeros_data_ptr);         \
+          <<<grid, kWarpSize, 0, stream>>>(packed_w_pta32, out_pta32,          \
+                                           scales_and_zeros_pta32);            \
       break;                                                                   \
     case 8:                                                                    \
       _dequantize_int4_kernel<c10::BFloat16, 8, QGROUPSIZE, true>              \
-          <<<grid, kWarpSize, 0, stream>>>(packed_w_data_ptr, out_data_ptr,    \
-                                           scales_and_zeros_data_ptr);         \
+          <<<grid, kWarpSize, 0, stream>>>(packed_w_pta32, out_pta32,          \
+                                           scales_and_zeros_pta32);            \
       break;                                                                   \
     default:                                                                   \
       break;                                                                   \
@@ -302,14 +346,15 @@ _dequantize_tensor_core_tiled_layout(const at::Tensor &packed_w,
                                      const at::Tensor &scales_and_zeros,
                                      int64_t group_size, int64_t innerKTiles) {
 
-  AtenTensorHandle packed_w_ath = tensor_pointer_to_tensor_handle(&packed_w);
+  AtenTensorHandle packed_w_ath =
+      torch::aot_inductor::tensor_pointer_to_tensor_handle(&packed_w);
   AtenTensorHandle scales_and_zeros_ath =
-      tensor_pointer_to_tensor_handle(&scales_and_zeros);
+      torch::aot_inductor::tensor_pointer_to_tensor_handle(&scales_and_zeros);
 
   AtenTensorHandle ath_res = _ATH_dequantize_tensor_core_tiled_layout(
       packed_w_ath, scales_and_zeros_ath, group_size, innerKTiles);
 
-  return ath_res; // tensor_handle_to_tensor_pointer(ath_res);
+  return *torch::aot_inductor::tensor_handle_to_tensor_pointer(ath_res);
 }
 
 // output is [n][k] (int32 dtype)

From abdae1e3d5576816c9394c53297d5235f54cb617 Mon Sep 17 00:00:00 2001
From: Jane Xu <janeyx@meta.com>
Date: Thu, 16 Jan 2025 14:12:56 -0800
Subject: [PATCH 03/12] Ignore ignore_this

---
 .../tensor_core_tiled_layout.cu                |  6 +-----
 torchao/csrc/ignore_this.txt                   | 18 ------------------
 2 files changed, 1 insertion(+), 23 deletions(-)
 delete mode 100644 torchao/csrc/ignore_this.txt

diff --git a/torchao/csrc/cuda/tensor_core_tiled_layout/tensor_core_tiled_layout.cu b/torchao/csrc/cuda/tensor_core_tiled_layout/tensor_core_tiled_layout.cu
index e00afdff85..f271448368 100644
--- a/torchao/csrc/cuda/tensor_core_tiled_layout/tensor_core_tiled_layout.cu
+++ b/torchao/csrc/cuda/tensor_core_tiled_layout/tensor_core_tiled_layout.cu
@@ -408,11 +408,7 @@ at::Tensor _unpack_tensor_core_tiled_layout(const at::Tensor &packed_w,
   return out;
 }
 
-// // Use when all tensors arguments accept one (normal) batch dim.
-// // This batching rule expands the batch dim on all Tensors, reshapes it into
-// // dim 0, calls the op, and then reshapes the batch dim out of dim 0.
-// // This is not the most efficient thing; if there are alternatives, plese try
-// // to use them. Use this only as a last resort.
+// The following example will be useful when we do registration
 // #define EXISTING_BDIM_ALL_BOXED(op) \
 //   m.impl(#op,
 //   torch::CppFunction::makeFromBoxedFunction<boxed_existing_bdim_all_batch_rule>());
diff --git a/torchao/csrc/ignore_this.txt b/torchao/csrc/ignore_this.txt
deleted file mode 100644
index ef8544f6e9..0000000000
--- a/torchao/csrc/ignore_this.txt
+++ /dev/null
@@ -1,18 +0,0 @@
-#include <torch/csrc/utils/pythoncapi_compat.h>
-
-int main() {
-  PyObject *list = PyList_New(2);
-
-  for (auto i = 0; i < 2; ++i) {
-    PyObject *anint = PyLong_FromLong(i + 1);
-    PyList_SET_ITEM(list, i, anint);
-  }
-
-  auto a = PyList_GetItemRef(list, 0);
-
-  return 0;
-}
-
-int r = main();
-
-// auto lol = torch::getTHPLayout(static_cast<c10::Layout>(1));

From df85d15a9f11a374b997d94a767f889f0753df75 Mon Sep 17 00:00:00 2001
From: Jane Xu <janeyx@meta.com>
Date: Fri, 17 Jan 2025 10:24:38 -0800
Subject: [PATCH 04/12] add mock registration prototype

---
 .../tensor_core_tiled_layout.cu               | 270 ++++++++++++++----
 1 file changed, 217 insertions(+), 53 deletions(-)

diff --git a/torchao/csrc/cuda/tensor_core_tiled_layout/tensor_core_tiled_layout.cu b/torchao/csrc/cuda/tensor_core_tiled_layout/tensor_core_tiled_layout.cu
index f271448368..98021759bf 100644
--- a/torchao/csrc/cuda/tensor_core_tiled_layout/tensor_core_tiled_layout.cu
+++ b/torchao/csrc/cuda/tensor_core_tiled_layout/tensor_core_tiled_layout.cu
@@ -4,6 +4,8 @@
 // #include <ATen/core/Tensor.h>
 #include <ATen/DeviceGuard.h>
 #include <ATen/core/TensorAccessor.h>
+#include <ATen/core/ivalue.h>
+#include <ATen/core/stack.h>
 #include <ATen/cuda/CUDAContext.h>
 #include <c10/cuda/CUDAGuard.h>
 #include <c10/util/BFloat16.h>
@@ -19,18 +21,18 @@ constexpr __host__ __device__ auto divUp(U a, V b) -> decltype(a + b) {
 }
 constexpr int32_t kWarpSize = 32;
 
-// Simple data structure to represent 4 pairs of bfloat16s, used for vectorized
-// dequantization
-// https://github.com/pytorch/pytorch/blob/b6689e0fb83a1578959ab0d9c6d2d9e11f7df21a/aten/src/ATen/native/cuda/int4mm.cu#L178-L180
-struct __align__(16) bf16x2x4 { __nv_bfloat162 vals[4]; };
+//Simple data structure to represent 4 pairs of bfloat16s, used for vectorized dequantization
+//https://github.com/pytorch/pytorch/blob/b6689e0fb83a1578959ab0d9c6d2d9e11f7df21a/aten/src/ATen/native/cuda/int4mm.cu#L178-L180
+struct __align__(16) bf16x2x4 {
+  __nv_bfloat162 vals[4];
+};
 
-// Copied from
-// https://github.com/pytorch/pytorch/blob/b6689e0fb83a1578959ab0d9c6d2d9e11f7df21a/aten/src/ATen/native/cuda/int4mm.cu#L195C1-L241C1
+//Copied from https://github.com/pytorch/pytorch/blob/b6689e0fb83a1578959ab0d9c6d2d9e11f7df21a/aten/src/ATen/native/cuda/int4mm.cu#L195C1-L241C1
 inline __device__ bf16x2x4 convert_i4x8_to_bf16x2x4(uint32_t source) {
   bf16x2x4 result;
   constexpr int kElements = 8;
 
-  uint32_t *h = reinterpret_cast<uint32_t *>(&result);
+  uint32_t* h = reinterpret_cast<uint32_t*>(&result);
   uint32_t const source_i4s = source;
 
   // First, we extract the i4s and construct an intermediate fp16 number.
@@ -48,10 +50,10 @@ inline __device__ bf16x2x4 convert_i4x8_to_bf16x2x4(uint32_t source) {
   for (int ii = 1; ii < kElements / 2; ++ii) {
     i4s >>= 4; // or is it 8?
     // (i4s & 0x000f000f) | 0x43004300
-    asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n"
-                 : "=r"(h[ii])
-                 : "r"(i4s), "n"(MASK), "n"(I4s_TO_BF16s_MAGIC_NUM),
-                   "n"(immLut));
+    asm volatile(
+        "lop3.b32 %0, %1, %2, %3, %4;\n"
+        : "=r"(h[ii])
+        : "r"(i4s), "n"(MASK), "n"(I4s_TO_BF16s_MAGIC_NUM), "n"(immLut));
   }
 
   // This is the BF16 {-136, -136} represented as an integer.
@@ -77,9 +79,8 @@ template <typename Out_t, int InnerKTiles, int groupSize, bool kDequant = true>
 __global__ void _dequantize_int4_kernel(
     const at::PackedTensorAccessor32<int32_t, 4, at::RestrictPtrTraits> in,
     at::PackedTensorAccessor32<Out_t, 2, at::RestrictPtrTraits> out,
-    std::optional<const at::PackedTensorAccessor32<c10::BFloat16, 3,
-                                                   at::RestrictPtrTraits>>
-        scales_and_zeros = std::nullopt) {
+    std::optional<const at::PackedTensorAccessor32<c10::BFloat16, 3, at::RestrictPtrTraits>> scales_and_zeros = std::nullopt)
+{
 
   constexpr int32_t kNTileSize = 8;
   constexpr int32_t kKTileSize = 16;
@@ -93,8 +94,7 @@ __global__ void _dequantize_int4_kernel(
 
   // 8 k-tile values, 4 per m16n8k16 mma.sync operand B
   // int32_t ks[8];
-  // Only need 4 offsets since TC layout for single tile is 2x2 (2 pairs of 2
-  // contiguous values)
+  //Only need 4 offsets since TC layout for single tile is 2x2 (2 pairs of 2 contiguous values)
   int32_t ks[4];
 
   // Store address base offset
@@ -103,9 +103,9 @@ __global__ void _dequantize_int4_kernel(
 // Unpack 2 k-tiles at a time since min pack size is InnerKTiles = 2
 #pragma unroll
   for (int innerKTile = 0; innerKTile < InnerKTiles; innerKTile += 2) {
-    // Tensor-core layout for m16n8k16 is such that each tile has 2 pairs of 2
-    // contiguous values Hence, we only need 4 offsets
-    //  Offsets of innerTile0
+    //Tensor-core layout for m16n8k16 is such that each tile has 2 pairs of 2 contiguous values
+    //Hence, we only need 4 offsets
+    // Offsets of innerTile0
     auto kBase0 = (kOuterTile * InnerKTiles + innerKTile) * kKTileSize;
     ks[0] = kBase0 + (t % 4) * 2;
     ks[1] = ks[0] + 8;
@@ -118,35 +118,30 @@ __global__ void _dequantize_int4_kernel(
     // inner k-tiles unpack two at a time
     int32_t pack = in[nTile][kOuterTile][t][innerKTile / 2];
 
-    if constexpr (kDequant) {
-      // static_assert(scales_and_zeros.has_value(), "scales_and_zeros must be
-      // set when dequantizing");
-      static_assert(std::is_same<Out_t, c10::BFloat16>::value,
-                    "Out must be BFloat16 when dequantizing");
+    if constexpr(kDequant) {
+      // static_assert(scales_and_zeros.has_value(), "scales_and_zeros must be set when dequantizing");
+      static_assert(std::is_same<Out_t, c10::BFloat16>::value, "Out must be BFloat16 when dequantizing");
       // __nv_bfloat16 v[8];
 
-      // // Extract u4, convert to s4 by subtracting by 2 ** nbits / 2, then
-      // convert to bfloat16
+      // // Extract u4, convert to s4 by subtracting by 2 ** nbits / 2, then convert to bfloat16
       bf16x2x4 v_bf16x2x4 = convert_i4x8_to_bf16x2x4(pack);
 
       // All b values within a 16x16 tile should fall within the same q group
       // Hence we load 1 scale and zero per loop
-      int qgroup = ks[0] / groupSize;
-      const __nv_bfloat16 *pSZ = reinterpret_cast<const __nv_bfloat16 *>(
-          &scales_and_zeros.value()[qgroup][n0][0]);
+      int qgroup = ks[0] /  groupSize;
+      const __nv_bfloat16 *pSZ = reinterpret_cast<const __nv_bfloat16*>(&scales_and_zeros.value()[qgroup][n0][0]);
 
       // Vectorize scales and zeros
       __nv_bfloat162 scale2 = __bfloat162bfloat162(pSZ[0]);
       __nv_bfloat162 zero2 = __bfloat162bfloat162(pSZ[1]);
 
-#pragma unroll
+  #pragma unroll
       for (int i = 0; i < 4; i++) {
-        reinterpret_cast<__nv_bfloat162 *>(&pOut[ks[i]])[0] =
-            __hfma2(v_bf16x2x4.vals[i], scale2, zero2);
+        reinterpret_cast<__nv_bfloat162*>(&pOut[ks[i]])[0] = __hfma2(v_bf16x2x4.vals[i], scale2, zero2);
       }
-    } else {
-      static_assert(std::is_same<Out_t, int32_t>::value,
-                    "Out must be int32_t when unpacking to int");
+    }
+    else {
+      static_assert(std::is_same<Out_t, int32_t>::value, "Out must be int32_t when unpacking to int");
       int32_t v[8];
 
       v[0] = pack & 0x0000000f;
@@ -157,9 +152,9 @@ __global__ void _dequantize_int4_kernel(
       v[3] = (pack >> 20) & 0x0000000f;
       v[5] = (pack >> 24) & 0x0000000f;
       v[7] = (pack >> 28) & 0x0000000f;
-      int2 *v_i32x2 = reinterpret_cast<int2 *>(v);
+      int2* v_i32x2 = reinterpret_cast<int2 *>(v);
 
-#pragma unroll
+    #pragma unroll
       for (int i = 0; i < 4; ++i) {
         reinterpret_cast<int2 *>(&pOut[ks[i]])[0] = v_i32x2[i];
       }
@@ -341,22 +336,107 @@ AtenTensorHandle _ATH_dequantize_tensor_core_tiled_layout(
 // input is [n / 8][k / (InnerKTiles * 16)][32][innerKTiles / 2]
 // scales_and_zeros is [numQGroups][n][2]
 // qGroupSize is 32, 64, 128 or 256
-at::Tensor
-_dequantize_tensor_core_tiled_layout(const at::Tensor &packed_w,
-                                     const at::Tensor &scales_and_zeros,
-                                     int64_t group_size, int64_t innerKTiles) {
-
-  AtenTensorHandle packed_w_ath =
-      torch::aot_inductor::tensor_pointer_to_tensor_handle(&packed_w);
+// at::Tensor
+// _dequantize_tensor_core_tiled_layout(const at::Tensor &packed_w,
+//                                      const at::Tensor &scales_and_zeros,
+//                                      int64_t group_size, int64_t innerKTiles) {
+
+//   AtenTensorHandle packed_w_ath =
+//       torch::aot_inductor::tensor_pointer_to_tensor_handle(&packed_w);
+//   AtenTensorHandle scales_and_zeros_ath =
+//       torch::aot_inductor::tensor_pointer_to_tensor_handle(&scales_and_zeros);
+
+//   AtenTensorHandle ath_res = _ATH_dequantize_tensor_core_tiled_layout(
+//       packed_w_ath, scales_and_zeros_ath, group_size, innerKTiles);
+
+//   return *torch::aot_inductor::tensor_handle_to_tensor_pointer(ath_res);
+// }
+
+void voidyvoid_boxed_ATH_dequantize_tensor_core_tiled_layout(void **stack,
+                                                       int64_t num_args,
+                                                       int64_t num_outputs) {
+  // here, void* is my StableIValue
+  // function is going to take a stack of void*, cast them to our
+  // schema values for now, and run the function and modify the void* stack
+  int64_t innerKTiles = *reinterpret_cast<int64_t *>(stack[3]);
+  int64_t group_size = *reinterpret_cast<int64_t *>(stack[2]);
   AtenTensorHandle scales_and_zeros_ath =
-      torch::aot_inductor::tensor_pointer_to_tensor_handle(&scales_and_zeros);
+      reinterpret_cast<AtenTensorHandle>(stack[1]);
+  AtenTensorHandle packed_w_ath = reinterpret_cast<AtenTensorHandle>(stack[0]);
 
   AtenTensorHandle ath_res = _ATH_dequantize_tensor_core_tiled_layout(
       packed_w_ath, scales_and_zeros_ath, group_size, innerKTiles);
 
-  return *torch::aot_inductor::tensor_handle_to_tensor_pointer(ath_res);
+  void *out = reinterpret_cast<void *>(ath_res);
+  stack[num_args] = out;
 }
 
+// step 1: from here, call the ATH func
+// step 2: make ATH func also boxed and call it
+// step 3: move abstract code to libtorch
+void boxed_dequantize_tensor_core_tiled_layout(const c10::OperatorHandle &op,
+                                               torch::jit::Stack *stack) {
+
+  // function pt1 here should take in IValues, pass a malloc'd stack into the
+  // second function
+  // need a translation from IValues to ATH to void*s!
+  int64_t innerKTiles = torch::jit::pop(stack).toInt();
+  int64_t group_size = torch::jit::pop(stack).toInt();
+  const at::Tensor &scales_and_zeros = torch::jit::pop(stack).toTensor();
+  AtenTensorHandle scales_and_zeros_ath =
+      torch::aot_inductor::tensor_pointer_to_tensor_handle(&scales_and_zeros);
+  const at::Tensor &packed_w = torch::jit::pop(stack).toTensor();
+  AtenTensorHandle packed_w_ath =
+      torch::aot_inductor::tensor_pointer_to_tensor_handle(&packed_w);
+
+  int64_t num_args = 4;
+  int64_t num_outputs = 1;
+  void **ministack = (void**)malloc((num_args + num_outputs) * sizeof(void *));
+  ministack[3] = reinterpret_cast<void *>(&innerKTiles);
+  ministack[2] = reinterpret_cast<void *>(&group_size);
+  ministack[1] = reinterpret_cast<void *>(scales_and_zeros_ath);
+  ministack[0] = reinterpret_cast<void *>(packed_w_ath);
+
+  // second function is going to take a stack of void*, cast them to our
+  // schema values for now, and run the function and modify the void* stack
+  voidyvoid_boxed_ATH_dequantize_tensor_core_tiled_layout(ministack, num_args,
+                                                    num_outputs);
+
+  // now read the output from the end of the stack and wrap that back into
+  // IValue from void*?
+
+  AtenTensorHandle out_ath =
+      reinterpret_cast<AtenTensorHandle>(ministack[num_args]);
+    
+  free(ministack);
+  at::Tensor out =
+      *torch::aot_inductor::tensor_handle_to_tensor_pointer(out_ath);
+  torch::jit::push(stack, c10::IValue(out));
+
+  // so above is our stack of IValues, but we cannot have these IValues because
+  // they are NOT ABI stable! So we need another version of "boxed" with void*s.
+  // and that is what is going to happen below
+
+  // what the old function used to be:
+  // int64_t innerKTiles = torch::jit::pop(stack).toInt();
+  // int64_t group_size = torch::jit::pop(stack).toInt();
+  // const at::Tensor &scales_and_zeros = torch::jit::pop(stack).toTensor();
+  // const at::Tensor &packed_w = torch::jit::pop(stack).toTensor();
+
+  // AtenTensorHandle packed_w_ath =
+  //     torch::aot_inductor::tensor_pointer_to_tensor_handle(&packed_w);
+  // AtenTensorHandle scales_and_zeros_ath =
+  //     torch::aot_inductor::tensor_pointer_to_tensor_handle(&scales_and_zeros);
+
+  // AtenTensorHandle ath_res = _ATH_dequantize_tensor_core_tiled_layout(
+  //     packed_w_ath, scales_and_zeros_ath, group_size, innerKTiles);
+
+  // at::Tensor out =
+  //     *torch::aot_inductor::tensor_handle_to_tensor_pointer(ath_res);
+  // torch::jit::push(stack, c10::IValue(out));
+}
+
+
 // output is [n][k] (int32 dtype)
 // input is [n / 8][k / (InnerKTiles * 16)][32][innerKTiles / 2]
 at::Tensor _unpack_tensor_core_tiled_layout(const at::Tensor &packed_w,
@@ -408,15 +488,99 @@ at::Tensor _unpack_tensor_core_tiled_layout(const at::Tensor &packed_w,
   return out;
 }
 
-// The following example will be useful when we do registration
-// #define EXISTING_BDIM_ALL_BOXED(op) \
-//   m.impl(#op,
-//   torch::CppFunction::makeFromBoxedFunction<boxed_existing_bdim_all_batch_rule>());
+void voidyvoid_boxed_ATH_dequantize_tensor_core_tiled_layout(void **stack,
+                                                       int64_t num_args,
+                                                       int64_t num_outputs) {
+  // here, void* is my StableIValue
+  // function is going to take a stack of void*, cast them to our
+  // schema values for now, and run the function and modify the void* stack
+  int64_t innerKTiles = *reinterpret_cast<int64_t *>(stack[3]);
+  int64_t group_size = *reinterpret_cast<int64_t *>(stack[2]);
+  AtenTensorHandle scales_and_zeros_ath =
+      reinterpret_cast<AtenTensorHandle>(stack[1]);
+  AtenTensorHandle packed_w_ath = reinterpret_cast<AtenTensorHandle>(stack[0]);
 
-TORCH_LIBRARY_IMPL(torchao, CUDA, m) {
-  m.impl("torchao::unpack_tensor_core_tiled_layout", &_unpack_tensor_core_tiled_layout);
-  m.impl("torchao::dequantize_tensor_core_tiled_layout", &_dequantize_tensor_core_tiled_layout);
+  AtenTensorHandle ath_res = _ATH_dequantize_tensor_core_tiled_layout(
+      packed_w_ath, scales_and_zeros_ath, group_size, innerKTiles);
 
+  void *out = reinterpret_cast<void *>(ath_res);
+  stack[num_args] = out;
+}
+
+// step 1: from here, call the ATH func
+// step 2: make ATH func also boxed and call it
+// step 3: move abstract code to libtorch
+void boxed_dequantize_tensor_core_tiled_layout(const c10::OperatorHandle &op,
+                                               torch::jit::Stack *stack) {
+
+  // function pt1 here should take in IValues, pass a malloc'd stack into the
+  // second function
+  // need a translation from IValues to ATH to void*s!
+  int64_t innerKTiles = torch::jit::pop(stack).toInt();
+  int64_t group_size = torch::jit::pop(stack).toInt();
+  const at::Tensor &scales_and_zeros = torch::jit::pop(stack).toTensor();
+  AtenTensorHandle scales_and_zeros_ath =
+      torch::aot_inductor::tensor_pointer_to_tensor_handle(&scales_and_zeros);
+  const at::Tensor &packed_w = torch::jit::pop(stack).toTensor();
+  AtenTensorHandle packed_w_ath =
+      torch::aot_inductor::tensor_pointer_to_tensor_handle(&packed_w);
+
+  int64_t num_args = 4;
+  int64_t num_outputs = 1;
+  void **ministack = (void**)malloc((num_args + num_outputs) * sizeof(void *));
+  ministack[3] = reinterpret_cast<void *>(&innerKTiles);
+  ministack[2] = reinterpret_cast<void *>(&group_size);
+  ministack[1] = reinterpret_cast<void *>(scales_and_zeros_ath);
+  ministack[0] = reinterpret_cast<void *>(packed_w_ath);
+
+  // second function is going to take a stack of void*, cast them to our
+  // schema values for now, and run the function and modify the void* stack
+  voidyvoid_boxed_ATH_dequantize_tensor_core_tiled_layout(ministack, num_args,
+                                                    num_outputs);
+
+  // now read the output from the end of the stack and wrap that back into
+  // IValue from void*?
+
+  AtenTensorHandle out_ath =
+      reinterpret_cast<AtenTensorHandle>(ministack[num_args]);
+    
+  free(ministack);
+  at::Tensor out =
+      *torch::aot_inductor::tensor_handle_to_tensor_pointer(out_ath);
+  torch::jit::push(stack, c10::IValue(out));
+
+  // so above is our stack of IValues, but we cannot have these IValues because
+  // they are NOT ABI stable! So we need another version of "boxed" with void*s.
+  // and that is what is going to happen below
+
+  // what the old function used to be:
+  // int64_t innerKTiles = torch::jit::pop(stack).toInt();
+  // int64_t group_size = torch::jit::pop(stack).toInt();
+  // const at::Tensor &scales_and_zeros = torch::jit::pop(stack).toTensor();
+  // const at::Tensor &packed_w = torch::jit::pop(stack).toTensor();
+
+  // AtenTensorHandle packed_w_ath =
+  //     torch::aot_inductor::tensor_pointer_to_tensor_handle(&packed_w);
+  // AtenTensorHandle scales_and_zeros_ath =
+  //     torch::aot_inductor::tensor_pointer_to_tensor_handle(&scales_and_zeros);
+
+  // AtenTensorHandle ath_res = _ATH_dequantize_tensor_core_tiled_layout(
+  //     packed_w_ath, scales_and_zeros_ath, group_size, innerKTiles);
+
+  // at::Tensor out =
+  //     *torch::aot_inductor::tensor_handle_to_tensor_pointer(ath_res);
+  // torch::jit::push(stack, c10::IValue(out));
+}
+
+
+TORCH_LIBRARY_IMPL(torchao, CUDA, m) {
+  m.impl("torchao::unpack_tensor_core_tiled_layout",
+         &_unpack_tensor_core_tiled_layout);
+  // m.impl("torchao::dequantize_tensor_core_tiled_layout",
+  // &_dequantize_tensor_core_tiled_layout);
+  m.impl("torchao::dequantize_tensor_core_tiled_layout",
+         torch::CppFunction::makeFromBoxedFunction<
+             boxed_dequantize_tensor_core_tiled_layout>());
 }
 
 #endif

From 4120a8b922bcb2674da20cf3627f0acd4a9c02b1 Mon Sep 17 00:00:00 2001
From: Jane Xu <janeyx@meta.com>
Date: Tue, 21 Jan 2025 12:59:56 -0800
Subject: [PATCH 05/12] there is a diff between IValue blah and IValue& blah

---
 .../tensor_core_tiled_layout.cu               | 142 +++++-------------
 1 file changed, 36 insertions(+), 106 deletions(-)

diff --git a/torchao/csrc/cuda/tensor_core_tiled_layout/tensor_core_tiled_layout.cu b/torchao/csrc/cuda/tensor_core_tiled_layout/tensor_core_tiled_layout.cu
index 98021759bf..994374910d 100644
--- a/torchao/csrc/cuda/tensor_core_tiled_layout/tensor_core_tiled_layout.cu
+++ b/torchao/csrc/cuda/tensor_core_tiled_layout/tensor_core_tiled_layout.cu
@@ -358,8 +358,10 @@ void voidyvoid_boxed_ATH_dequantize_tensor_core_tiled_layout(void **stack,
   // here, void* is my StableIValue
   // function is going to take a stack of void*, cast them to our
   // schema values for now, and run the function and modify the void* stack
-  int64_t innerKTiles = *reinterpret_cast<int64_t *>(stack[3]);
-  int64_t group_size = *reinterpret_cast<int64_t *>(stack[2]);
+  int64_t innerKTiles = reinterpret_cast<int64_t>(stack[3]);
+  int64_t group_size = reinterpret_cast<int64_t>(stack[2]);
+  TORCH_WARN(innerKTiles);
+  TORCH_WARN(group_size);
   AtenTensorHandle scales_and_zeros_ath =
       reinterpret_cast<AtenTensorHandle>(stack[1]);
   AtenTensorHandle packed_w_ath = reinterpret_cast<AtenTensorHandle>(stack[0]);
@@ -380,37 +382,49 @@ void boxed_dequantize_tensor_core_tiled_layout(const c10::OperatorHandle &op,
   // function pt1 here should take in IValues, pass a malloc'd stack into the
   // second function
   // need a translation from IValues to ATH to void*s!
-  int64_t innerKTiles = torch::jit::pop(stack).toInt();
-  int64_t group_size = torch::jit::pop(stack).toInt();
-  const at::Tensor &scales_and_zeros = torch::jit::pop(stack).toTensor();
-  AtenTensorHandle scales_and_zeros_ath =
-      torch::aot_inductor::tensor_pointer_to_tensor_handle(&scales_and_zeros);
-  const at::Tensor &packed_w = torch::jit::pop(stack).toTensor();
-  AtenTensorHandle packed_w_ath =
-      torch::aot_inductor::tensor_pointer_to_tensor_handle(&packed_w);
-
-  int64_t num_args = 4;
-  int64_t num_outputs = 1;
-  void **ministack = (void**)malloc((num_args + num_outputs) * sizeof(void *));
-  ministack[3] = reinterpret_cast<void *>(&innerKTiles);
-  ministack[2] = reinterpret_cast<void *>(&group_size);
-  ministack[1] = reinterpret_cast<void *>(scales_and_zeros_ath);
-  ministack[0] = reinterpret_cast<void *>(packed_w_ath);
+
+  const auto& schema = op.schema();
+  const auto num_returns = schema.returns().size();
+  const auto num_arguments = schema.arguments().size();
+  TORCH_CHECK(num_arguments==4);
+  TORCH_CHECK(num_returns==1);
+  void **ministack = (void**)malloc((num_arguments + num_returns) * sizeof(void *));
+
+  for (auto idx = 0; idx < num_arguments; idx++) {
+    TORCH_WARN(idx);
+    const c10::IValue& arg = torch::jit::peek(stack, idx, num_arguments);
+    if (arg.isInt()) {
+      ministack[idx] = reinterpret_cast<void *>(arg.toInt());
+    } else if (arg.isTensor()) {
+      TORCH_WARN("am tensor!")
+      const at::Tensor& tensor = arg.toTensor();
+      AtenTensorHandle ath = torch::aot_inductor::tensor_pointer_to_tensor_handle(&tensor);
+      ministack[idx] = reinterpret_cast<void *>(ath);
+    } else {
+      TORCH_CHECK(false, "Other types of IValues not handled!");
+    }
+  }
+  TORCH_WARN("done with forloop no problems!")
 
   // second function is going to take a stack of void*, cast them to our
   // schema values for now, and run the function and modify the void* stack
-  voidyvoid_boxed_ATH_dequantize_tensor_core_tiled_layout(ministack, num_args,
-                                                    num_outputs);
+  voidyvoid_boxed_ATH_dequantize_tensor_core_tiled_layout(ministack, num_arguments,
+                                                    num_returns);
 
   // now read the output from the end of the stack and wrap that back into
   // IValue from void*?
 
   AtenTensorHandle out_ath =
-      reinterpret_cast<AtenTensorHandle>(ministack[num_args]);
-    
+      reinterpret_cast<AtenTensorHandle>(ministack[num_arguments]);
+  
   free(ministack);
+
   at::Tensor out =
       *torch::aot_inductor::tensor_handle_to_tensor_pointer(out_ath);
+
+  // now pop everything. if we pop earlier, Tensors would go out of scope
+  // before calling the function
+  torch::jit::drop(stack, num_arguments);
   torch::jit::push(stack, c10::IValue(out));
 
   // so above is our stack of IValues, but we cannot have these IValues because
@@ -488,90 +502,6 @@ at::Tensor _unpack_tensor_core_tiled_layout(const at::Tensor &packed_w,
   return out;
 }
 
-void voidyvoid_boxed_ATH_dequantize_tensor_core_tiled_layout(void **stack,
-                                                       int64_t num_args,
-                                                       int64_t num_outputs) {
-  // here, void* is my StableIValue
-  // function is going to take a stack of void*, cast them to our
-  // schema values for now, and run the function and modify the void* stack
-  int64_t innerKTiles = *reinterpret_cast<int64_t *>(stack[3]);
-  int64_t group_size = *reinterpret_cast<int64_t *>(stack[2]);
-  AtenTensorHandle scales_and_zeros_ath =
-      reinterpret_cast<AtenTensorHandle>(stack[1]);
-  AtenTensorHandle packed_w_ath = reinterpret_cast<AtenTensorHandle>(stack[0]);
-
-  AtenTensorHandle ath_res = _ATH_dequantize_tensor_core_tiled_layout(
-      packed_w_ath, scales_and_zeros_ath, group_size, innerKTiles);
-
-  void *out = reinterpret_cast<void *>(ath_res);
-  stack[num_args] = out;
-}
-
-// step 1: from here, call the ATH func
-// step 2: make ATH func also boxed and call it
-// step 3: move abstract code to libtorch
-void boxed_dequantize_tensor_core_tiled_layout(const c10::OperatorHandle &op,
-                                               torch::jit::Stack *stack) {
-
-  // function pt1 here should take in IValues, pass a malloc'd stack into the
-  // second function
-  // need a translation from IValues to ATH to void*s!
-  int64_t innerKTiles = torch::jit::pop(stack).toInt();
-  int64_t group_size = torch::jit::pop(stack).toInt();
-  const at::Tensor &scales_and_zeros = torch::jit::pop(stack).toTensor();
-  AtenTensorHandle scales_and_zeros_ath =
-      torch::aot_inductor::tensor_pointer_to_tensor_handle(&scales_and_zeros);
-  const at::Tensor &packed_w = torch::jit::pop(stack).toTensor();
-  AtenTensorHandle packed_w_ath =
-      torch::aot_inductor::tensor_pointer_to_tensor_handle(&packed_w);
-
-  int64_t num_args = 4;
-  int64_t num_outputs = 1;
-  void **ministack = (void**)malloc((num_args + num_outputs) * sizeof(void *));
-  ministack[3] = reinterpret_cast<void *>(&innerKTiles);
-  ministack[2] = reinterpret_cast<void *>(&group_size);
-  ministack[1] = reinterpret_cast<void *>(scales_and_zeros_ath);
-  ministack[0] = reinterpret_cast<void *>(packed_w_ath);
-
-  // second function is going to take a stack of void*, cast them to our
-  // schema values for now, and run the function and modify the void* stack
-  voidyvoid_boxed_ATH_dequantize_tensor_core_tiled_layout(ministack, num_args,
-                                                    num_outputs);
-
-  // now read the output from the end of the stack and wrap that back into
-  // IValue from void*?
-
-  AtenTensorHandle out_ath =
-      reinterpret_cast<AtenTensorHandle>(ministack[num_args]);
-    
-  free(ministack);
-  at::Tensor out =
-      *torch::aot_inductor::tensor_handle_to_tensor_pointer(out_ath);
-  torch::jit::push(stack, c10::IValue(out));
-
-  // so above is our stack of IValues, but we cannot have these IValues because
-  // they are NOT ABI stable! So we need another version of "boxed" with void*s.
-  // and that is what is going to happen below
-
-  // what the old function used to be:
-  // int64_t innerKTiles = torch::jit::pop(stack).toInt();
-  // int64_t group_size = torch::jit::pop(stack).toInt();
-  // const at::Tensor &scales_and_zeros = torch::jit::pop(stack).toTensor();
-  // const at::Tensor &packed_w = torch::jit::pop(stack).toTensor();
-
-  // AtenTensorHandle packed_w_ath =
-  //     torch::aot_inductor::tensor_pointer_to_tensor_handle(&packed_w);
-  // AtenTensorHandle scales_and_zeros_ath =
-  //     torch::aot_inductor::tensor_pointer_to_tensor_handle(&scales_and_zeros);
-
-  // AtenTensorHandle ath_res = _ATH_dequantize_tensor_core_tiled_layout(
-  //     packed_w_ath, scales_and_zeros_ath, group_size, innerKTiles);
-
-  // at::Tensor out =
-  //     *torch::aot_inductor::tensor_handle_to_tensor_pointer(ath_res);
-  // torch::jit::push(stack, c10::IValue(out));
-}
-
 
 TORCH_LIBRARY_IMPL(torchao, CUDA, m) {
   m.impl("torchao::unpack_tensor_core_tiled_layout",

From a1944ee20d53f829ccadcc2492d5e4172c39dc2f Mon Sep 17 00:00:00 2001
From: Jane Xu <janeyx@meta.com>
Date: Wed, 29 Jan 2025 11:32:05 -0800
Subject: [PATCH 06/12] Clean up code, finish other end of void* boxed kernel

---
 .../tensor_core_tiled_layout.cu               | 79 ++++---------------
 1 file changed, 17 insertions(+), 62 deletions(-)

diff --git a/torchao/csrc/cuda/tensor_core_tiled_layout/tensor_core_tiled_layout.cu b/torchao/csrc/cuda/tensor_core_tiled_layout/tensor_core_tiled_layout.cu
index 994374910d..39acdaf4eb 100644
--- a/torchao/csrc/cuda/tensor_core_tiled_layout/tensor_core_tiled_layout.cu
+++ b/torchao/csrc/cuda/tensor_core_tiled_layout/tensor_core_tiled_layout.cu
@@ -1,7 +1,7 @@
 #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 800  // at least Ampere
 
-// #include <ATen/ATen.h>
-// #include <ATen/core/Tensor.h>
+#include <ATen/ATen.h>
+#include <ATen/core/Tensor.h>
 #include <ATen/DeviceGuard.h>
 #include <ATen/core/TensorAccessor.h>
 #include <ATen/core/ivalue.h>
@@ -332,25 +332,6 @@ AtenTensorHandle _ATH_dequantize_tensor_core_tiled_layout(
   return out;
 }
 
-// output is [n][k] (int32 dtype)
-// input is [n / 8][k / (InnerKTiles * 16)][32][innerKTiles / 2]
-// scales_and_zeros is [numQGroups][n][2]
-// qGroupSize is 32, 64, 128 or 256
-// at::Tensor
-// _dequantize_tensor_core_tiled_layout(const at::Tensor &packed_w,
-//                                      const at::Tensor &scales_and_zeros,
-//                                      int64_t group_size, int64_t innerKTiles) {
-
-//   AtenTensorHandle packed_w_ath =
-//       torch::aot_inductor::tensor_pointer_to_tensor_handle(&packed_w);
-//   AtenTensorHandle scales_and_zeros_ath =
-//       torch::aot_inductor::tensor_pointer_to_tensor_handle(&scales_and_zeros);
-
-//   AtenTensorHandle ath_res = _ATH_dequantize_tensor_core_tiled_layout(
-//       packed_w_ath, scales_and_zeros_ath, group_size, innerKTiles);
-
-//   return *torch::aot_inductor::tensor_handle_to_tensor_pointer(ath_res);
-// }
 
 void voidyvoid_boxed_ATH_dequantize_tensor_core_tiled_layout(void **stack,
                                                        int64_t num_args,
@@ -360,8 +341,6 @@ void voidyvoid_boxed_ATH_dequantize_tensor_core_tiled_layout(void **stack,
   // schema values for now, and run the function and modify the void* stack
   int64_t innerKTiles = reinterpret_cast<int64_t>(stack[3]);
   int64_t group_size = reinterpret_cast<int64_t>(stack[2]);
-  TORCH_WARN(innerKTiles);
-  TORCH_WARN(group_size);
   AtenTensorHandle scales_and_zeros_ath =
       reinterpret_cast<AtenTensorHandle>(stack[1]);
   AtenTensorHandle packed_w_ath = reinterpret_cast<AtenTensorHandle>(stack[0]);
@@ -386,68 +365,44 @@ void boxed_dequantize_tensor_core_tiled_layout(const c10::OperatorHandle &op,
   const auto& schema = op.schema();
   const auto num_returns = schema.returns().size();
   const auto num_arguments = schema.arguments().size();
-  TORCH_CHECK(num_arguments==4);
-  TORCH_CHECK(num_returns==1);
   void **ministack = (void**)malloc((num_arguments + num_returns) * sizeof(void *));
 
   for (auto idx = 0; idx < num_arguments; idx++) {
-    TORCH_WARN(idx);
     const c10::IValue& arg = torch::jit::peek(stack, idx, num_arguments);
     if (arg.isInt()) {
       ministack[idx] = reinterpret_cast<void *>(arg.toInt());
     } else if (arg.isTensor()) {
-      TORCH_WARN("am tensor!")
       const at::Tensor& tensor = arg.toTensor();
       AtenTensorHandle ath = torch::aot_inductor::tensor_pointer_to_tensor_handle(&tensor);
       ministack[idx] = reinterpret_cast<void *>(ath);
     } else {
-      TORCH_CHECK(false, "Other types of IValues not handled!");
+      TORCH_CHECK(false, "Other types of IValues not yet handled!");
     }
   }
-  TORCH_WARN("done with forloop no problems!")
 
   // second function is going to take a stack of void*, cast them to our
   // schema values for now, and run the function and modify the void* stack
   voidyvoid_boxed_ATH_dequantize_tensor_core_tiled_layout(ministack, num_arguments,
                                                     num_returns);
 
-  // now read the output from the end of the stack and wrap that back into
-  // IValue from void*?
-
-  AtenTensorHandle out_ath =
-      reinterpret_cast<AtenTensorHandle>(ministack[num_arguments]);
-  
-  free(ministack);
-
-  at::Tensor out =
-      *torch::aot_inductor::tensor_handle_to_tensor_pointer(out_ath);
-
-  // now pop everything. if we pop earlier, Tensors would go out of scope
+  // now pop all inputs on stack. if we pop earlier, Tensors would go out of scope
   // before calling the function
   torch::jit::drop(stack, num_arguments);
-  torch::jit::push(stack, c10::IValue(out));
-
-  // so above is our stack of IValues, but we cannot have these IValues because
-  // they are NOT ABI stable! So we need another version of "boxed" with void*s.
-  // and that is what is going to happen below
-
-  // what the old function used to be:
-  // int64_t innerKTiles = torch::jit::pop(stack).toInt();
-  // int64_t group_size = torch::jit::pop(stack).toInt();
-  // const at::Tensor &scales_and_zeros = torch::jit::pop(stack).toTensor();
-  // const at::Tensor &packed_w = torch::jit::pop(stack).toTensor();
 
-  // AtenTensorHandle packed_w_ath =
-  //     torch::aot_inductor::tensor_pointer_to_tensor_handle(&packed_w);
-  // AtenTensorHandle scales_and_zeros_ath =
-  //     torch::aot_inductor::tensor_pointer_to_tensor_handle(&scales_and_zeros);
-
-  // AtenTensorHandle ath_res = _ATH_dequantize_tensor_core_tiled_layout(
-  //     packed_w_ath, scales_and_zeros_ath, group_size, innerKTiles);
+  // read the output from the end of the stack and wrap that back into
+  // IValue from void*?
+  for (auto idx = 0; idx < num_returns; idx++) {
+    const c10::TypePtr& ret_type = schema.returns()[idx].type();
+    if (*ret_type == *c10::getTypePtr<at::Tensor>()) {
+      AtenTensorHandle ret_ath = reinterpret_cast<AtenTensorHandle>( ministack[num_arguments + idx]);
+      at::Tensor out = *torch::aot_inductor::tensor_handle_to_tensor_pointer(ret_ath);
+      torch::jit::push(stack, c10::IValue(out));
+    } else {
+      TORCH_CHECK(false, "Only Tensor return types are currently supported!");
+    }
+  }
 
-  // at::Tensor out =
-  //     *torch::aot_inductor::tensor_handle_to_tensor_pointer(ath_res);
-  // torch::jit::push(stack, c10::IValue(out));
+  free(ministack);
 }
 
 

From 672aeec65ae24ba00900dc7e16c3d8b18cc04055 Mon Sep 17 00:00:00 2001
From: Jane Xu <janeyx@meta.com>
Date: Thu, 30 Jan 2025 14:12:01 -0800
Subject: [PATCH 07/12] [skip ci] saving work on registration

---
 .../tensor_core_tiled_layout/libtorch.cpp     | 133 +++++++++++++
 .../cuda/tensor_core_tiled_layout/libtorch.h  |  90 +++++++++
 .../tensor_core_tiled_layout.cu               | 184 +++++++++---------
 3 files changed, 316 insertions(+), 91 deletions(-)
 create mode 100644 torchao/csrc/cuda/tensor_core_tiled_layout/libtorch.cpp
 create mode 100644 torchao/csrc/cuda/tensor_core_tiled_layout/libtorch.h

diff --git a/torchao/csrc/cuda/tensor_core_tiled_layout/libtorch.cpp b/torchao/csrc/cuda/tensor_core_tiled_layout/libtorch.cpp
new file mode 100644
index 0000000000..42dd6f947b
--- /dev/null
+++ b/torchao/csrc/cuda/tensor_core_tiled_layout/libtorch.cpp
@@ -0,0 +1,133 @@
+// in this file, we will implement the stuff in libtorch.h,
+// and we are allowed to call unstable stuff from pytorch!
+
+#include <libtorch.h>
+
+#include <ATen/ATen.h>
+#include <ATen/core/Tensor.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/core/TensorAccessor.h>
+#include <ATen/core/ivalue.h>
+#include <ATen/core/stack.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <torch/csrc/inductor/aoti_torch/utils.h>
+#include <torch/library.h>
+
+
+// step 1: from here, call the ATH func
+// step 2: make ATH func also boxed and call it
+// step 3: move abstract code to libtorch
+void boxed_dequantize_tensor_core_tiled_layout(const c10::OperatorHandle &op,
+                                               torch::jit::Stack *stack) {
+
+  // function pt1 here should take in IValues, pass a malloc'd stack into the
+  // second function
+  // need a translation from IValues to ATH to void*s!
+
+  const auto& schema = op.schema();
+  const auto num_returns = schema.returns().size();
+  const auto num_arguments = schema.arguments().size();
+  void **ministack = (void**)malloc((num_arguments + num_returns) * sizeof(void *));
+
+  for (auto idx = 0; idx < num_arguments; idx++) {
+    const c10::IValue& arg = torch::jit::peek(stack, idx, num_arguments);
+    if (arg.isInt()) {
+      ministack[idx] = reinterpret_cast<void *>(arg.toInt());
+    } else if (arg.isTensor()) {
+      const at::Tensor& tensor = arg.toTensor();
+      AtenTensorHandle ath = torch::aot_inductor::tensor_pointer_to_tensor_handle(&tensor);
+      ministack[idx] = reinterpret_cast<void *>(ath);
+    } else {
+      TORCH_CHECK(false, "Other types of IValues not yet handled!");
+    }
+  }
+
+  // second function is going to take a stack of void*, cast them to our
+  // schema values for now, and run the function and modify the void* stack
+  voidyvoid_boxed_ATH_dequantize_tensor_core_tiled_layout(ministack, num_arguments,
+                                                    num_returns);
+
+  // now pop all inputs on stack. if we pop earlier, Tensors would go out of scope
+  // before calling the function
+  torch::jit::drop(stack, num_arguments);
+
+  // read the output from the end of the stack and wrap that back into
+  // IValue from void*?
+  for (auto idx = 0; idx < num_returns; idx++) {
+    const c10::TypePtr& ret_type = schema.returns()[idx].type();
+    if (*ret_type == *c10::getTypePtr<at::Tensor>()) {
+      AtenTensorHandle ret_ath = reinterpret_cast<AtenTensorHandle>( ministack[num_arguments + idx]);
+      at::Tensor out = *torch::aot_inductor::tensor_handle_to_tensor_pointer(ret_ath);
+      torch::jit::push(stack, c10::IValue(out));
+    } else {
+      TORCH_CHECK(false, "Only Tensor return types are currently supported!");
+    }
+  }
+
+  free(ministack);
+}
+
+
+void boxed_unpack_tensor_core_tiled_layout(const c10::OperatorHandle &op,
+                                               torch::jit::Stack *stack) {
+
+  // function pt1 here should take in IValues, pass a malloc'd stack into the
+  // second function
+  // need a translation from IValues to ATH to void*s!
+
+  const auto& schema = op.schema();
+  const auto num_returns = schema.returns().size();
+  const auto num_arguments = schema.arguments().size();
+  void **ministack = (void**)malloc((num_arguments + num_returns) * sizeof(void *));
+
+  for (auto idx = 0; idx < num_arguments; idx++) {
+    const c10::IValue& arg = torch::jit::peek(stack, idx, num_arguments);
+    if (arg.isInt()) {
+      ministack[idx] = reinterpret_cast<void *>(arg.toInt());
+    } else if (arg.isTensor()) {
+      const at::Tensor& tensor = arg.toTensor();
+      AtenTensorHandle ath = torch::aot_inductor::tensor_pointer_to_tensor_handle(&tensor);
+      ministack[idx] = reinterpret_cast<void *>(ath);
+    } else {
+      TORCH_CHECK(false, "Other types of IValues not yet handled!");
+    }
+  }
+
+  // second function is going to take a stack of void*, cast them to our
+  // schema values for now, and run the function and modify the void* stack
+  voidyvoid_boxed_ATH_unpack_tensor_core_tiled_layout(ministack, num_arguments,
+                                                    num_returns);
+
+  // now pop all inputs on stack. if we pop earlier, Tensors would go out of scope
+  // before calling the function
+  torch::jit::drop(stack, num_arguments);
+
+  // read the output from the end of the stack and wrap that back into
+  // IValue from void*?
+  for (auto idx = 0; idx < num_returns; idx++) {
+    const c10::TypePtr& ret_type = schema.returns()[idx].type();
+    if (*ret_type == *c10::getTypePtr<at::Tensor>()) {
+      AtenTensorHandle ret_ath = reinterpret_cast<AtenTensorHandle>( ministack[num_arguments + idx]);
+      at::Tensor out = *torch::aot_inductor::tensor_handle_to_tensor_pointer(ret_ath);
+      torch::jit::push(stack, c10::IValue(out));
+    } else {
+      TORCH_CHECK(false, "Only Tensor return types are currently supported!");
+    }
+  }
+
+  free(ministack);
+}
+
+TORCH_LIBRARY_IMPL(torchao, CUDA, m) {
+  // m.impl("torchao::unpack_tensor_core_tiled_layout",
+  //        &_unpack_tensor_core_tiled_layout);
+  m.impl("torchao::unpack_tensor_core_tiled_layout",
+         torch::CppFunction::makeFromBoxedFunction<
+             boxed_unpack_tensor_core_tiled_layout>());
+  // m.impl("torchao::dequantize_tensor_core_tiled_layout",
+  // &_dequantize_tensor_core_tiled_layout);
+  m.impl("torchao::dequantize_tensor_core_tiled_layout",
+         torch::CppFunction::makeFromBoxedFunction<
+             boxed_dequantize_tensor_core_tiled_layout>());
+}
diff --git a/torchao/csrc/cuda/tensor_core_tiled_layout/libtorch.h b/torchao/csrc/cuda/tensor_core_tiled_layout/libtorch.h
new file mode 100644
index 0000000000..9fd7636d04
--- /dev/null
+++ b/torchao/csrc/cuda/tensor_core_tiled_layout/libtorch.h
@@ -0,0 +1,90 @@
+// this file can only have stable stuff! Akin to shim.h
+
+#include <c10/util/BFloat16.h>
+#include <c10/macros/Macros.h>     // used for C10_UID, verified to be header-only
+#include <c10/core/DispatchKey.h>  // used for DispatchKey, enum verified to be header-only
+#include <torch/csrc/inductor/aoti_torch/c/shim.h>
+
+class StableLibrary final {
+  public:
+  // a pointer to a real Library
+  // a kind
+  enum Kind {
+    DEF, // from TORCH_LIBRARY (no qualifier)
+    IMPL,
+    FRAGMENT,
+  };
+
+
+  // constructor
+
+  
+};
+
+
+// _def function   ==> IGNORE LIBRARY + just call these
+
+// stable_impl function (that takes in a string and a void** function pointer)
+// _impl doesn't really need a Library object, try to avoid it for now
+// just copy its implementation
+// it'll give u a handle that needs to be kept alive, just assign to global for now
+
+
+class StableTorchLibraryInit final {
+  private:
+    using InitFn = void(StableLibrary&);
+    StableLibrary lib_;
+  
+  public:
+    StableTorchLibraryInit(
+      StableLibrary::Kind kind,
+      InitFn* fn,
+      const char* ns,
+      std::optional<c10::DispatchKey> k,
+      const char* file,
+      uint32_t line)
+      : lib_(kind, ns, k, file, line) {
+        fn(lib_);
+      }
+};
+
+
+#define STABLE_TORCH_LIBRARY_IMPL(ns, k, m) _STABLE_TORCH_LIBRARY_IMPL(ns, k, m, C10_UID)
+
+#define _STABLE_TORCH_LIBRARY_IMPL(ns, k, m, uid)                         \
+  static void C10_CONCATENATE(                                            \
+      STABLE_TORCH_LIBRARY_IMPL_init_##ns##_##k##_, uid)(StableLibrary&);       \
+  static const StableTorchLibraryInit C10_CONCATENATE(           \
+      STABLE_TORCH_LIBRARY_IMPL_static_init_##ns##_##k##_, uid)(                 \
+      StableLibrary::IMPL,                                               \
+      &C10_CONCATENATE(STABLE_TORCH_LIBRARY_IMPL_init_##ns##_##k##_, uid), \
+      #ns,                                                                \
+      std::make_optional(c10::DispatchKey::k),                            \
+      __FILE__,                                                           \
+      __LINE__);                                                          \
+  void C10_CONCATENATE(                                                   \
+      STABLE_TORCH_LIBRARY_IMPL_init_##ns##_##k##_, uid)(StableLibrary & m)
+
+
+
+
+
+
+// #define TORCH_LIBRARY_IMPL(ns, k, m) _TORCH_LIBRARY_IMPL(ns, k, m, C10_UID)
+
+// #define _TORCH_LIBRARY_IMPL(ns, k, m, uid)                                \
+//   static void TORCH_LIBRARY_IMPL_init_torchao_CUDA_uid(torch::Library&);       \
+//   static const torch::detail::TorchLibraryInit      \
+//       TORCH_LIBRARY_IMPL_static_init_torchao_CUDA_uid(                 \
+//       torch::Library::IMPL,                                               \
+//       (c10::impl::dispatch_key_allowlist_check(c10::DispatchKey::CUDA)       \
+//            ? &TORCH_LIBRARY_IMPL_init_torchao_CUDA_uid \
+//            : [](torch::Library&) -> void {}),                             \
+//       torchao,                                                                \
+//       std::make_optional(c10::DispatchKey::CUDA),                            \
+//       __FILE__,                                                           \
+//       __LINE__);                                                          \
+//   TORCH_LIBRARY_IMPL_init_torchao_CUDA_uid(torch::Library & m) {
+
+//   }
+
diff --git a/torchao/csrc/cuda/tensor_core_tiled_layout/tensor_core_tiled_layout.cu b/torchao/csrc/cuda/tensor_core_tiled_layout/tensor_core_tiled_layout.cu
index 39acdaf4eb..f4c252ab37 100644
--- a/torchao/csrc/cuda/tensor_core_tiled_layout/tensor_core_tiled_layout.cu
+++ b/torchao/csrc/cuda/tensor_core_tiled_layout/tensor_core_tiled_layout.cu
@@ -1,17 +1,7 @@
 #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 800  // at least Ampere
 
-#include <ATen/ATen.h>
-#include <ATen/core/Tensor.h>
-#include <ATen/DeviceGuard.h>
-#include <ATen/core/TensorAccessor.h>
-#include <ATen/core/ivalue.h>
-#include <ATen/core/stack.h>
-#include <ATen/cuda/CUDAContext.h>
-#include <c10/cuda/CUDAGuard.h>
 #include <c10/util/BFloat16.h>
 #include <torch/csrc/inductor/aoti_torch/c/shim.h>
-#include <torch/csrc/inductor/aoti_torch/utils.h>
-#include <torch/library.h>
 
 template <typename U, typename V>
 constexpr __host__ __device__ auto divUp(U a, V b) -> decltype(a + b) {
@@ -72,6 +62,7 @@ inline __device__ bf16x2x4 convert_i4x8_to_bf16x2x4(uint32_t source) {
 
   return result;
 }
+
 // in size [ceil(n / 8)][ceil(k / (InnerKTiles * 16))][32][InnerKTiles / 2]
 // scales_and_zeros size [numQGroups][n][2]
 // out size [n][k]
@@ -162,6 +153,15 @@ __global__ void _dequantize_int4_kernel(
   }
 }
 
+
+
+// The following function should eventually be libtorch agnostic
+// Blockers from that goal today:
+//  - TORCH_CHECK
+//  - c10::cuda::CUDAGuard
+//  - at::cuda::getCurrentCUDAStream();
+//  - at::GenericPackedTensorAccessor
+//  - at::RestrictPtrTraits
 AtenTensorHandle _ATH_dequantize_tensor_core_tiled_layout(
     const AtenTensorHandle packed_w, const AtenTensorHandle scales_and_zeros,
     int64_t group_size, int64_t innerKTiles) {
@@ -352,77 +352,45 @@ void voidyvoid_boxed_ATH_dequantize_tensor_core_tiled_layout(void **stack,
   stack[num_args] = out;
 }
 
-// step 1: from here, call the ATH func
-// step 2: make ATH func also boxed and call it
-// step 3: move abstract code to libtorch
-void boxed_dequantize_tensor_core_tiled_layout(const c10::OperatorHandle &op,
-                                               torch::jit::Stack *stack) {
-
-  // function pt1 here should take in IValues, pass a malloc'd stack into the
-  // second function
-  // need a translation from IValues to ATH to void*s!
-
-  const auto& schema = op.schema();
-  const auto num_returns = schema.returns().size();
-  const auto num_arguments = schema.arguments().size();
-  void **ministack = (void**)malloc((num_arguments + num_returns) * sizeof(void *));
-
-  for (auto idx = 0; idx < num_arguments; idx++) {
-    const c10::IValue& arg = torch::jit::peek(stack, idx, num_arguments);
-    if (arg.isInt()) {
-      ministack[idx] = reinterpret_cast<void *>(arg.toInt());
-    } else if (arg.isTensor()) {
-      const at::Tensor& tensor = arg.toTensor();
-      AtenTensorHandle ath = torch::aot_inductor::tensor_pointer_to_tensor_handle(&tensor);
-      ministack[idx] = reinterpret_cast<void *>(ath);
-    } else {
-      TORCH_CHECK(false, "Other types of IValues not yet handled!");
-    }
-  }
-
-  // second function is going to take a stack of void*, cast them to our
-  // schema values for now, and run the function and modify the void* stack
-  voidyvoid_boxed_ATH_dequantize_tensor_core_tiled_layout(ministack, num_arguments,
-                                                    num_returns);
-
-  // now pop all inputs on stack. if we pop earlier, Tensors would go out of scope
-  // before calling the function
-  torch::jit::drop(stack, num_arguments);
-
-  // read the output from the end of the stack and wrap that back into
-  // IValue from void*?
-  for (auto idx = 0; idx < num_returns; idx++) {
-    const c10::TypePtr& ret_type = schema.returns()[idx].type();
-    if (*ret_type == *c10::getTypePtr<at::Tensor>()) {
-      AtenTensorHandle ret_ath = reinterpret_cast<AtenTensorHandle>( ministack[num_arguments + idx]);
-      at::Tensor out = *torch::aot_inductor::tensor_handle_to_tensor_pointer(ret_ath);
-      torch::jit::push(stack, c10::IValue(out));
-    } else {
-      TORCH_CHECK(false, "Only Tensor return types are currently supported!");
-    }
-  }
-
-  free(ministack);
-}
-
 
 // output is [n][k] (int32 dtype)
 // input is [n / 8][k / (InnerKTiles * 16)][32][innerKTiles / 2]
-at::Tensor _unpack_tensor_core_tiled_layout(const at::Tensor &packed_w,
+AtenTensorHandle _ATH_unpack_tensor_core_tiled_layout(const AtenTensorHandle packed_w,
                                             int64_t innerKTiles) {
 
-  c10::cuda::CUDAGuard g(packed_w.device());
+  int32_t packed_w_device_index;
+  aoti_torch_get_device_index(packed_w, &packed_w_device_index);
+
+  // c10::cuda::CUDAGuard g(packed_w.device());
+  c10::cuda::CUDAGuard g(packed_w_device_index);
+
+  int64_t packed_w_dim;
+  aoti_torch_get_dim(packed_w, &packed_w_dim);
+  TORCH_CHECK(packed_w_dim == 4);
+
+  int32_t packed_w_dtype;
+  aoti_torch_get_dtype(packed_w, &packed_w_dtype);
+  TORCH_CHECK(packed_w_dtype == static_cast<int32_t>(at::kInt));
 
-  TORCH_CHECK(packed_w.dim() == 4);
-  TORCH_CHECK(packed_w.dtype() == at::kInt);
-  TORCH_CHECK(packed_w.is_contiguous());
+  // is_contiguous not existent today
+  // TORCH_CHECK(packed_w.is_contiguous());
 
-  TORCH_CHECK(packed_w.size(2) == 32);
-  TORCH_CHECK(packed_w.size(3) == innerKTiles / 2);
+  int64_t packed_w_dim_2_size;
+  aoti_torch_get_size(packed_w, 2, &packed_w_dim_2_size);
+  TORCH_CHECK(packed_w_dim_2_size == 32);
+  
+  int64_t packed_w_dim_3_size;
+  aoti_torch_get_size(packed_w, 3, &packed_w_dim_3_size);
+  TORCH_CHECK(packed_w_dim_3_size == innerKTiles / 2);
   TORCH_CHECK(innerKTiles == 2 || innerKTiles == 4 || innerKTiles == 8);
 
-  int N = packed_w.size(0) * 8;
-  int K = packed_w.size(1) * innerKTiles * 16;
+  int64_t packed_w_dim_0_size;
+  aoti_torch_get_size(packed_w, 0, &packed_w_dim_0_size);
+  int N = packed_w_dim_0_size * 8;
+
+  int64_t packed_w_dim_1_size;
+  aoti_torch_get_size(packed_w, 1, &packed_w_dim_1_size);
+  int K = packed_w_dim_1_size * innerKTiles * 16;
 
   constexpr int32_t kNTileSize = 8;
   constexpr int32_t kKTileSize = 16;
@@ -431,41 +399,75 @@ at::Tensor _unpack_tensor_core_tiled_layout(const at::Tensor &packed_w,
 
   auto kSuperTiles = divUp(K, innerKTiles * kKTileSize);
 
-  auto out = at::empty(
-      {N, K}, at::TensorOptions().dtype(at::kInt).device(packed_w.device()));
+  AtenTensorHandle out;
+  int64_t out_sizes[] = {N, K};
+  int64_t out_strides[] = {K, 1};
+  auto kInt = aoti_torch_dtype_int32();
+  int32_t packed_w_device_type;
+  aoti_torch_get_device_type(packed_w, &packed_w_device_type);
+  aoti_torch_empty_strided(2, out_sizes, out_strides, kInt, packed_w_device_type, packed_w_device_index, &out);
+  // auto out = at::empty(
+  //     {N, K}, at::TensorOptions().dtype(at::kInt).device(packed_w.device()));
 
   auto stream = at::cuda::getCurrentCUDAStream();
   dim3 grid(kSuperTiles, nTiles);
 
+  void *packed_w_data_ptr;
+  int64_t *packed_w_sizes;
+  int64_t *packed_w_strides;
+  aoti_torch_get_data_ptr(packed_w, &packed_w_data_ptr);
+  aoti_torch_get_sizes(packed_w, &packed_w_sizes);
+  aoti_torch_get_strides(packed_w, &packed_w_strides);
+  at::GenericPackedTensorAccessor<int32_t, 4, at::RestrictPtrTraits, int32_t>
+      packed_w_pta32(
+          static_cast<typename at::RestrictPtrTraits<int32_t>::PtrType>(
+              packed_w_data_ptr),
+          packed_w_sizes, packed_w_strides);
+
+  void *out_data_ptr;
+  aoti_torch_get_data_ptr(out, &out_data_ptr);
+  at::GenericPackedTensorAccessor<int32_t, 2, at::RestrictPtrTraits,
+                                  int32_t>
+      out_pta32(
+          static_cast<typename at::RestrictPtrTraits<int32_t>::PtrType>(
+              out_data_ptr),
+          out_sizes, out_strides);
+
   if (innerKTiles == 2) {
     _dequantize_int4_kernel<int32_t, 2, 0, false>
-        <<<grid, kWarpSize, 0, stream>>>(
-            packed_w.packed_accessor32<int32_t, 4, at::RestrictPtrTraits>(),
-            out.packed_accessor32<int32_t, 2, at::RestrictPtrTraits>());
+        <<<grid, kWarpSize, 0, stream>>>(packed_w_pta32, out_pta32);
+    // packed_w.packed_accessor32<int32_t, 4, at::RestrictPtrTraits>(),
+    // out.packed_accessor32<int32_t, 2, at::RestrictPtrTraits>()
   } else if (innerKTiles == 4) {
     _dequantize_int4_kernel<int32_t, 4, 0, false>
-        <<<grid, kWarpSize, 0, stream>>>(
-            packed_w.packed_accessor32<int32_t, 4, at::RestrictPtrTraits>(),
-            out.packed_accessor32<int32_t, 2, at::RestrictPtrTraits>());
+        <<<grid, kWarpSize, 0, stream>>>(packed_w_pta32, out_pta32);
   } else if (innerKTiles == 8) {
     _dequantize_int4_kernel<int32_t, 8, 0, false>
-        <<<grid, kWarpSize, 0, stream>>>(
-            packed_w.packed_accessor32<int32_t, 4, at::RestrictPtrTraits>(),
-            out.packed_accessor32<int32_t, 2, at::RestrictPtrTraits>());
+        <<<grid, kWarpSize, 0, stream>>>(packed_w_pta32, out_pta32);
   }
 
   return out;
 }
 
+void voidyvoid_boxed_ATH_unpack_tensor_core_tiled_layout(void **stack,
+                                                       int64_t num_args,
+                                                       int64_t num_outputs) {
+  // here, void* is my StableIValue
+  // function is going to take a stack of void*, cast them to our
+  // schema values for now, and run the function and modify the void* stack
+  int64_t innerKTiles = reinterpret_cast<int64_t>(stack[1]);
+  AtenTensorHandle packed_w_ath = reinterpret_cast<AtenTensorHandle>(stack[0]);
+
+  AtenTensorHandle ath_res = _ATH_unpack_tensor_core_tiled_layout(
+      packed_w_ath, innerKTiles);
+
+  void *out = reinterpret_cast<void *>(ath_res);
+  stack[num_args] = out;
+}
 
-TORCH_LIBRARY_IMPL(torchao, CUDA, m) {
-  m.impl("torchao::unpack_tensor_core_tiled_layout",
-         &_unpack_tensor_core_tiled_layout);
-  // m.impl("torchao::dequantize_tensor_core_tiled_layout",
-  // &_dequantize_tensor_core_tiled_layout);
-  m.impl("torchao::dequantize_tensor_core_tiled_layout",
-         torch::CppFunction::makeFromBoxedFunction<
-             boxed_dequantize_tensor_core_tiled_layout>());
+STABLE_TORCH_LIBRARY_IMPL(torchao, CUDA, m) {
+    m.impl("torchao::unpack_tensor_core_tiled_layout", &voidyvoid_boxed_ATH_unpack_tensor_core_tiled_layout);
+    m.impl("torchao::dequantize_tensor_core_tiled_layout", &voidyvoid_boxed_ATH_dequantize_tensor_core_tiled_layout);
 }
 
 #endif

From 97a922048b9c61bda4440cccde6633f60a7bcf96 Mon Sep 17 00:00:00 2001
From: Jane Xu <janeyx@meta.com>
Date: Fri, 31 Jan 2025 15:00:02 -0800
Subject: [PATCH 08/12] [skip ci] This definitely does not compile

---
 .../tensor_core_tiled_layout/libtorch.cpp     | 330 ++++++++++++------
 .../cuda/tensor_core_tiled_layout/libtorch.h  |  43 ++-
 2 files changed, 252 insertions(+), 121 deletions(-)

diff --git a/torchao/csrc/cuda/tensor_core_tiled_layout/libtorch.cpp b/torchao/csrc/cuda/tensor_core_tiled_layout/libtorch.cpp
index 42dd6f947b..50fcf97db4 100644
--- a/torchao/csrc/cuda/tensor_core_tiled_layout/libtorch.cpp
+++ b/torchao/csrc/cuda/tensor_core_tiled_layout/libtorch.cpp
@@ -15,119 +15,231 @@
 #include <torch/library.h>
 
 
-// step 1: from here, call the ATH func
-// step 2: make ATH func also boxed and call it
-// step 3: move abstract code to libtorch
-void boxed_dequantize_tensor_core_tiled_layout(const c10::OperatorHandle &op,
-                                               torch::jit::Stack *stack) {
-
-  // function pt1 here should take in IValues, pass a malloc'd stack into the
-  // second function
-  // need a translation from IValues to ATH to void*s!
-
-  const auto& schema = op.schema();
-  const auto num_returns = schema.returns().size();
-  const auto num_arguments = schema.arguments().size();
-  void **ministack = (void**)malloc((num_arguments + num_returns) * sizeof(void *));
-
-  for (auto idx = 0; idx < num_arguments; idx++) {
-    const c10::IValue& arg = torch::jit::peek(stack, idx, num_arguments);
-    if (arg.isInt()) {
-      ministack[idx] = reinterpret_cast<void *>(arg.toInt());
-    } else if (arg.isTensor()) {
-      const at::Tensor& tensor = arg.toTensor();
-      AtenTensorHandle ath = torch::aot_inductor::tensor_pointer_to_tensor_handle(&tensor);
-      ministack[idx] = reinterpret_cast<void *>(ath);
-    } else {
-      TORCH_CHECK(false, "Other types of IValues not yet handled!");
+// // step 1: from here, call the ATH func
+// // step 2: make ATH func also boxed and call it
+// // step 3: move abstract code to libtorch
+// void boxed_dequantize_tensor_core_tiled_layout(const c10::OperatorHandle &op,
+//                                                torch::jit::Stack *stack) {
+
+//   // function pt1 here should take in IValues, pass a malloc'd stack into the
+//   // second function
+//   // need a translation from IValues to ATH to void*s!
+
+//   const auto& schema = op.schema();
+//   const auto num_returns = schema.returns().size();
+//   const auto num_arguments = schema.arguments().size();
+//   void **ministack = (void**)malloc((num_arguments + num_returns) * sizeof(void *));
+
+//   for (auto idx = 0; idx < num_arguments; idx++) {
+//     const c10::IValue& arg = torch::jit::peek(stack, idx, num_arguments);
+//     if (arg.isInt()) {
+//       ministack[idx] = reinterpret_cast<void *>(arg.toInt());
+//     } else if (arg.isTensor()) {
+//       const at::Tensor& tensor = arg.toTensor();
+//       AtenTensorHandle ath = torch::aot_inductor::tensor_pointer_to_tensor_handle(&tensor);
+//       ministack[idx] = reinterpret_cast<void *>(ath);
+//     } else {
+//       TORCH_CHECK(false, "Other types of IValues not yet handled!");
+//     }
+//   }
+
+//   // second function is going to take a stack of void*, cast them to our
+//   // schema values for now, and run the function and modify the void* stack
+//   voidyvoid_boxed_ATH_dequantize_tensor_core_tiled_layout(ministack, num_arguments,
+//                                                     num_returns);
+
+//   // now pop all inputs on stack. if we pop earlier, Tensors would go out of scope
+//   // before calling the function
+//   torch::jit::drop(stack, num_arguments);
+
+//   // read the output from the end of the stack and wrap that back into
+//   // IValue from void*?
+//   for (auto idx = 0; idx < num_returns; idx++) {
+//     const c10::TypePtr& ret_type = schema.returns()[idx].type();
+//     if (*ret_type == *c10::getTypePtr<at::Tensor>()) {
+//       AtenTensorHandle ret_ath = reinterpret_cast<AtenTensorHandle>( ministack[num_arguments + idx]);
+//       at::Tensor out = *torch::aot_inductor::tensor_handle_to_tensor_pointer(ret_ath);
+//       torch::jit::push(stack, c10::IValue(out));
+//     } else {
+//       TORCH_CHECK(false, "Only Tensor return types are currently supported!");
+//     }
+//   }
+
+//   free(ministack);
+// }
+
+
+// void boxed_unpack_tensor_core_tiled_layout(const c10::OperatorHandle &op,
+//                                                torch::jit::Stack *stack) {
+
+//   // function pt1 here should take in IValues, pass a malloc'd stack into the
+//   // second function
+//   // need a translation from IValues to ATH to void*s!
+
+//   const auto& schema = op.schema();
+//   const auto num_returns = schema.returns().size();
+//   const auto num_arguments = schema.arguments().size();
+//   void **ministack = (void**)malloc((num_arguments + num_returns) * sizeof(void *));
+
+//   for (auto idx = 0; idx < num_arguments; idx++) {
+//     const c10::IValue& arg = torch::jit::peek(stack, idx, num_arguments);
+//     if (arg.isInt()) {
+//       ministack[idx] = reinterpret_cast<void *>(arg.toInt());
+//     } else if (arg.isTensor()) {
+//       const at::Tensor& tensor = arg.toTensor();
+//       AtenTensorHandle ath = torch::aot_inductor::tensor_pointer_to_tensor_handle(&tensor);
+//       ministack[idx] = reinterpret_cast<void *>(ath);
+//     } else {
+//       TORCH_CHECK(false, "Other types of IValues not yet handled!");
+//     }
+//   }
+
+//   // second function is going to take a stack of void*, cast them to our
+//   // schema values for now, and run the function and modify the void* stack
+//   voidyvoid_boxed_ATH_unpack_tensor_core_tiled_layout(ministack, num_arguments,
+//                                                     num_returns);
+
+//   // now pop all inputs on stack. if we pop earlier, Tensors would go out of scope
+//   // before calling the function
+//   torch::jit::drop(stack, num_arguments);
+
+//   // read the output from the end of the stack and wrap that back into
+//   // IValue from void*?
+//   for (auto idx = 0; idx < num_returns; idx++) {
+//     const c10::TypePtr& ret_type = schema.returns()[idx].type();
+//     if (*ret_type == *c10::getTypePtr<at::Tensor>()) {
+//       AtenTensorHandle ret_ath = reinterpret_cast<AtenTensorHandle>( ministack[num_arguments + idx]);
+//       at::Tensor out = *torch::aot_inductor::tensor_handle_to_tensor_pointer(ret_ath);
+//       torch::jit::push(stack, c10::IValue(out));
+//     } else {
+//       TORCH_CHECK(false, "Only Tensor return types are currently supported!");
+//     }
+//   }
+
+//   free(ministack);
+// }
+
+// void boxed_void_function(const c10::OperatorHandle &op, torch::jit::Stack *stack) {
+
+//   // function pt1 here should take in IValues, pass a malloc'd stack into the
+//   // second function
+//   // need a translation from IValues to ATH to void*s!
+
+//   const auto& schema = op.schema();
+//   const auto num_returns = schema.returns().size();
+//   const auto num_arguments = schema.arguments().size();
+//   void **ministack = (void**)malloc((num_arguments + num_returns) * sizeof(void *));
+
+//   for (auto idx = 0; idx < num_arguments; idx++) {
+//     const c10::IValue& arg = torch::jit::peek(stack, idx, num_arguments);
+//     if (arg.isInt()) {
+//       ministack[idx] = reinterpret_cast<void *>(arg.toInt());
+//     } else if (arg.isTensor()) {
+//       const at::Tensor& tensor = arg.toTensor();
+//       AtenTensorHandle ath = torch::aot_inductor::tensor_pointer_to_tensor_handle(&tensor);
+//       ministack[idx] = reinterpret_cast<void *>(ath);
+//     } else {
+//       TORCH_CHECK(false, "Other types of IValues not yet handled!");
+//     }
+//   }
+
+//   // second function is going to take a stack of void*, cast them to our
+//   // schema values for now, and run the function and modify the void* stack
+//   voidyvoid_boxed_ATH_unpack_tensor_core_tiled_layout(ministack, num_arguments,
+//                                                     num_returns);
+
+//   // now pop all inputs on stack. if we pop earlier, Tensors would go out of scope
+//   // before calling the function
+//   torch::jit::drop(stack, num_arguments);
+
+//   // read the output from the end of the stack and wrap that back into
+//   // IValue from void*?
+//   for (auto idx = 0; idx < num_returns; idx++) {
+//     const c10::TypePtr& ret_type = schema.returns()[idx].type();
+//     if (*ret_type == *c10::getTypePtr<at::Tensor>()) {
+//       AtenTensorHandle ret_ath = reinterpret_cast<AtenTensorHandle>( ministack[num_arguments + idx]);
+//       at::Tensor out = *torch::aot_inductor::tensor_handle_to_tensor_pointer(ret_ath);
+//       torch::jit::push(stack, c10::IValue(out));
+//     } else {
+//       TORCH_CHECK(false, "Only Tensor return types are currently supported!");
+//     }
+//   }
+
+//   free(ministack);
+// }
+
+// TORCH_LIBRARY_IMPL(torchao, CUDA, m) {
+//   // m.impl("torchao::unpack_tensor_core_tiled_layout",
+//   //        &_unpack_tensor_core_tiled_layout);
+//   m.impl("torchao::unpack_tensor_core_tiled_layout",
+//          torch::CppFunction::makeFromBoxedFunction<
+//              boxed_unpack_tensor_core_tiled_layout>());
+//   // m.impl("torchao::dequantize_tensor_core_tiled_layout",
+//   // &_dequantize_tensor_core_tiled_layout);
+//   m.impl("torchao::dequantize_tensor_core_tiled_layout",
+//          torch::CppFunction::makeFromBoxedFunction<
+//              boxed_dequantize_tensor_core_tiled_layout>());
+// }
+
+class StableLibrary::TorchLibraryOpaque {
+public:
+    TorchLibraryOpaque(StableLibrary::Kind kind, std::string ns, std::optional<c10::DispatchKey> k, const char* file, uint32_t line)
+        : library_(kind, ns, k, file, line) {}
+private:
+    torch::Library library_; // Actual Library object
+};
+
+StableLibrary::StableLibrary(StableLibrary::Kind kind, std::string ns, std::optional<c10::DispatchKey> k, const char* file, uint32_t line)
+    : lib_(&TorchLibraryOpaque(Library::Kind::IMPL, ns, k, file, line)) {}
+
+StableLibrary& StableLibrary::impl(std::string name, void (*fn)(void **, int64_t, int64_t)) {
+  auto boxed_function = [fn](const c10::OperatorHandle &op, torch::jit::Stack *stack) {
+    // function pt1 here should take in IValues, pass a malloc'd stack into the
+    // second function
+    // need a translation from IValues to ATH to void*s!
+
+    const auto& schema = op.schema();
+    const auto num_returns = schema.returns().size();
+    const auto num_arguments = schema.arguments().size();
+    void **ministack = (void**)malloc((num_arguments + num_returns) * sizeof(void *));
+
+    for (auto idx = 0; idx < num_arguments; idx++) {
+      const c10::IValue& arg = torch::jit::peek(stack, idx, num_arguments);
+      if (arg.isInt()) {
+        ministack[idx] = reinterpret_cast<void *>(arg.toInt());
+      } else if (arg.isTensor()) {
+        const at::Tensor& tensor = arg.toTensor();
+        AtenTensorHandle ath = torch::aot_inductor::tensor_pointer_to_tensor_handle(&tensor);
+        ministack[idx] = reinterpret_cast<void *>(ath);
+      } else {
+        TORCH_CHECK(false, "Other types of IValues not yet handled!");
+      }
     }
-  }
 
-  // second function is going to take a stack of void*, cast them to our
-  // schema values for now, and run the function and modify the void* stack
-  voidyvoid_boxed_ATH_dequantize_tensor_core_tiled_layout(ministack, num_arguments,
-                                                    num_returns);
-
-  // now pop all inputs on stack. if we pop earlier, Tensors would go out of scope
-  // before calling the function
-  torch::jit::drop(stack, num_arguments);
-
-  // read the output from the end of the stack and wrap that back into
-  // IValue from void*?
-  for (auto idx = 0; idx < num_returns; idx++) {
-    const c10::TypePtr& ret_type = schema.returns()[idx].type();
-    if (*ret_type == *c10::getTypePtr<at::Tensor>()) {
-      AtenTensorHandle ret_ath = reinterpret_cast<AtenTensorHandle>( ministack[num_arguments + idx]);
-      at::Tensor out = *torch::aot_inductor::tensor_handle_to_tensor_pointer(ret_ath);
-      torch::jit::push(stack, c10::IValue(out));
-    } else {
-      TORCH_CHECK(false, "Only Tensor return types are currently supported!");
+    // second function is going to take a stack of void*, cast them to our
+    // schema values for now, and run the function and modify the void* stack
+    fn(ministack, num_arguments, num_returns);
+
+    // now pop all inputs on stack. if we pop earlier, Tensors would go out of scope
+    // before calling the function
+    torch::jit::drop(stack, num_arguments);
+
+    // read the output from the end of the stack and wrap that back into
+    // IValue from void*?
+    for (auto idx = 0; idx < num_returns; idx++) {
+      const c10::TypePtr& ret_type = schema.returns()[idx].type();
+      if (*ret_type == *c10::getTypePtr<at::Tensor>()) {
+        AtenTensorHandle ret_ath = reinterpret_cast<AtenTensorHandle>( ministack[num_arguments + idx]);
+        at::Tensor out = *torch::aot_inductor::tensor_handle_to_tensor_pointer(ret_ath);
+        torch::jit::push(stack, c10::IValue(out));
+      } else {
+        TORCH_CHECK(false, "Only Tensor return types are currently supported!");
+      }
     }
-  }
-
-  free(ministack);
-}
-
 
-void boxed_unpack_tensor_core_tiled_layout(const c10::OperatorHandle &op,
-                                               torch::jit::Stack *stack) {
-
-  // function pt1 here should take in IValues, pass a malloc'd stack into the
-  // second function
-  // need a translation from IValues to ATH to void*s!
-
-  const auto& schema = op.schema();
-  const auto num_returns = schema.returns().size();
-  const auto num_arguments = schema.arguments().size();
-  void **ministack = (void**)malloc((num_arguments + num_returns) * sizeof(void *));
-
-  for (auto idx = 0; idx < num_arguments; idx++) {
-    const c10::IValue& arg = torch::jit::peek(stack, idx, num_arguments);
-    if (arg.isInt()) {
-      ministack[idx] = reinterpret_cast<void *>(arg.toInt());
-    } else if (arg.isTensor()) {
-      const at::Tensor& tensor = arg.toTensor();
-      AtenTensorHandle ath = torch::aot_inductor::tensor_pointer_to_tensor_handle(&tensor);
-      ministack[idx] = reinterpret_cast<void *>(ath);
-    } else {
-      TORCH_CHECK(false, "Other types of IValues not yet handled!");
-    }
+    free(ministack);
   }
 
-  // second function is going to take a stack of void*, cast them to our
-  // schema values for now, and run the function and modify the void* stack
-  voidyvoid_boxed_ATH_unpack_tensor_core_tiled_layout(ministack, num_arguments,
-                                                    num_returns);
-
-  // now pop all inputs on stack. if we pop earlier, Tensors would go out of scope
-  // before calling the function
-  torch::jit::drop(stack, num_arguments);
-
-  // read the output from the end of the stack and wrap that back into
-  // IValue from void*?
-  for (auto idx = 0; idx < num_returns; idx++) {
-    const c10::TypePtr& ret_type = schema.returns()[idx].type();
-    if (*ret_type == *c10::getTypePtr<at::Tensor>()) {
-      AtenTensorHandle ret_ath = reinterpret_cast<AtenTensorHandle>( ministack[num_arguments + idx]);
-      at::Tensor out = *torch::aot_inductor::tensor_handle_to_tensor_pointer(ret_ath);
-      torch::jit::push(stack, c10::IValue(out));
-    } else {
-      TORCH_CHECK(false, "Only Tensor return types are currently supported!");
-    }
-  }
-
-  free(ministack);
-}
-
-TORCH_LIBRARY_IMPL(torchao, CUDA, m) {
-  // m.impl("torchao::unpack_tensor_core_tiled_layout",
-  //        &_unpack_tensor_core_tiled_layout);
-  m.impl("torchao::unpack_tensor_core_tiled_layout",
-         torch::CppFunction::makeFromBoxedFunction<
-             boxed_unpack_tensor_core_tiled_layout>());
-  // m.impl("torchao::dequantize_tensor_core_tiled_layout",
-  // &_dequantize_tensor_core_tiled_layout);
-  m.impl("torchao::dequantize_tensor_core_tiled_layout",
-         torch::CppFunction::makeFromBoxedFunction<
-             boxed_dequantize_tensor_core_tiled_layout>());
+  this->lib_.impl(name, torch::CppFunction::makeFromBoxedFunction<boxed_function>());
+  return *this;
 }
diff --git a/torchao/csrc/cuda/tensor_core_tiled_layout/libtorch.h b/torchao/csrc/cuda/tensor_core_tiled_layout/libtorch.h
index 9fd7636d04..fb45f2b357 100644
--- a/torchao/csrc/cuda/tensor_core_tiled_layout/libtorch.h
+++ b/torchao/csrc/cuda/tensor_core_tiled_layout/libtorch.h
@@ -6,19 +6,38 @@
 #include <torch/csrc/inductor/aoti_torch/c/shim.h>
 
 class StableLibrary final {
+  private:
+    class TorchLibraryOpaque;
+    using TorchLibraryHandle = TorchLibraryOpaque*;
+    TorchLibraryHandle lib_;
   public:
-  // a pointer to a real Library
-  // a kind
-  enum Kind {
-    DEF, // from TORCH_LIBRARY (no qualifier)
-    IMPL,
-    FRAGMENT,
-  };
-
-
-  // constructor
-
-  
+    // a pointer to a real Library
+    // a kind
+    enum Kind {
+      // DEF, // from TORCH_LIBRARY (no qualifier)
+      IMPL,
+      // FRAGMENT,
+    };
+
+    // constructor
+    /// \private
+    ///
+    /// Use TORCH_LIBRARY() or TORCH_LIBRARY_IMPL() instead of using these
+    /// constructors directly
+    StableLibrary(
+        Kind kind,
+        std::string ns,
+        std::optional<c10::DispatchKey> k,
+        const char* file,
+        uint32_t line);
+    
+    StableLibrary(const StableLibrary&) = delete;
+    StableLibrary& operator=(const StableLibrary&) = delete;
+    StableLibrary(StableLibrary&&) = default;
+    StableLibrary& operator=(StableLibrary&&) = default;
+    ~StableLibrary() = default;
+
+    StableLibrary& impl(std::string name, void* fn);
 };
 
 

From 31c2925ecb059bd268db82f02a993465be687c86 Mon Sep 17 00:00:00 2001
From: Jane Xu <janeyx@meta.com>
Date: Tue, 4 Feb 2025 10:20:33 -0800
Subject: [PATCH 09/12] Now the code compiles

---
 .../tensor_core_tiled_layout/libtorch.cpp     | 458 ++++++++++--------
 .../cuda/tensor_core_tiled_layout/libtorch.h  |  40 +-
 .../tensor_core_tiled_layout.cu               |  16 +-
 3 files changed, 281 insertions(+), 233 deletions(-)

diff --git a/torchao/csrc/cuda/tensor_core_tiled_layout/libtorch.cpp b/torchao/csrc/cuda/tensor_core_tiled_layout/libtorch.cpp
index 50fcf97db4..8ea824bbad 100644
--- a/torchao/csrc/cuda/tensor_core_tiled_layout/libtorch.cpp
+++ b/torchao/csrc/cuda/tensor_core_tiled_layout/libtorch.cpp
@@ -1,11 +1,12 @@
 // in this file, we will implement the stuff in libtorch.h,
 // and we are allowed to call unstable stuff from pytorch!
 
-#include <libtorch.h>
+#include "libtorch.h"
 
 #include <ATen/ATen.h>
 #include <ATen/core/Tensor.h>
 #include <ATen/DeviceGuard.h>
+#include <ATen/core/boxing/KernelFunction.h>
 #include <ATen/core/TensorAccessor.h>
 #include <ATen/core/ivalue.h>
 #include <ATen/core/stack.h>
@@ -14,232 +15,263 @@
 #include <torch/csrc/inductor/aoti_torch/utils.h>
 #include <torch/library.h>
 
+#include <functional>
+#include <memory>
+
+
+/**
+// step 1: from here, call the ATH func
+// step 2: make ATH func also boxed and call it
+// step 3: move abstract code to libtorch
+void boxed_dequantize_tensor_core_tiled_layout(const c10::OperatorHandle &op,
+                                               torch::jit::Stack *stack) {
+
+  // function pt1 here should take in IValues, pass a malloc'd stack into the
+  // second function
+  // need a translation from IValues to ATH to void*s!
+
+  const auto& schema = op.schema();
+  const auto num_returns = schema.returns().size();
+  const auto num_arguments = schema.arguments().size();
+  void **ministack = (void**)malloc((num_arguments + num_returns) * sizeof(void *));
+
+  for (auto idx = 0; idx < num_arguments; idx++) {
+    const c10::IValue& arg = torch::jit::peek(stack, idx, num_arguments);
+    if (arg.isInt()) {
+      ministack[idx] = reinterpret_cast<void *>(arg.toInt());
+    } else if (arg.isTensor()) {
+      const at::Tensor& tensor = arg.toTensor();
+      AtenTensorHandle ath = torch::aot_inductor::tensor_pointer_to_tensor_handle(&tensor);
+      ministack[idx] = reinterpret_cast<void *>(ath);
+    } else {
+      TORCH_CHECK(false, "Other types of IValues not yet handled!");
+    }
+  }
+
+  // second function is going to take a stack of void*, cast them to our
+  // schema values for now, and run the function and modify the void* stack
+  voidyvoid_boxed_ATH_dequantize_tensor_core_tiled_layout(ministack, num_arguments,
+                                                    num_returns);
+
+  // now pop all inputs on stack. if we pop earlier, Tensors would go out of scope
+  // before calling the function
+  torch::jit::drop(stack, num_arguments);
+
+  // read the output from the end of the stack and wrap that back into
+  // IValue from void*?
+  for (auto idx = 0; idx < num_returns; idx++) {
+    const c10::TypePtr& ret_type = schema.returns()[idx].type();
+    if (*ret_type == *c10::getTypePtr<at::Tensor>()) {
+      AtenTensorHandle ret_ath = reinterpret_cast<AtenTensorHandle>( ministack[num_arguments + idx]);
+      at::Tensor out = *torch::aot_inductor::tensor_handle_to_tensor_pointer(ret_ath);
+      torch::jit::push(stack, c10::IValue(out));
+    } else {
+      TORCH_CHECK(false, "Only Tensor return types are currently supported!");
+    }
+  }
+
+  free(ministack);
+}
+
+
+void boxed_unpack_tensor_core_tiled_layout(const c10::OperatorHandle &op,
+                                               torch::jit::Stack *stack) {
+
+  // function pt1 here should take in IValues, pass a malloc'd stack into the
+  // second function
+  // need a translation from IValues to ATH to void*s!
+
+  const auto& schema = op.schema();
+  const auto num_returns = schema.returns().size();
+  const auto num_arguments = schema.arguments().size();
+  void **ministack = (void**)malloc((num_arguments + num_returns) * sizeof(void *));
+
+  for (auto idx = 0; idx < num_arguments; idx++) {
+    const c10::IValue& arg = torch::jit::peek(stack, idx, num_arguments);
+    if (arg.isInt()) {
+      ministack[idx] = reinterpret_cast<void *>(arg.toInt());
+    } else if (arg.isTensor()) {
+      const at::Tensor& tensor = arg.toTensor();
+      AtenTensorHandle ath = torch::aot_inductor::tensor_pointer_to_tensor_handle(&tensor);
+      ministack[idx] = reinterpret_cast<void *>(ath);
+    } else {
+      TORCH_CHECK(false, "Other types of IValues not yet handled!");
+    }
+  }
+
+  // second function is going to take a stack of void*, cast them to our
+  // schema values for now, and run the function and modify the void* stack
+  voidyvoid_boxed_ATH_unpack_tensor_core_tiled_layout(ministack, num_arguments,
+                                                    num_returns);
+
+  // now pop all inputs on stack. if we pop earlier, Tensors would go out of scope
+  // before calling the function
+  torch::jit::drop(stack, num_arguments);
+
+  // read the output from the end of the stack and wrap that back into
+  // IValue from void*?
+  for (auto idx = 0; idx < num_returns; idx++) {
+    const c10::TypePtr& ret_type = schema.returns()[idx].type();
+    if (*ret_type == *c10::getTypePtr<at::Tensor>()) {
+      AtenTensorHandle ret_ath = reinterpret_cast<AtenTensorHandle>( ministack[num_arguments + idx]);
+      at::Tensor out = *torch::aot_inductor::tensor_handle_to_tensor_pointer(ret_ath);
+      torch::jit::push(stack, c10::IValue(out));
+    } else {
+      TORCH_CHECK(false, "Only Tensor return types are currently supported!");
+    }
+  }
+
+  free(ministack);
+}
+
+void boxed_void_function(const c10::OperatorHandle &op, torch::jit::Stack *stack) {
+
+  // function pt1 here should take in IValues, pass a malloc'd stack into the
+  // second function
+  // need a translation from IValues to ATH to void*s!
+
+  const auto& schema = op.schema();
+  const auto num_returns = schema.returns().size();
+  const auto num_arguments = schema.arguments().size();
+  void **ministack = (void**)malloc((num_arguments + num_returns) * sizeof(void *));
+
+  for (auto idx = 0; idx < num_arguments; idx++) {
+    const c10::IValue& arg = torch::jit::peek(stack, idx, num_arguments);
+    if (arg.isInt()) {
+      ministack[idx] = reinterpret_cast<void *>(arg.toInt());
+    } else if (arg.isTensor()) {
+      const at::Tensor& tensor = arg.toTensor();
+      AtenTensorHandle ath = torch::aot_inductor::tensor_pointer_to_tensor_handle(&tensor);
+      ministack[idx] = reinterpret_cast<void *>(ath);
+    } else {
+      TORCH_CHECK(false, "Other types of IValues not yet handled!");
+    }
+  }
+
+  // second function is going to take a stack of void*, cast them to our
+  // schema values for now, and run the function and modify the void* stack
+  voidyvoid_boxed_ATH_unpack_tensor_core_tiled_layout(ministack, num_arguments,
+                                                    num_returns);
+
+  // now pop all inputs on stack. if we pop earlier, Tensors would go out of scope
+  // before calling the function
+  torch::jit::drop(stack, num_arguments);
+
+  // read the output from the end of the stack and wrap that back into
+  // IValue from void*?
+  for (auto idx = 0; idx < num_returns; idx++) {
+    const c10::TypePtr& ret_type = schema.returns()[idx].type();
+    if (*ret_type == *c10::getTypePtr<at::Tensor>()) {
+      AtenTensorHandle ret_ath = reinterpret_cast<AtenTensorHandle>( ministack[num_arguments + idx]);
+      at::Tensor out = *torch::aot_inductor::tensor_handle_to_tensor_pointer(ret_ath);
+      torch::jit::push(stack, c10::IValue(out));
+    } else {
+      TORCH_CHECK(false, "Only Tensor return types are currently supported!");
+    }
+  }
+
+  free(ministack);
+}
+
+TORCH_LIBRARY_IMPL(torchao, CUDA, m) {
+  // m.impl("torchao::unpack_tensor_core_tiled_layout",
+  //        &_unpack_tensor_core_tiled_layout);
+  m.impl("torchao::unpack_tensor_core_tiled_layout",
+         torch::CppFunction::makeFromBoxedFunction<
+             boxed_unpack_tensor_core_tiled_layout>());
+  // m.impl("torchao::dequantize_tensor_core_tiled_layout",
+  // &_dequantize_tensor_core_tiled_layout);
+  m.impl("torchao::dequantize_tensor_core_tiled_layout",
+         torch::CppFunction::makeFromBoxedFunction<
+             boxed_dequantize_tensor_core_tiled_layout>());
+}
+
+*/
 
-// // step 1: from here, call the ATH func
-// // step 2: make ATH func also boxed and call it
-// // step 3: move abstract code to libtorch
-// void boxed_dequantize_tensor_core_tiled_layout(const c10::OperatorHandle &op,
-//                                                torch::jit::Stack *stack) {
-
-//   // function pt1 here should take in IValues, pass a malloc'd stack into the
-//   // second function
-//   // need a translation from IValues to ATH to void*s!
-
-//   const auto& schema = op.schema();
-//   const auto num_returns = schema.returns().size();
-//   const auto num_arguments = schema.arguments().size();
-//   void **ministack = (void**)malloc((num_arguments + num_returns) * sizeof(void *));
-
-//   for (auto idx = 0; idx < num_arguments; idx++) {
-//     const c10::IValue& arg = torch::jit::peek(stack, idx, num_arguments);
-//     if (arg.isInt()) {
-//       ministack[idx] = reinterpret_cast<void *>(arg.toInt());
-//     } else if (arg.isTensor()) {
-//       const at::Tensor& tensor = arg.toTensor();
-//       AtenTensorHandle ath = torch::aot_inductor::tensor_pointer_to_tensor_handle(&tensor);
-//       ministack[idx] = reinterpret_cast<void *>(ath);
-//     } else {
-//       TORCH_CHECK(false, "Other types of IValues not yet handled!");
-//     }
-//   }
-
-//   // second function is going to take a stack of void*, cast them to our
-//   // schema values for now, and run the function and modify the void* stack
-//   voidyvoid_boxed_ATH_dequantize_tensor_core_tiled_layout(ministack, num_arguments,
-//                                                     num_returns);
-
-//   // now pop all inputs on stack. if we pop earlier, Tensors would go out of scope
-//   // before calling the function
-//   torch::jit::drop(stack, num_arguments);
-
-//   // read the output from the end of the stack and wrap that back into
-//   // IValue from void*?
-//   for (auto idx = 0; idx < num_returns; idx++) {
-//     const c10::TypePtr& ret_type = schema.returns()[idx].type();
-//     if (*ret_type == *c10::getTypePtr<at::Tensor>()) {
-//       AtenTensorHandle ret_ath = reinterpret_cast<AtenTensorHandle>( ministack[num_arguments + idx]);
-//       at::Tensor out = *torch::aot_inductor::tensor_handle_to_tensor_pointer(ret_ath);
-//       torch::jit::push(stack, c10::IValue(out));
-//     } else {
-//       TORCH_CHECK(false, "Only Tensor return types are currently supported!");
-//     }
-//   }
-
-//   free(ministack);
-// }
-
-
-// void boxed_unpack_tensor_core_tiled_layout(const c10::OperatorHandle &op,
-//                                                torch::jit::Stack *stack) {
-
-//   // function pt1 here should take in IValues, pass a malloc'd stack into the
-//   // second function
-//   // need a translation from IValues to ATH to void*s!
-
-//   const auto& schema = op.schema();
-//   const auto num_returns = schema.returns().size();
-//   const auto num_arguments = schema.arguments().size();
-//   void **ministack = (void**)malloc((num_arguments + num_returns) * sizeof(void *));
-
-//   for (auto idx = 0; idx < num_arguments; idx++) {
-//     const c10::IValue& arg = torch::jit::peek(stack, idx, num_arguments);
-//     if (arg.isInt()) {
-//       ministack[idx] = reinterpret_cast<void *>(arg.toInt());
-//     } else if (arg.isTensor()) {
-//       const at::Tensor& tensor = arg.toTensor();
-//       AtenTensorHandle ath = torch::aot_inductor::tensor_pointer_to_tensor_handle(&tensor);
-//       ministack[idx] = reinterpret_cast<void *>(ath);
-//     } else {
-//       TORCH_CHECK(false, "Other types of IValues not yet handled!");
-//     }
-//   }
-
-//   // second function is going to take a stack of void*, cast them to our
-//   // schema values for now, and run the function and modify the void* stack
-//   voidyvoid_boxed_ATH_unpack_tensor_core_tiled_layout(ministack, num_arguments,
-//                                                     num_returns);
-
-//   // now pop all inputs on stack. if we pop earlier, Tensors would go out of scope
-//   // before calling the function
-//   torch::jit::drop(stack, num_arguments);
-
-//   // read the output from the end of the stack and wrap that back into
-//   // IValue from void*?
-//   for (auto idx = 0; idx < num_returns; idx++) {
-//     const c10::TypePtr& ret_type = schema.returns()[idx].type();
-//     if (*ret_type == *c10::getTypePtr<at::Tensor>()) {
-//       AtenTensorHandle ret_ath = reinterpret_cast<AtenTensorHandle>( ministack[num_arguments + idx]);
-//       at::Tensor out = *torch::aot_inductor::tensor_handle_to_tensor_pointer(ret_ath);
-//       torch::jit::push(stack, c10::IValue(out));
-//     } else {
-//       TORCH_CHECK(false, "Only Tensor return types are currently supported!");
-//     }
-//   }
-
-//   free(ministack);
-// }
-
-// void boxed_void_function(const c10::OperatorHandle &op, torch::jit::Stack *stack) {
-
-//   // function pt1 here should take in IValues, pass a malloc'd stack into the
-//   // second function
-//   // need a translation from IValues to ATH to void*s!
-
-//   const auto& schema = op.schema();
-//   const auto num_returns = schema.returns().size();
-//   const auto num_arguments = schema.arguments().size();
-//   void **ministack = (void**)malloc((num_arguments + num_returns) * sizeof(void *));
-
-//   for (auto idx = 0; idx < num_arguments; idx++) {
-//     const c10::IValue& arg = torch::jit::peek(stack, idx, num_arguments);
-//     if (arg.isInt()) {
-//       ministack[idx] = reinterpret_cast<void *>(arg.toInt());
-//     } else if (arg.isTensor()) {
-//       const at::Tensor& tensor = arg.toTensor();
-//       AtenTensorHandle ath = torch::aot_inductor::tensor_pointer_to_tensor_handle(&tensor);
-//       ministack[idx] = reinterpret_cast<void *>(ath);
-//     } else {
-//       TORCH_CHECK(false, "Other types of IValues not yet handled!");
-//     }
-//   }
-
-//   // second function is going to take a stack of void*, cast them to our
-//   // schema values for now, and run the function and modify the void* stack
-//   voidyvoid_boxed_ATH_unpack_tensor_core_tiled_layout(ministack, num_arguments,
-//                                                     num_returns);
-
-//   // now pop all inputs on stack. if we pop earlier, Tensors would go out of scope
-//   // before calling the function
-//   torch::jit::drop(stack, num_arguments);
-
-//   // read the output from the end of the stack and wrap that back into
-//   // IValue from void*?
-//   for (auto idx = 0; idx < num_returns; idx++) {
-//     const c10::TypePtr& ret_type = schema.returns()[idx].type();
-//     if (*ret_type == *c10::getTypePtr<at::Tensor>()) {
-//       AtenTensorHandle ret_ath = reinterpret_cast<AtenTensorHandle>( ministack[num_arguments + idx]);
-//       at::Tensor out = *torch::aot_inductor::tensor_handle_to_tensor_pointer(ret_ath);
-//       torch::jit::push(stack, c10::IValue(out));
-//     } else {
-//       TORCH_CHECK(false, "Only Tensor return types are currently supported!");
-//     }
-//   }
-
-//   free(ministack);
-// }
-
-// TORCH_LIBRARY_IMPL(torchao, CUDA, m) {
-//   // m.impl("torchao::unpack_tensor_core_tiled_layout",
-//   //        &_unpack_tensor_core_tiled_layout);
-//   m.impl("torchao::unpack_tensor_core_tiled_layout",
-//          torch::CppFunction::makeFromBoxedFunction<
-//              boxed_unpack_tensor_core_tiled_layout>());
-//   // m.impl("torchao::dequantize_tensor_core_tiled_layout",
-//   // &_dequantize_tensor_core_tiled_layout);
-//   m.impl("torchao::dequantize_tensor_core_tiled_layout",
-//          torch::CppFunction::makeFromBoxedFunction<
-//              boxed_dequantize_tensor_core_tiled_layout>());
-// }
 
 class StableLibrary::TorchLibraryOpaque {
 public:
+    // TODO: support other Kinds lol, you'll need to translate between StableLibrary::Kind and Library::Kind
     TorchLibraryOpaque(StableLibrary::Kind kind, std::string ns, std::optional<c10::DispatchKey> k, const char* file, uint32_t line)
-        : library_(kind, ns, k, file, line) {}
+        : library_(torch::Library::Kind::IMPL, std::move(ns), k, file, line) {}
+    
+    TorchLibraryOpaque(const TorchLibraryOpaque&) = delete;
+    TorchLibraryOpaque& operator=(const TorchLibraryOpaque&) = delete;
+    TorchLibraryOpaque(TorchLibraryOpaque&&) = default;
+    TorchLibraryOpaque& operator=(TorchLibraryOpaque&&) = default;
+    ~TorchLibraryOpaque() = default;
+
+    void impl(const char* name, torch::CppFunction fn) {
+      library_.impl(name, std::move(fn));
+    }
 private:
     torch::Library library_; // Actual Library object
 };
 
-StableLibrary::StableLibrary(StableLibrary::Kind kind, std::string ns, std::optional<c10::DispatchKey> k, const char* file, uint32_t line)
-    : lib_(&TorchLibraryOpaque(Library::Kind::IMPL, ns, k, file, line)) {}
-
-StableLibrary& StableLibrary::impl(std::string name, void (*fn)(void **, int64_t, int64_t)) {
-  auto boxed_function = [fn](const c10::OperatorHandle &op, torch::jit::Stack *stack) {
-    // function pt1 here should take in IValues, pass a malloc'd stack into the
-    // second function
-    // need a translation from IValues to ATH to void*s!
-
-    const auto& schema = op.schema();
-    const auto num_returns = schema.returns().size();
-    const auto num_arguments = schema.arguments().size();
-    void **ministack = (void**)malloc((num_arguments + num_returns) * sizeof(void *));
-
-    for (auto idx = 0; idx < num_arguments; idx++) {
-      const c10::IValue& arg = torch::jit::peek(stack, idx, num_arguments);
-      if (arg.isInt()) {
-        ministack[idx] = reinterpret_cast<void *>(arg.toInt());
-      } else if (arg.isTensor()) {
-        const at::Tensor& tensor = arg.toTensor();
-        AtenTensorHandle ath = torch::aot_inductor::tensor_pointer_to_tensor_handle(&tensor);
-        ministack[idx] = reinterpret_cast<void *>(ath);
-      } else {
-        TORCH_CHECK(false, "Other types of IValues not yet handled!");
+
+class VoidStarConverter: public c10::OperatorKernel {
+  public:
+    VoidStarConverter(void (*fn)(void **, int64_t, int64_t)) : fn_(fn) {}
+
+    void operator()(const c10::OperatorHandle& op, c10::DispatchKeySet keyset, torch::jit::Stack* stack) {
+      // function pt1 here should take in IValues, pass a malloc'd stack into the
+      // second function
+      // need a translation from IValues to ATH to void*s!
+
+      const auto& schema = op.schema();
+      const auto num_returns = schema.returns().size();
+      const auto num_arguments = schema.arguments().size();
+      // to make this faster, you can make this a C array on the stack --> though this may cause a stackoverflow
+      void **ministack = (void**)malloc((num_arguments + num_returns) * sizeof(void *)); 
+      // std::unique_ptr<void *[]> ministack = std::make_unique<void*[]>(num_arguments + num_returns);
+
+      for (size_t idx = 0; idx < num_arguments; idx++) {  // rbarnes will prefer a c10::irange instead of this loop!
+        const c10::IValue& arg = torch::jit::peek(stack, idx, num_arguments);
+        if (arg.isInt()) {
+          ministack[idx] = reinterpret_cast<void *>(arg.toInt());
+        } else if (arg.isTensor()) {
+          const at::Tensor& tensor = arg.toTensor();
+          AtenTensorHandle ath = torch::aot_inductor::tensor_pointer_to_tensor_handle(&tensor);
+          ministack[idx] = reinterpret_cast<void *>(ath);
+        } else {
+          TORCH_CHECK(false, "Other types of IValues not yet handled!");
+        }
       }
-    }
 
-    // second function is going to take a stack of void*, cast them to our
-    // schema values for now, and run the function and modify the void* stack
-    fn(ministack, num_arguments, num_returns);
-
-    // now pop all inputs on stack. if we pop earlier, Tensors would go out of scope
-    // before calling the function
-    torch::jit::drop(stack, num_arguments);
-
-    // read the output from the end of the stack and wrap that back into
-    // IValue from void*?
-    for (auto idx = 0; idx < num_returns; idx++) {
-      const c10::TypePtr& ret_type = schema.returns()[idx].type();
-      if (*ret_type == *c10::getTypePtr<at::Tensor>()) {
-        AtenTensorHandle ret_ath = reinterpret_cast<AtenTensorHandle>( ministack[num_arguments + idx]);
-        at::Tensor out = *torch::aot_inductor::tensor_handle_to_tensor_pointer(ret_ath);
-        torch::jit::push(stack, c10::IValue(out));
-      } else {
-        TORCH_CHECK(false, "Only Tensor return types are currently supported!");
+      // second function is going to take a stack of void*, cast them to our
+      // schema values for now, and run the function and modify the void* stack
+      fn_(ministack, num_arguments, num_returns);
+
+      // now pop all inputs on stack. if we pop earlier, Tensors would go out of scope
+      // before calling the function
+      torch::jit::drop(stack, num_arguments);
+
+      // read the output from the end of the stack and wrap that back into
+      // IValue from void*?
+      for (size_t idx = 0; idx < num_returns; idx++) {
+        const c10::TypePtr& ret_type = schema.returns()[idx].type();
+        if (*ret_type == *c10::getTypePtr<at::Tensor>()) {
+          AtenTensorHandle ret_ath = reinterpret_cast<AtenTensorHandle>(ministack[num_arguments + idx]);
+          at::Tensor out = *torch::aot_inductor::tensor_handle_to_tensor_pointer(ret_ath);
+          torch::jit::push(stack, c10::IValue(out));
+        } else {
+          TORCH_CHECK(false, "Only Tensor return types are currently supported!");
+        }
       }
+
+      free(ministack);
     }
 
-    free(ministack);
-  }
+  private:
+    void (*fn_)(void **, int64_t, int64_t);
+};
+
+
+StableLibrary::StableLibrary(StableLibrary::Kind kind, std::string ns, std::optional<c10::DispatchKey> k, const char* file, uint32_t line) 
+ : lib_(new TorchLibraryOpaque(StableLibrary::Kind::IMPL, std::move(ns), k, file, line)) {}
+
 
-  this->lib_.impl(name, torch::CppFunction::makeFromBoxedFunction<boxed_function>());
+StableLibrary& StableLibrary::impl(const char* name, void (*fn)(void **, int64_t, int64_t)) {
+  this->lib_->impl(name, torch::CppFunction::makeFromBoxedFunctor(std::move(std::make_unique<VoidStarConverter>(fn))));
   return *this;
 }
diff --git a/torchao/csrc/cuda/tensor_core_tiled_layout/libtorch.h b/torchao/csrc/cuda/tensor_core_tiled_layout/libtorch.h
index fb45f2b357..9679921590 100644
--- a/torchao/csrc/cuda/tensor_core_tiled_layout/libtorch.h
+++ b/torchao/csrc/cuda/tensor_core_tiled_layout/libtorch.h
@@ -5,11 +5,14 @@
 #include <c10/core/DispatchKey.h>  // used for DispatchKey, enum verified to be header-only
 #include <torch/csrc/inductor/aoti_torch/c/shim.h>
 
+#include <optional>
+#include <string>
+
 class StableLibrary final {
   private:
     class TorchLibraryOpaque;
     using TorchLibraryHandle = TorchLibraryOpaque*;
-    TorchLibraryHandle lib_;
+    TorchLibraryHandle lib_;  // pimpl unique_ptr
   public:
     // a pointer to a real Library
     // a kind
@@ -37,7 +40,7 @@ class StableLibrary final {
     StableLibrary& operator=(StableLibrary&&) = default;
     ~StableLibrary() = default;
 
-    StableLibrary& impl(std::string name, void* fn);
+    StableLibrary& impl(const char* name, void (*fn)(void **, int64_t, int64_t));
 };
 
 
@@ -88,22 +91,23 @@ class StableTorchLibraryInit final {
 
 
 
+/**
+#define TORCH_LIBRARY_IMPL(ns, k, m) _TORCH_LIBRARY_IMPL(ns, k, m, C10_UID)
 
-// #define TORCH_LIBRARY_IMPL(ns, k, m) _TORCH_LIBRARY_IMPL(ns, k, m, C10_UID)
-
-// #define _TORCH_LIBRARY_IMPL(ns, k, m, uid)                                \
-//   static void TORCH_LIBRARY_IMPL_init_torchao_CUDA_uid(torch::Library&);       \
-//   static const torch::detail::TorchLibraryInit      \
-//       TORCH_LIBRARY_IMPL_static_init_torchao_CUDA_uid(                 \
-//       torch::Library::IMPL,                                               \
-//       (c10::impl::dispatch_key_allowlist_check(c10::DispatchKey::CUDA)       \
-//            ? &TORCH_LIBRARY_IMPL_init_torchao_CUDA_uid \
-//            : [](torch::Library&) -> void {}),                             \
-//       torchao,                                                                \
-//       std::make_optional(c10::DispatchKey::CUDA),                            \
-//       __FILE__,                                                           \
-//       __LINE__);                                                          \
-//   TORCH_LIBRARY_IMPL_init_torchao_CUDA_uid(torch::Library & m) {
+#define _TORCH_LIBRARY_IMPL(ns, k, m, uid)                                \
+  static void TORCH_LIBRARY_IMPL_init_torchao_CUDA_uid(torch::Library&);       \
+  static const torch::detail::TorchLibraryInit      \
+      TORCH_LIBRARY_IMPL_static_init_torchao_CUDA_uid(                 \
+      torch::Library::IMPL,                                               \
+      (c10::impl::dispatch_key_allowlist_check(c10::DispatchKey::CUDA)       \
+           ? &TORCH_LIBRARY_IMPL_init_torchao_CUDA_uid \
+           : [](torch::Library&) -> void {}),                             \
+      torchao,                                                                \
+      std::make_optional(c10::DispatchKey::CUDA),                            \
+      __FILE__,                                                           \
+      __LINE__);                                                          \
+  TORCH_LIBRARY_IMPL_init_torchao_CUDA_uid(torch::Library & m) {
 
-//   }
+  }
+*/
 
diff --git a/torchao/csrc/cuda/tensor_core_tiled_layout/tensor_core_tiled_layout.cu b/torchao/csrc/cuda/tensor_core_tiled_layout/tensor_core_tiled_layout.cu
index f4c252ab37..3b3d96d912 100644
--- a/torchao/csrc/cuda/tensor_core_tiled_layout/tensor_core_tiled_layout.cu
+++ b/torchao/csrc/cuda/tensor_core_tiled_layout/tensor_core_tiled_layout.cu
@@ -1,7 +1,12 @@
 #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 800  // at least Ampere
 
-#include <c10/util/BFloat16.h>
-#include <torch/csrc/inductor/aoti_torch/c/shim.h>
+#include "libtorch.h"
+
+ // need to confirm or make the following includes header-only
+#include <ATen/core/TensorAccessor.h> 
+#include <ATen/DeviceGuard.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
 
 template <typename U, typename V>
 constexpr __host__ __device__ auto divUp(U a, V b) -> decltype(a + b) {
@@ -455,6 +460,13 @@ void voidyvoid_boxed_ATH_unpack_tensor_core_tiled_layout(void **stack,
   // here, void* is my StableIValue
   // function is going to take a stack of void*, cast them to our
   // schema values for now, and run the function and modify the void* stack
+
+
+  // check that num_args > 2
+  // strict aliasing rule --> you can reinterpret_cast between pointers except for this rule
+  // which doesn't even apply in C.
+  // reinterpret_cast is not okay to interpreting floats+ints (c++20 lets you bitcast these) so you'll have to use memcpy
+  // c10 has a shim for bitcast (it is just a memcpy)!! so we should just use it
   int64_t innerKTiles = reinterpret_cast<int64_t>(stack[1]);
   AtenTensorHandle packed_w_ath = reinterpret_cast<AtenTensorHandle>(stack[0]);
 

From a0500e573e4d22bf5c83745a5dd182f33758b805 Mon Sep 17 00:00:00 2001
From: Jane Xu <janeyx@meta.com>
Date: Tue, 4 Feb 2025 10:36:57 -0800
Subject: [PATCH 10/12] Move commented out notes to the bottom to not distract

---
 .../tensor_core_tiled_layout/libtorch.cpp     | 175 +++++++++---------
 .../cuda/tensor_core_tiled_layout/libtorch.h  |  13 +-
 .../tensor_core_tiled_layout.cu               |   2 +-
 3 files changed, 88 insertions(+), 102 deletions(-)

diff --git a/torchao/csrc/cuda/tensor_core_tiled_layout/libtorch.cpp b/torchao/csrc/cuda/tensor_core_tiled_layout/libtorch.cpp
index 8ea824bbad..fd1557f2a7 100644
--- a/torchao/csrc/cuda/tensor_core_tiled_layout/libtorch.cpp
+++ b/torchao/csrc/cuda/tensor_core_tiled_layout/libtorch.cpp
@@ -15,10 +15,93 @@
 #include <torch/csrc/inductor/aoti_torch/utils.h>
 #include <torch/library.h>
 
-#include <functional>
-#include <memory>
+class StableLibrary::TorchLibraryOpaque {
+public:
+    // TODO: support other Kinds lol, you'll need to translate between StableLibrary::Kind and Library::Kind
+    TorchLibraryOpaque(StableLibrary::Kind kind, std::string ns, std::optional<c10::DispatchKey> k, const char* file, uint32_t line)
+        : library_(torch::Library::Kind::IMPL, std::move(ns), k, file, line) {}
+    
+    TorchLibraryOpaque(const TorchLibraryOpaque&) = delete;
+    TorchLibraryOpaque& operator=(const TorchLibraryOpaque&) = delete;
+    TorchLibraryOpaque(TorchLibraryOpaque&&) = default;
+    TorchLibraryOpaque& operator=(TorchLibraryOpaque&&) = default;
+    ~TorchLibraryOpaque() = default;
 
+    void impl(const char* name, torch::CppFunction fn) {
+      library_.impl(name, std::move(fn));
+    }
+private:
+    torch::Library library_; // Actual Library object
+};
+
+
+class VoidStarConverter: public c10::OperatorKernel {
+  public:
+    VoidStarConverter(void (*fn)(void **, int64_t, int64_t)) : fn_(fn) {}
+
+    void operator()(const c10::OperatorHandle& op, c10::DispatchKeySet keyset, torch::jit::Stack* stack) {
+      const auto& schema = op.schema();
+      const auto num_returns = schema.returns().size();
+      const auto num_arguments = schema.arguments().size();
+      // to make this faster, you can make this a C array on the stack --> though this may cause a stackoverflow
+      void **ministack = (void**)malloc((num_arguments + num_returns) * sizeof(void *)); 
+      // std::unique_ptr<void *[]> ministack = std::make_unique<void*[]>(num_arguments + num_returns);
+
+      for (size_t idx = 0; idx < num_arguments; idx++) {  // rbarnes will prefer a c10::irange instead of this loop!
+        const c10::IValue& arg = torch::jit::peek(stack, idx, num_arguments);
+        if (arg.isInt()) {
+          ministack[idx] = reinterpret_cast<void *>(arg.toInt());
+        } else if (arg.isTensor()) {
+          const at::Tensor& tensor = arg.toTensor();
+          AtenTensorHandle ath = torch::aot_inductor::tensor_pointer_to_tensor_handle(&tensor);
+          ministack[idx] = reinterpret_cast<void *>(ath);
+        } else {
+          TORCH_CHECK(false, "Other types of IValues not yet handled!");
+        }
+      }
 
+      // second function is going to take a stack of void*, cast them to our
+      // schema values for now, and run the function and modify the void* stack
+      fn_(ministack, num_arguments, num_returns);
+
+      // now pop all inputs on stack. if we pop earlier, Tensors would go out of scope
+      // before calling the function
+      torch::jit::drop(stack, num_arguments);
+
+      // read the output from the end of the stack and wrap that back into
+      // IValue from void*?
+      for (size_t idx = 0; idx < num_returns; idx++) {
+        const c10::TypePtr& ret_type = schema.returns()[idx].type();
+        if (*ret_type == *c10::getTypePtr<at::Tensor>()) {
+          AtenTensorHandle ret_ath = reinterpret_cast<AtenTensorHandle>(ministack[num_arguments + idx]);
+          at::Tensor out = *torch::aot_inductor::tensor_handle_to_tensor_pointer(ret_ath);
+          torch::jit::push(stack, c10::IValue(out));
+        } else {
+          TORCH_CHECK(false, "Only Tensor return types are currently supported!");
+        }
+      }
+
+      free(ministack);
+    }
+
+  private:
+    void (*fn_)(void **, int64_t, int64_t);
+};
+
+
+StableLibrary::StableLibrary(StableLibrary::Kind kind, std::string ns, std::optional<c10::DispatchKey> k, const char* file, uint32_t line) 
+ : lib_(new TorchLibraryOpaque(StableLibrary::Kind::IMPL, std::move(ns), k, file, line)) {}
+
+
+StableLibrary& StableLibrary::impl(const char* name, void (*fn)(void **, int64_t, int64_t)) {
+  this->lib_->impl(name, torch::CppFunction::makeFromBoxedFunctor(std::move(std::make_unique<VoidStarConverter>(fn))));
+  return *this;
+}
+
+
+
+
+// notes from trying to understand stuff + iteration
 /**
 // step 1: from here, call the ATH func
 // step 2: make ATH func also boxed and call it
@@ -187,91 +270,3 @@ TORCH_LIBRARY_IMPL(torchao, CUDA, m) {
 }
 
 */
-
-
-class StableLibrary::TorchLibraryOpaque {
-public:
-    // TODO: support other Kinds lol, you'll need to translate between StableLibrary::Kind and Library::Kind
-    TorchLibraryOpaque(StableLibrary::Kind kind, std::string ns, std::optional<c10::DispatchKey> k, const char* file, uint32_t line)
-        : library_(torch::Library::Kind::IMPL, std::move(ns), k, file, line) {}
-    
-    TorchLibraryOpaque(const TorchLibraryOpaque&) = delete;
-    TorchLibraryOpaque& operator=(const TorchLibraryOpaque&) = delete;
-    TorchLibraryOpaque(TorchLibraryOpaque&&) = default;
-    TorchLibraryOpaque& operator=(TorchLibraryOpaque&&) = default;
-    ~TorchLibraryOpaque() = default;
-
-    void impl(const char* name, torch::CppFunction fn) {
-      library_.impl(name, std::move(fn));
-    }
-private:
-    torch::Library library_; // Actual Library object
-};
-
-
-class VoidStarConverter: public c10::OperatorKernel {
-  public:
-    VoidStarConverter(void (*fn)(void **, int64_t, int64_t)) : fn_(fn) {}
-
-    void operator()(const c10::OperatorHandle& op, c10::DispatchKeySet keyset, torch::jit::Stack* stack) {
-      // function pt1 here should take in IValues, pass a malloc'd stack into the
-      // second function
-      // need a translation from IValues to ATH to void*s!
-
-      const auto& schema = op.schema();
-      const auto num_returns = schema.returns().size();
-      const auto num_arguments = schema.arguments().size();
-      // to make this faster, you can make this a C array on the stack --> though this may cause a stackoverflow
-      void **ministack = (void**)malloc((num_arguments + num_returns) * sizeof(void *)); 
-      // std::unique_ptr<void *[]> ministack = std::make_unique<void*[]>(num_arguments + num_returns);
-
-      for (size_t idx = 0; idx < num_arguments; idx++) {  // rbarnes will prefer a c10::irange instead of this loop!
-        const c10::IValue& arg = torch::jit::peek(stack, idx, num_arguments);
-        if (arg.isInt()) {
-          ministack[idx] = reinterpret_cast<void *>(arg.toInt());
-        } else if (arg.isTensor()) {
-          const at::Tensor& tensor = arg.toTensor();
-          AtenTensorHandle ath = torch::aot_inductor::tensor_pointer_to_tensor_handle(&tensor);
-          ministack[idx] = reinterpret_cast<void *>(ath);
-        } else {
-          TORCH_CHECK(false, "Other types of IValues not yet handled!");
-        }
-      }
-
-      // second function is going to take a stack of void*, cast them to our
-      // schema values for now, and run the function and modify the void* stack
-      fn_(ministack, num_arguments, num_returns);
-
-      // now pop all inputs on stack. if we pop earlier, Tensors would go out of scope
-      // before calling the function
-      torch::jit::drop(stack, num_arguments);
-
-      // read the output from the end of the stack and wrap that back into
-      // IValue from void*?
-      for (size_t idx = 0; idx < num_returns; idx++) {
-        const c10::TypePtr& ret_type = schema.returns()[idx].type();
-        if (*ret_type == *c10::getTypePtr<at::Tensor>()) {
-          AtenTensorHandle ret_ath = reinterpret_cast<AtenTensorHandle>(ministack[num_arguments + idx]);
-          at::Tensor out = *torch::aot_inductor::tensor_handle_to_tensor_pointer(ret_ath);
-          torch::jit::push(stack, c10::IValue(out));
-        } else {
-          TORCH_CHECK(false, "Only Tensor return types are currently supported!");
-        }
-      }
-
-      free(ministack);
-    }
-
-  private:
-    void (*fn_)(void **, int64_t, int64_t);
-};
-
-
-StableLibrary::StableLibrary(StableLibrary::Kind kind, std::string ns, std::optional<c10::DispatchKey> k, const char* file, uint32_t line) 
- : lib_(new TorchLibraryOpaque(StableLibrary::Kind::IMPL, std::move(ns), k, file, line)) {}
-
-
-StableLibrary& StableLibrary::impl(const char* name, void (*fn)(void **, int64_t, int64_t)) {
-  this->lib_->impl(name, torch::CppFunction::makeFromBoxedFunctor(std::move(std::make_unique<VoidStarConverter>(fn))));
-  return *this;
-}
diff --git a/torchao/csrc/cuda/tensor_core_tiled_layout/libtorch.h b/torchao/csrc/cuda/tensor_core_tiled_layout/libtorch.h
index 9679921590..febffe10bc 100644
--- a/torchao/csrc/cuda/tensor_core_tiled_layout/libtorch.h
+++ b/torchao/csrc/cuda/tensor_core_tiled_layout/libtorch.h
@@ -25,7 +25,7 @@ class StableLibrary final {
     // constructor
     /// \private
     ///
-    /// Use TORCH_LIBRARY() or TORCH_LIBRARY_IMPL() instead of using these
+    /// Use STABLE_TORCH_LIBRARY or STABLE_TORCH_LIBRARY_IMPL() instead of using these
     /// constructors directly
     StableLibrary(
         Kind kind,
@@ -43,15 +43,6 @@ class StableLibrary final {
     StableLibrary& impl(const char* name, void (*fn)(void **, int64_t, int64_t));
 };
 
-
-// _def function   ==> IGNORE LIBRARY + just call these
-
-// stable_impl function (that takes in a string and a void** function pointer)
-// _impl doesn't really need a Library object, try to avoid it for now
-// just copy its implementation
-// it'll give u a handle that needs to be kept alive, just assign to global for now
-
-
 class StableTorchLibraryInit final {
   private:
     using InitFn = void(StableLibrary&);
@@ -90,7 +81,7 @@ class StableTorchLibraryInit final {
 
 
 
-
+// notes while figuring out templating
 /**
 #define TORCH_LIBRARY_IMPL(ns, k, m) _TORCH_LIBRARY_IMPL(ns, k, m, C10_UID)
 
diff --git a/torchao/csrc/cuda/tensor_core_tiled_layout/tensor_core_tiled_layout.cu b/torchao/csrc/cuda/tensor_core_tiled_layout/tensor_core_tiled_layout.cu
index 3b3d96d912..a0eca94751 100644
--- a/torchao/csrc/cuda/tensor_core_tiled_layout/tensor_core_tiled_layout.cu
+++ b/torchao/csrc/cuda/tensor_core_tiled_layout/tensor_core_tiled_layout.cu
@@ -2,7 +2,7 @@
 
 #include "libtorch.h"
 
- // need to confirm or make the following includes header-only
+// need to confirm or make the following includes header-only
 #include <ATen/core/TensorAccessor.h> 
 #include <ATen/DeviceGuard.h>
 #include <ATen/cuda/CUDAContext.h>

From 5e2c2d052cc9a866b68bb164754526dd4aafacd4 Mon Sep 17 00:00:00 2001
From: Jane Xu <janeyx@meta.com>
Date: Tue, 4 Feb 2025 10:43:48 -0800
Subject: [PATCH 11/12] Remove dependency on change in core

---
 torchao/csrc/cuda/tensor_core_tiled_layout/libtorch.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torchao/csrc/cuda/tensor_core_tiled_layout/libtorch.cpp b/torchao/csrc/cuda/tensor_core_tiled_layout/libtorch.cpp
index fd1557f2a7..1ac4966da6 100644
--- a/torchao/csrc/cuda/tensor_core_tiled_layout/libtorch.cpp
+++ b/torchao/csrc/cuda/tensor_core_tiled_layout/libtorch.cpp
@@ -52,7 +52,7 @@ class VoidStarConverter: public c10::OperatorKernel {
         if (arg.isInt()) {
           ministack[idx] = reinterpret_cast<void *>(arg.toInt());
         } else if (arg.isTensor()) {
-          const at::Tensor& tensor = arg.toTensor();
+          at::Tensor& tensor = const_cast<at::Tensor&>(arg.toTensor());
           AtenTensorHandle ath = torch::aot_inductor::tensor_pointer_to_tensor_handle(&tensor);
           ministack[idx] = reinterpret_cast<void *>(ath);
         } else {

From cc4d0223c1ee2ff287c9ece99b6517e85ac5a2d6 Mon Sep 17 00:00:00 2001
From: Jane Xu <janeyx@meta.com>
Date: Tue, 18 Feb 2025 11:27:40 -0800
Subject: [PATCH 12/12] Fix memory leak by using RAIIATH

---
 .../tensor_core_tiled_layout/libtorch.cpp     | 10 +++---
 .../tensor_core_tiled_layout.cu               | 35 +++++++++++--------
 2 files changed, 27 insertions(+), 18 deletions(-)

diff --git a/torchao/csrc/cuda/tensor_core_tiled_layout/libtorch.cpp b/torchao/csrc/cuda/tensor_core_tiled_layout/libtorch.cpp
index 1ac4966da6..352ff1be22 100644
--- a/torchao/csrc/cuda/tensor_core_tiled_layout/libtorch.cpp
+++ b/torchao/csrc/cuda/tensor_core_tiled_layout/libtorch.cpp
@@ -12,6 +12,7 @@
 #include <ATen/core/stack.h>
 #include <ATen/cuda/CUDAContext.h>
 #include <c10/cuda/CUDAGuard.h>
+#include <torch/csrc/inductor/aoti_runtime/utils.h>
 #include <torch/csrc/inductor/aoti_torch/utils.h>
 #include <torch/library.h>
 
@@ -35,6 +36,8 @@ class StableLibrary::TorchLibraryOpaque {
 };
 
 
+using RAIIATH = torch::aot_inductor::RAIIAtenTensorHandle;
+
 class VoidStarConverter: public c10::OperatorKernel {
   public:
     VoidStarConverter(void (*fn)(void **, int64_t, int64_t)) : fn_(fn) {}
@@ -52,8 +55,7 @@ class VoidStarConverter: public c10::OperatorKernel {
         if (arg.isInt()) {
           ministack[idx] = reinterpret_cast<void *>(arg.toInt());
         } else if (arg.isTensor()) {
-          at::Tensor& tensor = const_cast<at::Tensor&>(arg.toTensor());
-          AtenTensorHandle ath = torch::aot_inductor::tensor_pointer_to_tensor_handle(&tensor);
+          AtenTensorHandle ath = torch::aot_inductor::new_tensor_handle(std::move(const_cast<at::Tensor&>(arg.toTensor())));
           ministack[idx] = reinterpret_cast<void *>(ath);
         } else {
           TORCH_CHECK(false, "Other types of IValues not yet handled!");
@@ -73,8 +75,8 @@ class VoidStarConverter: public c10::OperatorKernel {
       for (size_t idx = 0; idx < num_returns; idx++) {
         const c10::TypePtr& ret_type = schema.returns()[idx].type();
         if (*ret_type == *c10::getTypePtr<at::Tensor>()) {
-          AtenTensorHandle ret_ath = reinterpret_cast<AtenTensorHandle>(ministack[num_arguments + idx]);
-          at::Tensor out = *torch::aot_inductor::tensor_handle_to_tensor_pointer(ret_ath);
+          auto ret_raiiath = RAIIATH(reinterpret_cast<AtenTensorHandle>(ministack[num_arguments + idx]));
+          at::Tensor out = *torch::aot_inductor::tensor_handle_to_tensor_pointer(ret_raiiath.get());
           torch::jit::push(stack, c10::IValue(out));
         } else {
           TORCH_CHECK(false, "Only Tensor return types are currently supported!");
diff --git a/torchao/csrc/cuda/tensor_core_tiled_layout/tensor_core_tiled_layout.cu b/torchao/csrc/cuda/tensor_core_tiled_layout/tensor_core_tiled_layout.cu
index a0eca94751..b41b0669f7 100644
--- a/torchao/csrc/cuda/tensor_core_tiled_layout/tensor_core_tiled_layout.cu
+++ b/torchao/csrc/cuda/tensor_core_tiled_layout/tensor_core_tiled_layout.cu
@@ -1,5 +1,6 @@
 #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 800  // at least Ampere
 
+#include <torch/csrc/inductor/aoti_runtime/utils.h>
 #include "libtorch.h"
 
 // need to confirm or make the following includes header-only
@@ -8,6 +9,8 @@
 #include <ATen/cuda/CUDAContext.h>
 #include <c10/cuda/CUDAGuard.h>
 
+using RAIIATH = torch::aot_inductor::RAIIAtenTensorHandle;
+
 template <typename U, typename V>
 constexpr __host__ __device__ auto divUp(U a, V b) -> decltype(a + b) {
   static_assert(std::is_integral<U>::value && std::is_integral<V>::value, "");
@@ -167,10 +170,13 @@ __global__ void _dequantize_int4_kernel(
 //  - at::cuda::getCurrentCUDAStream();
 //  - at::GenericPackedTensorAccessor
 //  - at::RestrictPtrTraits
-AtenTensorHandle _ATH_dequantize_tensor_core_tiled_layout(
-    const AtenTensorHandle packed_w, const AtenTensorHandle scales_and_zeros,
+RAIIATH _ATH_dequantize_tensor_core_tiled_layout(
+    const RAIIATH packed_w_raiiath, const RAIIATH scales_and_zeros_raiiath,
     int64_t group_size, int64_t innerKTiles) {
 
+  auto packed_w = packed_w_raiiath.get();
+  auto scales_and_zeros = scales_and_zeros_raiiath.get();
+
   constexpr int32_t kNTileSize = 8;
   constexpr int32_t kKTileSize = 16;
 
@@ -334,7 +340,7 @@ AtenTensorHandle _ATH_dequantize_tensor_core_tiled_layout(
 #undef DISPATCH_Q_GROUP
 #undef RUN_DEQUANT
 
-  return out;
+  return RAIIATH(out);
 }
 
 
@@ -346,23 +352,24 @@ void voidyvoid_boxed_ATH_dequantize_tensor_core_tiled_layout(void **stack,
   // schema values for now, and run the function and modify the void* stack
   int64_t innerKTiles = reinterpret_cast<int64_t>(stack[3]);
   int64_t group_size = reinterpret_cast<int64_t>(stack[2]);
-  AtenTensorHandle scales_and_zeros_ath =
-      reinterpret_cast<AtenTensorHandle>(stack[1]);
-  AtenTensorHandle packed_w_ath = reinterpret_cast<AtenTensorHandle>(stack[0]);
+  RAIIATH scales_and_zeros_ath(reinterpret_cast<AtenTensorHandle>(stack[1]));
+  RAIIATH packed_w_ath(reinterpret_cast<AtenTensorHandle>(stack[0]));
 
-  AtenTensorHandle ath_res = _ATH_dequantize_tensor_core_tiled_layout(
-      packed_w_ath, scales_and_zeros_ath, group_size, innerKTiles);
+  RAIIATH raiiath_res = _ATH_dequantize_tensor_core_tiled_layout(
+      std::move(packed_w_ath), std::move(scales_and_zeros_ath), group_size, innerKTiles);
 
-  void *out = reinterpret_cast<void *>(ath_res);
+  void *out = reinterpret_cast<void *>(raiiath_res.release());
   stack[num_args] = out;
 }
 
 
 // output is [n][k] (int32 dtype)
 // input is [n / 8][k / (InnerKTiles * 16)][32][innerKTiles / 2]
-AtenTensorHandle _ATH_unpack_tensor_core_tiled_layout(const AtenTensorHandle packed_w,
+RAIIATH _ATH_unpack_tensor_core_tiled_layout(const RAIIATH packed_w_raiiath,
                                             int64_t innerKTiles) {
 
+  auto packed_w = packed_w_raiiath.get();
+
   int32_t packed_w_device_index;
   aoti_torch_get_device_index(packed_w, &packed_w_device_index);
 
@@ -451,7 +458,7 @@ AtenTensorHandle _ATH_unpack_tensor_core_tiled_layout(const AtenTensorHandle pac
         <<<grid, kWarpSize, 0, stream>>>(packed_w_pta32, out_pta32);
   }
 
-  return out;
+  return RAIIATH(out);
 }
 
 void voidyvoid_boxed_ATH_unpack_tensor_core_tiled_layout(void **stack,
@@ -470,10 +477,10 @@ void voidyvoid_boxed_ATH_unpack_tensor_core_tiled_layout(void **stack,
   int64_t innerKTiles = reinterpret_cast<int64_t>(stack[1]);
   AtenTensorHandle packed_w_ath = reinterpret_cast<AtenTensorHandle>(stack[0]);
 
-  AtenTensorHandle ath_res = _ATH_unpack_tensor_core_tiled_layout(
-      packed_w_ath, innerKTiles);
+  RAIIATH raiiath_res = _ATH_unpack_tensor_core_tiled_layout(
+      RAIIATH(packed_w_ath), innerKTiles);
 
-  void *out = reinterpret_cast<void *>(ath_res);
+  void *out = reinterpret_cast<void *>(raiiath_res.release());
   stack[num_args] = out;
 }