From 38b50b415d5d7de9c31f777163f9e5b4246c7f73 Mon Sep 17 00:00:00 2001 From: Jane Xu Date: Fri, 10 Jan 2025 15:27:45 -0800 Subject: [PATCH 01/12] [super ugly not working code] use shim.h instead of Tensor --- .../tensor_core_tiled_layout.cu | 303 +++++++++++------- torchao/csrc/ignore_this.txt | 18 ++ 2 files changed, 202 insertions(+), 119 deletions(-) create mode 100644 torchao/csrc/ignore_this.txt diff --git a/torchao/csrc/cuda/tensor_core_tiled_layout/tensor_core_tiled_layout.cu b/torchao/csrc/cuda/tensor_core_tiled_layout/tensor_core_tiled_layout.cu index ea0f24c202..bfca09edcf 100644 --- a/torchao/csrc/cuda/tensor_core_tiled_layout/tensor_core_tiled_layout.cu +++ b/torchao/csrc/cuda/tensor_core_tiled_layout/tensor_core_tiled_layout.cu @@ -1,10 +1,12 @@ #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 800 // at least Ampere -#include -#include -#include +// #include +// #include #include +#include #include +#include +#include #include template @@ -15,18 +17,18 @@ constexpr __host__ __device__ auto divUp(U a, V b) -> decltype(a + b) { } constexpr int32_t kWarpSize = 32; -//Simple data structure to represent 4 pairs of bfloat16s, used for vectorized dequantization -//https://github.com/pytorch/pytorch/blob/b6689e0fb83a1578959ab0d9c6d2d9e11f7df21a/aten/src/ATen/native/cuda/int4mm.cu#L178-L180 -struct __align__(16) bf16x2x4 { - __nv_bfloat162 vals[4]; -}; +// Simple data structure to represent 4 pairs of bfloat16s, used for vectorized +// dequantization +// https://github.com/pytorch/pytorch/blob/b6689e0fb83a1578959ab0d9c6d2d9e11f7df21a/aten/src/ATen/native/cuda/int4mm.cu#L178-L180 +struct __align__(16) bf16x2x4 { __nv_bfloat162 vals[4]; }; -//Copied from https://github.com/pytorch/pytorch/blob/b6689e0fb83a1578959ab0d9c6d2d9e11f7df21a/aten/src/ATen/native/cuda/int4mm.cu#L195C1-L241C1 +// Copied from +// https://github.com/pytorch/pytorch/blob/b6689e0fb83a1578959ab0d9c6d2d9e11f7df21a/aten/src/ATen/native/cuda/int4mm.cu#L195C1-L241C1 inline __device__ bf16x2x4 convert_i4x8_to_bf16x2x4(uint32_t source) { bf16x2x4 result; constexpr int kElements = 8; - uint32_t* h = reinterpret_cast(&result); + uint32_t *h = reinterpret_cast(&result); uint32_t const source_i4s = source; // First, we extract the i4s and construct an intermediate fp16 number. @@ -44,10 +46,10 @@ inline __device__ bf16x2x4 convert_i4x8_to_bf16x2x4(uint32_t source) { for (int ii = 1; ii < kElements / 2; ++ii) { i4s >>= 4; // or is it 8? // (i4s & 0x000f000f) | 0x43004300 - asm volatile( - "lop3.b32 %0, %1, %2, %3, %4;\n" - : "=r"(h[ii]) - : "r"(i4s), "n"(MASK), "n"(I4s_TO_BF16s_MAGIC_NUM), "n"(immLut)); + asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n" + : "=r"(h[ii]) + : "r"(i4s), "n"(MASK), "n"(I4s_TO_BF16s_MAGIC_NUM), + "n"(immLut)); } // This is the BF16 {-136, -136} represented as an integer. @@ -73,8 +75,9 @@ template __global__ void _dequantize_int4_kernel( const at::PackedTensorAccessor32 in, at::PackedTensorAccessor32 out, - std::optional> scales_and_zeros = std::nullopt) -{ + std::optional> + scales_and_zeros = std::nullopt) { constexpr int32_t kNTileSize = 8; constexpr int32_t kKTileSize = 16; @@ -88,7 +91,8 @@ __global__ void _dequantize_int4_kernel( // 8 k-tile values, 4 per m16n8k16 mma.sync operand B // int32_t ks[8]; - //Only need 4 offsets since TC layout for single tile is 2x2 (2 pairs of 2 contiguous values) + // Only need 4 offsets since TC layout for single tile is 2x2 (2 pairs of 2 + // contiguous values) int32_t ks[4]; // Store address base offset @@ -97,9 +101,9 @@ __global__ void _dequantize_int4_kernel( // Unpack 2 k-tiles at a time since min pack size is InnerKTiles = 2 #pragma unroll for (int innerKTile = 0; innerKTile < InnerKTiles; innerKTile += 2) { - //Tensor-core layout for m16n8k16 is such that each tile has 2 pairs of 2 contiguous values - //Hence, we only need 4 offsets - // Offsets of innerTile0 + // Tensor-core layout for m16n8k16 is such that each tile has 2 pairs of 2 + // contiguous values Hence, we only need 4 offsets + // Offsets of innerTile0 auto kBase0 = (kOuterTile * InnerKTiles + innerKTile) * kKTileSize; ks[0] = kBase0 + (t % 4) * 2; ks[1] = ks[0] + 8; @@ -112,30 +116,35 @@ __global__ void _dequantize_int4_kernel( // inner k-tiles unpack two at a time int32_t pack = in[nTile][kOuterTile][t][innerKTile / 2]; - if constexpr(kDequant) { - // static_assert(scales_and_zeros.has_value(), "scales_and_zeros must be set when dequantizing"); - static_assert(std::is_same::value, "Out must be BFloat16 when dequantizing"); + if constexpr (kDequant) { + // static_assert(scales_and_zeros.has_value(), "scales_and_zeros must be + // set when dequantizing"); + static_assert(std::is_same::value, + "Out must be BFloat16 when dequantizing"); // __nv_bfloat16 v[8]; - // // Extract u4, convert to s4 by subtracting by 2 ** nbits / 2, then convert to bfloat16 + // // Extract u4, convert to s4 by subtracting by 2 ** nbits / 2, then + // convert to bfloat16 bf16x2x4 v_bf16x2x4 = convert_i4x8_to_bf16x2x4(pack); // All b values within a 16x16 tile should fall within the same q group // Hence we load 1 scale and zero per loop - int qgroup = ks[0] / groupSize; - const __nv_bfloat16 *pSZ = reinterpret_cast(&scales_and_zeros.value()[qgroup][n0][0]); + int qgroup = ks[0] / groupSize; + const __nv_bfloat16 *pSZ = reinterpret_cast( + &scales_and_zeros.value()[qgroup][n0][0]); // Vectorize scales and zeros __nv_bfloat162 scale2 = __bfloat162bfloat162(pSZ[0]); __nv_bfloat162 zero2 = __bfloat162bfloat162(pSZ[1]); - #pragma unroll +#pragma unroll for (int i = 0; i < 4; i++) { - reinterpret_cast<__nv_bfloat162*>(&pOut[ks[i]])[0] = __hfma2(v_bf16x2x4.vals[i], scale2, zero2); + reinterpret_cast<__nv_bfloat162 *>(&pOut[ks[i]])[0] = + __hfma2(v_bf16x2x4.vals[i], scale2, zero2); } - } - else { - static_assert(std::is_same::value, "Out must be int32_t when unpacking to int"); + } else { + static_assert(std::is_same::value, + "Out must be int32_t when unpacking to int"); int32_t v[8]; v[0] = pack & 0x0000000f; @@ -146,9 +155,9 @@ __global__ void _dequantize_int4_kernel( v[3] = (pack >> 20) & 0x0000000f; v[5] = (pack >> 24) & 0x0000000f; v[7] = (pack >> 28) & 0x0000000f; - int2* v_i32x2 = reinterpret_cast(v); + int2 *v_i32x2 = reinterpret_cast(v); - #pragma unroll +#pragma unroll for (int i = 0; i < 4; ++i) { reinterpret_cast(&pOut[ks[i]])[0] = v_i32x2[i]; } @@ -156,16 +165,9 @@ __global__ void _dequantize_int4_kernel( } } -// output is [n][k] (int32 dtype) -// input is [n / 8][k / (InnerKTiles * 16)][32][innerKTiles / 2] -// scales_and_zeros is [numQGroups][n][2] -// qGroupSize is 32, 64, 128 or 256 -at::Tensor _dequantize_tensor_core_tiled_layout( - const at::Tensor& packed_w, - const at::Tensor& scales_and_zeros, - int64_t group_size, - int64_t innerKTiles) -{ +at::Tensor _ATH_dequantize_tensor_core_tiled_layout( + const AtenTensorHandle packed_w, const AtenTensorHandle scales_and_zeros, + int64_t group_size, int64_t innerKTiles) { constexpr int32_t kNTileSize = 8; constexpr int32_t kKTileSize = 16; @@ -173,94 +175,147 @@ at::Tensor _dequantize_tensor_core_tiled_layout( c10::cuda::CUDAGuard g(packed_w.device()); // packed_w preconditions - TORCH_CHECK(packed_w.dim() == 4); - TORCH_CHECK(packed_w.dtype() == at::kInt); - TORCH_CHECK(packed_w.is_contiguous()); - TORCH_CHECK(packed_w.size(2) == 32); - TORCH_CHECK(packed_w.size(3) == innerKTiles / 2); + int64_t packed_w_dim; + aoti_torch_get_dim(packed_w, &packed_w_dim); + TORCH_CHECK(packed_w_dim == 4); + + int32_t packed_w_dtype; + aoti_torch_get_dtype(packed_w, &packed_w_dtype); + TORCH_CHECK(packed_w_dtype == at::kInt); + + // is_contiguous not existent today + // TORCH_CHECK(packed_w.is_contiguous()); + + int64_t packed_w_dim_2_size; + aoti_torch_get_size(packed_w, 2, &packed_w_dim_2_size); + TORCH_CHECK(packed_w_dim_2_size == 32); + + int64_t packed_w_dim_3_size; + aoti_torch_get_size(packed_w, 3, &packed_w_dim_3_size); + TORCH_CHECK(packed_w_dim_3_size == innerKTiles / 2); TORCH_CHECK(innerKTiles == 2 || innerKTiles == 4 || innerKTiles == 8); - auto numQGroups = scales_and_zeros.size(0); - int N = packed_w.size(0) * kNTileSize; - int K = packed_w.size(1) * innerKTiles * kKTileSize; + auto numQGroups; + aoti_torch_get_size(scales_and_zeros, 0, &numQGroups); + + int64_t packed_w_dim_0_size; + aoti_torch_get_size(packed_w, 0, &packed_w_dim_0_size); + int N = packed_w_dim_0_size * kNTileSize; + + int64_t packed_w_dim_1_size; + aoti_torch_get_size(packed_w, 1, &packed_w_dim_1_size); + int K = packed_w_dim_1_size * innerKTiles * kKTileSize; // scales_and_zeros preconditions - TORCH_CHECK( - group_size == 32 || group_size == 64 || group_size == 128 || - group_size == 256); + TORCH_CHECK(group_size == 32 || group_size == 64 || group_size == 128 || + group_size == 256); TORCH_CHECK(numQGroups == K / group_size); - TORCH_CHECK(scales_and_zeros.dim() == 3); - TORCH_CHECK(scales_and_zeros.size(1) == N); - TORCH_CHECK(scales_and_zeros.size(2) == 2); + + int64_t scales_and_zeros_dim; + aoti_torch_get_dim(scales_and_zeros, &scales_and_zeros_dim); + TORCH_CHECK(scales_and_zeros_dim == 3); + + int64_t scales_and_zeros_dim_1_size; + int64_t scales_and_zeros_dim_2_size; + aoti_torch_get_size(scales_and_zeros, 1, &scales_and_zeros_dim_1_size); + aoti_torch_get_size(scales_and_zeros, 2, &scales_and_zeros_dim_2_size); + TORCH_CHECK(scales_and_zeros_dim_1_size == N); + TORCH_CHECK(scales_and_zeros_dim_2_size == 2); auto nTiles = divUp(N, kNTileSize); auto kSuperTiles = divUp(K, innerKTiles * kKTileSize); auto out = at::empty( {N, K}, at::TensorOptions().dtype(at::kBFloat16).device(packed_w.device())); + // gotta swap this to be an AtenTensorHandle auto stream = at::cuda::getCurrentCUDAStream(); dim3 grid(kSuperTiles, nTiles); -#define RUN_DEQUANT(QGROUPSIZE) \ - do { \ - switch(innerKTiles) { \ - case 2: \ - _dequantize_int4_kernel<<>>( \ - packed_w.packed_accessor32(), \ - out.packed_accessor32(), \ - scales_and_zeros.packed_accessor32()); \ - break; \ - case 4: \ - _dequantize_int4_kernel<<>>( \ - packed_w.packed_accessor32(), \ - out.packed_accessor32(), \ - scales_and_zeros.packed_accessor32()); \ - break; \ - case 8: \ - _dequantize_int4_kernel<<>>( \ - packed_w.packed_accessor32(), \ + void *packed_w_data_ptr; + void *scales_and_zeros_data_ptr; + void *out_data_ptr; + aoti_torch_get_data_ptr(packed_w, &packed_w_data_ptr); + aoti_torch_get_data_ptr(scales_and_zeros, &scales_and_zeros_data_ptr); + aoti_torch_get_data_ptr(out, &out_data_ptr); + +// packed_w.packed_accessor32(), \ out.packed_accessor32(), \ scales_and_zeros.packed_accessor32()); \ - break; \ - default: \ - break; \ - } \ - } while(false) - -#define DISPATCH_Q_GROUP() \ - do { \ - switch (group_size) { \ - case 32: \ - RUN_DEQUANT(32); \ - break; \ - case 64: \ - RUN_DEQUANT(64); \ - break; \ - case 128: \ - RUN_DEQUANT(128); \ - break; \ - case 256: \ - RUN_DEQUANT(256); \ - break; \ - default: \ - break; \ - } \ - } while(false) + +#define RUN_DEQUANT(QGROUPSIZE) \ + do { \ + switch (innerKTiles) { \ + case 2: \ + _dequantize_int4_kernel \ + <<>>(packed_w_data_ptr, out_data_ptr, \ + scales_and_zeros_data_ptr); \ + break; \ + case 4: \ + _dequantize_int4_kernel \ + <<>>(packed_w_data_ptr, out_data_ptr, \ + scales_and_zeros_data_ptr); \ + break; \ + case 8: \ + _dequantize_int4_kernel \ + <<>>(packed_w_data_ptr, out_data_ptr, \ + scales_and_zeros_data_ptr); \ + break; \ + default: \ + break; \ + } \ + } while (false) + +#define DISPATCH_Q_GROUP() \ + do { \ + switch (group_size) { \ + case 32: \ + RUN_DEQUANT(32); \ + break; \ + case 64: \ + RUN_DEQUANT(64); \ + break; \ + case 128: \ + RUN_DEQUANT(128); \ + break; \ + case 256: \ + RUN_DEQUANT(256); \ + break; \ + default: \ + break; \ + } \ + } while (false) DISPATCH_Q_GROUP(); - #undef DISPATCH_Q_GROUP - #undef RUN_DEQUANT +#undef DISPATCH_Q_GROUP +#undef RUN_DEQUANT return out; } // output is [n][k] (int32 dtype) // input is [n / 8][k / (InnerKTiles * 16)][32][innerKTiles / 2] -at::Tensor _unpack_tensor_core_tiled_layout( - const at::Tensor& packed_w, - int64_t innerKTiles) -{ +// scales_and_zeros is [numQGroups][n][2] +// qGroupSize is 32, 64, 128 or 256 +at::Tensor +_dequantize_tensor_core_tiled_layout(const at::Tensor &packed_w, + const at::Tensor &scales_and_zeros, + int64_t group_size, int64_t innerKTiles) { + + AtenTensorHandle packed_w_ath = tensor_pointer_to_tensor_handle(&packed_w); + AtenTensorHandle scales_and_zeros_ath = + tensor_pointer_to_tensor_handle(&scales_and_zeros); + + AtenTensorHandle ath_res = _ATH_dequantize_tensor_core_tiled_layout( + packed_w_ath, scales_and_zeros_ath, group_size, innerKTiles); + + return ath_res; // tensor_handle_to_tensor_pointer(ath_res); +} + +// output is [n][k] (int32 dtype) +// input is [n / 8][k / (InnerKTiles * 16)][32][innerKTiles / 2] +at::Tensor _unpack_tensor_core_tiled_layout(const at::Tensor &packed_w, + int64_t innerKTiles) { c10::cuda::CUDAGuard g(packed_w.device()); @@ -283,30 +338,40 @@ at::Tensor _unpack_tensor_core_tiled_layout( auto kSuperTiles = divUp(K, innerKTiles * kKTileSize); auto out = at::empty( - {N, K}, - at::TensorOptions().dtype(at::kInt).device(packed_w.device())); + {N, K}, at::TensorOptions().dtype(at::kInt).device(packed_w.device())); auto stream = at::cuda::getCurrentCUDAStream(); dim3 grid(kSuperTiles, nTiles); if (innerKTiles == 2) { - _dequantize_int4_kernel<<>>( - packed_w.packed_accessor32(), - out.packed_accessor32()); - } - else if (innerKTiles == 4) { - _dequantize_int4_kernel<<>>( - packed_w.packed_accessor32(), - out.packed_accessor32()); + _dequantize_int4_kernel + <<>>( + packed_w.packed_accessor32(), + out.packed_accessor32()); + } else if (innerKTiles == 4) { + _dequantize_int4_kernel + <<>>( + packed_w.packed_accessor32(), + out.packed_accessor32()); } else if (innerKTiles == 8) { - _dequantize_int4_kernel<<>>( - packed_w.packed_accessor32(), - out.packed_accessor32()); + _dequantize_int4_kernel + <<>>( + packed_w.packed_accessor32(), + out.packed_accessor32()); } return out; } +// // Use when all tensors arguments accept one (normal) batch dim. +// // This batching rule expands the batch dim on all Tensors, reshapes it into +// // dim 0, calls the op, and then reshapes the batch dim out of dim 0. +// // This is not the most efficient thing; if there are alternatives, plese try +// // to use them. Use this only as a last resort. +// #define EXISTING_BDIM_ALL_BOXED(op) \ +// m.impl(#op, +// torch::CppFunction::makeFromBoxedFunction()); + TORCH_LIBRARY_IMPL(torchao, CUDA, m) { m.impl("torchao::unpack_tensor_core_tiled_layout", &_unpack_tensor_core_tiled_layout); m.impl("torchao::dequantize_tensor_core_tiled_layout", &_dequantize_tensor_core_tiled_layout); diff --git a/torchao/csrc/ignore_this.txt b/torchao/csrc/ignore_this.txt new file mode 100644 index 0000000000..ef8544f6e9 --- /dev/null +++ b/torchao/csrc/ignore_this.txt @@ -0,0 +1,18 @@ +#include + +int main() { + PyObject *list = PyList_New(2); + + for (auto i = 0; i < 2; ++i) { + PyObject *anint = PyLong_FromLong(i + 1); + PyList_SET_ITEM(list, i, anint); + } + + auto a = PyList_GetItemRef(list, 0); + + return 0; +} + +int r = main(); + +// auto lol = torch::getTHPLayout(static_cast(1)); From 4d0cebf22378118896a9aa592fb9d9e0d6e8dd1f Mon Sep 17 00:00:00 2001 From: Jane Xu Date: Thu, 16 Jan 2025 14:08:27 -0800 Subject: [PATCH 02/12] Cleaned up PoC --- .../tensor_core_tiled_layout.cu | 81 ++++++++++++++----- 1 file changed, 63 insertions(+), 18 deletions(-) diff --git a/torchao/csrc/cuda/tensor_core_tiled_layout/tensor_core_tiled_layout.cu b/torchao/csrc/cuda/tensor_core_tiled_layout/tensor_core_tiled_layout.cu index bfca09edcf..e00afdff85 100644 --- a/torchao/csrc/cuda/tensor_core_tiled_layout/tensor_core_tiled_layout.cu +++ b/torchao/csrc/cuda/tensor_core_tiled_layout/tensor_core_tiled_layout.cu @@ -3,8 +3,10 @@ // #include // #include #include +#include #include #include +#include #include #include #include @@ -165,14 +167,18 @@ __global__ void _dequantize_int4_kernel( } } -at::Tensor _ATH_dequantize_tensor_core_tiled_layout( +AtenTensorHandle _ATH_dequantize_tensor_core_tiled_layout( const AtenTensorHandle packed_w, const AtenTensorHandle scales_and_zeros, int64_t group_size, int64_t innerKTiles) { constexpr int32_t kNTileSize = 8; constexpr int32_t kKTileSize = 16; - c10::cuda::CUDAGuard g(packed_w.device()); + int32_t packed_w_device_index; + aoti_torch_get_device_index(packed_w, &packed_w_device_index); + + // c10::cuda::CUDAGuard g(packed_w.device()); + c10::cuda::CUDAGuard g(packed_w_device_index); // packed_w preconditions int64_t packed_w_dim; @@ -181,7 +187,7 @@ at::Tensor _ATH_dequantize_tensor_core_tiled_layout( int32_t packed_w_dtype; aoti_torch_get_dtype(packed_w, &packed_w_dtype); - TORCH_CHECK(packed_w_dtype == at::kInt); + TORCH_CHECK(packed_w_dtype == static_cast(at::kInt)); // is_contiguous not existent today // TORCH_CHECK(packed_w.is_contiguous()); @@ -195,7 +201,7 @@ at::Tensor _ATH_dequantize_tensor_core_tiled_layout( TORCH_CHECK(packed_w_dim_3_size == innerKTiles / 2); TORCH_CHECK(innerKTiles == 2 || innerKTiles == 4 || innerKTiles == 8); - auto numQGroups; + int64_t numQGroups; aoti_torch_get_size(scales_and_zeros, 0, &numQGroups); int64_t packed_w_dim_0_size; @@ -224,20 +230,58 @@ at::Tensor _ATH_dequantize_tensor_core_tiled_layout( auto nTiles = divUp(N, kNTileSize); auto kSuperTiles = divUp(K, innerKTiles * kKTileSize); - auto out = at::empty( - {N, K}, - at::TensorOptions().dtype(at::kBFloat16).device(packed_w.device())); + + auto bf16 = aoti_torch_dtype_bfloat16(); + + AtenTensorHandle out; + int64_t out_sizes[] = {N, K}; + int64_t out_strides[] = {K, 1}; + int32_t packed_w_device_type; + aoti_torch_get_device_type(packed_w, &packed_w_device_type); + aoti_torch_empty_strided(2, out_sizes, out_strides, bf16, + packed_w_device_type, packed_w_device_index, &out); + + // auto out = at::empty( + // {N, K}, + // at::TensorOptions().dtype(at::kBFloat16).device(packed_w.device())); // gotta swap this to be an AtenTensorHandle auto stream = at::cuda::getCurrentCUDAStream(); dim3 grid(kSuperTiles, nTiles); void *packed_w_data_ptr; - void *scales_and_zeros_data_ptr; - void *out_data_ptr; + int64_t *packed_w_sizes; + int64_t *packed_w_strides; aoti_torch_get_data_ptr(packed_w, &packed_w_data_ptr); + aoti_torch_get_sizes(packed_w, &packed_w_sizes); + aoti_torch_get_strides(packed_w, &packed_w_strides); + at::GenericPackedTensorAccessor + packed_w_pta32( + static_cast::PtrType>( + packed_w_data_ptr), + packed_w_sizes, packed_w_strides); + + void *scales_and_zeros_data_ptr; + int64_t *scales_and_zeros_sizes; + int64_t *scales_and_zeros_strides; aoti_torch_get_data_ptr(scales_and_zeros, &scales_and_zeros_data_ptr); + aoti_torch_get_sizes(scales_and_zeros, &scales_and_zeros_sizes); + aoti_torch_get_strides(scales_and_zeros, &scales_and_zeros_strides); + at::GenericPackedTensorAccessor + scales_and_zeros_pta32( + static_cast::PtrType>( + scales_and_zeros_data_ptr), + scales_and_zeros_sizes, scales_and_zeros_strides); + + void *out_data_ptr; aoti_torch_get_data_ptr(out, &out_data_ptr); + at::GenericPackedTensorAccessor + out_pta32( + static_cast::PtrType>( + out_data_ptr), + out_sizes, out_strides); // packed_w.packed_accessor32(), \ out.packed_accessor32(), \ @@ -248,18 +292,18 @@ at::Tensor _ATH_dequantize_tensor_core_tiled_layout( switch (innerKTiles) { \ case 2: \ _dequantize_int4_kernel \ - <<>>(packed_w_data_ptr, out_data_ptr, \ - scales_and_zeros_data_ptr); \ + <<>>(packed_w_pta32, out_pta32, \ + scales_and_zeros_pta32); \ break; \ case 4: \ _dequantize_int4_kernel \ - <<>>(packed_w_data_ptr, out_data_ptr, \ - scales_and_zeros_data_ptr); \ + <<>>(packed_w_pta32, out_pta32, \ + scales_and_zeros_pta32); \ break; \ case 8: \ _dequantize_int4_kernel \ - <<>>(packed_w_data_ptr, out_data_ptr, \ - scales_and_zeros_data_ptr); \ + <<>>(packed_w_pta32, out_pta32, \ + scales_and_zeros_pta32); \ break; \ default: \ break; \ @@ -302,14 +346,15 @@ _dequantize_tensor_core_tiled_layout(const at::Tensor &packed_w, const at::Tensor &scales_and_zeros, int64_t group_size, int64_t innerKTiles) { - AtenTensorHandle packed_w_ath = tensor_pointer_to_tensor_handle(&packed_w); + AtenTensorHandle packed_w_ath = + torch::aot_inductor::tensor_pointer_to_tensor_handle(&packed_w); AtenTensorHandle scales_and_zeros_ath = - tensor_pointer_to_tensor_handle(&scales_and_zeros); + torch::aot_inductor::tensor_pointer_to_tensor_handle(&scales_and_zeros); AtenTensorHandle ath_res = _ATH_dequantize_tensor_core_tiled_layout( packed_w_ath, scales_and_zeros_ath, group_size, innerKTiles); - return ath_res; // tensor_handle_to_tensor_pointer(ath_res); + return *torch::aot_inductor::tensor_handle_to_tensor_pointer(ath_res); } // output is [n][k] (int32 dtype) From abdae1e3d5576816c9394c53297d5235f54cb617 Mon Sep 17 00:00:00 2001 From: Jane Xu Date: Thu, 16 Jan 2025 14:12:56 -0800 Subject: [PATCH 03/12] Ignore ignore_this --- .../tensor_core_tiled_layout.cu | 6 +----- torchao/csrc/ignore_this.txt | 18 ------------------ 2 files changed, 1 insertion(+), 23 deletions(-) delete mode 100644 torchao/csrc/ignore_this.txt diff --git a/torchao/csrc/cuda/tensor_core_tiled_layout/tensor_core_tiled_layout.cu b/torchao/csrc/cuda/tensor_core_tiled_layout/tensor_core_tiled_layout.cu index e00afdff85..f271448368 100644 --- a/torchao/csrc/cuda/tensor_core_tiled_layout/tensor_core_tiled_layout.cu +++ b/torchao/csrc/cuda/tensor_core_tiled_layout/tensor_core_tiled_layout.cu @@ -408,11 +408,7 @@ at::Tensor _unpack_tensor_core_tiled_layout(const at::Tensor &packed_w, return out; } -// // Use when all tensors arguments accept one (normal) batch dim. -// // This batching rule expands the batch dim on all Tensors, reshapes it into -// // dim 0, calls the op, and then reshapes the batch dim out of dim 0. -// // This is not the most efficient thing; if there are alternatives, plese try -// // to use them. Use this only as a last resort. +// The following example will be useful when we do registration // #define EXISTING_BDIM_ALL_BOXED(op) \ // m.impl(#op, // torch::CppFunction::makeFromBoxedFunction()); diff --git a/torchao/csrc/ignore_this.txt b/torchao/csrc/ignore_this.txt deleted file mode 100644 index ef8544f6e9..0000000000 --- a/torchao/csrc/ignore_this.txt +++ /dev/null @@ -1,18 +0,0 @@ -#include - -int main() { - PyObject *list = PyList_New(2); - - for (auto i = 0; i < 2; ++i) { - PyObject *anint = PyLong_FromLong(i + 1); - PyList_SET_ITEM(list, i, anint); - } - - auto a = PyList_GetItemRef(list, 0); - - return 0; -} - -int r = main(); - -// auto lol = torch::getTHPLayout(static_cast(1)); From df85d15a9f11a374b997d94a767f889f0753df75 Mon Sep 17 00:00:00 2001 From: Jane Xu Date: Fri, 17 Jan 2025 10:24:38 -0800 Subject: [PATCH 04/12] add mock registration prototype --- .../tensor_core_tiled_layout.cu | 270 ++++++++++++++---- 1 file changed, 217 insertions(+), 53 deletions(-) diff --git a/torchao/csrc/cuda/tensor_core_tiled_layout/tensor_core_tiled_layout.cu b/torchao/csrc/cuda/tensor_core_tiled_layout/tensor_core_tiled_layout.cu index f271448368..98021759bf 100644 --- a/torchao/csrc/cuda/tensor_core_tiled_layout/tensor_core_tiled_layout.cu +++ b/torchao/csrc/cuda/tensor_core_tiled_layout/tensor_core_tiled_layout.cu @@ -4,6 +4,8 @@ // #include #include #include +#include +#include #include #include #include @@ -19,18 +21,18 @@ constexpr __host__ __device__ auto divUp(U a, V b) -> decltype(a + b) { } constexpr int32_t kWarpSize = 32; -// Simple data structure to represent 4 pairs of bfloat16s, used for vectorized -// dequantization -// https://github.com/pytorch/pytorch/blob/b6689e0fb83a1578959ab0d9c6d2d9e11f7df21a/aten/src/ATen/native/cuda/int4mm.cu#L178-L180 -struct __align__(16) bf16x2x4 { __nv_bfloat162 vals[4]; }; +//Simple data structure to represent 4 pairs of bfloat16s, used for vectorized dequantization +//https://github.com/pytorch/pytorch/blob/b6689e0fb83a1578959ab0d9c6d2d9e11f7df21a/aten/src/ATen/native/cuda/int4mm.cu#L178-L180 +struct __align__(16) bf16x2x4 { + __nv_bfloat162 vals[4]; +}; -// Copied from -// https://github.com/pytorch/pytorch/blob/b6689e0fb83a1578959ab0d9c6d2d9e11f7df21a/aten/src/ATen/native/cuda/int4mm.cu#L195C1-L241C1 +//Copied from https://github.com/pytorch/pytorch/blob/b6689e0fb83a1578959ab0d9c6d2d9e11f7df21a/aten/src/ATen/native/cuda/int4mm.cu#L195C1-L241C1 inline __device__ bf16x2x4 convert_i4x8_to_bf16x2x4(uint32_t source) { bf16x2x4 result; constexpr int kElements = 8; - uint32_t *h = reinterpret_cast(&result); + uint32_t* h = reinterpret_cast(&result); uint32_t const source_i4s = source; // First, we extract the i4s and construct an intermediate fp16 number. @@ -48,10 +50,10 @@ inline __device__ bf16x2x4 convert_i4x8_to_bf16x2x4(uint32_t source) { for (int ii = 1; ii < kElements / 2; ++ii) { i4s >>= 4; // or is it 8? // (i4s & 0x000f000f) | 0x43004300 - asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n" - : "=r"(h[ii]) - : "r"(i4s), "n"(MASK), "n"(I4s_TO_BF16s_MAGIC_NUM), - "n"(immLut)); + asm volatile( + "lop3.b32 %0, %1, %2, %3, %4;\n" + : "=r"(h[ii]) + : "r"(i4s), "n"(MASK), "n"(I4s_TO_BF16s_MAGIC_NUM), "n"(immLut)); } // This is the BF16 {-136, -136} represented as an integer. @@ -77,9 +79,8 @@ template __global__ void _dequantize_int4_kernel( const at::PackedTensorAccessor32 in, at::PackedTensorAccessor32 out, - std::optional> - scales_and_zeros = std::nullopt) { + std::optional> scales_and_zeros = std::nullopt) +{ constexpr int32_t kNTileSize = 8; constexpr int32_t kKTileSize = 16; @@ -93,8 +94,7 @@ __global__ void _dequantize_int4_kernel( // 8 k-tile values, 4 per m16n8k16 mma.sync operand B // int32_t ks[8]; - // Only need 4 offsets since TC layout for single tile is 2x2 (2 pairs of 2 - // contiguous values) + //Only need 4 offsets since TC layout for single tile is 2x2 (2 pairs of 2 contiguous values) int32_t ks[4]; // Store address base offset @@ -103,9 +103,9 @@ __global__ void _dequantize_int4_kernel( // Unpack 2 k-tiles at a time since min pack size is InnerKTiles = 2 #pragma unroll for (int innerKTile = 0; innerKTile < InnerKTiles; innerKTile += 2) { - // Tensor-core layout for m16n8k16 is such that each tile has 2 pairs of 2 - // contiguous values Hence, we only need 4 offsets - // Offsets of innerTile0 + //Tensor-core layout for m16n8k16 is such that each tile has 2 pairs of 2 contiguous values + //Hence, we only need 4 offsets + // Offsets of innerTile0 auto kBase0 = (kOuterTile * InnerKTiles + innerKTile) * kKTileSize; ks[0] = kBase0 + (t % 4) * 2; ks[1] = ks[0] + 8; @@ -118,35 +118,30 @@ __global__ void _dequantize_int4_kernel( // inner k-tiles unpack two at a time int32_t pack = in[nTile][kOuterTile][t][innerKTile / 2]; - if constexpr (kDequant) { - // static_assert(scales_and_zeros.has_value(), "scales_and_zeros must be - // set when dequantizing"); - static_assert(std::is_same::value, - "Out must be BFloat16 when dequantizing"); + if constexpr(kDequant) { + // static_assert(scales_and_zeros.has_value(), "scales_and_zeros must be set when dequantizing"); + static_assert(std::is_same::value, "Out must be BFloat16 when dequantizing"); // __nv_bfloat16 v[8]; - // // Extract u4, convert to s4 by subtracting by 2 ** nbits / 2, then - // convert to bfloat16 + // // Extract u4, convert to s4 by subtracting by 2 ** nbits / 2, then convert to bfloat16 bf16x2x4 v_bf16x2x4 = convert_i4x8_to_bf16x2x4(pack); // All b values within a 16x16 tile should fall within the same q group // Hence we load 1 scale and zero per loop - int qgroup = ks[0] / groupSize; - const __nv_bfloat16 *pSZ = reinterpret_cast( - &scales_and_zeros.value()[qgroup][n0][0]); + int qgroup = ks[0] / groupSize; + const __nv_bfloat16 *pSZ = reinterpret_cast(&scales_and_zeros.value()[qgroup][n0][0]); // Vectorize scales and zeros __nv_bfloat162 scale2 = __bfloat162bfloat162(pSZ[0]); __nv_bfloat162 zero2 = __bfloat162bfloat162(pSZ[1]); -#pragma unroll + #pragma unroll for (int i = 0; i < 4; i++) { - reinterpret_cast<__nv_bfloat162 *>(&pOut[ks[i]])[0] = - __hfma2(v_bf16x2x4.vals[i], scale2, zero2); + reinterpret_cast<__nv_bfloat162*>(&pOut[ks[i]])[0] = __hfma2(v_bf16x2x4.vals[i], scale2, zero2); } - } else { - static_assert(std::is_same::value, - "Out must be int32_t when unpacking to int"); + } + else { + static_assert(std::is_same::value, "Out must be int32_t when unpacking to int"); int32_t v[8]; v[0] = pack & 0x0000000f; @@ -157,9 +152,9 @@ __global__ void _dequantize_int4_kernel( v[3] = (pack >> 20) & 0x0000000f; v[5] = (pack >> 24) & 0x0000000f; v[7] = (pack >> 28) & 0x0000000f; - int2 *v_i32x2 = reinterpret_cast(v); + int2* v_i32x2 = reinterpret_cast(v); -#pragma unroll + #pragma unroll for (int i = 0; i < 4; ++i) { reinterpret_cast(&pOut[ks[i]])[0] = v_i32x2[i]; } @@ -341,22 +336,107 @@ AtenTensorHandle _ATH_dequantize_tensor_core_tiled_layout( // input is [n / 8][k / (InnerKTiles * 16)][32][innerKTiles / 2] // scales_and_zeros is [numQGroups][n][2] // qGroupSize is 32, 64, 128 or 256 -at::Tensor -_dequantize_tensor_core_tiled_layout(const at::Tensor &packed_w, - const at::Tensor &scales_and_zeros, - int64_t group_size, int64_t innerKTiles) { - - AtenTensorHandle packed_w_ath = - torch::aot_inductor::tensor_pointer_to_tensor_handle(&packed_w); +// at::Tensor +// _dequantize_tensor_core_tiled_layout(const at::Tensor &packed_w, +// const at::Tensor &scales_and_zeros, +// int64_t group_size, int64_t innerKTiles) { + +// AtenTensorHandle packed_w_ath = +// torch::aot_inductor::tensor_pointer_to_tensor_handle(&packed_w); +// AtenTensorHandle scales_and_zeros_ath = +// torch::aot_inductor::tensor_pointer_to_tensor_handle(&scales_and_zeros); + +// AtenTensorHandle ath_res = _ATH_dequantize_tensor_core_tiled_layout( +// packed_w_ath, scales_and_zeros_ath, group_size, innerKTiles); + +// return *torch::aot_inductor::tensor_handle_to_tensor_pointer(ath_res); +// } + +void voidyvoid_boxed_ATH_dequantize_tensor_core_tiled_layout(void **stack, + int64_t num_args, + int64_t num_outputs) { + // here, void* is my StableIValue + // function is going to take a stack of void*, cast them to our + // schema values for now, and run the function and modify the void* stack + int64_t innerKTiles = *reinterpret_cast(stack[3]); + int64_t group_size = *reinterpret_cast(stack[2]); AtenTensorHandle scales_and_zeros_ath = - torch::aot_inductor::tensor_pointer_to_tensor_handle(&scales_and_zeros); + reinterpret_cast(stack[1]); + AtenTensorHandle packed_w_ath = reinterpret_cast(stack[0]); AtenTensorHandle ath_res = _ATH_dequantize_tensor_core_tiled_layout( packed_w_ath, scales_and_zeros_ath, group_size, innerKTiles); - return *torch::aot_inductor::tensor_handle_to_tensor_pointer(ath_res); + void *out = reinterpret_cast(ath_res); + stack[num_args] = out; } +// step 1: from here, call the ATH func +// step 2: make ATH func also boxed and call it +// step 3: move abstract code to libtorch +void boxed_dequantize_tensor_core_tiled_layout(const c10::OperatorHandle &op, + torch::jit::Stack *stack) { + + // function pt1 here should take in IValues, pass a malloc'd stack into the + // second function + // need a translation from IValues to ATH to void*s! + int64_t innerKTiles = torch::jit::pop(stack).toInt(); + int64_t group_size = torch::jit::pop(stack).toInt(); + const at::Tensor &scales_and_zeros = torch::jit::pop(stack).toTensor(); + AtenTensorHandle scales_and_zeros_ath = + torch::aot_inductor::tensor_pointer_to_tensor_handle(&scales_and_zeros); + const at::Tensor &packed_w = torch::jit::pop(stack).toTensor(); + AtenTensorHandle packed_w_ath = + torch::aot_inductor::tensor_pointer_to_tensor_handle(&packed_w); + + int64_t num_args = 4; + int64_t num_outputs = 1; + void **ministack = (void**)malloc((num_args + num_outputs) * sizeof(void *)); + ministack[3] = reinterpret_cast(&innerKTiles); + ministack[2] = reinterpret_cast(&group_size); + ministack[1] = reinterpret_cast(scales_and_zeros_ath); + ministack[0] = reinterpret_cast(packed_w_ath); + + // second function is going to take a stack of void*, cast them to our + // schema values for now, and run the function and modify the void* stack + voidyvoid_boxed_ATH_dequantize_tensor_core_tiled_layout(ministack, num_args, + num_outputs); + + // now read the output from the end of the stack and wrap that back into + // IValue from void*? + + AtenTensorHandle out_ath = + reinterpret_cast(ministack[num_args]); + + free(ministack); + at::Tensor out = + *torch::aot_inductor::tensor_handle_to_tensor_pointer(out_ath); + torch::jit::push(stack, c10::IValue(out)); + + // so above is our stack of IValues, but we cannot have these IValues because + // they are NOT ABI stable! So we need another version of "boxed" with void*s. + // and that is what is going to happen below + + // what the old function used to be: + // int64_t innerKTiles = torch::jit::pop(stack).toInt(); + // int64_t group_size = torch::jit::pop(stack).toInt(); + // const at::Tensor &scales_and_zeros = torch::jit::pop(stack).toTensor(); + // const at::Tensor &packed_w = torch::jit::pop(stack).toTensor(); + + // AtenTensorHandle packed_w_ath = + // torch::aot_inductor::tensor_pointer_to_tensor_handle(&packed_w); + // AtenTensorHandle scales_and_zeros_ath = + // torch::aot_inductor::tensor_pointer_to_tensor_handle(&scales_and_zeros); + + // AtenTensorHandle ath_res = _ATH_dequantize_tensor_core_tiled_layout( + // packed_w_ath, scales_and_zeros_ath, group_size, innerKTiles); + + // at::Tensor out = + // *torch::aot_inductor::tensor_handle_to_tensor_pointer(ath_res); + // torch::jit::push(stack, c10::IValue(out)); +} + + // output is [n][k] (int32 dtype) // input is [n / 8][k / (InnerKTiles * 16)][32][innerKTiles / 2] at::Tensor _unpack_tensor_core_tiled_layout(const at::Tensor &packed_w, @@ -408,15 +488,99 @@ at::Tensor _unpack_tensor_core_tiled_layout(const at::Tensor &packed_w, return out; } -// The following example will be useful when we do registration -// #define EXISTING_BDIM_ALL_BOXED(op) \ -// m.impl(#op, -// torch::CppFunction::makeFromBoxedFunction()); +void voidyvoid_boxed_ATH_dequantize_tensor_core_tiled_layout(void **stack, + int64_t num_args, + int64_t num_outputs) { + // here, void* is my StableIValue + // function is going to take a stack of void*, cast them to our + // schema values for now, and run the function and modify the void* stack + int64_t innerKTiles = *reinterpret_cast(stack[3]); + int64_t group_size = *reinterpret_cast(stack[2]); + AtenTensorHandle scales_and_zeros_ath = + reinterpret_cast(stack[1]); + AtenTensorHandle packed_w_ath = reinterpret_cast(stack[0]); -TORCH_LIBRARY_IMPL(torchao, CUDA, m) { - m.impl("torchao::unpack_tensor_core_tiled_layout", &_unpack_tensor_core_tiled_layout); - m.impl("torchao::dequantize_tensor_core_tiled_layout", &_dequantize_tensor_core_tiled_layout); + AtenTensorHandle ath_res = _ATH_dequantize_tensor_core_tiled_layout( + packed_w_ath, scales_and_zeros_ath, group_size, innerKTiles); + void *out = reinterpret_cast(ath_res); + stack[num_args] = out; +} + +// step 1: from here, call the ATH func +// step 2: make ATH func also boxed and call it +// step 3: move abstract code to libtorch +void boxed_dequantize_tensor_core_tiled_layout(const c10::OperatorHandle &op, + torch::jit::Stack *stack) { + + // function pt1 here should take in IValues, pass a malloc'd stack into the + // second function + // need a translation from IValues to ATH to void*s! + int64_t innerKTiles = torch::jit::pop(stack).toInt(); + int64_t group_size = torch::jit::pop(stack).toInt(); + const at::Tensor &scales_and_zeros = torch::jit::pop(stack).toTensor(); + AtenTensorHandle scales_and_zeros_ath = + torch::aot_inductor::tensor_pointer_to_tensor_handle(&scales_and_zeros); + const at::Tensor &packed_w = torch::jit::pop(stack).toTensor(); + AtenTensorHandle packed_w_ath = + torch::aot_inductor::tensor_pointer_to_tensor_handle(&packed_w); + + int64_t num_args = 4; + int64_t num_outputs = 1; + void **ministack = (void**)malloc((num_args + num_outputs) * sizeof(void *)); + ministack[3] = reinterpret_cast(&innerKTiles); + ministack[2] = reinterpret_cast(&group_size); + ministack[1] = reinterpret_cast(scales_and_zeros_ath); + ministack[0] = reinterpret_cast(packed_w_ath); + + // second function is going to take a stack of void*, cast them to our + // schema values for now, and run the function and modify the void* stack + voidyvoid_boxed_ATH_dequantize_tensor_core_tiled_layout(ministack, num_args, + num_outputs); + + // now read the output from the end of the stack and wrap that back into + // IValue from void*? + + AtenTensorHandle out_ath = + reinterpret_cast(ministack[num_args]); + + free(ministack); + at::Tensor out = + *torch::aot_inductor::tensor_handle_to_tensor_pointer(out_ath); + torch::jit::push(stack, c10::IValue(out)); + + // so above is our stack of IValues, but we cannot have these IValues because + // they are NOT ABI stable! So we need another version of "boxed" with void*s. + // and that is what is going to happen below + + // what the old function used to be: + // int64_t innerKTiles = torch::jit::pop(stack).toInt(); + // int64_t group_size = torch::jit::pop(stack).toInt(); + // const at::Tensor &scales_and_zeros = torch::jit::pop(stack).toTensor(); + // const at::Tensor &packed_w = torch::jit::pop(stack).toTensor(); + + // AtenTensorHandle packed_w_ath = + // torch::aot_inductor::tensor_pointer_to_tensor_handle(&packed_w); + // AtenTensorHandle scales_and_zeros_ath = + // torch::aot_inductor::tensor_pointer_to_tensor_handle(&scales_and_zeros); + + // AtenTensorHandle ath_res = _ATH_dequantize_tensor_core_tiled_layout( + // packed_w_ath, scales_and_zeros_ath, group_size, innerKTiles); + + // at::Tensor out = + // *torch::aot_inductor::tensor_handle_to_tensor_pointer(ath_res); + // torch::jit::push(stack, c10::IValue(out)); +} + + +TORCH_LIBRARY_IMPL(torchao, CUDA, m) { + m.impl("torchao::unpack_tensor_core_tiled_layout", + &_unpack_tensor_core_tiled_layout); + // m.impl("torchao::dequantize_tensor_core_tiled_layout", + // &_dequantize_tensor_core_tiled_layout); + m.impl("torchao::dequantize_tensor_core_tiled_layout", + torch::CppFunction::makeFromBoxedFunction< + boxed_dequantize_tensor_core_tiled_layout>()); } #endif From 4120a8b922bcb2674da20cf3627f0acd4a9c02b1 Mon Sep 17 00:00:00 2001 From: Jane Xu Date: Tue, 21 Jan 2025 12:59:56 -0800 Subject: [PATCH 05/12] there is a diff between IValue blah and IValue& blah --- .../tensor_core_tiled_layout.cu | 142 +++++------------- 1 file changed, 36 insertions(+), 106 deletions(-) diff --git a/torchao/csrc/cuda/tensor_core_tiled_layout/tensor_core_tiled_layout.cu b/torchao/csrc/cuda/tensor_core_tiled_layout/tensor_core_tiled_layout.cu index 98021759bf..994374910d 100644 --- a/torchao/csrc/cuda/tensor_core_tiled_layout/tensor_core_tiled_layout.cu +++ b/torchao/csrc/cuda/tensor_core_tiled_layout/tensor_core_tiled_layout.cu @@ -358,8 +358,10 @@ void voidyvoid_boxed_ATH_dequantize_tensor_core_tiled_layout(void **stack, // here, void* is my StableIValue // function is going to take a stack of void*, cast them to our // schema values for now, and run the function and modify the void* stack - int64_t innerKTiles = *reinterpret_cast(stack[3]); - int64_t group_size = *reinterpret_cast(stack[2]); + int64_t innerKTiles = reinterpret_cast(stack[3]); + int64_t group_size = reinterpret_cast(stack[2]); + TORCH_WARN(innerKTiles); + TORCH_WARN(group_size); AtenTensorHandle scales_and_zeros_ath = reinterpret_cast(stack[1]); AtenTensorHandle packed_w_ath = reinterpret_cast(stack[0]); @@ -380,37 +382,49 @@ void boxed_dequantize_tensor_core_tiled_layout(const c10::OperatorHandle &op, // function pt1 here should take in IValues, pass a malloc'd stack into the // second function // need a translation from IValues to ATH to void*s! - int64_t innerKTiles = torch::jit::pop(stack).toInt(); - int64_t group_size = torch::jit::pop(stack).toInt(); - const at::Tensor &scales_and_zeros = torch::jit::pop(stack).toTensor(); - AtenTensorHandle scales_and_zeros_ath = - torch::aot_inductor::tensor_pointer_to_tensor_handle(&scales_and_zeros); - const at::Tensor &packed_w = torch::jit::pop(stack).toTensor(); - AtenTensorHandle packed_w_ath = - torch::aot_inductor::tensor_pointer_to_tensor_handle(&packed_w); - - int64_t num_args = 4; - int64_t num_outputs = 1; - void **ministack = (void**)malloc((num_args + num_outputs) * sizeof(void *)); - ministack[3] = reinterpret_cast(&innerKTiles); - ministack[2] = reinterpret_cast(&group_size); - ministack[1] = reinterpret_cast(scales_and_zeros_ath); - ministack[0] = reinterpret_cast(packed_w_ath); + + const auto& schema = op.schema(); + const auto num_returns = schema.returns().size(); + const auto num_arguments = schema.arguments().size(); + TORCH_CHECK(num_arguments==4); + TORCH_CHECK(num_returns==1); + void **ministack = (void**)malloc((num_arguments + num_returns) * sizeof(void *)); + + for (auto idx = 0; idx < num_arguments; idx++) { + TORCH_WARN(idx); + const c10::IValue& arg = torch::jit::peek(stack, idx, num_arguments); + if (arg.isInt()) { + ministack[idx] = reinterpret_cast(arg.toInt()); + } else if (arg.isTensor()) { + TORCH_WARN("am tensor!") + const at::Tensor& tensor = arg.toTensor(); + AtenTensorHandle ath = torch::aot_inductor::tensor_pointer_to_tensor_handle(&tensor); + ministack[idx] = reinterpret_cast(ath); + } else { + TORCH_CHECK(false, "Other types of IValues not handled!"); + } + } + TORCH_WARN("done with forloop no problems!") // second function is going to take a stack of void*, cast them to our // schema values for now, and run the function and modify the void* stack - voidyvoid_boxed_ATH_dequantize_tensor_core_tiled_layout(ministack, num_args, - num_outputs); + voidyvoid_boxed_ATH_dequantize_tensor_core_tiled_layout(ministack, num_arguments, + num_returns); // now read the output from the end of the stack and wrap that back into // IValue from void*? AtenTensorHandle out_ath = - reinterpret_cast(ministack[num_args]); - + reinterpret_cast(ministack[num_arguments]); + free(ministack); + at::Tensor out = *torch::aot_inductor::tensor_handle_to_tensor_pointer(out_ath); + + // now pop everything. if we pop earlier, Tensors would go out of scope + // before calling the function + torch::jit::drop(stack, num_arguments); torch::jit::push(stack, c10::IValue(out)); // so above is our stack of IValues, but we cannot have these IValues because @@ -488,90 +502,6 @@ at::Tensor _unpack_tensor_core_tiled_layout(const at::Tensor &packed_w, return out; } -void voidyvoid_boxed_ATH_dequantize_tensor_core_tiled_layout(void **stack, - int64_t num_args, - int64_t num_outputs) { - // here, void* is my StableIValue - // function is going to take a stack of void*, cast them to our - // schema values for now, and run the function and modify the void* stack - int64_t innerKTiles = *reinterpret_cast(stack[3]); - int64_t group_size = *reinterpret_cast(stack[2]); - AtenTensorHandle scales_and_zeros_ath = - reinterpret_cast(stack[1]); - AtenTensorHandle packed_w_ath = reinterpret_cast(stack[0]); - - AtenTensorHandle ath_res = _ATH_dequantize_tensor_core_tiled_layout( - packed_w_ath, scales_and_zeros_ath, group_size, innerKTiles); - - void *out = reinterpret_cast(ath_res); - stack[num_args] = out; -} - -// step 1: from here, call the ATH func -// step 2: make ATH func also boxed and call it -// step 3: move abstract code to libtorch -void boxed_dequantize_tensor_core_tiled_layout(const c10::OperatorHandle &op, - torch::jit::Stack *stack) { - - // function pt1 here should take in IValues, pass a malloc'd stack into the - // second function - // need a translation from IValues to ATH to void*s! - int64_t innerKTiles = torch::jit::pop(stack).toInt(); - int64_t group_size = torch::jit::pop(stack).toInt(); - const at::Tensor &scales_and_zeros = torch::jit::pop(stack).toTensor(); - AtenTensorHandle scales_and_zeros_ath = - torch::aot_inductor::tensor_pointer_to_tensor_handle(&scales_and_zeros); - const at::Tensor &packed_w = torch::jit::pop(stack).toTensor(); - AtenTensorHandle packed_w_ath = - torch::aot_inductor::tensor_pointer_to_tensor_handle(&packed_w); - - int64_t num_args = 4; - int64_t num_outputs = 1; - void **ministack = (void**)malloc((num_args + num_outputs) * sizeof(void *)); - ministack[3] = reinterpret_cast(&innerKTiles); - ministack[2] = reinterpret_cast(&group_size); - ministack[1] = reinterpret_cast(scales_and_zeros_ath); - ministack[0] = reinterpret_cast(packed_w_ath); - - // second function is going to take a stack of void*, cast them to our - // schema values for now, and run the function and modify the void* stack - voidyvoid_boxed_ATH_dequantize_tensor_core_tiled_layout(ministack, num_args, - num_outputs); - - // now read the output from the end of the stack and wrap that back into - // IValue from void*? - - AtenTensorHandle out_ath = - reinterpret_cast(ministack[num_args]); - - free(ministack); - at::Tensor out = - *torch::aot_inductor::tensor_handle_to_tensor_pointer(out_ath); - torch::jit::push(stack, c10::IValue(out)); - - // so above is our stack of IValues, but we cannot have these IValues because - // they are NOT ABI stable! So we need another version of "boxed" with void*s. - // and that is what is going to happen below - - // what the old function used to be: - // int64_t innerKTiles = torch::jit::pop(stack).toInt(); - // int64_t group_size = torch::jit::pop(stack).toInt(); - // const at::Tensor &scales_and_zeros = torch::jit::pop(stack).toTensor(); - // const at::Tensor &packed_w = torch::jit::pop(stack).toTensor(); - - // AtenTensorHandle packed_w_ath = - // torch::aot_inductor::tensor_pointer_to_tensor_handle(&packed_w); - // AtenTensorHandle scales_and_zeros_ath = - // torch::aot_inductor::tensor_pointer_to_tensor_handle(&scales_and_zeros); - - // AtenTensorHandle ath_res = _ATH_dequantize_tensor_core_tiled_layout( - // packed_w_ath, scales_and_zeros_ath, group_size, innerKTiles); - - // at::Tensor out = - // *torch::aot_inductor::tensor_handle_to_tensor_pointer(ath_res); - // torch::jit::push(stack, c10::IValue(out)); -} - TORCH_LIBRARY_IMPL(torchao, CUDA, m) { m.impl("torchao::unpack_tensor_core_tiled_layout", From a1944ee20d53f829ccadcc2492d5e4172c39dc2f Mon Sep 17 00:00:00 2001 From: Jane Xu Date: Wed, 29 Jan 2025 11:32:05 -0800 Subject: [PATCH 06/12] Clean up code, finish other end of void* boxed kernel --- .../tensor_core_tiled_layout.cu | 79 ++++--------------- 1 file changed, 17 insertions(+), 62 deletions(-) diff --git a/torchao/csrc/cuda/tensor_core_tiled_layout/tensor_core_tiled_layout.cu b/torchao/csrc/cuda/tensor_core_tiled_layout/tensor_core_tiled_layout.cu index 994374910d..39acdaf4eb 100644 --- a/torchao/csrc/cuda/tensor_core_tiled_layout/tensor_core_tiled_layout.cu +++ b/torchao/csrc/cuda/tensor_core_tiled_layout/tensor_core_tiled_layout.cu @@ -1,7 +1,7 @@ #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 800 // at least Ampere -// #include -// #include +#include +#include #include #include #include @@ -332,25 +332,6 @@ AtenTensorHandle _ATH_dequantize_tensor_core_tiled_layout( return out; } -// output is [n][k] (int32 dtype) -// input is [n / 8][k / (InnerKTiles * 16)][32][innerKTiles / 2] -// scales_and_zeros is [numQGroups][n][2] -// qGroupSize is 32, 64, 128 or 256 -// at::Tensor -// _dequantize_tensor_core_tiled_layout(const at::Tensor &packed_w, -// const at::Tensor &scales_and_zeros, -// int64_t group_size, int64_t innerKTiles) { - -// AtenTensorHandle packed_w_ath = -// torch::aot_inductor::tensor_pointer_to_tensor_handle(&packed_w); -// AtenTensorHandle scales_and_zeros_ath = -// torch::aot_inductor::tensor_pointer_to_tensor_handle(&scales_and_zeros); - -// AtenTensorHandle ath_res = _ATH_dequantize_tensor_core_tiled_layout( -// packed_w_ath, scales_and_zeros_ath, group_size, innerKTiles); - -// return *torch::aot_inductor::tensor_handle_to_tensor_pointer(ath_res); -// } void voidyvoid_boxed_ATH_dequantize_tensor_core_tiled_layout(void **stack, int64_t num_args, @@ -360,8 +341,6 @@ void voidyvoid_boxed_ATH_dequantize_tensor_core_tiled_layout(void **stack, // schema values for now, and run the function and modify the void* stack int64_t innerKTiles = reinterpret_cast(stack[3]); int64_t group_size = reinterpret_cast(stack[2]); - TORCH_WARN(innerKTiles); - TORCH_WARN(group_size); AtenTensorHandle scales_and_zeros_ath = reinterpret_cast(stack[1]); AtenTensorHandle packed_w_ath = reinterpret_cast(stack[0]); @@ -386,68 +365,44 @@ void boxed_dequantize_tensor_core_tiled_layout(const c10::OperatorHandle &op, const auto& schema = op.schema(); const auto num_returns = schema.returns().size(); const auto num_arguments = schema.arguments().size(); - TORCH_CHECK(num_arguments==4); - TORCH_CHECK(num_returns==1); void **ministack = (void**)malloc((num_arguments + num_returns) * sizeof(void *)); for (auto idx = 0; idx < num_arguments; idx++) { - TORCH_WARN(idx); const c10::IValue& arg = torch::jit::peek(stack, idx, num_arguments); if (arg.isInt()) { ministack[idx] = reinterpret_cast(arg.toInt()); } else if (arg.isTensor()) { - TORCH_WARN("am tensor!") const at::Tensor& tensor = arg.toTensor(); AtenTensorHandle ath = torch::aot_inductor::tensor_pointer_to_tensor_handle(&tensor); ministack[idx] = reinterpret_cast(ath); } else { - TORCH_CHECK(false, "Other types of IValues not handled!"); + TORCH_CHECK(false, "Other types of IValues not yet handled!"); } } - TORCH_WARN("done with forloop no problems!") // second function is going to take a stack of void*, cast them to our // schema values for now, and run the function and modify the void* stack voidyvoid_boxed_ATH_dequantize_tensor_core_tiled_layout(ministack, num_arguments, num_returns); - // now read the output from the end of the stack and wrap that back into - // IValue from void*? - - AtenTensorHandle out_ath = - reinterpret_cast(ministack[num_arguments]); - - free(ministack); - - at::Tensor out = - *torch::aot_inductor::tensor_handle_to_tensor_pointer(out_ath); - - // now pop everything. if we pop earlier, Tensors would go out of scope + // now pop all inputs on stack. if we pop earlier, Tensors would go out of scope // before calling the function torch::jit::drop(stack, num_arguments); - torch::jit::push(stack, c10::IValue(out)); - - // so above is our stack of IValues, but we cannot have these IValues because - // they are NOT ABI stable! So we need another version of "boxed" with void*s. - // and that is what is going to happen below - - // what the old function used to be: - // int64_t innerKTiles = torch::jit::pop(stack).toInt(); - // int64_t group_size = torch::jit::pop(stack).toInt(); - // const at::Tensor &scales_and_zeros = torch::jit::pop(stack).toTensor(); - // const at::Tensor &packed_w = torch::jit::pop(stack).toTensor(); - // AtenTensorHandle packed_w_ath = - // torch::aot_inductor::tensor_pointer_to_tensor_handle(&packed_w); - // AtenTensorHandle scales_and_zeros_ath = - // torch::aot_inductor::tensor_pointer_to_tensor_handle(&scales_and_zeros); - - // AtenTensorHandle ath_res = _ATH_dequantize_tensor_core_tiled_layout( - // packed_w_ath, scales_and_zeros_ath, group_size, innerKTiles); + // read the output from the end of the stack and wrap that back into + // IValue from void*? + for (auto idx = 0; idx < num_returns; idx++) { + const c10::TypePtr& ret_type = schema.returns()[idx].type(); + if (*ret_type == *c10::getTypePtr()) { + AtenTensorHandle ret_ath = reinterpret_cast( ministack[num_arguments + idx]); + at::Tensor out = *torch::aot_inductor::tensor_handle_to_tensor_pointer(ret_ath); + torch::jit::push(stack, c10::IValue(out)); + } else { + TORCH_CHECK(false, "Only Tensor return types are currently supported!"); + } + } - // at::Tensor out = - // *torch::aot_inductor::tensor_handle_to_tensor_pointer(ath_res); - // torch::jit::push(stack, c10::IValue(out)); + free(ministack); } From 672aeec65ae24ba00900dc7e16c3d8b18cc04055 Mon Sep 17 00:00:00 2001 From: Jane Xu Date: Thu, 30 Jan 2025 14:12:01 -0800 Subject: [PATCH 07/12] [skip ci] saving work on registration --- .../tensor_core_tiled_layout/libtorch.cpp | 133 +++++++++++++ .../cuda/tensor_core_tiled_layout/libtorch.h | 90 +++++++++ .../tensor_core_tiled_layout.cu | 184 +++++++++--------- 3 files changed, 316 insertions(+), 91 deletions(-) create mode 100644 torchao/csrc/cuda/tensor_core_tiled_layout/libtorch.cpp create mode 100644 torchao/csrc/cuda/tensor_core_tiled_layout/libtorch.h diff --git a/torchao/csrc/cuda/tensor_core_tiled_layout/libtorch.cpp b/torchao/csrc/cuda/tensor_core_tiled_layout/libtorch.cpp new file mode 100644 index 0000000000..42dd6f947b --- /dev/null +++ b/torchao/csrc/cuda/tensor_core_tiled_layout/libtorch.cpp @@ -0,0 +1,133 @@ +// in this file, we will implement the stuff in libtorch.h, +// and we are allowed to call unstable stuff from pytorch! + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +// step 1: from here, call the ATH func +// step 2: make ATH func also boxed and call it +// step 3: move abstract code to libtorch +void boxed_dequantize_tensor_core_tiled_layout(const c10::OperatorHandle &op, + torch::jit::Stack *stack) { + + // function pt1 here should take in IValues, pass a malloc'd stack into the + // second function + // need a translation from IValues to ATH to void*s! + + const auto& schema = op.schema(); + const auto num_returns = schema.returns().size(); + const auto num_arguments = schema.arguments().size(); + void **ministack = (void**)malloc((num_arguments + num_returns) * sizeof(void *)); + + for (auto idx = 0; idx < num_arguments; idx++) { + const c10::IValue& arg = torch::jit::peek(stack, idx, num_arguments); + if (arg.isInt()) { + ministack[idx] = reinterpret_cast(arg.toInt()); + } else if (arg.isTensor()) { + const at::Tensor& tensor = arg.toTensor(); + AtenTensorHandle ath = torch::aot_inductor::tensor_pointer_to_tensor_handle(&tensor); + ministack[idx] = reinterpret_cast(ath); + } else { + TORCH_CHECK(false, "Other types of IValues not yet handled!"); + } + } + + // second function is going to take a stack of void*, cast them to our + // schema values for now, and run the function and modify the void* stack + voidyvoid_boxed_ATH_dequantize_tensor_core_tiled_layout(ministack, num_arguments, + num_returns); + + // now pop all inputs on stack. if we pop earlier, Tensors would go out of scope + // before calling the function + torch::jit::drop(stack, num_arguments); + + // read the output from the end of the stack and wrap that back into + // IValue from void*? + for (auto idx = 0; idx < num_returns; idx++) { + const c10::TypePtr& ret_type = schema.returns()[idx].type(); + if (*ret_type == *c10::getTypePtr()) { + AtenTensorHandle ret_ath = reinterpret_cast( ministack[num_arguments + idx]); + at::Tensor out = *torch::aot_inductor::tensor_handle_to_tensor_pointer(ret_ath); + torch::jit::push(stack, c10::IValue(out)); + } else { + TORCH_CHECK(false, "Only Tensor return types are currently supported!"); + } + } + + free(ministack); +} + + +void boxed_unpack_tensor_core_tiled_layout(const c10::OperatorHandle &op, + torch::jit::Stack *stack) { + + // function pt1 here should take in IValues, pass a malloc'd stack into the + // second function + // need a translation from IValues to ATH to void*s! + + const auto& schema = op.schema(); + const auto num_returns = schema.returns().size(); + const auto num_arguments = schema.arguments().size(); + void **ministack = (void**)malloc((num_arguments + num_returns) * sizeof(void *)); + + for (auto idx = 0; idx < num_arguments; idx++) { + const c10::IValue& arg = torch::jit::peek(stack, idx, num_arguments); + if (arg.isInt()) { + ministack[idx] = reinterpret_cast(arg.toInt()); + } else if (arg.isTensor()) { + const at::Tensor& tensor = arg.toTensor(); + AtenTensorHandle ath = torch::aot_inductor::tensor_pointer_to_tensor_handle(&tensor); + ministack[idx] = reinterpret_cast(ath); + } else { + TORCH_CHECK(false, "Other types of IValues not yet handled!"); + } + } + + // second function is going to take a stack of void*, cast them to our + // schema values for now, and run the function and modify the void* stack + voidyvoid_boxed_ATH_unpack_tensor_core_tiled_layout(ministack, num_arguments, + num_returns); + + // now pop all inputs on stack. if we pop earlier, Tensors would go out of scope + // before calling the function + torch::jit::drop(stack, num_arguments); + + // read the output from the end of the stack and wrap that back into + // IValue from void*? + for (auto idx = 0; idx < num_returns; idx++) { + const c10::TypePtr& ret_type = schema.returns()[idx].type(); + if (*ret_type == *c10::getTypePtr()) { + AtenTensorHandle ret_ath = reinterpret_cast( ministack[num_arguments + idx]); + at::Tensor out = *torch::aot_inductor::tensor_handle_to_tensor_pointer(ret_ath); + torch::jit::push(stack, c10::IValue(out)); + } else { + TORCH_CHECK(false, "Only Tensor return types are currently supported!"); + } + } + + free(ministack); +} + +TORCH_LIBRARY_IMPL(torchao, CUDA, m) { + // m.impl("torchao::unpack_tensor_core_tiled_layout", + // &_unpack_tensor_core_tiled_layout); + m.impl("torchao::unpack_tensor_core_tiled_layout", + torch::CppFunction::makeFromBoxedFunction< + boxed_unpack_tensor_core_tiled_layout>()); + // m.impl("torchao::dequantize_tensor_core_tiled_layout", + // &_dequantize_tensor_core_tiled_layout); + m.impl("torchao::dequantize_tensor_core_tiled_layout", + torch::CppFunction::makeFromBoxedFunction< + boxed_dequantize_tensor_core_tiled_layout>()); +} diff --git a/torchao/csrc/cuda/tensor_core_tiled_layout/libtorch.h b/torchao/csrc/cuda/tensor_core_tiled_layout/libtorch.h new file mode 100644 index 0000000000..9fd7636d04 --- /dev/null +++ b/torchao/csrc/cuda/tensor_core_tiled_layout/libtorch.h @@ -0,0 +1,90 @@ +// this file can only have stable stuff! Akin to shim.h + +#include +#include // used for C10_UID, verified to be header-only +#include // used for DispatchKey, enum verified to be header-only +#include + +class StableLibrary final { + public: + // a pointer to a real Library + // a kind + enum Kind { + DEF, // from TORCH_LIBRARY (no qualifier) + IMPL, + FRAGMENT, + }; + + + // constructor + + +}; + + +// _def function ==> IGNORE LIBRARY + just call these + +// stable_impl function (that takes in a string and a void** function pointer) +// _impl doesn't really need a Library object, try to avoid it for now +// just copy its implementation +// it'll give u a handle that needs to be kept alive, just assign to global for now + + +class StableTorchLibraryInit final { + private: + using InitFn = void(StableLibrary&); + StableLibrary lib_; + + public: + StableTorchLibraryInit( + StableLibrary::Kind kind, + InitFn* fn, + const char* ns, + std::optional k, + const char* file, + uint32_t line) + : lib_(kind, ns, k, file, line) { + fn(lib_); + } +}; + + +#define STABLE_TORCH_LIBRARY_IMPL(ns, k, m) _STABLE_TORCH_LIBRARY_IMPL(ns, k, m, C10_UID) + +#define _STABLE_TORCH_LIBRARY_IMPL(ns, k, m, uid) \ + static void C10_CONCATENATE( \ + STABLE_TORCH_LIBRARY_IMPL_init_##ns##_##k##_, uid)(StableLibrary&); \ + static const StableTorchLibraryInit C10_CONCATENATE( \ + STABLE_TORCH_LIBRARY_IMPL_static_init_##ns##_##k##_, uid)( \ + StableLibrary::IMPL, \ + &C10_CONCATENATE(STABLE_TORCH_LIBRARY_IMPL_init_##ns##_##k##_, uid), \ + #ns, \ + std::make_optional(c10::DispatchKey::k), \ + __FILE__, \ + __LINE__); \ + void C10_CONCATENATE( \ + STABLE_TORCH_LIBRARY_IMPL_init_##ns##_##k##_, uid)(StableLibrary & m) + + + + + + +// #define TORCH_LIBRARY_IMPL(ns, k, m) _TORCH_LIBRARY_IMPL(ns, k, m, C10_UID) + +// #define _TORCH_LIBRARY_IMPL(ns, k, m, uid) \ +// static void TORCH_LIBRARY_IMPL_init_torchao_CUDA_uid(torch::Library&); \ +// static const torch::detail::TorchLibraryInit \ +// TORCH_LIBRARY_IMPL_static_init_torchao_CUDA_uid( \ +// torch::Library::IMPL, \ +// (c10::impl::dispatch_key_allowlist_check(c10::DispatchKey::CUDA) \ +// ? &TORCH_LIBRARY_IMPL_init_torchao_CUDA_uid \ +// : [](torch::Library&) -> void {}), \ +// torchao, \ +// std::make_optional(c10::DispatchKey::CUDA), \ +// __FILE__, \ +// __LINE__); \ +// TORCH_LIBRARY_IMPL_init_torchao_CUDA_uid(torch::Library & m) { + +// } + diff --git a/torchao/csrc/cuda/tensor_core_tiled_layout/tensor_core_tiled_layout.cu b/torchao/csrc/cuda/tensor_core_tiled_layout/tensor_core_tiled_layout.cu index 39acdaf4eb..f4c252ab37 100644 --- a/torchao/csrc/cuda/tensor_core_tiled_layout/tensor_core_tiled_layout.cu +++ b/torchao/csrc/cuda/tensor_core_tiled_layout/tensor_core_tiled_layout.cu @@ -1,17 +1,7 @@ #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 800 // at least Ampere -#include -#include -#include -#include -#include -#include -#include -#include #include #include -#include -#include template constexpr __host__ __device__ auto divUp(U a, V b) -> decltype(a + b) { @@ -72,6 +62,7 @@ inline __device__ bf16x2x4 convert_i4x8_to_bf16x2x4(uint32_t source) { return result; } + // in size [ceil(n / 8)][ceil(k / (InnerKTiles * 16))][32][InnerKTiles / 2] // scales_and_zeros size [numQGroups][n][2] // out size [n][k] @@ -162,6 +153,15 @@ __global__ void _dequantize_int4_kernel( } } + + +// The following function should eventually be libtorch agnostic +// Blockers from that goal today: +// - TORCH_CHECK +// - c10::cuda::CUDAGuard +// - at::cuda::getCurrentCUDAStream(); +// - at::GenericPackedTensorAccessor +// - at::RestrictPtrTraits AtenTensorHandle _ATH_dequantize_tensor_core_tiled_layout( const AtenTensorHandle packed_w, const AtenTensorHandle scales_and_zeros, int64_t group_size, int64_t innerKTiles) { @@ -352,77 +352,45 @@ void voidyvoid_boxed_ATH_dequantize_tensor_core_tiled_layout(void **stack, stack[num_args] = out; } -// step 1: from here, call the ATH func -// step 2: make ATH func also boxed and call it -// step 3: move abstract code to libtorch -void boxed_dequantize_tensor_core_tiled_layout(const c10::OperatorHandle &op, - torch::jit::Stack *stack) { - - // function pt1 here should take in IValues, pass a malloc'd stack into the - // second function - // need a translation from IValues to ATH to void*s! - - const auto& schema = op.schema(); - const auto num_returns = schema.returns().size(); - const auto num_arguments = schema.arguments().size(); - void **ministack = (void**)malloc((num_arguments + num_returns) * sizeof(void *)); - - for (auto idx = 0; idx < num_arguments; idx++) { - const c10::IValue& arg = torch::jit::peek(stack, idx, num_arguments); - if (arg.isInt()) { - ministack[idx] = reinterpret_cast(arg.toInt()); - } else if (arg.isTensor()) { - const at::Tensor& tensor = arg.toTensor(); - AtenTensorHandle ath = torch::aot_inductor::tensor_pointer_to_tensor_handle(&tensor); - ministack[idx] = reinterpret_cast(ath); - } else { - TORCH_CHECK(false, "Other types of IValues not yet handled!"); - } - } - - // second function is going to take a stack of void*, cast them to our - // schema values for now, and run the function and modify the void* stack - voidyvoid_boxed_ATH_dequantize_tensor_core_tiled_layout(ministack, num_arguments, - num_returns); - - // now pop all inputs on stack. if we pop earlier, Tensors would go out of scope - // before calling the function - torch::jit::drop(stack, num_arguments); - - // read the output from the end of the stack and wrap that back into - // IValue from void*? - for (auto idx = 0; idx < num_returns; idx++) { - const c10::TypePtr& ret_type = schema.returns()[idx].type(); - if (*ret_type == *c10::getTypePtr()) { - AtenTensorHandle ret_ath = reinterpret_cast( ministack[num_arguments + idx]); - at::Tensor out = *torch::aot_inductor::tensor_handle_to_tensor_pointer(ret_ath); - torch::jit::push(stack, c10::IValue(out)); - } else { - TORCH_CHECK(false, "Only Tensor return types are currently supported!"); - } - } - - free(ministack); -} - // output is [n][k] (int32 dtype) // input is [n / 8][k / (InnerKTiles * 16)][32][innerKTiles / 2] -at::Tensor _unpack_tensor_core_tiled_layout(const at::Tensor &packed_w, +AtenTensorHandle _ATH_unpack_tensor_core_tiled_layout(const AtenTensorHandle packed_w, int64_t innerKTiles) { - c10::cuda::CUDAGuard g(packed_w.device()); + int32_t packed_w_device_index; + aoti_torch_get_device_index(packed_w, &packed_w_device_index); + + // c10::cuda::CUDAGuard g(packed_w.device()); + c10::cuda::CUDAGuard g(packed_w_device_index); + + int64_t packed_w_dim; + aoti_torch_get_dim(packed_w, &packed_w_dim); + TORCH_CHECK(packed_w_dim == 4); + + int32_t packed_w_dtype; + aoti_torch_get_dtype(packed_w, &packed_w_dtype); + TORCH_CHECK(packed_w_dtype == static_cast(at::kInt)); - TORCH_CHECK(packed_w.dim() == 4); - TORCH_CHECK(packed_w.dtype() == at::kInt); - TORCH_CHECK(packed_w.is_contiguous()); + // is_contiguous not existent today + // TORCH_CHECK(packed_w.is_contiguous()); - TORCH_CHECK(packed_w.size(2) == 32); - TORCH_CHECK(packed_w.size(3) == innerKTiles / 2); + int64_t packed_w_dim_2_size; + aoti_torch_get_size(packed_w, 2, &packed_w_dim_2_size); + TORCH_CHECK(packed_w_dim_2_size == 32); + + int64_t packed_w_dim_3_size; + aoti_torch_get_size(packed_w, 3, &packed_w_dim_3_size); + TORCH_CHECK(packed_w_dim_3_size == innerKTiles / 2); TORCH_CHECK(innerKTiles == 2 || innerKTiles == 4 || innerKTiles == 8); - int N = packed_w.size(0) * 8; - int K = packed_w.size(1) * innerKTiles * 16; + int64_t packed_w_dim_0_size; + aoti_torch_get_size(packed_w, 0, &packed_w_dim_0_size); + int N = packed_w_dim_0_size * 8; + + int64_t packed_w_dim_1_size; + aoti_torch_get_size(packed_w, 1, &packed_w_dim_1_size); + int K = packed_w_dim_1_size * innerKTiles * 16; constexpr int32_t kNTileSize = 8; constexpr int32_t kKTileSize = 16; @@ -431,41 +399,75 @@ at::Tensor _unpack_tensor_core_tiled_layout(const at::Tensor &packed_w, auto kSuperTiles = divUp(K, innerKTiles * kKTileSize); - auto out = at::empty( - {N, K}, at::TensorOptions().dtype(at::kInt).device(packed_w.device())); + AtenTensorHandle out; + int64_t out_sizes[] = {N, K}; + int64_t out_strides[] = {K, 1}; + auto kInt = aoti_torch_dtype_int32(); + int32_t packed_w_device_type; + aoti_torch_get_device_type(packed_w, &packed_w_device_type); + aoti_torch_empty_strided(2, out_sizes, out_strides, kInt, packed_w_device_type, packed_w_device_index, &out); + // auto out = at::empty( + // {N, K}, at::TensorOptions().dtype(at::kInt).device(packed_w.device())); auto stream = at::cuda::getCurrentCUDAStream(); dim3 grid(kSuperTiles, nTiles); + void *packed_w_data_ptr; + int64_t *packed_w_sizes; + int64_t *packed_w_strides; + aoti_torch_get_data_ptr(packed_w, &packed_w_data_ptr); + aoti_torch_get_sizes(packed_w, &packed_w_sizes); + aoti_torch_get_strides(packed_w, &packed_w_strides); + at::GenericPackedTensorAccessor + packed_w_pta32( + static_cast::PtrType>( + packed_w_data_ptr), + packed_w_sizes, packed_w_strides); + + void *out_data_ptr; + aoti_torch_get_data_ptr(out, &out_data_ptr); + at::GenericPackedTensorAccessor + out_pta32( + static_cast::PtrType>( + out_data_ptr), + out_sizes, out_strides); + if (innerKTiles == 2) { _dequantize_int4_kernel - <<>>( - packed_w.packed_accessor32(), - out.packed_accessor32()); + <<>>(packed_w_pta32, out_pta32); + // packed_w.packed_accessor32(), + // out.packed_accessor32() } else if (innerKTiles == 4) { _dequantize_int4_kernel - <<>>( - packed_w.packed_accessor32(), - out.packed_accessor32()); + <<>>(packed_w_pta32, out_pta32); } else if (innerKTiles == 8) { _dequantize_int4_kernel - <<>>( - packed_w.packed_accessor32(), - out.packed_accessor32()); + <<>>(packed_w_pta32, out_pta32); } return out; } +void voidyvoid_boxed_ATH_unpack_tensor_core_tiled_layout(void **stack, + int64_t num_args, + int64_t num_outputs) { + // here, void* is my StableIValue + // function is going to take a stack of void*, cast them to our + // schema values for now, and run the function and modify the void* stack + int64_t innerKTiles = reinterpret_cast(stack[1]); + AtenTensorHandle packed_w_ath = reinterpret_cast(stack[0]); + + AtenTensorHandle ath_res = _ATH_unpack_tensor_core_tiled_layout( + packed_w_ath, innerKTiles); + + void *out = reinterpret_cast(ath_res); + stack[num_args] = out; +} -TORCH_LIBRARY_IMPL(torchao, CUDA, m) { - m.impl("torchao::unpack_tensor_core_tiled_layout", - &_unpack_tensor_core_tiled_layout); - // m.impl("torchao::dequantize_tensor_core_tiled_layout", - // &_dequantize_tensor_core_tiled_layout); - m.impl("torchao::dequantize_tensor_core_tiled_layout", - torch::CppFunction::makeFromBoxedFunction< - boxed_dequantize_tensor_core_tiled_layout>()); +STABLE_TORCH_LIBRARY_IMPL(torchao, CUDA, m) { + m.impl("torchao::unpack_tensor_core_tiled_layout", &voidyvoid_boxed_ATH_unpack_tensor_core_tiled_layout); + m.impl("torchao::dequantize_tensor_core_tiled_layout", &voidyvoid_boxed_ATH_dequantize_tensor_core_tiled_layout); } #endif From 97a922048b9c61bda4440cccde6633f60a7bcf96 Mon Sep 17 00:00:00 2001 From: Jane Xu Date: Fri, 31 Jan 2025 15:00:02 -0800 Subject: [PATCH 08/12] [skip ci] This definitely does not compile --- .../tensor_core_tiled_layout/libtorch.cpp | 330 ++++++++++++------ .../cuda/tensor_core_tiled_layout/libtorch.h | 43 ++- 2 files changed, 252 insertions(+), 121 deletions(-) diff --git a/torchao/csrc/cuda/tensor_core_tiled_layout/libtorch.cpp b/torchao/csrc/cuda/tensor_core_tiled_layout/libtorch.cpp index 42dd6f947b..50fcf97db4 100644 --- a/torchao/csrc/cuda/tensor_core_tiled_layout/libtorch.cpp +++ b/torchao/csrc/cuda/tensor_core_tiled_layout/libtorch.cpp @@ -15,119 +15,231 @@ #include -// step 1: from here, call the ATH func -// step 2: make ATH func also boxed and call it -// step 3: move abstract code to libtorch -void boxed_dequantize_tensor_core_tiled_layout(const c10::OperatorHandle &op, - torch::jit::Stack *stack) { - - // function pt1 here should take in IValues, pass a malloc'd stack into the - // second function - // need a translation from IValues to ATH to void*s! - - const auto& schema = op.schema(); - const auto num_returns = schema.returns().size(); - const auto num_arguments = schema.arguments().size(); - void **ministack = (void**)malloc((num_arguments + num_returns) * sizeof(void *)); - - for (auto idx = 0; idx < num_arguments; idx++) { - const c10::IValue& arg = torch::jit::peek(stack, idx, num_arguments); - if (arg.isInt()) { - ministack[idx] = reinterpret_cast(arg.toInt()); - } else if (arg.isTensor()) { - const at::Tensor& tensor = arg.toTensor(); - AtenTensorHandle ath = torch::aot_inductor::tensor_pointer_to_tensor_handle(&tensor); - ministack[idx] = reinterpret_cast(ath); - } else { - TORCH_CHECK(false, "Other types of IValues not yet handled!"); +// // step 1: from here, call the ATH func +// // step 2: make ATH func also boxed and call it +// // step 3: move abstract code to libtorch +// void boxed_dequantize_tensor_core_tiled_layout(const c10::OperatorHandle &op, +// torch::jit::Stack *stack) { + +// // function pt1 here should take in IValues, pass a malloc'd stack into the +// // second function +// // need a translation from IValues to ATH to void*s! + +// const auto& schema = op.schema(); +// const auto num_returns = schema.returns().size(); +// const auto num_arguments = schema.arguments().size(); +// void **ministack = (void**)malloc((num_arguments + num_returns) * sizeof(void *)); + +// for (auto idx = 0; idx < num_arguments; idx++) { +// const c10::IValue& arg = torch::jit::peek(stack, idx, num_arguments); +// if (arg.isInt()) { +// ministack[idx] = reinterpret_cast(arg.toInt()); +// } else if (arg.isTensor()) { +// const at::Tensor& tensor = arg.toTensor(); +// AtenTensorHandle ath = torch::aot_inductor::tensor_pointer_to_tensor_handle(&tensor); +// ministack[idx] = reinterpret_cast(ath); +// } else { +// TORCH_CHECK(false, "Other types of IValues not yet handled!"); +// } +// } + +// // second function is going to take a stack of void*, cast them to our +// // schema values for now, and run the function and modify the void* stack +// voidyvoid_boxed_ATH_dequantize_tensor_core_tiled_layout(ministack, num_arguments, +// num_returns); + +// // now pop all inputs on stack. if we pop earlier, Tensors would go out of scope +// // before calling the function +// torch::jit::drop(stack, num_arguments); + +// // read the output from the end of the stack and wrap that back into +// // IValue from void*? +// for (auto idx = 0; idx < num_returns; idx++) { +// const c10::TypePtr& ret_type = schema.returns()[idx].type(); +// if (*ret_type == *c10::getTypePtr()) { +// AtenTensorHandle ret_ath = reinterpret_cast( ministack[num_arguments + idx]); +// at::Tensor out = *torch::aot_inductor::tensor_handle_to_tensor_pointer(ret_ath); +// torch::jit::push(stack, c10::IValue(out)); +// } else { +// TORCH_CHECK(false, "Only Tensor return types are currently supported!"); +// } +// } + +// free(ministack); +// } + + +// void boxed_unpack_tensor_core_tiled_layout(const c10::OperatorHandle &op, +// torch::jit::Stack *stack) { + +// // function pt1 here should take in IValues, pass a malloc'd stack into the +// // second function +// // need a translation from IValues to ATH to void*s! + +// const auto& schema = op.schema(); +// const auto num_returns = schema.returns().size(); +// const auto num_arguments = schema.arguments().size(); +// void **ministack = (void**)malloc((num_arguments + num_returns) * sizeof(void *)); + +// for (auto idx = 0; idx < num_arguments; idx++) { +// const c10::IValue& arg = torch::jit::peek(stack, idx, num_arguments); +// if (arg.isInt()) { +// ministack[idx] = reinterpret_cast(arg.toInt()); +// } else if (arg.isTensor()) { +// const at::Tensor& tensor = arg.toTensor(); +// AtenTensorHandle ath = torch::aot_inductor::tensor_pointer_to_tensor_handle(&tensor); +// ministack[idx] = reinterpret_cast(ath); +// } else { +// TORCH_CHECK(false, "Other types of IValues not yet handled!"); +// } +// } + +// // second function is going to take a stack of void*, cast them to our +// // schema values for now, and run the function and modify the void* stack +// voidyvoid_boxed_ATH_unpack_tensor_core_tiled_layout(ministack, num_arguments, +// num_returns); + +// // now pop all inputs on stack. if we pop earlier, Tensors would go out of scope +// // before calling the function +// torch::jit::drop(stack, num_arguments); + +// // read the output from the end of the stack and wrap that back into +// // IValue from void*? +// for (auto idx = 0; idx < num_returns; idx++) { +// const c10::TypePtr& ret_type = schema.returns()[idx].type(); +// if (*ret_type == *c10::getTypePtr()) { +// AtenTensorHandle ret_ath = reinterpret_cast( ministack[num_arguments + idx]); +// at::Tensor out = *torch::aot_inductor::tensor_handle_to_tensor_pointer(ret_ath); +// torch::jit::push(stack, c10::IValue(out)); +// } else { +// TORCH_CHECK(false, "Only Tensor return types are currently supported!"); +// } +// } + +// free(ministack); +// } + +// void boxed_void_function(const c10::OperatorHandle &op, torch::jit::Stack *stack) { + +// // function pt1 here should take in IValues, pass a malloc'd stack into the +// // second function +// // need a translation from IValues to ATH to void*s! + +// const auto& schema = op.schema(); +// const auto num_returns = schema.returns().size(); +// const auto num_arguments = schema.arguments().size(); +// void **ministack = (void**)malloc((num_arguments + num_returns) * sizeof(void *)); + +// for (auto idx = 0; idx < num_arguments; idx++) { +// const c10::IValue& arg = torch::jit::peek(stack, idx, num_arguments); +// if (arg.isInt()) { +// ministack[idx] = reinterpret_cast(arg.toInt()); +// } else if (arg.isTensor()) { +// const at::Tensor& tensor = arg.toTensor(); +// AtenTensorHandle ath = torch::aot_inductor::tensor_pointer_to_tensor_handle(&tensor); +// ministack[idx] = reinterpret_cast(ath); +// } else { +// TORCH_CHECK(false, "Other types of IValues not yet handled!"); +// } +// } + +// // second function is going to take a stack of void*, cast them to our +// // schema values for now, and run the function and modify the void* stack +// voidyvoid_boxed_ATH_unpack_tensor_core_tiled_layout(ministack, num_arguments, +// num_returns); + +// // now pop all inputs on stack. if we pop earlier, Tensors would go out of scope +// // before calling the function +// torch::jit::drop(stack, num_arguments); + +// // read the output from the end of the stack and wrap that back into +// // IValue from void*? +// for (auto idx = 0; idx < num_returns; idx++) { +// const c10::TypePtr& ret_type = schema.returns()[idx].type(); +// if (*ret_type == *c10::getTypePtr()) { +// AtenTensorHandle ret_ath = reinterpret_cast( ministack[num_arguments + idx]); +// at::Tensor out = *torch::aot_inductor::tensor_handle_to_tensor_pointer(ret_ath); +// torch::jit::push(stack, c10::IValue(out)); +// } else { +// TORCH_CHECK(false, "Only Tensor return types are currently supported!"); +// } +// } + +// free(ministack); +// } + +// TORCH_LIBRARY_IMPL(torchao, CUDA, m) { +// // m.impl("torchao::unpack_tensor_core_tiled_layout", +// // &_unpack_tensor_core_tiled_layout); +// m.impl("torchao::unpack_tensor_core_tiled_layout", +// torch::CppFunction::makeFromBoxedFunction< +// boxed_unpack_tensor_core_tiled_layout>()); +// // m.impl("torchao::dequantize_tensor_core_tiled_layout", +// // &_dequantize_tensor_core_tiled_layout); +// m.impl("torchao::dequantize_tensor_core_tiled_layout", +// torch::CppFunction::makeFromBoxedFunction< +// boxed_dequantize_tensor_core_tiled_layout>()); +// } + +class StableLibrary::TorchLibraryOpaque { +public: + TorchLibraryOpaque(StableLibrary::Kind kind, std::string ns, std::optional k, const char* file, uint32_t line) + : library_(kind, ns, k, file, line) {} +private: + torch::Library library_; // Actual Library object +}; + +StableLibrary::StableLibrary(StableLibrary::Kind kind, std::string ns, std::optional k, const char* file, uint32_t line) + : lib_(&TorchLibraryOpaque(Library::Kind::IMPL, ns, k, file, line)) {} + +StableLibrary& StableLibrary::impl(std::string name, void (*fn)(void **, int64_t, int64_t)) { + auto boxed_function = [fn](const c10::OperatorHandle &op, torch::jit::Stack *stack) { + // function pt1 here should take in IValues, pass a malloc'd stack into the + // second function + // need a translation from IValues to ATH to void*s! + + const auto& schema = op.schema(); + const auto num_returns = schema.returns().size(); + const auto num_arguments = schema.arguments().size(); + void **ministack = (void**)malloc((num_arguments + num_returns) * sizeof(void *)); + + for (auto idx = 0; idx < num_arguments; idx++) { + const c10::IValue& arg = torch::jit::peek(stack, idx, num_arguments); + if (arg.isInt()) { + ministack[idx] = reinterpret_cast(arg.toInt()); + } else if (arg.isTensor()) { + const at::Tensor& tensor = arg.toTensor(); + AtenTensorHandle ath = torch::aot_inductor::tensor_pointer_to_tensor_handle(&tensor); + ministack[idx] = reinterpret_cast(ath); + } else { + TORCH_CHECK(false, "Other types of IValues not yet handled!"); + } } - } - // second function is going to take a stack of void*, cast them to our - // schema values for now, and run the function and modify the void* stack - voidyvoid_boxed_ATH_dequantize_tensor_core_tiled_layout(ministack, num_arguments, - num_returns); - - // now pop all inputs on stack. if we pop earlier, Tensors would go out of scope - // before calling the function - torch::jit::drop(stack, num_arguments); - - // read the output from the end of the stack and wrap that back into - // IValue from void*? - for (auto idx = 0; idx < num_returns; idx++) { - const c10::TypePtr& ret_type = schema.returns()[idx].type(); - if (*ret_type == *c10::getTypePtr()) { - AtenTensorHandle ret_ath = reinterpret_cast( ministack[num_arguments + idx]); - at::Tensor out = *torch::aot_inductor::tensor_handle_to_tensor_pointer(ret_ath); - torch::jit::push(stack, c10::IValue(out)); - } else { - TORCH_CHECK(false, "Only Tensor return types are currently supported!"); + // second function is going to take a stack of void*, cast them to our + // schema values for now, and run the function and modify the void* stack + fn(ministack, num_arguments, num_returns); + + // now pop all inputs on stack. if we pop earlier, Tensors would go out of scope + // before calling the function + torch::jit::drop(stack, num_arguments); + + // read the output from the end of the stack and wrap that back into + // IValue from void*? + for (auto idx = 0; idx < num_returns; idx++) { + const c10::TypePtr& ret_type = schema.returns()[idx].type(); + if (*ret_type == *c10::getTypePtr()) { + AtenTensorHandle ret_ath = reinterpret_cast( ministack[num_arguments + idx]); + at::Tensor out = *torch::aot_inductor::tensor_handle_to_tensor_pointer(ret_ath); + torch::jit::push(stack, c10::IValue(out)); + } else { + TORCH_CHECK(false, "Only Tensor return types are currently supported!"); + } } - } - - free(ministack); -} - -void boxed_unpack_tensor_core_tiled_layout(const c10::OperatorHandle &op, - torch::jit::Stack *stack) { - - // function pt1 here should take in IValues, pass a malloc'd stack into the - // second function - // need a translation from IValues to ATH to void*s! - - const auto& schema = op.schema(); - const auto num_returns = schema.returns().size(); - const auto num_arguments = schema.arguments().size(); - void **ministack = (void**)malloc((num_arguments + num_returns) * sizeof(void *)); - - for (auto idx = 0; idx < num_arguments; idx++) { - const c10::IValue& arg = torch::jit::peek(stack, idx, num_arguments); - if (arg.isInt()) { - ministack[idx] = reinterpret_cast(arg.toInt()); - } else if (arg.isTensor()) { - const at::Tensor& tensor = arg.toTensor(); - AtenTensorHandle ath = torch::aot_inductor::tensor_pointer_to_tensor_handle(&tensor); - ministack[idx] = reinterpret_cast(ath); - } else { - TORCH_CHECK(false, "Other types of IValues not yet handled!"); - } + free(ministack); } - // second function is going to take a stack of void*, cast them to our - // schema values for now, and run the function and modify the void* stack - voidyvoid_boxed_ATH_unpack_tensor_core_tiled_layout(ministack, num_arguments, - num_returns); - - // now pop all inputs on stack. if we pop earlier, Tensors would go out of scope - // before calling the function - torch::jit::drop(stack, num_arguments); - - // read the output from the end of the stack and wrap that back into - // IValue from void*? - for (auto idx = 0; idx < num_returns; idx++) { - const c10::TypePtr& ret_type = schema.returns()[idx].type(); - if (*ret_type == *c10::getTypePtr()) { - AtenTensorHandle ret_ath = reinterpret_cast( ministack[num_arguments + idx]); - at::Tensor out = *torch::aot_inductor::tensor_handle_to_tensor_pointer(ret_ath); - torch::jit::push(stack, c10::IValue(out)); - } else { - TORCH_CHECK(false, "Only Tensor return types are currently supported!"); - } - } - - free(ministack); -} - -TORCH_LIBRARY_IMPL(torchao, CUDA, m) { - // m.impl("torchao::unpack_tensor_core_tiled_layout", - // &_unpack_tensor_core_tiled_layout); - m.impl("torchao::unpack_tensor_core_tiled_layout", - torch::CppFunction::makeFromBoxedFunction< - boxed_unpack_tensor_core_tiled_layout>()); - // m.impl("torchao::dequantize_tensor_core_tiled_layout", - // &_dequantize_tensor_core_tiled_layout); - m.impl("torchao::dequantize_tensor_core_tiled_layout", - torch::CppFunction::makeFromBoxedFunction< - boxed_dequantize_tensor_core_tiled_layout>()); + this->lib_.impl(name, torch::CppFunction::makeFromBoxedFunction()); + return *this; } diff --git a/torchao/csrc/cuda/tensor_core_tiled_layout/libtorch.h b/torchao/csrc/cuda/tensor_core_tiled_layout/libtorch.h index 9fd7636d04..fb45f2b357 100644 --- a/torchao/csrc/cuda/tensor_core_tiled_layout/libtorch.h +++ b/torchao/csrc/cuda/tensor_core_tiled_layout/libtorch.h @@ -6,19 +6,38 @@ #include class StableLibrary final { + private: + class TorchLibraryOpaque; + using TorchLibraryHandle = TorchLibraryOpaque*; + TorchLibraryHandle lib_; public: - // a pointer to a real Library - // a kind - enum Kind { - DEF, // from TORCH_LIBRARY (no qualifier) - IMPL, - FRAGMENT, - }; - - - // constructor - - + // a pointer to a real Library + // a kind + enum Kind { + // DEF, // from TORCH_LIBRARY (no qualifier) + IMPL, + // FRAGMENT, + }; + + // constructor + /// \private + /// + /// Use TORCH_LIBRARY() or TORCH_LIBRARY_IMPL() instead of using these + /// constructors directly + StableLibrary( + Kind kind, + std::string ns, + std::optional k, + const char* file, + uint32_t line); + + StableLibrary(const StableLibrary&) = delete; + StableLibrary& operator=(const StableLibrary&) = delete; + StableLibrary(StableLibrary&&) = default; + StableLibrary& operator=(StableLibrary&&) = default; + ~StableLibrary() = default; + + StableLibrary& impl(std::string name, void* fn); }; From 31c2925ecb059bd268db82f02a993465be687c86 Mon Sep 17 00:00:00 2001 From: Jane Xu Date: Tue, 4 Feb 2025 10:20:33 -0800 Subject: [PATCH 09/12] Now the code compiles --- .../tensor_core_tiled_layout/libtorch.cpp | 458 ++++++++++-------- .../cuda/tensor_core_tiled_layout/libtorch.h | 40 +- .../tensor_core_tiled_layout.cu | 16 +- 3 files changed, 281 insertions(+), 233 deletions(-) diff --git a/torchao/csrc/cuda/tensor_core_tiled_layout/libtorch.cpp b/torchao/csrc/cuda/tensor_core_tiled_layout/libtorch.cpp index 50fcf97db4..8ea824bbad 100644 --- a/torchao/csrc/cuda/tensor_core_tiled_layout/libtorch.cpp +++ b/torchao/csrc/cuda/tensor_core_tiled_layout/libtorch.cpp @@ -1,11 +1,12 @@ // in this file, we will implement the stuff in libtorch.h, // and we are allowed to call unstable stuff from pytorch! -#include +#include "libtorch.h" #include #include #include +#include #include #include #include @@ -14,232 +15,263 @@ #include #include +#include +#include + + +/** +// step 1: from here, call the ATH func +// step 2: make ATH func also boxed and call it +// step 3: move abstract code to libtorch +void boxed_dequantize_tensor_core_tiled_layout(const c10::OperatorHandle &op, + torch::jit::Stack *stack) { + + // function pt1 here should take in IValues, pass a malloc'd stack into the + // second function + // need a translation from IValues to ATH to void*s! + + const auto& schema = op.schema(); + const auto num_returns = schema.returns().size(); + const auto num_arguments = schema.arguments().size(); + void **ministack = (void**)malloc((num_arguments + num_returns) * sizeof(void *)); + + for (auto idx = 0; idx < num_arguments; idx++) { + const c10::IValue& arg = torch::jit::peek(stack, idx, num_arguments); + if (arg.isInt()) { + ministack[idx] = reinterpret_cast(arg.toInt()); + } else if (arg.isTensor()) { + const at::Tensor& tensor = arg.toTensor(); + AtenTensorHandle ath = torch::aot_inductor::tensor_pointer_to_tensor_handle(&tensor); + ministack[idx] = reinterpret_cast(ath); + } else { + TORCH_CHECK(false, "Other types of IValues not yet handled!"); + } + } + + // second function is going to take a stack of void*, cast them to our + // schema values for now, and run the function and modify the void* stack + voidyvoid_boxed_ATH_dequantize_tensor_core_tiled_layout(ministack, num_arguments, + num_returns); + + // now pop all inputs on stack. if we pop earlier, Tensors would go out of scope + // before calling the function + torch::jit::drop(stack, num_arguments); + + // read the output from the end of the stack and wrap that back into + // IValue from void*? + for (auto idx = 0; idx < num_returns; idx++) { + const c10::TypePtr& ret_type = schema.returns()[idx].type(); + if (*ret_type == *c10::getTypePtr()) { + AtenTensorHandle ret_ath = reinterpret_cast( ministack[num_arguments + idx]); + at::Tensor out = *torch::aot_inductor::tensor_handle_to_tensor_pointer(ret_ath); + torch::jit::push(stack, c10::IValue(out)); + } else { + TORCH_CHECK(false, "Only Tensor return types are currently supported!"); + } + } + + free(ministack); +} + + +void boxed_unpack_tensor_core_tiled_layout(const c10::OperatorHandle &op, + torch::jit::Stack *stack) { + + // function pt1 here should take in IValues, pass a malloc'd stack into the + // second function + // need a translation from IValues to ATH to void*s! + + const auto& schema = op.schema(); + const auto num_returns = schema.returns().size(); + const auto num_arguments = schema.arguments().size(); + void **ministack = (void**)malloc((num_arguments + num_returns) * sizeof(void *)); + + for (auto idx = 0; idx < num_arguments; idx++) { + const c10::IValue& arg = torch::jit::peek(stack, idx, num_arguments); + if (arg.isInt()) { + ministack[idx] = reinterpret_cast(arg.toInt()); + } else if (arg.isTensor()) { + const at::Tensor& tensor = arg.toTensor(); + AtenTensorHandle ath = torch::aot_inductor::tensor_pointer_to_tensor_handle(&tensor); + ministack[idx] = reinterpret_cast(ath); + } else { + TORCH_CHECK(false, "Other types of IValues not yet handled!"); + } + } + + // second function is going to take a stack of void*, cast them to our + // schema values for now, and run the function and modify the void* stack + voidyvoid_boxed_ATH_unpack_tensor_core_tiled_layout(ministack, num_arguments, + num_returns); + + // now pop all inputs on stack. if we pop earlier, Tensors would go out of scope + // before calling the function + torch::jit::drop(stack, num_arguments); + + // read the output from the end of the stack and wrap that back into + // IValue from void*? + for (auto idx = 0; idx < num_returns; idx++) { + const c10::TypePtr& ret_type = schema.returns()[idx].type(); + if (*ret_type == *c10::getTypePtr()) { + AtenTensorHandle ret_ath = reinterpret_cast( ministack[num_arguments + idx]); + at::Tensor out = *torch::aot_inductor::tensor_handle_to_tensor_pointer(ret_ath); + torch::jit::push(stack, c10::IValue(out)); + } else { + TORCH_CHECK(false, "Only Tensor return types are currently supported!"); + } + } + + free(ministack); +} + +void boxed_void_function(const c10::OperatorHandle &op, torch::jit::Stack *stack) { + + // function pt1 here should take in IValues, pass a malloc'd stack into the + // second function + // need a translation from IValues to ATH to void*s! + + const auto& schema = op.schema(); + const auto num_returns = schema.returns().size(); + const auto num_arguments = schema.arguments().size(); + void **ministack = (void**)malloc((num_arguments + num_returns) * sizeof(void *)); + + for (auto idx = 0; idx < num_arguments; idx++) { + const c10::IValue& arg = torch::jit::peek(stack, idx, num_arguments); + if (arg.isInt()) { + ministack[idx] = reinterpret_cast(arg.toInt()); + } else if (arg.isTensor()) { + const at::Tensor& tensor = arg.toTensor(); + AtenTensorHandle ath = torch::aot_inductor::tensor_pointer_to_tensor_handle(&tensor); + ministack[idx] = reinterpret_cast(ath); + } else { + TORCH_CHECK(false, "Other types of IValues not yet handled!"); + } + } + + // second function is going to take a stack of void*, cast them to our + // schema values for now, and run the function and modify the void* stack + voidyvoid_boxed_ATH_unpack_tensor_core_tiled_layout(ministack, num_arguments, + num_returns); + + // now pop all inputs on stack. if we pop earlier, Tensors would go out of scope + // before calling the function + torch::jit::drop(stack, num_arguments); + + // read the output from the end of the stack and wrap that back into + // IValue from void*? + for (auto idx = 0; idx < num_returns; idx++) { + const c10::TypePtr& ret_type = schema.returns()[idx].type(); + if (*ret_type == *c10::getTypePtr()) { + AtenTensorHandle ret_ath = reinterpret_cast( ministack[num_arguments + idx]); + at::Tensor out = *torch::aot_inductor::tensor_handle_to_tensor_pointer(ret_ath); + torch::jit::push(stack, c10::IValue(out)); + } else { + TORCH_CHECK(false, "Only Tensor return types are currently supported!"); + } + } + + free(ministack); +} + +TORCH_LIBRARY_IMPL(torchao, CUDA, m) { + // m.impl("torchao::unpack_tensor_core_tiled_layout", + // &_unpack_tensor_core_tiled_layout); + m.impl("torchao::unpack_tensor_core_tiled_layout", + torch::CppFunction::makeFromBoxedFunction< + boxed_unpack_tensor_core_tiled_layout>()); + // m.impl("torchao::dequantize_tensor_core_tiled_layout", + // &_dequantize_tensor_core_tiled_layout); + m.impl("torchao::dequantize_tensor_core_tiled_layout", + torch::CppFunction::makeFromBoxedFunction< + boxed_dequantize_tensor_core_tiled_layout>()); +} + +*/ -// // step 1: from here, call the ATH func -// // step 2: make ATH func also boxed and call it -// // step 3: move abstract code to libtorch -// void boxed_dequantize_tensor_core_tiled_layout(const c10::OperatorHandle &op, -// torch::jit::Stack *stack) { - -// // function pt1 here should take in IValues, pass a malloc'd stack into the -// // second function -// // need a translation from IValues to ATH to void*s! - -// const auto& schema = op.schema(); -// const auto num_returns = schema.returns().size(); -// const auto num_arguments = schema.arguments().size(); -// void **ministack = (void**)malloc((num_arguments + num_returns) * sizeof(void *)); - -// for (auto idx = 0; idx < num_arguments; idx++) { -// const c10::IValue& arg = torch::jit::peek(stack, idx, num_arguments); -// if (arg.isInt()) { -// ministack[idx] = reinterpret_cast(arg.toInt()); -// } else if (arg.isTensor()) { -// const at::Tensor& tensor = arg.toTensor(); -// AtenTensorHandle ath = torch::aot_inductor::tensor_pointer_to_tensor_handle(&tensor); -// ministack[idx] = reinterpret_cast(ath); -// } else { -// TORCH_CHECK(false, "Other types of IValues not yet handled!"); -// } -// } - -// // second function is going to take a stack of void*, cast them to our -// // schema values for now, and run the function and modify the void* stack -// voidyvoid_boxed_ATH_dequantize_tensor_core_tiled_layout(ministack, num_arguments, -// num_returns); - -// // now pop all inputs on stack. if we pop earlier, Tensors would go out of scope -// // before calling the function -// torch::jit::drop(stack, num_arguments); - -// // read the output from the end of the stack and wrap that back into -// // IValue from void*? -// for (auto idx = 0; idx < num_returns; idx++) { -// const c10::TypePtr& ret_type = schema.returns()[idx].type(); -// if (*ret_type == *c10::getTypePtr()) { -// AtenTensorHandle ret_ath = reinterpret_cast( ministack[num_arguments + idx]); -// at::Tensor out = *torch::aot_inductor::tensor_handle_to_tensor_pointer(ret_ath); -// torch::jit::push(stack, c10::IValue(out)); -// } else { -// TORCH_CHECK(false, "Only Tensor return types are currently supported!"); -// } -// } - -// free(ministack); -// } - - -// void boxed_unpack_tensor_core_tiled_layout(const c10::OperatorHandle &op, -// torch::jit::Stack *stack) { - -// // function pt1 here should take in IValues, pass a malloc'd stack into the -// // second function -// // need a translation from IValues to ATH to void*s! - -// const auto& schema = op.schema(); -// const auto num_returns = schema.returns().size(); -// const auto num_arguments = schema.arguments().size(); -// void **ministack = (void**)malloc((num_arguments + num_returns) * sizeof(void *)); - -// for (auto idx = 0; idx < num_arguments; idx++) { -// const c10::IValue& arg = torch::jit::peek(stack, idx, num_arguments); -// if (arg.isInt()) { -// ministack[idx] = reinterpret_cast(arg.toInt()); -// } else if (arg.isTensor()) { -// const at::Tensor& tensor = arg.toTensor(); -// AtenTensorHandle ath = torch::aot_inductor::tensor_pointer_to_tensor_handle(&tensor); -// ministack[idx] = reinterpret_cast(ath); -// } else { -// TORCH_CHECK(false, "Other types of IValues not yet handled!"); -// } -// } - -// // second function is going to take a stack of void*, cast them to our -// // schema values for now, and run the function and modify the void* stack -// voidyvoid_boxed_ATH_unpack_tensor_core_tiled_layout(ministack, num_arguments, -// num_returns); - -// // now pop all inputs on stack. if we pop earlier, Tensors would go out of scope -// // before calling the function -// torch::jit::drop(stack, num_arguments); - -// // read the output from the end of the stack and wrap that back into -// // IValue from void*? -// for (auto idx = 0; idx < num_returns; idx++) { -// const c10::TypePtr& ret_type = schema.returns()[idx].type(); -// if (*ret_type == *c10::getTypePtr()) { -// AtenTensorHandle ret_ath = reinterpret_cast( ministack[num_arguments + idx]); -// at::Tensor out = *torch::aot_inductor::tensor_handle_to_tensor_pointer(ret_ath); -// torch::jit::push(stack, c10::IValue(out)); -// } else { -// TORCH_CHECK(false, "Only Tensor return types are currently supported!"); -// } -// } - -// free(ministack); -// } - -// void boxed_void_function(const c10::OperatorHandle &op, torch::jit::Stack *stack) { - -// // function pt1 here should take in IValues, pass a malloc'd stack into the -// // second function -// // need a translation from IValues to ATH to void*s! - -// const auto& schema = op.schema(); -// const auto num_returns = schema.returns().size(); -// const auto num_arguments = schema.arguments().size(); -// void **ministack = (void**)malloc((num_arguments + num_returns) * sizeof(void *)); - -// for (auto idx = 0; idx < num_arguments; idx++) { -// const c10::IValue& arg = torch::jit::peek(stack, idx, num_arguments); -// if (arg.isInt()) { -// ministack[idx] = reinterpret_cast(arg.toInt()); -// } else if (arg.isTensor()) { -// const at::Tensor& tensor = arg.toTensor(); -// AtenTensorHandle ath = torch::aot_inductor::tensor_pointer_to_tensor_handle(&tensor); -// ministack[idx] = reinterpret_cast(ath); -// } else { -// TORCH_CHECK(false, "Other types of IValues not yet handled!"); -// } -// } - -// // second function is going to take a stack of void*, cast them to our -// // schema values for now, and run the function and modify the void* stack -// voidyvoid_boxed_ATH_unpack_tensor_core_tiled_layout(ministack, num_arguments, -// num_returns); - -// // now pop all inputs on stack. if we pop earlier, Tensors would go out of scope -// // before calling the function -// torch::jit::drop(stack, num_arguments); - -// // read the output from the end of the stack and wrap that back into -// // IValue from void*? -// for (auto idx = 0; idx < num_returns; idx++) { -// const c10::TypePtr& ret_type = schema.returns()[idx].type(); -// if (*ret_type == *c10::getTypePtr()) { -// AtenTensorHandle ret_ath = reinterpret_cast( ministack[num_arguments + idx]); -// at::Tensor out = *torch::aot_inductor::tensor_handle_to_tensor_pointer(ret_ath); -// torch::jit::push(stack, c10::IValue(out)); -// } else { -// TORCH_CHECK(false, "Only Tensor return types are currently supported!"); -// } -// } - -// free(ministack); -// } - -// TORCH_LIBRARY_IMPL(torchao, CUDA, m) { -// // m.impl("torchao::unpack_tensor_core_tiled_layout", -// // &_unpack_tensor_core_tiled_layout); -// m.impl("torchao::unpack_tensor_core_tiled_layout", -// torch::CppFunction::makeFromBoxedFunction< -// boxed_unpack_tensor_core_tiled_layout>()); -// // m.impl("torchao::dequantize_tensor_core_tiled_layout", -// // &_dequantize_tensor_core_tiled_layout); -// m.impl("torchao::dequantize_tensor_core_tiled_layout", -// torch::CppFunction::makeFromBoxedFunction< -// boxed_dequantize_tensor_core_tiled_layout>()); -// } class StableLibrary::TorchLibraryOpaque { public: + // TODO: support other Kinds lol, you'll need to translate between StableLibrary::Kind and Library::Kind TorchLibraryOpaque(StableLibrary::Kind kind, std::string ns, std::optional k, const char* file, uint32_t line) - : library_(kind, ns, k, file, line) {} + : library_(torch::Library::Kind::IMPL, std::move(ns), k, file, line) {} + + TorchLibraryOpaque(const TorchLibraryOpaque&) = delete; + TorchLibraryOpaque& operator=(const TorchLibraryOpaque&) = delete; + TorchLibraryOpaque(TorchLibraryOpaque&&) = default; + TorchLibraryOpaque& operator=(TorchLibraryOpaque&&) = default; + ~TorchLibraryOpaque() = default; + + void impl(const char* name, torch::CppFunction fn) { + library_.impl(name, std::move(fn)); + } private: torch::Library library_; // Actual Library object }; -StableLibrary::StableLibrary(StableLibrary::Kind kind, std::string ns, std::optional k, const char* file, uint32_t line) - : lib_(&TorchLibraryOpaque(Library::Kind::IMPL, ns, k, file, line)) {} - -StableLibrary& StableLibrary::impl(std::string name, void (*fn)(void **, int64_t, int64_t)) { - auto boxed_function = [fn](const c10::OperatorHandle &op, torch::jit::Stack *stack) { - // function pt1 here should take in IValues, pass a malloc'd stack into the - // second function - // need a translation from IValues to ATH to void*s! - - const auto& schema = op.schema(); - const auto num_returns = schema.returns().size(); - const auto num_arguments = schema.arguments().size(); - void **ministack = (void**)malloc((num_arguments + num_returns) * sizeof(void *)); - - for (auto idx = 0; idx < num_arguments; idx++) { - const c10::IValue& arg = torch::jit::peek(stack, idx, num_arguments); - if (arg.isInt()) { - ministack[idx] = reinterpret_cast(arg.toInt()); - } else if (arg.isTensor()) { - const at::Tensor& tensor = arg.toTensor(); - AtenTensorHandle ath = torch::aot_inductor::tensor_pointer_to_tensor_handle(&tensor); - ministack[idx] = reinterpret_cast(ath); - } else { - TORCH_CHECK(false, "Other types of IValues not yet handled!"); + +class VoidStarConverter: public c10::OperatorKernel { + public: + VoidStarConverter(void (*fn)(void **, int64_t, int64_t)) : fn_(fn) {} + + void operator()(const c10::OperatorHandle& op, c10::DispatchKeySet keyset, torch::jit::Stack* stack) { + // function pt1 here should take in IValues, pass a malloc'd stack into the + // second function + // need a translation from IValues to ATH to void*s! + + const auto& schema = op.schema(); + const auto num_returns = schema.returns().size(); + const auto num_arguments = schema.arguments().size(); + // to make this faster, you can make this a C array on the stack --> though this may cause a stackoverflow + void **ministack = (void**)malloc((num_arguments + num_returns) * sizeof(void *)); + // std::unique_ptr ministack = std::make_unique(num_arguments + num_returns); + + for (size_t idx = 0; idx < num_arguments; idx++) { // rbarnes will prefer a c10::irange instead of this loop! + const c10::IValue& arg = torch::jit::peek(stack, idx, num_arguments); + if (arg.isInt()) { + ministack[idx] = reinterpret_cast(arg.toInt()); + } else if (arg.isTensor()) { + const at::Tensor& tensor = arg.toTensor(); + AtenTensorHandle ath = torch::aot_inductor::tensor_pointer_to_tensor_handle(&tensor); + ministack[idx] = reinterpret_cast(ath); + } else { + TORCH_CHECK(false, "Other types of IValues not yet handled!"); + } } - } - // second function is going to take a stack of void*, cast them to our - // schema values for now, and run the function and modify the void* stack - fn(ministack, num_arguments, num_returns); - - // now pop all inputs on stack. if we pop earlier, Tensors would go out of scope - // before calling the function - torch::jit::drop(stack, num_arguments); - - // read the output from the end of the stack and wrap that back into - // IValue from void*? - for (auto idx = 0; idx < num_returns; idx++) { - const c10::TypePtr& ret_type = schema.returns()[idx].type(); - if (*ret_type == *c10::getTypePtr()) { - AtenTensorHandle ret_ath = reinterpret_cast( ministack[num_arguments + idx]); - at::Tensor out = *torch::aot_inductor::tensor_handle_to_tensor_pointer(ret_ath); - torch::jit::push(stack, c10::IValue(out)); - } else { - TORCH_CHECK(false, "Only Tensor return types are currently supported!"); + // second function is going to take a stack of void*, cast them to our + // schema values for now, and run the function and modify the void* stack + fn_(ministack, num_arguments, num_returns); + + // now pop all inputs on stack. if we pop earlier, Tensors would go out of scope + // before calling the function + torch::jit::drop(stack, num_arguments); + + // read the output from the end of the stack and wrap that back into + // IValue from void*? + for (size_t idx = 0; idx < num_returns; idx++) { + const c10::TypePtr& ret_type = schema.returns()[idx].type(); + if (*ret_type == *c10::getTypePtr()) { + AtenTensorHandle ret_ath = reinterpret_cast(ministack[num_arguments + idx]); + at::Tensor out = *torch::aot_inductor::tensor_handle_to_tensor_pointer(ret_ath); + torch::jit::push(stack, c10::IValue(out)); + } else { + TORCH_CHECK(false, "Only Tensor return types are currently supported!"); + } } + + free(ministack); } - free(ministack); - } + private: + void (*fn_)(void **, int64_t, int64_t); +}; + + +StableLibrary::StableLibrary(StableLibrary::Kind kind, std::string ns, std::optional k, const char* file, uint32_t line) + : lib_(new TorchLibraryOpaque(StableLibrary::Kind::IMPL, std::move(ns), k, file, line)) {} + - this->lib_.impl(name, torch::CppFunction::makeFromBoxedFunction()); +StableLibrary& StableLibrary::impl(const char* name, void (*fn)(void **, int64_t, int64_t)) { + this->lib_->impl(name, torch::CppFunction::makeFromBoxedFunctor(std::move(std::make_unique(fn)))); return *this; } diff --git a/torchao/csrc/cuda/tensor_core_tiled_layout/libtorch.h b/torchao/csrc/cuda/tensor_core_tiled_layout/libtorch.h index fb45f2b357..9679921590 100644 --- a/torchao/csrc/cuda/tensor_core_tiled_layout/libtorch.h +++ b/torchao/csrc/cuda/tensor_core_tiled_layout/libtorch.h @@ -5,11 +5,14 @@ #include // used for DispatchKey, enum verified to be header-only #include +#include +#include + class StableLibrary final { private: class TorchLibraryOpaque; using TorchLibraryHandle = TorchLibraryOpaque*; - TorchLibraryHandle lib_; + TorchLibraryHandle lib_; // pimpl unique_ptr public: // a pointer to a real Library // a kind @@ -37,7 +40,7 @@ class StableLibrary final { StableLibrary& operator=(StableLibrary&&) = default; ~StableLibrary() = default; - StableLibrary& impl(std::string name, void* fn); + StableLibrary& impl(const char* name, void (*fn)(void **, int64_t, int64_t)); }; @@ -88,22 +91,23 @@ class StableTorchLibraryInit final { +/** +#define TORCH_LIBRARY_IMPL(ns, k, m) _TORCH_LIBRARY_IMPL(ns, k, m, C10_UID) -// #define TORCH_LIBRARY_IMPL(ns, k, m) _TORCH_LIBRARY_IMPL(ns, k, m, C10_UID) - -// #define _TORCH_LIBRARY_IMPL(ns, k, m, uid) \ -// static void TORCH_LIBRARY_IMPL_init_torchao_CUDA_uid(torch::Library&); \ -// static const torch::detail::TorchLibraryInit \ -// TORCH_LIBRARY_IMPL_static_init_torchao_CUDA_uid( \ -// torch::Library::IMPL, \ -// (c10::impl::dispatch_key_allowlist_check(c10::DispatchKey::CUDA) \ -// ? &TORCH_LIBRARY_IMPL_init_torchao_CUDA_uid \ -// : [](torch::Library&) -> void {}), \ -// torchao, \ -// std::make_optional(c10::DispatchKey::CUDA), \ -// __FILE__, \ -// __LINE__); \ -// TORCH_LIBRARY_IMPL_init_torchao_CUDA_uid(torch::Library & m) { +#define _TORCH_LIBRARY_IMPL(ns, k, m, uid) \ + static void TORCH_LIBRARY_IMPL_init_torchao_CUDA_uid(torch::Library&); \ + static const torch::detail::TorchLibraryInit \ + TORCH_LIBRARY_IMPL_static_init_torchao_CUDA_uid( \ + torch::Library::IMPL, \ + (c10::impl::dispatch_key_allowlist_check(c10::DispatchKey::CUDA) \ + ? &TORCH_LIBRARY_IMPL_init_torchao_CUDA_uid \ + : [](torch::Library&) -> void {}), \ + torchao, \ + std::make_optional(c10::DispatchKey::CUDA), \ + __FILE__, \ + __LINE__); \ + TORCH_LIBRARY_IMPL_init_torchao_CUDA_uid(torch::Library & m) { -// } + } +*/ diff --git a/torchao/csrc/cuda/tensor_core_tiled_layout/tensor_core_tiled_layout.cu b/torchao/csrc/cuda/tensor_core_tiled_layout/tensor_core_tiled_layout.cu index f4c252ab37..3b3d96d912 100644 --- a/torchao/csrc/cuda/tensor_core_tiled_layout/tensor_core_tiled_layout.cu +++ b/torchao/csrc/cuda/tensor_core_tiled_layout/tensor_core_tiled_layout.cu @@ -1,7 +1,12 @@ #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 800 // at least Ampere -#include -#include +#include "libtorch.h" + + // need to confirm or make the following includes header-only +#include +#include +#include +#include template constexpr __host__ __device__ auto divUp(U a, V b) -> decltype(a + b) { @@ -455,6 +460,13 @@ void voidyvoid_boxed_ATH_unpack_tensor_core_tiled_layout(void **stack, // here, void* is my StableIValue // function is going to take a stack of void*, cast them to our // schema values for now, and run the function and modify the void* stack + + + // check that num_args > 2 + // strict aliasing rule --> you can reinterpret_cast between pointers except for this rule + // which doesn't even apply in C. + // reinterpret_cast is not okay to interpreting floats+ints (c++20 lets you bitcast these) so you'll have to use memcpy + // c10 has a shim for bitcast (it is just a memcpy)!! so we should just use it int64_t innerKTiles = reinterpret_cast(stack[1]); AtenTensorHandle packed_w_ath = reinterpret_cast(stack[0]); From a0500e573e4d22bf5c83745a5dd182f33758b805 Mon Sep 17 00:00:00 2001 From: Jane Xu Date: Tue, 4 Feb 2025 10:36:57 -0800 Subject: [PATCH 10/12] Move commented out notes to the bottom to not distract --- .../tensor_core_tiled_layout/libtorch.cpp | 175 +++++++++--------- .../cuda/tensor_core_tiled_layout/libtorch.h | 13 +- .../tensor_core_tiled_layout.cu | 2 +- 3 files changed, 88 insertions(+), 102 deletions(-) diff --git a/torchao/csrc/cuda/tensor_core_tiled_layout/libtorch.cpp b/torchao/csrc/cuda/tensor_core_tiled_layout/libtorch.cpp index 8ea824bbad..fd1557f2a7 100644 --- a/torchao/csrc/cuda/tensor_core_tiled_layout/libtorch.cpp +++ b/torchao/csrc/cuda/tensor_core_tiled_layout/libtorch.cpp @@ -15,10 +15,93 @@ #include #include -#include -#include +class StableLibrary::TorchLibraryOpaque { +public: + // TODO: support other Kinds lol, you'll need to translate between StableLibrary::Kind and Library::Kind + TorchLibraryOpaque(StableLibrary::Kind kind, std::string ns, std::optional k, const char* file, uint32_t line) + : library_(torch::Library::Kind::IMPL, std::move(ns), k, file, line) {} + + TorchLibraryOpaque(const TorchLibraryOpaque&) = delete; + TorchLibraryOpaque& operator=(const TorchLibraryOpaque&) = delete; + TorchLibraryOpaque(TorchLibraryOpaque&&) = default; + TorchLibraryOpaque& operator=(TorchLibraryOpaque&&) = default; + ~TorchLibraryOpaque() = default; + void impl(const char* name, torch::CppFunction fn) { + library_.impl(name, std::move(fn)); + } +private: + torch::Library library_; // Actual Library object +}; + + +class VoidStarConverter: public c10::OperatorKernel { + public: + VoidStarConverter(void (*fn)(void **, int64_t, int64_t)) : fn_(fn) {} + + void operator()(const c10::OperatorHandle& op, c10::DispatchKeySet keyset, torch::jit::Stack* stack) { + const auto& schema = op.schema(); + const auto num_returns = schema.returns().size(); + const auto num_arguments = schema.arguments().size(); + // to make this faster, you can make this a C array on the stack --> though this may cause a stackoverflow + void **ministack = (void**)malloc((num_arguments + num_returns) * sizeof(void *)); + // std::unique_ptr ministack = std::make_unique(num_arguments + num_returns); + + for (size_t idx = 0; idx < num_arguments; idx++) { // rbarnes will prefer a c10::irange instead of this loop! + const c10::IValue& arg = torch::jit::peek(stack, idx, num_arguments); + if (arg.isInt()) { + ministack[idx] = reinterpret_cast(arg.toInt()); + } else if (arg.isTensor()) { + const at::Tensor& tensor = arg.toTensor(); + AtenTensorHandle ath = torch::aot_inductor::tensor_pointer_to_tensor_handle(&tensor); + ministack[idx] = reinterpret_cast(ath); + } else { + TORCH_CHECK(false, "Other types of IValues not yet handled!"); + } + } + // second function is going to take a stack of void*, cast them to our + // schema values for now, and run the function and modify the void* stack + fn_(ministack, num_arguments, num_returns); + + // now pop all inputs on stack. if we pop earlier, Tensors would go out of scope + // before calling the function + torch::jit::drop(stack, num_arguments); + + // read the output from the end of the stack and wrap that back into + // IValue from void*? + for (size_t idx = 0; idx < num_returns; idx++) { + const c10::TypePtr& ret_type = schema.returns()[idx].type(); + if (*ret_type == *c10::getTypePtr()) { + AtenTensorHandle ret_ath = reinterpret_cast(ministack[num_arguments + idx]); + at::Tensor out = *torch::aot_inductor::tensor_handle_to_tensor_pointer(ret_ath); + torch::jit::push(stack, c10::IValue(out)); + } else { + TORCH_CHECK(false, "Only Tensor return types are currently supported!"); + } + } + + free(ministack); + } + + private: + void (*fn_)(void **, int64_t, int64_t); +}; + + +StableLibrary::StableLibrary(StableLibrary::Kind kind, std::string ns, std::optional k, const char* file, uint32_t line) + : lib_(new TorchLibraryOpaque(StableLibrary::Kind::IMPL, std::move(ns), k, file, line)) {} + + +StableLibrary& StableLibrary::impl(const char* name, void (*fn)(void **, int64_t, int64_t)) { + this->lib_->impl(name, torch::CppFunction::makeFromBoxedFunctor(std::move(std::make_unique(fn)))); + return *this; +} + + + + +// notes from trying to understand stuff + iteration /** // step 1: from here, call the ATH func // step 2: make ATH func also boxed and call it @@ -187,91 +270,3 @@ TORCH_LIBRARY_IMPL(torchao, CUDA, m) { } */ - - -class StableLibrary::TorchLibraryOpaque { -public: - // TODO: support other Kinds lol, you'll need to translate between StableLibrary::Kind and Library::Kind - TorchLibraryOpaque(StableLibrary::Kind kind, std::string ns, std::optional k, const char* file, uint32_t line) - : library_(torch::Library::Kind::IMPL, std::move(ns), k, file, line) {} - - TorchLibraryOpaque(const TorchLibraryOpaque&) = delete; - TorchLibraryOpaque& operator=(const TorchLibraryOpaque&) = delete; - TorchLibraryOpaque(TorchLibraryOpaque&&) = default; - TorchLibraryOpaque& operator=(TorchLibraryOpaque&&) = default; - ~TorchLibraryOpaque() = default; - - void impl(const char* name, torch::CppFunction fn) { - library_.impl(name, std::move(fn)); - } -private: - torch::Library library_; // Actual Library object -}; - - -class VoidStarConverter: public c10::OperatorKernel { - public: - VoidStarConverter(void (*fn)(void **, int64_t, int64_t)) : fn_(fn) {} - - void operator()(const c10::OperatorHandle& op, c10::DispatchKeySet keyset, torch::jit::Stack* stack) { - // function pt1 here should take in IValues, pass a malloc'd stack into the - // second function - // need a translation from IValues to ATH to void*s! - - const auto& schema = op.schema(); - const auto num_returns = schema.returns().size(); - const auto num_arguments = schema.arguments().size(); - // to make this faster, you can make this a C array on the stack --> though this may cause a stackoverflow - void **ministack = (void**)malloc((num_arguments + num_returns) * sizeof(void *)); - // std::unique_ptr ministack = std::make_unique(num_arguments + num_returns); - - for (size_t idx = 0; idx < num_arguments; idx++) { // rbarnes will prefer a c10::irange instead of this loop! - const c10::IValue& arg = torch::jit::peek(stack, idx, num_arguments); - if (arg.isInt()) { - ministack[idx] = reinterpret_cast(arg.toInt()); - } else if (arg.isTensor()) { - const at::Tensor& tensor = arg.toTensor(); - AtenTensorHandle ath = torch::aot_inductor::tensor_pointer_to_tensor_handle(&tensor); - ministack[idx] = reinterpret_cast(ath); - } else { - TORCH_CHECK(false, "Other types of IValues not yet handled!"); - } - } - - // second function is going to take a stack of void*, cast them to our - // schema values for now, and run the function and modify the void* stack - fn_(ministack, num_arguments, num_returns); - - // now pop all inputs on stack. if we pop earlier, Tensors would go out of scope - // before calling the function - torch::jit::drop(stack, num_arguments); - - // read the output from the end of the stack and wrap that back into - // IValue from void*? - for (size_t idx = 0; idx < num_returns; idx++) { - const c10::TypePtr& ret_type = schema.returns()[idx].type(); - if (*ret_type == *c10::getTypePtr()) { - AtenTensorHandle ret_ath = reinterpret_cast(ministack[num_arguments + idx]); - at::Tensor out = *torch::aot_inductor::tensor_handle_to_tensor_pointer(ret_ath); - torch::jit::push(stack, c10::IValue(out)); - } else { - TORCH_CHECK(false, "Only Tensor return types are currently supported!"); - } - } - - free(ministack); - } - - private: - void (*fn_)(void **, int64_t, int64_t); -}; - - -StableLibrary::StableLibrary(StableLibrary::Kind kind, std::string ns, std::optional k, const char* file, uint32_t line) - : lib_(new TorchLibraryOpaque(StableLibrary::Kind::IMPL, std::move(ns), k, file, line)) {} - - -StableLibrary& StableLibrary::impl(const char* name, void (*fn)(void **, int64_t, int64_t)) { - this->lib_->impl(name, torch::CppFunction::makeFromBoxedFunctor(std::move(std::make_unique(fn)))); - return *this; -} diff --git a/torchao/csrc/cuda/tensor_core_tiled_layout/libtorch.h b/torchao/csrc/cuda/tensor_core_tiled_layout/libtorch.h index 9679921590..febffe10bc 100644 --- a/torchao/csrc/cuda/tensor_core_tiled_layout/libtorch.h +++ b/torchao/csrc/cuda/tensor_core_tiled_layout/libtorch.h @@ -25,7 +25,7 @@ class StableLibrary final { // constructor /// \private /// - /// Use TORCH_LIBRARY() or TORCH_LIBRARY_IMPL() instead of using these + /// Use STABLE_TORCH_LIBRARY or STABLE_TORCH_LIBRARY_IMPL() instead of using these /// constructors directly StableLibrary( Kind kind, @@ -43,15 +43,6 @@ class StableLibrary final { StableLibrary& impl(const char* name, void (*fn)(void **, int64_t, int64_t)); }; - -// _def function ==> IGNORE LIBRARY + just call these - -// stable_impl function (that takes in a string and a void** function pointer) -// _impl doesn't really need a Library object, try to avoid it for now -// just copy its implementation -// it'll give u a handle that needs to be kept alive, just assign to global for now - - class StableTorchLibraryInit final { private: using InitFn = void(StableLibrary&); @@ -90,7 +81,7 @@ class StableTorchLibraryInit final { - +// notes while figuring out templating /** #define TORCH_LIBRARY_IMPL(ns, k, m) _TORCH_LIBRARY_IMPL(ns, k, m, C10_UID) diff --git a/torchao/csrc/cuda/tensor_core_tiled_layout/tensor_core_tiled_layout.cu b/torchao/csrc/cuda/tensor_core_tiled_layout/tensor_core_tiled_layout.cu index 3b3d96d912..a0eca94751 100644 --- a/torchao/csrc/cuda/tensor_core_tiled_layout/tensor_core_tiled_layout.cu +++ b/torchao/csrc/cuda/tensor_core_tiled_layout/tensor_core_tiled_layout.cu @@ -2,7 +2,7 @@ #include "libtorch.h" - // need to confirm or make the following includes header-only +// need to confirm or make the following includes header-only #include #include #include From 5e2c2d052cc9a866b68bb164754526dd4aafacd4 Mon Sep 17 00:00:00 2001 From: Jane Xu Date: Tue, 4 Feb 2025 10:43:48 -0800 Subject: [PATCH 11/12] Remove dependency on change in core --- torchao/csrc/cuda/tensor_core_tiled_layout/libtorch.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torchao/csrc/cuda/tensor_core_tiled_layout/libtorch.cpp b/torchao/csrc/cuda/tensor_core_tiled_layout/libtorch.cpp index fd1557f2a7..1ac4966da6 100644 --- a/torchao/csrc/cuda/tensor_core_tiled_layout/libtorch.cpp +++ b/torchao/csrc/cuda/tensor_core_tiled_layout/libtorch.cpp @@ -52,7 +52,7 @@ class VoidStarConverter: public c10::OperatorKernel { if (arg.isInt()) { ministack[idx] = reinterpret_cast(arg.toInt()); } else if (arg.isTensor()) { - const at::Tensor& tensor = arg.toTensor(); + at::Tensor& tensor = const_cast(arg.toTensor()); AtenTensorHandle ath = torch::aot_inductor::tensor_pointer_to_tensor_handle(&tensor); ministack[idx] = reinterpret_cast(ath); } else { From cc4d0223c1ee2ff287c9ece99b6517e85ac5a2d6 Mon Sep 17 00:00:00 2001 From: Jane Xu Date: Tue, 18 Feb 2025 11:27:40 -0800 Subject: [PATCH 12/12] Fix memory leak by using RAIIATH --- .../tensor_core_tiled_layout/libtorch.cpp | 10 +++--- .../tensor_core_tiled_layout.cu | 35 +++++++++++-------- 2 files changed, 27 insertions(+), 18 deletions(-) diff --git a/torchao/csrc/cuda/tensor_core_tiled_layout/libtorch.cpp b/torchao/csrc/cuda/tensor_core_tiled_layout/libtorch.cpp index 1ac4966da6..352ff1be22 100644 --- a/torchao/csrc/cuda/tensor_core_tiled_layout/libtorch.cpp +++ b/torchao/csrc/cuda/tensor_core_tiled_layout/libtorch.cpp @@ -12,6 +12,7 @@ #include #include #include +#include #include #include @@ -35,6 +36,8 @@ class StableLibrary::TorchLibraryOpaque { }; +using RAIIATH = torch::aot_inductor::RAIIAtenTensorHandle; + class VoidStarConverter: public c10::OperatorKernel { public: VoidStarConverter(void (*fn)(void **, int64_t, int64_t)) : fn_(fn) {} @@ -52,8 +55,7 @@ class VoidStarConverter: public c10::OperatorKernel { if (arg.isInt()) { ministack[idx] = reinterpret_cast(arg.toInt()); } else if (arg.isTensor()) { - at::Tensor& tensor = const_cast(arg.toTensor()); - AtenTensorHandle ath = torch::aot_inductor::tensor_pointer_to_tensor_handle(&tensor); + AtenTensorHandle ath = torch::aot_inductor::new_tensor_handle(std::move(const_cast(arg.toTensor()))); ministack[idx] = reinterpret_cast(ath); } else { TORCH_CHECK(false, "Other types of IValues not yet handled!"); @@ -73,8 +75,8 @@ class VoidStarConverter: public c10::OperatorKernel { for (size_t idx = 0; idx < num_returns; idx++) { const c10::TypePtr& ret_type = schema.returns()[idx].type(); if (*ret_type == *c10::getTypePtr()) { - AtenTensorHandle ret_ath = reinterpret_cast(ministack[num_arguments + idx]); - at::Tensor out = *torch::aot_inductor::tensor_handle_to_tensor_pointer(ret_ath); + auto ret_raiiath = RAIIATH(reinterpret_cast(ministack[num_arguments + idx])); + at::Tensor out = *torch::aot_inductor::tensor_handle_to_tensor_pointer(ret_raiiath.get()); torch::jit::push(stack, c10::IValue(out)); } else { TORCH_CHECK(false, "Only Tensor return types are currently supported!"); diff --git a/torchao/csrc/cuda/tensor_core_tiled_layout/tensor_core_tiled_layout.cu b/torchao/csrc/cuda/tensor_core_tiled_layout/tensor_core_tiled_layout.cu index a0eca94751..b41b0669f7 100644 --- a/torchao/csrc/cuda/tensor_core_tiled_layout/tensor_core_tiled_layout.cu +++ b/torchao/csrc/cuda/tensor_core_tiled_layout/tensor_core_tiled_layout.cu @@ -1,5 +1,6 @@ #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 800 // at least Ampere +#include #include "libtorch.h" // need to confirm or make the following includes header-only @@ -8,6 +9,8 @@ #include #include +using RAIIATH = torch::aot_inductor::RAIIAtenTensorHandle; + template constexpr __host__ __device__ auto divUp(U a, V b) -> decltype(a + b) { static_assert(std::is_integral::value && std::is_integral::value, ""); @@ -167,10 +170,13 @@ __global__ void _dequantize_int4_kernel( // - at::cuda::getCurrentCUDAStream(); // - at::GenericPackedTensorAccessor // - at::RestrictPtrTraits -AtenTensorHandle _ATH_dequantize_tensor_core_tiled_layout( - const AtenTensorHandle packed_w, const AtenTensorHandle scales_and_zeros, +RAIIATH _ATH_dequantize_tensor_core_tiled_layout( + const RAIIATH packed_w_raiiath, const RAIIATH scales_and_zeros_raiiath, int64_t group_size, int64_t innerKTiles) { + auto packed_w = packed_w_raiiath.get(); + auto scales_and_zeros = scales_and_zeros_raiiath.get(); + constexpr int32_t kNTileSize = 8; constexpr int32_t kKTileSize = 16; @@ -334,7 +340,7 @@ AtenTensorHandle _ATH_dequantize_tensor_core_tiled_layout( #undef DISPATCH_Q_GROUP #undef RUN_DEQUANT - return out; + return RAIIATH(out); } @@ -346,23 +352,24 @@ void voidyvoid_boxed_ATH_dequantize_tensor_core_tiled_layout(void **stack, // schema values for now, and run the function and modify the void* stack int64_t innerKTiles = reinterpret_cast(stack[3]); int64_t group_size = reinterpret_cast(stack[2]); - AtenTensorHandle scales_and_zeros_ath = - reinterpret_cast(stack[1]); - AtenTensorHandle packed_w_ath = reinterpret_cast(stack[0]); + RAIIATH scales_and_zeros_ath(reinterpret_cast(stack[1])); + RAIIATH packed_w_ath(reinterpret_cast(stack[0])); - AtenTensorHandle ath_res = _ATH_dequantize_tensor_core_tiled_layout( - packed_w_ath, scales_and_zeros_ath, group_size, innerKTiles); + RAIIATH raiiath_res = _ATH_dequantize_tensor_core_tiled_layout( + std::move(packed_w_ath), std::move(scales_and_zeros_ath), group_size, innerKTiles); - void *out = reinterpret_cast(ath_res); + void *out = reinterpret_cast(raiiath_res.release()); stack[num_args] = out; } // output is [n][k] (int32 dtype) // input is [n / 8][k / (InnerKTiles * 16)][32][innerKTiles / 2] -AtenTensorHandle _ATH_unpack_tensor_core_tiled_layout(const AtenTensorHandle packed_w, +RAIIATH _ATH_unpack_tensor_core_tiled_layout(const RAIIATH packed_w_raiiath, int64_t innerKTiles) { + auto packed_w = packed_w_raiiath.get(); + int32_t packed_w_device_index; aoti_torch_get_device_index(packed_w, &packed_w_device_index); @@ -451,7 +458,7 @@ AtenTensorHandle _ATH_unpack_tensor_core_tiled_layout(const AtenTensorHandle pac <<>>(packed_w_pta32, out_pta32); } - return out; + return RAIIATH(out); } void voidyvoid_boxed_ATH_unpack_tensor_core_tiled_layout(void **stack, @@ -470,10 +477,10 @@ void voidyvoid_boxed_ATH_unpack_tensor_core_tiled_layout(void **stack, int64_t innerKTiles = reinterpret_cast(stack[1]); AtenTensorHandle packed_w_ath = reinterpret_cast(stack[0]); - AtenTensorHandle ath_res = _ATH_unpack_tensor_core_tiled_layout( - packed_w_ath, innerKTiles); + RAIIATH raiiath_res = _ATH_unpack_tensor_core_tiled_layout( + RAIIATH(packed_w_ath), innerKTiles); - void *out = reinterpret_cast(ath_res); + void *out = reinterpret_cast(raiiath_res.release()); stack[num_args] = out; }