Skip to content

Commit 4738377

Browse files
fix bug in tl.store mask for kernel _to_fp8_row_major_t_and_non_t (#1516)
1 parent f86fda9 commit 4738377

File tree

2 files changed

+5
-3
lines changed

2 files changed

+5
-3
lines changed

torchao/prototype/float8nocompile/kernels/fp8_dynamic_tensorwise.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -375,8 +375,8 @@ def _to_fp8_row_major_t_and_non_t(
375375
block_col_offs[:, None] * row_major_t_out_stride_row
376376
+ block_row_offs[None, :] * row_major_t_out_stride_col
377377
)
378-
mask = (block_row_offs[:, None] < row_major_t_num_rows) & (
379-
block_col_offs[None, :] < row_major_t_num_cols
378+
mask = (block_col_offs[:, None] < row_major_t_num_rows) & (
379+
block_row_offs[None, :] < row_major_t_num_cols
380380
)
381381
tl.store(row_major_t_out_ptr + row_major_t_offs, fp8_vals.trans(1, 0), mask=mask)
382382

torchao/prototype/float8nocompile/test/train_test.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,9 @@ def model2():
3636
return TestModel()
3737

3838

39-
@pytest.mark.parametrize("input_shape", [(16, 32), (1, 16, 32), (2, 16, 32)])
39+
@pytest.mark.parametrize(
40+
"input_shape", [(16, 32), (1, 16, 32), (2, 16, 32), (128, 8192, 32)]
41+
)
4042
def test_model_weights_and_gradients(model1, model2, input_shape: tuple[int, int]):
4143
assert torch.cuda.is_available()
4244
device = torch.device("cuda")

0 commit comments

Comments
 (0)