Add native masked_softmax (pytorch#69268)

zrphercule2 · facebook-github-bot · commit aab67c6dff38 · 2021-12-09T23:29:45.000-08:00
Summary: Pull Request resolved: pytorch#69268 This diff enabled native masked softmax on CUDA, also expanded our current warp_softmax to accept masking. The mask in this masked softmax has to be the same shape as input, and has to be contiguous. In a following diff I will submit later, I will have encoder mask layout included, where input is BHDD and mask is BD. Test Plan: buck build mode/opt -c fbcode.enable_gpu_sections=true caffe2/test:nn && buck-out/gen/caffe2/test/nn\#binary.par -r test_masked_softmax Reviewed By: ngimel Differential Revision: D32338419 fbshipit-source-id: 48c3fde793ad4535725d9dae712db42e2bdb8a49
diff --git a/aten/src/ATen/native/SoftMax.cpp b/aten/src/ATen/native/SoftMax.cpp
@@ -125,8 +125,12 @@ TORCH_META_FUNC(_log_softmax_backward_data)
 namespace native {
 namespace {
 
-template <typename scalar_t, bool LogSoftMax>
-void host_softmax(Tensor output, const Tensor& input, const int64_t dim) {
+template <typename scalar_t, bool LogSoftMax, bool MaskedSoftMax = false>
+void host_softmax(
+    Tensor output,
+    const Tensor& input,
+    const int64_t dim,
+    bool* mask = nullptr) {
   int64_t outer_size = 1;
   int64_t dim_size = input.size(dim);
   int64_t inner_size = 1;
@@ -140,6 +144,7 @@ void host_softmax(Tensor output, const Tensor& input, const int64_t dim) {
   int64_t outer_stride = dim_size * dim_stride;
   scalar_t* input_data_base = input.data_ptr<scalar_t>();
   scalar_t* output_data_base = output.data_ptr<scalar_t>();
+  bool* mask_data_base = mask;
   int64_t grain_size = std::min(internal::GRAIN_SIZE / dim_size, (int64_t)1);
   parallel_for(
       0, outer_size * inner_size, grain_size,
@@ -151,14 +156,38 @@ void host_softmax(Tensor output, const Tensor& input, const int64_t dim) {
               input_data_base + outer_idx * outer_stride + inner_idx;
           scalar_t* output_data =
               output_data_base + outer_idx * outer_stride + inner_idx;
+          bool* mask_data = nullptr;
+          if (MaskedSoftMax) {
+            mask_data = mask_data_base + outer_idx * outer_stride + inner_idx;
+          }
+
+          // Calc max in softmax dim
+          bool is_meaningful_max = false;
           scalar_t max_input = input_data[0];
-          for (const auto d : c10::irange(1, dim_size)) {
-            max_input = std::max(max_input, input_data[d * dim_stride]);
+          if (!MaskedSoftMax) {
+            for (const auto d : c10::irange(1, dim_size)) {
+              max_input = std::max(max_input, input_data[d * dim_stride]);
+            }
+          } else {
+            for (const auto d : c10::irange(0, dim_size)) {
+              if (mask_data[d * dim_stride]) {
+                max_input = is_meaningful_max
+                    ? std::max(max_input, input_data[d * dim_stride])
+                    : input_data[d * dim_stride];
+                is_meaningful_max = true;
+              }
+            }
           }
 
+          // Calc sum in softmax dim
           acc_type<scalar_t, false> tmpsum = 0;
           for (const auto d : c10::irange(dim_size)) {
-            scalar_t z = std::exp(input_data[d * dim_stride] - max_input);
+            scalar_t z{};
+            if (!MaskedSoftMax || mask_data[d * dim_stride]) {
+              z = std::exp(input_data[d * dim_stride] - max_input);
+            } else {
+              z = 0;
+            }
             if (!LogSoftMax) {
               output_data[d * dim_stride] = z;
             }
@@ -171,7 +200,9 @@ void host_softmax(Tensor output, const Tensor& input, const int64_t dim) {
             tmpsum = 1 / tmpsum;
           }
 
+          // update output
           for (const auto d : c10::irange(dim_size)) {
+            // LogSoftMax and MaskedSoftMax should not both be true
             if (LogSoftMax) {
               output_data[d * dim_stride] =
                   input_data[d * dim_stride] - max_input - tmpsum;
@@ -294,7 +325,10 @@ TORCH_IMPL_FUNC(log_softmax_cpu_out)
   } else {
     AT_DISPATCH_FLOATING_TYPES_AND(
         at::ScalarType::BFloat16, input_.scalar_type(), "log_softmax", [&] {
-          host_softmax<scalar_t, true>(output, input_, dim_);
+          host_softmax<
+              scalar_t,
+              true /* LogSoftMax */,
+              false /* MaskedSoftMax */>(output, input_, dim_);
         });
   }
 }
@@ -431,5 +465,23 @@ Tensor log_softmax(const Tensor& self, Dimname dim, optional<ScalarType> dtype)
   return at::log_softmax(self, dimname_to_position(self, dim), dtype);
 }
 
+Tensor masked_softmax_cpu(const Tensor& input, const Tensor& mask) {
+  Tensor output = at::empty_like(input, input.options());
+  TORCH_CHECK(
+      input.sizes() == mask.sizes(), "Mask shape should match input shape");
+  TORCH_CHECK(mask.is_contiguous(), "Mask should always be contiguous");
+  TORCH_CHECK(
+      mask.scalar_type() == ScalarType::Bool,
+      "Mask should be a boolean tensor");
+  AT_DISPATCH_FLOATING_TYPES_AND(
+      at::ScalarType::BFloat16, input.scalar_type(), "log_softmax", [&] {
+        host_softmax<
+            scalar_t,
+            false /* LogSoftMax */,
+            true /* MaskedSoftMax */>(
+            output, input, input.dim() - 1, mask.data_ptr<bool>());
+      });
+  return output;
+}
 }
 }
diff --git a/aten/src/ATen/native/cuda/PersistentSoftmax.cuh b/aten/src/ATen/native/cuda/PersistentSoftmax.cuh
@@ -55,15 +55,17 @@ __device__ __forceinline__ void warp_reduce(acc_t* sum) {
 // CUDA warp size is 32 for all existing GPU architectures, but there is no guarantee this will not change for future arch.
 // ROCm warp size is 64 for all currently ROCm-supported GPU architectures, but this may change for future archs.
 // is_log_softmax is a flag indicating whether SoftMax or LogSoftMax should be computed.
+// is_masked is a flag indicating whether SoftMax or MaskedSoftMax should be computed.
 // The template can be instantiated with any floating point type for the type arguments input_t, output_t and acc_t.
 // This allows SoftMax to be fused with a cast immediately following the SoftMax.
+// The mask should have the same shape as input, with a boolean indicate if the value is masked.
 // For instance:
 // input_t=half,  acc_t=float, output_t=half  => read half tensor, float accumulators, write half tensor.
 // input_t=half,  acc_t=float, output_t=float => read half tensor, float accumulators, write float tensor.
 // input_t_float, acc_t=float, output_t=half  => read float tensor, float accumulators, write half tensor.
 
-template <typename input_t, typename output_t, typename acc_t, int log2_elements, bool is_log_softmax>
-__global__ void softmax_warp_forward(output_t *dst, const input_t *src, int batch_size, int stride, int element_count)
+template <typename input_t, typename output_t, typename acc_t, int log2_elements, bool is_log_softmax, bool is_masked = false>
+__global__ void softmax_warp_forward(output_t *dst, const input_t *src, int batch_size, int stride, int element_count, const bool *mask = nullptr)
 {
     // WARP_SIZE and WARP_BATCH must match the return values batches_per_warp and warp_size of method warp_softmax_forward_kernel.
     constexpr int next_power_of_two = 1 << log2_elements;
@@ -84,7 +86,9 @@ __global__ void softmax_warp_forward(output_t *dst, const input_t *src, int batc
 
     src += first_batch * stride + local_idx;
     dst += first_batch * stride + local_idx;
-
+    if (is_masked) {
+        mask += first_batch * stride + local_idx;
+    }
     // The nested loops over WARP_BATCH and then WARP_ITERATIONS can be simplified to one loop,
     // but I think doing so would obfuscate the logic of the algorithm, thus I chose to keep
     // the nested loops.
@@ -108,10 +112,23 @@ __global__ void softmax_warp_forward(output_t *dst, const input_t *src, int batc
     acc_t max_value[WARP_BATCH];
     #pragma unroll
     for (int i = 0;  i < WARP_BATCH;  ++i) {
+        bool is_meaningful_max = false;
         max_value[i] = elements[i][0];
         #pragma unroll
-        for (int it = 1;  it < WARP_ITERATIONS;  ++it) {
-            max_value[i] = (max_value[i] > elements[i][it]) ? max_value[i] : elements[i][it];
+        for (int it = 0;  it < WARP_ITERATIONS;  ++it) {
+            if (is_masked) {
+                if (mask[i*element_count+it*WARP_SIZE]) {
+                    max_value[i] = (is_meaningful_max && max_value[i] > elements[i][it]) ? max_value[i] : elements[i][it];
+                    is_meaningful_max = true;
+                }
+            } else {
+                max_value[i] = max_value[i] > elements[i][it] ? max_value[i] : elements[i][it];
+            }
+        }
+        if (is_masked) {
+            if (!is_meaningful_max) {
+                max_value[i] = -std::numeric_limits<acc_t>::infinity();
+            }
         }
     }
     warp_reduce<acc_t, WARP_BATCH, WARP_SIZE, Max>(max_value);
@@ -121,11 +138,22 @@ __global__ void softmax_warp_forward(output_t *dst, const input_t *src, int batc
     for (int i = 0;  i < WARP_BATCH;  ++i) {
         #pragma unroll
         for (int it = 0;  it < WARP_ITERATIONS;  ++it) {
-            if (is_log_softmax) {
-              sum[i] += std::exp(elements[i][it] - max_value[i]);
+            if (!is_masked) {
+                if (is_log_softmax) {
+                    sum[i] += std::exp(elements[i][it] - max_value[i]);
+                } else {
+                    elements[i][it] = std::exp(elements[i][it] - max_value[i]);
+                    sum[i] += elements[i][it];
+                }
             } else {
-              elements[i][it] = std::exp(elements[i][it] - max_value[i]);
-              sum[i] += elements[i][it];
+                if (mask[i*element_count+it*WARP_SIZE]) {
+                    if (is_log_softmax) {
+                        sum[i] += std::exp(elements[i][it] - max_value[i]);
+                    } else {
+                        elements[i][it] = std::exp(elements[i][it] - max_value[i]);
+                        sum[i] += elements[i][it];
+                    }
+                }
             }
         }
     }
@@ -141,6 +169,12 @@ __global__ void softmax_warp_forward(output_t *dst, const input_t *src, int batc
         for (int it = 0;  it < WARP_ITERATIONS;  ++it) {
             int element_index = local_idx + it * WARP_SIZE;
             if (element_index < element_count) {
+                if (is_masked) {
+                    if (!mask[i*element_count+it*WARP_SIZE]) {
+                        dst[i*element_count+it*WARP_SIZE] = 0;
+                        continue;
+                    }
+                }
                 if (is_log_softmax) {
                     dst[i*element_count+it*WARP_SIZE] = elements[i][it] - max_value[i] - sum[i];
                 } else {
@@ -234,8 +268,8 @@ __global__ void softmax_warp_backward(output_t *gradInput, const input_t *grad,
 
 } // end of anonymous namespace
 
-template<typename input_t, typename output_t, typename acc_t, bool is_log_softmax>
-void dispatch_softmax_forward(output_t *dst, const input_t *src, int softmax_elements, int softmax_elements_stride, int batch_count)
+template<typename input_t, typename output_t, typename acc_t, bool is_log_softmax, bool is_masked = false>
+void dispatch_softmax_forward(output_t *dst, const input_t *src, int softmax_elements, int softmax_elements_stride, int batch_count, const bool *mask = nullptr)
 {
     TORCH_INTERNAL_ASSERT( softmax_elements >= 0 && softmax_elements <= 1024 );
     if (softmax_elements == 0) {
@@ -260,9 +294,9 @@ void dispatch_softmax_forward(output_t *dst, const input_t *src, int softmax_ele
         // Launch code would be more elegant if C++ supported FOR CONSTEXPR
         switch (log2_elements) {
             #define LAUNCH_SOFTMAX_WARP_FORWARD(L2E) case L2E:                    \
-            softmax_warp_forward<input_t, output_t, acc_t, L2E, is_log_softmax>   \
+            softmax_warp_forward<input_t, output_t, acc_t, L2E, is_log_softmax, is_masked>   \
                 <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst,   \
-                    src, batch_count, softmax_elements_stride, softmax_elements); \
+                    src, batch_count, softmax_elements_stride, softmax_elements, mask); \
             C10_CUDA_KERNEL_LAUNCH_CHECK();                                       \
             break;
 
diff --git a/aten/src/ATen/native/cuda/SoftMax.cu b/aten/src/ATen/native/cuda/SoftMax.cu
@@ -713,8 +713,8 @@ Tensor host_softmax(const Tensor & input_, const int64_t dim_, const bool half_t
             int64_t remaining = outer_size;
             int64_t chunk_size = (1L << 30L) / dim_size;
             while(remaining > 0) {
-              dispatch_softmax_forward<scalar_t, scalar_t, accscalar_t, is_log_softmax>(
-                output_ptr, input_ptr, dim_size, dim_size, std::min<int64_t>(remaining, chunk_size));
+              dispatch_softmax_forward<scalar_t, scalar_t, accscalar_t, is_log_softmax, false>(
+                output_ptr, input_ptr, dim_size, dim_size, std::min<int64_t>(remaining, chunk_size), nullptr/* not masked */);
               input_ptr += chunk_size * dim_size;
               output_ptr += chunk_size * dim_size;
               remaining -= chunk_size;
@@ -734,8 +734,8 @@ Tensor host_softmax(const Tensor & input_, const int64_t dim_, const bool half_t
             int64_t remaining = outer_size;
             int64_t chunk_size = (1<<30) / dim_size;
             while(remaining > 0) {
-              dispatch_softmax_forward<scalar_t, accscalar_t, accscalar_t, is_log_softmax>(
-                  output_ptr, input_ptr, dim_size, dim_size, std::min<int64_t>(remaining, chunk_size));
+              dispatch_softmax_forward<scalar_t, accscalar_t, accscalar_t, is_log_softmax, false>(
+                  output_ptr, input_ptr, dim_size, dim_size, std::min<int64_t>(remaining, chunk_size), nullptr/* not masked */);
               input_ptr += chunk_size * dim_size;
               output_ptr += chunk_size * dim_size;
               remaining -= chunk_size;
@@ -941,5 +941,32 @@ TORCH_IMPL_FUNC(softmax_backward_cuda_out)
   Tensor tmp = grad * output;
   host_softmax_backward<SoftMaxBackwardEpilogue,false>(tmp, output, dim, half_to_float, grad_input);
 }
+
+Tensor masked_softmax_cuda(const Tensor& input, const Tensor& mask) {
+    TORCH_CHECK(mask.scalar_type() == ScalarType::Bool, "Mask should be a boolean tensor");
+    TORCH_CHECK(mask.is_contiguous(), "Mask should always be contiguous");
+    // Always do masked softmax on last dim
+    int softmax_elements = input.size(input.dim() - 1);
+    TORCH_CHECK(softmax_elements <= 1024, "TODO: Masked softmax only support softmax elements <= 1024");
+    Tensor output = at::empty_like(input, input.options());
+    int batch_count = input.numel() / softmax_elements;
+    AT_DISPATCH_FLOATING_TYPES_AND2(
+      ScalarType::Half,
+      ScalarType::BFloat16,
+      input.scalar_type(),
+      "masked_softmax",
+      [&] {
+        using accscalar_t = acc_type<scalar_t, true>;
+        dispatch_softmax_forward<scalar_t, scalar_t, accscalar_t, false, true>(
+          output.data_ptr<scalar_t>(),      // dst
+          input.data_ptr<scalar_t>(),       // src
+          softmax_elements,
+          softmax_elements,
+          batch_count,
+          mask.data_ptr<bool>()
+        );
+      });
+    return output;
+}
 }
 }
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
@@ -5892,6 +5892,11 @@
 - func: masked_scatter(Tensor self, Tensor mask, Tensor source) -> Tensor
   variants: function, method
 
+- func: _masked_softmax(Tensor self, Tensor mask) -> Tensor
+  dispatch:
+    CUDA: masked_softmax_cuda
+    CPU: masked_softmax_cpu
+
 - func: view(Tensor(a) self, int[] size) -> Tensor(a)
   variants: method
   device_check: NoCheck
diff --git a/test/test_nn.py b/test/test_nn.py
@@ -15560,6 +15560,27 @@ def embedding_bag_check(indices, weights, mode, sparse, padding_idx):
                     rtol = None
                 self.assertEqual(grad, grad_check, msg=msg, atol=atol, rtol=rtol)
 
+    def test_masked_softmax(self, device):
+        B = 10
+        num_heads = 8
+        L = 512
+        input = torch.randn((B, num_heads, L, L))
+        mask = torch.randint(0, 2, (B, L))
+        if (self.device_type == "cuda"):
+            input = input.cuda()
+            mask = mask.cuda()
+        mask = mask.reshape(B, 1, 1, L).expand(B, num_heads, L, L).bool()
+        native_res = torch._masked_softmax(input, mask)
+        mask = mask.float()
+
+        def slow_masked_softmax(input, mask):
+            exp = torch.exp(input)
+            exp = exp * mask
+            s = exp.sum(dim=3, keepdim=True).expand(exp.size())
+            return exp / s
+        pt_res = slow_masked_softmax(input, mask)
+        self.assertEqual(pt_res, native_res, exact_dtype=True)
+
     # Test fails on Vg20
     @skipCUDAIfRocm
     @dtypesIfCUDA(torch.half, torch.float)