[14/N] Fix clang-tidy warnings in aten/src/ATen (pytorch#133988)

cyyever · pytorchmergebot · commit e4cdc31227de · 2024-09-21T22:41:40.000Z
Follows pytorch#133807 Pull Request resolved: pytorch#133988 Approved by: https://github.com/ezyang
diff --git a/aten/src/ATen/cuda/tunable/TunableOp.h b/aten/src/ATen/cuda/tunable/TunableOp.h
@@ -278,7 +278,7 @@ class TunableOp {
 };
 
 struct OpParams {
-  OpParams() {}
+  OpParams() = default;
   virtual ~OpParams() = default;
   virtual std::string Signature() const = 0;
 };
diff --git a/aten/src/ATen/native/cuda/BinaryInternal.h b/aten/src/ATen/native/cuda/BinaryInternal.h
@@ -15,9 +15,7 @@
 
 #include <type_traits>
 
-namespace at {
-namespace native {
-namespace binary_internal {
+namespace at::native::binary_internal {
 
 template <typename scalar_t>
 struct DivFunctor {
@@ -43,6 +41,4 @@ struct MulFunctor<bool> {
 };
 void div_true_kernel_cuda(TensorIteratorBase& iter);
 void div_trunc_kernel_cuda(TensorIteratorBase& iter);
-} // namespace binary_internal
-} // namespace native
-} // namespace at
+} // namespace at::native::binary_internal
diff --git a/aten/src/ATen/native/cuda/Blas.cpp b/aten/src/ATen/native/cuda/Blas.cpp
@@ -95,7 +95,7 @@ c10::MaybeOwned<Tensor> inline prepare_matrix_for_cublas(const Tensor& tensor, b
 
 struct cublasCommonArgs {
   cublasCommonArgs(const Tensor& mat1, const Tensor& mat2, Tensor& c) {
-    bool transpose_result, transpose_mat1, transpose_mat2;
+    bool transpose_result = false, transpose_mat1 = false, transpose_mat2 = false;
     result = prepare_matrix_for_cublas(c, transpose_result);
     mata = prepare_matrix_for_cublas(transpose_result ? mat2 : mat1, transpose_mat1, transpose_result);
     matb = prepare_matrix_for_cublas(transpose_result ? mat1 : mat2, transpose_mat2, transpose_result);
@@ -263,6 +263,7 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma
     "expected mat1 and mat2 to have the same dtype, but got: ", mat1.dtype(), " != ", mat2.dtype()
   )
 
+  // NOLINTNEXTLINE(*c-array*)
   TensorArg targs[]{{result, "out", 0}, {self, "self", 1}, {mat1, "mat1", 2}, {mat2, "mat2", 3}};
   checkAllSameGPU(__func__, targs);
 
@@ -483,9 +484,11 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma
         });
     switch (activation) {
       case Activation::RELU:
+        // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
         at::relu_(const_cast<Tensor&>(*args.result));
         break;
       case Activation::GELU:
+        // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
         at::gelu_(const_cast<Tensor&>(*args.result), "tanh");
         break;
       default: break;
@@ -542,8 +545,8 @@ const Tensor& baddbmm_out_cuda_impl(const Tensor& result, const Tensor& self, co
   int64_t n = result_sizes[leading_dim];
   int64_t k = (transpose_result ? batch2 : batch1).sizes()[leading_dim];
 
-  int64_t lda, ldb, ldc;
-  bool transpose_batch1, transpose_batch2;
+  int64_t lda = 0, ldb = 0, ldc = 0;
+  bool transpose_batch1 = false, transpose_batch2 = false;
   auto batch1_ = prepare_batch_matrix_for_cublas(transpose_result ? batch2 : batch1, transpose_batch1, lda, transpose_result, m, k);
   auto batch2_ = prepare_batch_matrix_for_cublas(transpose_result ? batch1 : batch2, transpose_batch2, ldb, transpose_result, k, n);
 
@@ -593,14 +596,17 @@ const Tensor& baddbmm_out_cuda_impl(const Tensor& result, const Tensor& self, co
 } // anonymous namespace
 
 TORCH_IMPL_FUNC(addmm_out_cuda)(const Tensor& self, const Tensor& mat1, const Tensor& mat2, const Scalar& beta, const Scalar& alpha, const Tensor& result) {
+  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
   addmm_out_cuda_impl(const_cast<Tensor&>(result), self, mat1, mat2, beta, alpha);
 }
 
 TORCH_IMPL_FUNC(addmm_activation_out_cuda)(const Tensor& self, const Tensor& mat1, const Tensor& mat2, const Scalar& beta, const Scalar& alpha, bool use_gelu, const Tensor& result) {
+  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
   addmm_out_cuda_impl(const_cast<Tensor&>(result), self, mat1, mat2, beta, alpha, use_gelu ? Activation::GELU : Activation::RELU);
 }
 
 TORCH_IMPL_FUNC(mm_out_cuda)(const Tensor& self, const Tensor& mat2, const Tensor& result) {
+  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
   addmm_out_cuda_impl(const_cast<Tensor&>(result), result, self, mat2, 0, 1);
 }
 
@@ -765,13 +771,15 @@ TORCH_IMPL_FUNC(addmv_out_cuda)(const Tensor &self, const Tensor &mat, const Ten
       result.zero_();
     } else {
       at::mul_out(
+          // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
           const_cast<Tensor&>(result),
           self,
           at::native::scalar_tensor(
               beta_, self.scalar_type(), std::nullopt /* layout */, at::kCPU, std::nullopt /* pin_memory */));
     }
   } else {
     if (!result.is_same(*self_) && betaval != 0.0) { //if beta is 0, result contents will be zeroed later
+                                                            // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
       at::native::copy_(const_cast<Tensor&>(result), *self_);
     }
     if (result.numel() != 0) {
@@ -1040,6 +1048,7 @@ _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2,
     auto bias_ = bias.value_or(Tensor());
     auto scale_result_ = scale_result.value_or(Tensor());
 
+    // NOLINTNEXTLINE(*c-array*)
     TensorArg targs[]{{out, "out", 0}, {mat1, "mat1", 1}, {mat2, "mat2", 2},
                       {bias_, "bias", 3}, {scale_a, "scale_a", 4}, {scale_b, "scale_b", 5},
                       {scale_result_, "scale_result", 6}};
diff --git a/aten/src/ATen/native/cuda/Copy.h b/aten/src/ATen/native/cuda/Copy.h
@@ -5,6 +5,7 @@ struct TensorIteratorBase;
 
 namespace native {
 
-void direct_copy_kernel_cuda(TensorIteratorBase &iter);
+void direct_copy_kernel_cuda(TensorIteratorBase& iter);
 
-}}  // namespace at::native
+}
+} // namespace at
diff --git a/aten/src/ATen/native/cuda/Distributions.cpp b/aten/src/ATen/native/cuda/Distributions.cpp
@@ -18,13 +18,15 @@
 
 namespace at::native {
 
+// NOLINTNEXTLINE(performance-unnecessary-value-param)
 Tensor _s_poisson_cuda(const Tensor& lambda, std::optional<Generator> gen_) {
   auto gen = get_generator_or_default<CUDAGeneratorImpl>(gen_, cuda::detail::getDefaultCUDAGenerator());
   Tensor ret = at::empty(lambda.sizes(), lambda.options());
   launch_poisson_cuda_kernel(ret, lambda, gen);
   return ret;
 }
 
+// NOLINTNEXTLINE(performance-unnecessary-value-param)
 Tensor _s_binomial_cuda(const Tensor& count, const Tensor& prob, std::optional<Generator> gen_) {
   auto gen = get_generator_or_default<CUDAGeneratorImpl>(gen_, cuda::detail::getDefaultCUDAGenerator());
   Tensor ret = at::empty(count.sizes(), count.options());
@@ -37,13 +39,15 @@ Tensor _s_binomial_cuda(const Tensor& count, const Tensor& prob, std::optional<G
   return ret;
 }
 
+// NOLINTNEXTLINE(performance-unnecessary-value-param)
 Tensor _s_gamma_cuda(const Tensor& alpha, std::optional<Generator> gen_) {
   auto gen = get_generator_or_default<CUDAGeneratorImpl>(gen_, cuda::detail::getDefaultCUDAGenerator());
   Tensor ret = at::empty(alpha.sizes(), alpha.options());
   launch_gamma_kernel(ret, alpha, gen);
   return ret;
 }
 
+// NOLINTNEXTLINE(performance-unnecessary-value-param)
 Tensor _s_dirichlet_cuda(const Tensor& alpha, std::optional<Generator> gen_) {
   auto gen = get_generator_or_default<CUDAGeneratorImpl>(gen_, cuda::detail::getDefaultCUDAGenerator());
   Tensor ret = at::empty(alpha.sizes(), alpha.options());
diff --git a/aten/src/ATen/native/cuda/Indexing.cu b/aten/src/ATen/native/cuda/Indexing.cu
@@ -1353,7 +1353,7 @@ void index_select_out_cuda_impl(
     uint64_t dim,
     const Tensor& index) {
   uint64_t numIndices = index.numel();
-  uint64_t selfDims = self.dim() == 0 ? 1 : self.dim();
+  auto selfDims = self.dim() == 0 ? 1 : self.dim();
 
   const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
 
diff --git a/aten/src/ATen/native/cuda/LaunchUtils.h b/aten/src/ATen/native/cuda/LaunchUtils.h
@@ -1,8 +1,7 @@
 #pragma once
-#include<algorithm>
+#include <algorithm>
 
-namespace at {
-namespace native {
+namespace at::native {
 
 // returns 2**floor(log2(n))
 static int lastPow2(unsigned int n) {
@@ -14,5 +13,4 @@ static int lastPow2(unsigned int n) {
   return std::max<int>(1, n - (n >> 1));
 }
 
-} // namespace native
-} // namespace at
+} // namespace at::native
diff --git a/aten/src/ATen/native/cuda/LinearAlgebraStubs.cpp b/aten/src/ATen/native/cuda/LinearAlgebraStubs.cpp
@@ -160,12 +160,12 @@ REGISTER_CUDA_DISPATCH(lstsq_stub, &lazy_lstsq_kernel);
 // Protect from infinite recursion by initializing dispatch to self and checking
 // that values are different after linalg library were loaded
 
-namespace cuda {
-namespace detail {
+
+namespace cuda::detail {
 void registerLinalgDispatch(const LinalgDispatch& disp_) {
   disp = disp_;
 }
-}} //namespace cuda::detail
+} //namespace cuda::detail
 
 Tensor _cholesky_solve_helper_cuda(const Tensor& self, const Tensor& A, bool upper) {
     getTorchLinalgLibrary();
diff --git a/aten/src/ATen/native/cuda/ReduceOps.cpp b/aten/src/ATen/native/cuda/ReduceOps.cpp
@@ -28,9 +28,9 @@ namespace at::native {
 namespace {
 
 void norm_kernel_cuda(TensorIterator& iter, const Scalar& val) {
-  double p;
+  double p = 0;
   if (val.isIntegral(false)) {
-    p = val.to<int64_t>();
+    p = static_cast<double>(val.to<int64_t>());
   } else if (val.isFloatingPoint()) {
     p = val.to<double>();
   } else {
diff --git a/aten/src/ATen/native/cuda/Resize.cpp b/aten/src/ATen/native/cuda/Resize.cpp
@@ -54,8 +54,8 @@ const Tensor& resize_cuda_(
     return resize_named_tensor_(self, size, optional_memory_format);
   }
   auto* self_ = self.unsafeGetTensorImpl();
-  int64_t old_storage_nbytes = self_->unsafe_storage() ? self_->unsafe_storage().nbytes() : 0;
-  resize_impl_cuda_(self_, size, /*strides=*/std::nullopt);
+  auto old_storage_nbytes = self_->unsafe_storage() ? self_->unsafe_storage().nbytes() : 0;
+  resize_impl_cuda_(self_, size, /*stride=*/std::nullopt);
   if (optional_memory_format.has_value()) {
     auto memory_format =
         optional_memory_format.value();
@@ -67,7 +67,7 @@ const Tensor& resize_cuda_(
   }
   // See Note [Enabling Deterministic Operations]
   if (C10_UNLIKELY(at::globalContext().deterministicAlgorithms() && at::globalContext().deterministicFillUninitializedMemory())) {
-    at::native::fill_resize_deterministic_(self, old_storage_nbytes);
+    at::native::fill_resize_deterministic_(self, static_cast<int64_t>(old_storage_nbytes));
   }
   return self;
 }
diff --git a/aten/src/ATen/native/cuda/RowwiseScaledMM.h b/aten/src/ATen/native/cuda/RowwiseScaledMM.h
@@ -2,7 +2,6 @@
 #include <ATen/core/TensorBase.h>
 #include <optional>
 
-
 namespace at::cuda::detail {
 TORCH_API void f8f8bf16_rowwise(
     at::Tensor XQ, // FP8
@@ -12,4 +11,4 @@ TORCH_API void f8f8bf16_rowwise(
     std::optional<at::Tensor> bias, // BF16
     bool use_fast_accum,
     at::Tensor& out);
-}  // at::cuda::detail
+} // namespace at::cuda::detail
diff --git a/aten/src/ATen/native/cuda/Sort.cpp b/aten/src/ATen/native/cuda/Sort.cpp
@@ -25,7 +25,7 @@ namespace at::native {
 
 std::vector<int64_t> infer_dense_strides_dim_last(const Tensor & self, int64_t dim);
 
-void fillSliceWithIndex(const Tensor& t, int dim) {
+void fillSliceWithIndex(const Tensor& t, int64_t dim) {
   if (t.numel()) {
     auto sizes = DimVector(t.dim(), 1);
     sizes[dim] = t.sizes()[dim];
diff --git a/aten/src/ATen/native/cuda/Sort.cu b/aten/src/ATen/native/cuda/Sort.cu
@@ -19,7 +19,7 @@ namespace at::native {
 template <typename T>
 static int minimum_grid_for_occupancy(T kernel, int max_block_size) {
   int minGridSize = 0;
-  int blockSize;
+  int blockSize = 0;
   C10_CUDA_CHECK(cudaOccupancyMaxPotentialBlockSize(
       &minGridSize,
       &blockSize,
@@ -361,7 +361,7 @@ void sortCommon(Sorter sorter, const TensorBase &key, const TensorBase &value,
 void sortKeyValueInplace(
     const TensorBase& key,
     const TensorBase& value,
-    int dim,
+    int64_t dim,
     bool descending,
     bool stable) {
   const auto sort_size = key.size(dim);
diff --git a/aten/src/ATen/native/cuda/Sort.h b/aten/src/ATen/native/cuda/Sort.h
@@ -3,15 +3,15 @@
 #include <ATen/core/TensorBase.h>
 #include <ATen/native/cuda/SortStable.h>
 
-namespace at {
-namespace native {
+
+namespace at::native {
 
 inline bool should_use_small_sort(const TensorBase &self, int64_t dim) {
   return self.size(dim) <= 4096;
 }
 
 void sortKeyValueInplace(
-    const TensorBase &key, const TensorBase &value, int dim,
+    const TensorBase &key, const TensorBase &value, int64_t dim,
     bool descending, bool stable=false);
 
-}}  // namespace at::native
+} // namespace at::native
diff --git a/aten/src/ATen/native/cuda/SortStable.h b/aten/src/ATen/native/cuda/SortStable.h
@@ -2,8 +2,7 @@
 #include <ATen/core/TensorBase.h>
 #include <cstdint>
 
-namespace at {
-namespace native {
+namespace at::native {
 
 // Stable-sort self into values, and set indices to the
 // inverse-permutation from values back to self.
@@ -15,5 +14,4 @@ void launch_stable_sort_kernel(
     const TensorBase& values,
     const TensorBase& indices);
 
-} // namespace native
-} // namespace at
+} // namespace at::native
diff --git a/aten/src/ATen/native/cuda/TensorModeKernel.cpp b/aten/src/ATen/native/cuda/TensorModeKernel.cpp
@@ -6,7 +6,7 @@
 #include <ATen/native/Resize.h>
 #include <ATen/native/TensorCompare.h>
 
-constexpr int MAX_BLOCK_SIZE = AT_ROCM_ENABLED() ? 256 : 1024;
+constexpr int64_t MAX_BLOCK_SIZE = AT_ROCM_ENABLED() ? 256 : 1024;
 
 // Maximum size per grid dimension that we assume (compute capability >= 2.0)
 constexpr int64_t MAX_GRID_SIZE = 65535LL;
diff --git a/aten/src/ATen/native/cuda/TensorShapeCUDA.cpp b/aten/src/ATen/native/cuda/TensorShapeCUDA.cpp
@@ -29,7 +29,7 @@ Tensor& set_cuda_(Tensor& result) {
 
 // unify with cuda implementation?  This is not done to avoid a dispatch in resize_impl_cpu_
 Tensor& set_storage_cuda_(Tensor& result, Storage storage, int64_t storage_offset, IntArrayRef size, IntArrayRef stride) {
-  checkSetStorage(result, storage, storage_offset, size, stride);
+  checkSetStorage(result, std::move(storage), storage_offset, size, stride);
 
   result.unsafeGetTensorImpl()->set_storage_offset(storage_offset);
   at::OptionalIntArrayRef stride_opt = stride.data() != nullptr ?
diff --git a/aten/src/ATen/native/nested/cuda/NestedTensorTransformerFunctions.cu b/aten/src/ATen/native/nested/cuda/NestedTensorTransformerFunctions.cu
@@ -579,7 +579,7 @@ inline std::tuple<dim3, dim3, StackArray<int64_t>> check_shape_and_partition_(
   const dim3 blocks(
       div_round_up(outer_dense_size * jagged_folded_size, threads_y));
 
-  StackArray<int64_t> jagged_dims_tensor;
+  StackArray<int64_t> jagged_dims_tensor{};
   const int num_jagged_dim = dense_tensor.dim() - 2;
   TORCH_CHECK(num_jagged_dim <= static_cast<int>(kStackArrayMaxDims));
   jagged_dims_tensor.ndim = num_jagged_dim;
@@ -845,7 +845,7 @@ __launch_bounds__(kMaxThreads) void jagged_dense_dense_elementwise_jagged_output
     }
     if (!truncated) {
       const int oidx = offset_temp;
-      int iidx;
+      int iidx = 0;
       for (iidx = threadIdx.x; iidx * 2 + 1 < inner_dense_size;
            iidx += blockDim.x) {
         output_values[offset][2 * iidx] =
@@ -1201,7 +1201,7 @@ inline bool jagged_dense_dense_elementwise_jagged_output_matches_opt(
   matches &= (y_0_reshaped.size(0) < INT_MAX);
   matches &= (y_0_reshaped.size(1) < INT_MAX);
 
-  int max_shared_bytes;
+  int max_shared_bytes = 0;
 #ifndef USE_ROCM
   C10_CUDA_CHECK(cudaDeviceGetAttribute(
       &max_shared_bytes,
@@ -1226,7 +1226,7 @@ inline bool jagged_dense_dense_elementwise_jagged_output_matches_opt(
         auto B = y_0_reshaped.size(0);
         // the default shared memory on V100/A100/H100 is 48 KB from
         // https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#shared-memory-8-x
-        if ((B + 1) * sizeof(index_t) >= used_shared_bytes) {
+        if ((B + 1) * sizeof(index_t) >= static_cast<size_t>(used_shared_bytes)) {
           matches = false;
         }
       });

Original file line number	Diff line number	Diff line change
`@@ -54,8 +54,8 @@ const Tensor& resize_cuda_(`
`54`	`54`	`return resize_named_tensor_(self, size, optional_memory_format);`
`55`	`55`	`}`
`56`	`56`	`auto* self_ = self.unsafeGetTensorImpl();`
`57`		`- int64_t old_storage_nbytes = self_->unsafe_storage() ? self_->unsafe_storage().nbytes() : 0;`
`58`		`- resize_impl_cuda_(self_, size, /strides=/std::nullopt);`
	`57`	`+ auto old_storage_nbytes = self_->unsafe_storage() ? self_->unsafe_storage().nbytes() : 0;`
	`58`	`+ resize_impl_cuda_(self_, size, /stride=/std::nullopt);`
`59`	`59`	`if (optional_memory_format.has_value()) {`
`60`	`60`	`auto memory_format =`
`61`	`61`	`optional_memory_format.value();`
`@@ -67,7 +67,7 @@ const Tensor& resize_cuda_(`
`67`	`67`	`}`
`68`	`68`	`// See Note [Enabling Deterministic Operations]`
`69`	`69`	`if (C10_UNLIKELY(at::globalContext().deterministicAlgorithms() && at::globalContext().deterministicFillUninitializedMemory())) {`
`70`		`- at::native::fill_resize_deterministic_(self, old_storage_nbytes);`
	`70`	`+ at::native::fill_resize_deterministic_(self, static_cast<int64_t>(old_storage_nbytes));`
`71`	`71`	`}`
`72`	`72`	`return self;`
`73`	`73`	`}`