[complex32] conj

kshitij12345 · pytorchmergebot · commit 4b311a9633f7 · 2022-04-22T14:21:27.000Z
Reference: pytorch#74537 Required for `complex32` support by `fft` module. Tested with `test_dtypes` and `test_complex_half_reference_testing` in test_ops.py Pull Request resolved: pytorch#76132 Approved by: https://github.com/anjali411
diff --git a/aten/src/ATen/cpu/vec/vec_base.h b/aten/src/ATen/cpu/vec/vec_base.h
@@ -133,7 +133,7 @@ struct Vectorized {
   static constexpr size_type size() {
     return VECTOR_WIDTH / sizeof(T);
   }
-  Vectorized() : values{0} {}
+  Vectorized() : values{static_cast<T>(0)} {}
   Vectorized(T val) {
     for (int i = 0; i != size(); i++) {
       values[i] = val;
diff --git a/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp b/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp
@@ -211,8 +211,8 @@ static void imag_kernel(TensorIteratorBase& iter) {
 
 // NB: Ignores the negative bit on tensors
 void conj_kernel(TensorIteratorBase& iter) {
-  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(
-      kBool, kBFloat16, kHalf, iter.common_dtype(), "conj_cpu", [&]() {
+  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND4(
+      kBool, kBFloat16, kHalf, kComplexHalf, iter.common_dtype(), "conj_cpu", [&]() {
         cpu_kernel_vec(
             iter,
             [=](scalar_t a) -> scalar_t { return conj_impl(a); },
diff --git a/aten/src/ATen/native/cpu/zmath.h b/aten/src/ATen/native/cpu/zmath.h
@@ -123,6 +123,11 @@ inline TYPE conj_impl (TYPE z) {
   return z; //No-Op
 }
 
+template<>
+inline c10::complex<at::Half> conj_impl <c10::complex<at::Half>> (c10::complex<at::Half> z) {
+  return c10::complex<at::Half>{z.real(), -z.imag()};
+}
+
 template<>
 inline c10::complex<float> conj_impl <c10::complex<float>> (c10::complex<float> z) {
   return c10::complex<float>(z.real(), -z.imag());
diff --git a/aten/src/ATen/native/cuda/UnaryComplexKernels.cu b/aten/src/ATen/native/cuda/UnaryComplexKernels.cu
@@ -2,6 +2,7 @@
 #include <limits>
 #include <ATen/native/UnaryOps.h>
 #include <ATen/native/cuda/Loops.cuh>
+#include <ATen/native/cuda/JitLoops.cuh>
 #include <ATen/Dispatch.h>
 #include <ATen/NumericUtils.h>
 #include <ATen/native/DispatchStub.h>
@@ -81,13 +82,32 @@ __host__ __device__ static inline c10::complex<T> conj_wrapper(c10::complex<T> v
 }
 
 // NB: Ignores the negative bit on tensors
+const char conj_name[] = "conj_kernel";
 void conj_kernel_cuda(TensorIteratorBase& iter) {
-  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(
+  auto common_dtype = iter.common_dtype();
+  if (common_dtype == kComplexHalf) {
+    using scalar_t = c10::complex<at::Half>;
+    #if AT_USE_JITERATOR()
+      static const auto conj_string = jiterator_stringify(
+        template <typename T>
+        T conj_kernel(T z) {
+          return std::conj(z);
+        }
+      );
+      jitted_gpu_kernel<conj_name, scalar_t, scalar_t, 1>(iter, conj_string);
+    #else
+      gpu_kernel(iter, [] GPU_LAMBDA(scalar_t a) -> scalar_t {
+          return conj_wrapper(a);
+      });
+    #endif
+  } else {
+    AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(
       kBool, kBFloat16, kHalf, iter.common_dtype(), "conj_cuda", [&]() {
         gpu_kernel(iter, [] GPU_LAMBDA(scalar_t a) -> scalar_t {
           return conj_wrapper(a);
         });
-      });
+    });
+  }
 }
 
 REGISTER_DISPATCH(angle_stub, &angle_kernel_cuda);
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
@@ -9418,16 +9418,21 @@ def generate_std_var_kwargs(t: torch.Tensor, **kwargs):
                    ),
     UnaryUfuncInfo('conj',
                    ref=np.conj,
-                   dtypes=all_types_and_complex_and(torch.bool,
-                                                    torch.bfloat16, torch.half),
+                   dtypes=all_types_and_complex_and(torch.bool, torch.bfloat16,
+                                                    torch.half, torch.chalf),
                    supports_sparse=True,
                    supports_forward_ad=True,
                    supports_fwgrad_bwgrad=True,
-                   supports_out=False),
+                   supports_out=False,
+                   skips=(
+                       # numpy() raises TypeError: Got unsupported ScalarType ComplexHalf
+                       DecorateInfo(unittest.expectedFailure, "TestUnaryUfuncs", "test_reference_numerics_normal",
+                                    dtypes=(torch.complex32,)),
+                   )),
     UnaryUfuncInfo('conj_physical',
                    ref=np.conj,
-                   dtypes=all_types_and_complex_and(torch.bool,
-                                                    torch.bfloat16, torch.half),
+                   dtypes=all_types_and_complex_and(torch.bool, torch.bfloat16,
+                                                    torch.half, torch.chalf),
                    supports_forward_ad=True,
                    supports_fwgrad_bwgrad=True,
                    supports_sparse=True,
@@ -9439,6 +9444,22 @@ def generate_std_var_kwargs(t: torch.Tensor, **kwargs):
                        DecorateInfo(unittest.skip("Skipped!"), 'TestJit', 'test_variant_consistency_jit', dtypes=(torch.float32, )),
                        DecorateInfo(unittest.skip("Skipped! conj_physical_ not implemented for sparse"),
                                     'TestSparseUnaryUfuncs', 'test_inplace'),
+                       # numpy() raises TypeError: Got unsupported ScalarType ComplexHalf
+                       DecorateInfo(unittest.expectedFailure, "TestUnaryUfuncs", "test_reference_numerics_normal",
+                                    dtypes=(torch.complex32,)),
+                       # RuntimeError: "nonzero_count_cpu" not implemented for 'ComplexHalf'
+                       DecorateInfo(unittest.expectedFailure, "TestSparseCSR", "test_sparse_csr_consistency",
+                                    dtypes=(torch.complex32,)),
+                       # RuntimeError: "nonzero_count_cpu" not implemented for 'ComplexHalf'
+                       DecorateInfo(unittest.expectedFailure, "TestSparseCSR", "test_sparse_csr_unary_inplace",
+                                    dtypes=(torch.complex32,)),
+                       # RuntimeError: "nonzero_count_cpu" not implemented for 'ComplexHalf'
+                       DecorateInfo(unittest.expectedFailure, "TestSparseCSR", "test_sparse_csr_unary_out",
+                                    dtypes=(torch.complex32,)),
+                       # RuntimeError: "add_out_op2_sparse_csr" not implemented for 'ComplexHalf'
+                       DecorateInfo(unittest.expectedFailure, "TestSparseCSR",
+                                    "test_zero_to_zero_correspondence_unary",
+                                    dtypes=(torch.complex32,)),
                    )),
     OpInfo('resolve_conj',
            dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16),

Original file line number	Diff line number	Diff line change
`@@ -133,7 +133,7 @@ struct Vectorized {`
`133`	`133`	`static constexpr size_type size() {`
`134`	`134`	`return VECTOR_WIDTH / sizeof(T);`
`135`	`135`	`}`
`136`		`- Vectorized() : values{0} {}`
	`136`	`+ Vectorized() : values{static_cast<T>(0)} {}`
`137`	`137`	`Vectorized(T val) {`
`138`	`138`	`for (int i = 0; i != size(); i++) {`
`139`	`139`	`values[i] = val;`