drisspg
diff --git a/‎aten/src/ATen/core/Tensor.h
+2 b/‎aten/src/ATen/core/Tensor.h
+2
diff --git a/‎aten/src/ATen/core/TensorMethods.h
+6 b/‎aten/src/ATen/core/TensorMethods.h
+6
diff --git a/‎aten/src/ATen/core/Type.h
+2 b/‎aten/src/ATen/core/Type.h
+2
diff --git a/‎aten/src/ATen/native/Activation.cpp
+245 b/‎aten/src/ATen/native/Activation.cpp
+245
@@ -551,6 +551,8 @@ struct AT_API Tensor {
   Tensor & round_();
   Tensor relu() const;
   Tensor & relu_();
+  Tensor prelu(const Tensor & weight) const;
+  std::tuple<Tensor,Tensor> prelu_backward(const Tensor & grad_output, const Tensor & weight) const;
   Tensor hardshrink(Scalar lambd=0.5) const;
   Tensor hardshrink_backward(const Tensor & grad_out, Scalar lambd) const;
   Tensor rsqrt() const;
 
@@ -935,6 +935,12 @@ inline Tensor Tensor::relu() const {
 inline Tensor & Tensor::relu_() {
     return type().relu_(*this);
 }
+inline Tensor Tensor::prelu(const Tensor & weight) const {
+    return type().prelu(*this, weight);
+}
+inline std::tuple<Tensor,Tensor> Tensor::prelu_backward(const Tensor & grad_output, const Tensor & weight) const {
+    return type().prelu_backward(grad_output, *this, weight);
+}
 inline Tensor Tensor::hardshrink(Scalar lambd) const {
     return type().hardshrink(*this, lambd);
 }
 
@@ -505,6 +505,8 @@ struct AT_API Type {
   virtual Tensor & round_(Tensor & self) const = 0;
   virtual Tensor relu(const Tensor & self) const = 0;
   virtual Tensor & relu_(Tensor & self) const = 0;
+  virtual Tensor prelu(const Tensor & self, const Tensor & weight) const = 0;
+  virtual std::tuple<Tensor,Tensor> prelu_backward(const Tensor & grad_output, const Tensor & self, const Tensor & weight) const = 0;
   virtual Tensor hardshrink(const Tensor & self, Scalar lambd) const = 0;
   virtual Tensor hardshrink_backward(const Tensor & grad_out, const Tensor & self, Scalar lambd) const = 0;
   virtual Tensor rsqrt(const Tensor & self) const = 0;
 
@@ -4,6 +4,7 @@
 #include "ATen/NativeFunctions.h"
 #include "ATen/core/Half.h"
 
+
 namespace at { namespace native {
 
 static const double SELU_ALPHA = 1.6732632423543772848170429916717;
@@ -43,6 +44,250 @@ Tensor & rrelu_(Tensor & self, Scalar lower, Scalar upper, bool training, Genera
   return at::rrelu_with_noise_(self, self.type().tensor(), lower, upper, training, generator);
 }
 
+// -----------------------------------
+// prelu forward
+// -----------------------------------
+template <typename scalar_t>
+void inline prelu_cpu_kernel_share_weights(
+  Tensor& result,
+  const Tensor& input,
+  const Tensor& weight) {
+
+  int64_t i;
+  int64_t input_numel = input.numel();
+  auto result_data = result.data<scalar_t>();
+  auto input_data = input.data<scalar_t>();
+  auto weight_val = weight.data<scalar_t>()[0];
+
+  #pragma omp parallel for private(i) if (input_numel > 1000)
+  for (i = 0; i < input_numel; i++) {
+    scalar_t input_data_val = input_data[i];
+    // to allow for compiler optimization, here splitting into two lines:
+    scalar_t r = (input_data_val > 0) ? scalar_t(1) : weight_val;
+    result_data[i] = r * input_data_val;
+  }
+}
+
+template <typename scalar_t>
+void inline prelu_cpu_kernel_multi_weights(
+  Tensor& result,
+  const Tensor& input,
+  const Tensor& weight,
+  int64_t input_dim0_size,
+  int64_t channel_size,
+  int64_t input_stride0,
+  int64_t input_stride1) {
+
+  int64_t i, j, k;
+  int64_t input_numel = input.numel();
+  scalar_t* result_data = result.data<scalar_t>();
+  scalar_t* input_data = input.data<scalar_t>();
+  scalar_t* weight_data = weight.data<scalar_t>();
+
+  #pragma omp parallel for private(i,j,k) if (input.numel() > 1000)
+  for (i = 0; i < input_dim0_size; ++i) {
+    int64_t offset = i * channel_size * input_stride1;
+    scalar_t* n_input_data = input_data + offset;
+    scalar_t* n_result_data = result_data + offset;
+    for (j = 0; j < channel_size; ++j) {
+      for (k = 0; k < input_stride1; ++k) {
+        // to allow for compiler optimization, here splitting into two lines:
+        scalar_t w = (n_input_data[k] > 0) ? scalar_t(1) : weight_data[j];
+        n_result_data[k] = w * n_input_data[k];
+      }
+      n_input_data += input_stride1;
+      n_result_data += input_stride1;
+    }
+  }
+}
+
+Tensor prelu_cpu(const Tensor& self, const Tensor& weight_) {
+  auto input = self.contiguous();
+  auto weight = weight_.contiguous();
+
+  AT_CHECK(input.is_contiguous());
+  AT_CHECK(weight.is_contiguous());
+
+  int64_t weight_num = weight.numel();
+  Tensor result = at::empty_like(input);
+  auto strides = input.strides();
+
+  // case1: shared weight for all channels
+  if (weight_num == 1) {
+    AT_DISPATCH_FLOATING_TYPES(input.type(), "prelu_cpu", [&] {
+      prelu_cpu_kernel_share_weights<scalar_t>(result, input, weight);
+    });
+  }
+  else { // case2: multiple weights, one for each channel
+    int64_t input_ndim = input.dim();
+    AT_CHECK(input_ndim > 0, "Not allow zero-dim input tensor.");
+
+    int64_t channel_size = 1; // channel_size default to 1
+    int64_t input_dim0_size = 1, input_stride0 = 1, input_stride1 = 1;
+
+    if (input_ndim > 1) {
+      channel_size = input.size(1); // channel is the 2nd dim of input
+      input_dim0_size = input.size(0);
+      input_stride0 = strides[0];
+      input_stride1 = strides[1];
+    }
+    AT_CHECK(channel_size == weight_num,
+      "Mismatch of parameter numbers and input channel size. Found parameter numbers = %d, and channel size = %d.",
+      weight_num, channel_size);
+
+    AT_DISPATCH_FLOATING_TYPES(input.type(), "prelu_cpu", [&] {
+      prelu_cpu_kernel_multi_weights<scalar_t>(
+        result,
+        input,
+        weight,
+        input_dim0_size,
+        channel_size,
+        input_stride0,
+        input_stride1);
+    });
+  }
+  return result;
+}
+
+// -----------------------------------
+// prelu backward
+// -----------------------------------
+template <typename scalar_t>
+void inline prelu_cpu_backward_kernel_share_weights(
+  const Tensor& input,
+  const Tensor& weight,
+  const Tensor& grad_out,
+  Tensor& input_grad,
+  Tensor& weight_grad) {
+
+  int64_t i;
+  int64_t input_numel = input.numel();
+  scalar_t sum = 0;
+  auto input_data = input.data<scalar_t>();
+  auto weight_val = weight.data<scalar_t>()[0];
+  auto grad_out_data = grad_out.data<scalar_t>();
+  auto input_grad_data = input_grad.data<scalar_t>();
+  auto weight_grad_data = weight_grad.data<scalar_t>();
+
+  #pragma omp parallel for private(i) reduction(+:sum) if (input_numel > 1000)
+  for (i = 0; i < input_numel; i++) {
+    scalar_t input_data_val = input_data[i];
+    scalar_t grad_out_data_val = grad_out_data[i];
+    // to allow for compiler optimization, here splitting into two lines:
+    scalar_t w = (input_data_val > 0) ? scalar_t(1) : weight_val;
+    input_grad_data[i] = w * grad_out_data_val;
+    // to allow for compiler optimization, here splitting into two lines:
+    scalar_t mask = (input_data_val > 0) ? scalar_t(0) : scalar_t(1);
+    sum += mask * input_data_val * grad_out_data_val;
+  }
+  weight_grad_data[0] = sum;
+}
+
+template <typename scalar_t>
+void inline prelu_cpu_backward_kernel_multi_weights(
+  const Tensor& input,
+  const Tensor& weight,
+  const Tensor& grad_out,
+  Tensor& input_grad,
+  Tensor& weight_grad_collector,
+  int64_t input_dim0_size,
+  int64_t channel_size,
+  int64_t input_stride0,
+  int64_t input_stride1) {
+
+  int64_t i, j, k;
+  int64_t input_numel = input.numel();
+  auto input_data = input.data<scalar_t>();
+  auto weight_data = weight.data<scalar_t>();
+  auto grad_out_data = grad_out.data<scalar_t>();
+  auto input_grad_data = input_grad.data<scalar_t>();
+  auto weight_grad_collector_data = weight_grad_collector.data<scalar_t>();
+
+  #pragma omp parallel for private(i, j, k) if (input.numel() > 1000)
+  for (i = 0; i < input_dim0_size; i++) {
+    for (j = 0; j < channel_size; j++) {
+      for (k = 0; k < input_stride1; k++) {
+        int64_t pos = i * input_stride0 + j * input_stride1 + k;
+        scalar_t weight_data_val = weight_data[j];
+        scalar_t input_data_val = input_data[pos];
+        scalar_t grad_out_data_val = grad_out_data[pos];
+        // to allow for compiler optimization, here splitting into two lines:
+        scalar_t w = (input_data_val > 0) ? scalar_t(1) : weight_data_val;
+        input_grad_data[pos] = w * grad_out_data_val;
+        // to allow for compiler optimization, here splitting into two lines:
+        scalar_t mask = (input_data_val > 0) ? scalar_t(0) : scalar_t(1);
+        weight_grad_collector_data[pos] = mask * input_data_val * grad_out_data_val;
+      }
+    }
+  }
+}
+
+std::tuple<Tensor, Tensor> prelu_backward_cpu(const Tensor& grad_out_, const Tensor& self, const Tensor& weight_) {
+  auto input = self.contiguous();
+  auto grad_out = grad_out_.contiguous();
+  auto weight = weight_.contiguous();
+
+  AT_CHECK(input.is_contiguous());
+  AT_CHECK(grad_out.is_contiguous());
+  AT_CHECK(weight.is_contiguous());
+
+  int64_t weight_num = weight.numel();
+  auto strides = input.strides();
+  auto dims = input.dim();
+
+  Tensor input_grad = at::empty_like(input);
+  Tensor weight_grad = at::empty_like(weight);
+  Tensor weight_grad_collector = at::empty_like(input);
+
+  // case1: shared parameter for all channels
+  if (weight_num == 1) {
+    AT_DISPATCH_FLOATING_TYPES(input.type(), "prelu_backward_cpu", [&] {
+      prelu_cpu_backward_kernel_share_weights<scalar_t>(input, weight, grad_out, input_grad, weight_grad);
+    });
+  }
+  else { // case2: multiple parameters, one for each channel
+    int64_t input_ndim = input.dim();
+    AT_CHECK(input_ndim > 0, "Not allow zero-dim input tensor.");
+
+    int64_t channel_size = 1; // channel_size default to 1
+    int64_t input_dim0_size = 1, input_stride0 = 1, input_stride1 = 1;
+
+    if (input_ndim > 1) {
+      channel_size = input.size(1); // channel is the 2nd dim of input
+      input_dim0_size = input.size(0);
+      input_stride0 = strides[0];
+      input_stride1 = strides[1];
+    }
+    AT_CHECK(channel_size == weight_num,
+      "Mismatch of parameter numbers and input channel size. Found parameter numbers = %d, and channel size = %d.",
+      weight_num, channel_size);
+
+    AT_DISPATCH_FLOATING_TYPES(input.type(), "prelu_backward_cpu", [&] {
+      prelu_cpu_backward_kernel_multi_weights<scalar_t>(
+        input,
+        weight,
+        grad_out,
+        input_grad,
+        weight_grad_collector,
+        input_dim0_size,
+        channel_size,
+        input_stride0,
+        input_stride1);
+    });
+    // update weight_grad
+    std::vector<int64_t> reduce_dims;
+    reduce_dims.push_back(0);
+    if (dims > 2) {
+      for(int64_t i = 2; i < dims; i++) reduce_dims.push_back(i);
+    }
+    weight_grad = weight_grad_collector.sum(reduce_dims);
+  }
+  return std::tuple<Tensor, Tensor>{input_grad, weight_grad};
+}
+
+// -----------------------------------
+// hardshrink
+// -----------------------------------
 Tensor hardshrink_cpu(const Tensor & self, Scalar lambd) {
   auto out_tensor = at::empty_like(self);
   AT_DISPATCH_FLOATING_TYPES(self.type(), "hardshrink_cpu", [&] {
Original file line number	Diff line number	Diff line change
`@@ -935,6 +935,12 @@ inline Tensor Tensor::relu() const {`
`935`	`935`	`inline Tensor & Tensor::relu_() {`
`936`	`936`	`return type().relu_(*this);`
`937`	`937`	`}`
	`938`	`+inline Tensor Tensor::prelu(const Tensor & weight) const {`
	`939`	`+ return type().prelu(*this, weight);`
	`940`	`+}`
	`941`	`+inline std::tuple<Tensor,Tensor> Tensor::prelu_backward(const Tensor & grad_output, const Tensor & weight) const {`
	`942`	`+ return type().prelu_backward(grad_output, *this, weight);`
	`943`	`+}`
`938`	`944`	`inline Tensor Tensor::hardshrink(Scalar lambd) const {`
`939`	`945`	`return type().hardshrink(*this, lambd);`
`940`	`946`	`}`