kevinch-nv
diff --git a/‎caffe2/operators/quantized/int8_leaky_relu_op.h
Lines changed: 75 additions & 29 deletions b/‎caffe2/operators/quantized/int8_leaky_relu_op.h
Lines changed: 75 additions & 29 deletions
diff --git a/‎caffe2/operators/quantized/int8_relu_op.h
Lines changed: 56 additions & 5 deletions b/‎caffe2/operators/quantized/int8_relu_op.h
Lines changed: 56 additions & 5 deletions
@@ -1,6 +1,8 @@
 #ifndef CAFFE2_OPERATORS_INT8_LEAKY_RELU_OP_H_
 #define CAFFE2_OPERATORS_INT8_LEAKY_RELU_OP_H_
 
+#include <qnnpack.h>
+
 #include "caffe2/core/context.h"
 #include "caffe2/core/operator.h"
 #include "caffe2/core/tensor_int8.h"
@@ -13,48 +15,92 @@ namespace int8 {
 class Int8LeakyReluOp final : public Operator<CPUContext> {
  public:
   Int8LeakyReluOp(const OperatorDef& operator_def, Workspace* ws)
-      : Operator<CPUContext>(operator_def, ws) {
-    double alpha = this->template GetSingleArgument<float>("alpha", 0.01);
+      : Operator<CPUContext>(operator_def, ws),
+        ws_(ws) {
+    const float alpha = this->template GetSingleArgument<float>("alpha", 0.01);
     CAFFE_ENFORCE_GT(alpha, 0.0);
     CAFFE_ENFORCE_LT(alpha, 1.0);
-    QuantizeMultiplierSmallerThanOne(alpha, &multiplier_, &shift_);
+    this->alpha_ = alpha;
+  }
+
+  ~Int8LeakyReluOp() {
+    if (this->qnnpackOperator_ != nullptr) {
+      qnnp_delete_operator(this->qnnpackOperator_);
+      this->qnnpackOperator_ = nullptr;
+    }
   }
 
   bool RunOnDevice() override {
     const auto& X = Inputs()[0]->template Get<Int8TensorCPU>();
     auto* Y = Outputs()[0]->template GetMutable<Int8TensorCPU>();
+    const int32_t Y_zero_point =
+        this->template GetSingleArgument<int>("Y_zero_point", 0);
+    const float Y_scale =
+        this->template GetSingleArgument<float>("Y_scale", 1);
+    CHECK_GE(Y_zero_point, std::numeric_limits<uint8_t>::min());
+    CHECK_LE(Y_zero_point, std::numeric_limits<uint8_t>::max());
+
+    /*
+     * Record quantization parameters for the input, because if the op is
+     * in-place, we may overwrite these parameters later, when we set
+     * quantization parameters for output tensor.
+     */
+    const uint8_t X_zero_point = X.zero_point;
+    const float X_scale = X.scale;
+
+    Y->scale = Y_scale;
+    Y->zero_point = Y_zero_point;
     Y->t.ResizeLike(X.t);
-    Y->scale = X.scale;
-    Y->zero_point = X.zero_point;
-    CHECK_GE(X.zero_point, std::numeric_limits<uint8_t>::min());
-    CHECK_LE(X.zero_point, std::numeric_limits<uint8_t>::max());
-    int32_t Y_offset = this->template GetSingleArgument<int>("Y_zero_point", 0);
-    auto Y_scale = this->template GetSingleArgument<float>("Y_scale", 1);
-    CHECK_EQ(Y_offset, X.zero_point);
-    CHECK_EQ(Y_scale, X.scale);
-
-    const uint8_t* Xdata = X.t.data<uint8_t>();
-    uint8_t* Ydata = Y->t.mutable_data<uint8_t>();
-
-    // For x < zero_point:
-    //   (y - zero_point) * scale = alpha * (x - zero_point) * scale
-    //   y = alpha * (x - zeropoint) + zero_point
-    for (int i = 0; i < X.t.numel(); i++) {
-      if (Xdata[i] < X.zero_point) {
-        int32_t out = MultiplyByQuantizedMultiplierSmallerThanOne(
-                          Xdata[i] - X.zero_point, multiplier_, shift_) +
-            X.zero_point;
-        Ydata[i] = static_cast<uint8_t>(out);
-      } else {
-        Ydata[i] = Xdata[i];
-      }
+
+    initQNNPACK();
+
+    if (this->qnnpackOperator_ == nullptr) {
+      const qnnp_status createStatus = qnnp_create_leaky_relu_nc_q8(
+        1 /* channels */,
+        this->alpha_,
+        static_cast<uint8_t>(X_zero_point), X_scale,
+        static_cast<uint8_t>(Y_zero_point), Y_scale,
+        0 /* output min */,
+        255 /* output max */,
+        &qnnpackOperator_);
+      CAFFE_ENFORCE(
+          createStatus == qnnp_status_success,
+          "failed to create QNNPACK Leaky ReLU operator");
+      CAFFE_ENFORCE(this->qnnpackOperator_ != nullptr);
     }
+
+    const qnnp_status setupStatus = qnnp_setup_leaky_relu_nc_q8(
+        this->qnnpackOperator_,
+        X.t.numel() /* batch size */,
+        X.t.template data<uint8_t>(),
+        1 /* X stride */,
+        Y->t.template mutable_data<uint8_t>(),
+        1 /* Y stride */);
+    CAFFE_ENFORCE(
+        setupStatus == qnnp_status_success,
+        "failed to setup QNNPACK Leaky ReLU operator");
+
+#ifdef FBCODE_CAFFE2
+    const qnnp_status runStatus =
+        qnnp_run_operator(this->qnnpackOperator_, nullptr /* thread pool */);
+#else
+    pthreadpool_t threadpool =
+        reinterpret_cast<pthreadpool_t>(ws_->GetThreadPool());
+    const qnnp_status runStatus =
+        qnnp_run_operator(this->qnnpackOperator_, threadpool);
+#endif
+    CAFFE_ENFORCE(
+        runStatus == qnnp_status_success,
+        "failed to run QNNPACK Leaky ReLU operator");
+
     return true;
   }
 
  private:
-  int32_t multiplier_;
-  int shift_;
+  float alpha_;
+  Workspace* ws_;
+  // QNNPACK Leaky ReLU operator
+  qnnp_operator_t qnnpackOperator_{nullptr};
 };
 
 } // namespace int8
 
@@ -1,19 +1,29 @@
 #ifndef CAFFE2_OPERATORS_INT8_RELU_OP_H_
 #define CAFFE2_OPERATORS_INT8_RELU_OP_H_
 
+#include <qnnpack.h>
+
 #include "caffe2/core/context.h"
 #include "caffe2/core/operator.h"
 #include "caffe2/core/tensor_int8.h"
 #include "caffe2/operators/quantized/int8_utils.h"
-#include "caffe2/utils/eigen_utils.h"
 
 namespace caffe2 {
 
 namespace int8 {
 
 class Int8ReluOp final : public Operator<CPUContext> {
  public:
-  using Operator<CPUContext>::Operator;
+  Int8ReluOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<CPUContext>(operator_def, ws),
+        ws_(ws) {}
+
+  ~Int8ReluOp() {
+    if (this->qnnpackOperator_ != nullptr) {
+      qnnp_delete_operator(this->qnnpackOperator_);
+      this->qnnpackOperator_ = nullptr;
+    }
+  }
 
   bool RunOnDevice() override {
     const auto& X = Inputs()[0]->template Get<Int8TensorCPU>();
@@ -29,11 +39,52 @@ class Int8ReluOp final : public Operator<CPUContext> {
         this->template GetSingleArgument<float>("Y_scale", 1.0f);
     CHECK_EQ(Y_offset, X.zero_point);
     CHECK_EQ(Y_scale, X.scale);
-    EigenVectorMap<uint8_t>(Y->t.mutable_data<uint8_t>(), X.t.numel()) =
-        ConstEigenVectorMap<uint8_t>(X.t.data<uint8_t>(), X.t.numel())
-            .cwiseMax(QuantizeUint8(X.scale, X.zero_point, 0));
+
+    initQNNPACK();
+
+    if (this->qnnpackOperator_ == nullptr) {
+      const qnnp_status createStatus = qnnp_create_clamp_nc_u8(
+        1 /* channels */,
+        X.zero_point /* output min */,
+        255 /* output max */,
+        &qnnpackOperator_);
+      CAFFE_ENFORCE(
+          createStatus == qnnp_status_success,
+          "failed to create QNNPACK Clamp operator");
+      CAFFE_ENFORCE(this->qnnpackOperator_ != nullptr);
+    }
+
+    const qnnp_status setupStatus = qnnp_setup_clamp_nc_u8(
+        this->qnnpackOperator_,
+        X.t.numel() /* batch size */,
+        X.t.template data<uint8_t>(),
+        1 /* X stride */,
+        Y->t.template mutable_data<uint8_t>(),
+        1 /* Y stride */);
+    CAFFE_ENFORCE(
+        setupStatus == qnnp_status_success,
+        "failed to setup QNNPACK Clamp operator");
+
+#ifdef FBCODE_CAFFE2
+    const qnnp_status runStatus =
+        qnnp_run_operator(this->qnnpackOperator_, nullptr /* thread pool */);
+#else
+    pthreadpool_t threadpool =
+        reinterpret_cast<pthreadpool_t>(ws_->GetThreadPool());
+    const qnnp_status runStatus =
+        qnnp_run_operator(this->qnnpackOperator_, threadpool);
+#endif
+    CAFFE_ENFORCE(
+        runStatus == qnnp_status_success,
+        "failed to run QNNPACK Clamp operator");
+
     return true;
   }
+
+ private:
+  Workspace* ws_;
+  // QNNPACK Clamp operator
+  qnnp_operator_t qnnpackOperator_{nullptr};
 };
 
 } // namespace int8