Skip to content

Commit bf1d411

Browse files
Marat Dukhanfacebook-github-bot
authored andcommitted
Switch Int8Softmax, Int8Relu, and Int8LeakyRelu to QNNPACK (pytorch#14933)
Summary: Int8Softmax: 4x-5x speedup compared to previous implementation Pull Request resolved: pytorch#14933 Differential Revision: D13406820 Pulled By: Maratyszcza fbshipit-source-id: ea8cbe1b861ddb7ff1b851d06d52c6fd6d04ed01
1 parent a1ea7db commit bf1d411

File tree

4 files changed

+201
-193
lines changed

4 files changed

+201
-193
lines changed

caffe2/operators/quantized/int8_leaky_relu_op.h

Lines changed: 75 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
#ifndef CAFFE2_OPERATORS_INT8_LEAKY_RELU_OP_H_
22
#define CAFFE2_OPERATORS_INT8_LEAKY_RELU_OP_H_
33

4+
#include <qnnpack.h>
5+
46
#include "caffe2/core/context.h"
57
#include "caffe2/core/operator.h"
68
#include "caffe2/core/tensor_int8.h"
@@ -13,48 +15,92 @@ namespace int8 {
1315
class Int8LeakyReluOp final : public Operator<CPUContext> {
1416
public:
1517
Int8LeakyReluOp(const OperatorDef& operator_def, Workspace* ws)
16-
: Operator<CPUContext>(operator_def, ws) {
17-
double alpha = this->template GetSingleArgument<float>("alpha", 0.01);
18+
: Operator<CPUContext>(operator_def, ws),
19+
ws_(ws) {
20+
const float alpha = this->template GetSingleArgument<float>("alpha", 0.01);
1821
CAFFE_ENFORCE_GT(alpha, 0.0);
1922
CAFFE_ENFORCE_LT(alpha, 1.0);
20-
QuantizeMultiplierSmallerThanOne(alpha, &multiplier_, &shift_);
23+
this->alpha_ = alpha;
24+
}
25+
26+
~Int8LeakyReluOp() {
27+
if (this->qnnpackOperator_ != nullptr) {
28+
qnnp_delete_operator(this->qnnpackOperator_);
29+
this->qnnpackOperator_ = nullptr;
30+
}
2131
}
2232

2333
bool RunOnDevice() override {
2434
const auto& X = Inputs()[0]->template Get<Int8TensorCPU>();
2535
auto* Y = Outputs()[0]->template GetMutable<Int8TensorCPU>();
36+
const int32_t Y_zero_point =
37+
this->template GetSingleArgument<int>("Y_zero_point", 0);
38+
const float Y_scale =
39+
this->template GetSingleArgument<float>("Y_scale", 1);
40+
CHECK_GE(Y_zero_point, std::numeric_limits<uint8_t>::min());
41+
CHECK_LE(Y_zero_point, std::numeric_limits<uint8_t>::max());
42+
43+
/*
44+
* Record quantization parameters for the input, because if the op is
45+
* in-place, we may overwrite these parameters later, when we set
46+
* quantization parameters for output tensor.
47+
*/
48+
const uint8_t X_zero_point = X.zero_point;
49+
const float X_scale = X.scale;
50+
51+
Y->scale = Y_scale;
52+
Y->zero_point = Y_zero_point;
2653
Y->t.ResizeLike(X.t);
27-
Y->scale = X.scale;
28-
Y->zero_point = X.zero_point;
29-
CHECK_GE(X.zero_point, std::numeric_limits<uint8_t>::min());
30-
CHECK_LE(X.zero_point, std::numeric_limits<uint8_t>::max());
31-
int32_t Y_offset = this->template GetSingleArgument<int>("Y_zero_point", 0);
32-
auto Y_scale = this->template GetSingleArgument<float>("Y_scale", 1);
33-
CHECK_EQ(Y_offset, X.zero_point);
34-
CHECK_EQ(Y_scale, X.scale);
35-
36-
const uint8_t* Xdata = X.t.data<uint8_t>();
37-
uint8_t* Ydata = Y->t.mutable_data<uint8_t>();
38-
39-
// For x < zero_point:
40-
// (y - zero_point) * scale = alpha * (x - zero_point) * scale
41-
// y = alpha * (x - zeropoint) + zero_point
42-
for (int i = 0; i < X.t.numel(); i++) {
43-
if (Xdata[i] < X.zero_point) {
44-
int32_t out = MultiplyByQuantizedMultiplierSmallerThanOne(
45-
Xdata[i] - X.zero_point, multiplier_, shift_) +
46-
X.zero_point;
47-
Ydata[i] = static_cast<uint8_t>(out);
48-
} else {
49-
Ydata[i] = Xdata[i];
50-
}
54+
55+
initQNNPACK();
56+
57+
if (this->qnnpackOperator_ == nullptr) {
58+
const qnnp_status createStatus = qnnp_create_leaky_relu_nc_q8(
59+
1 /* channels */,
60+
this->alpha_,
61+
static_cast<uint8_t>(X_zero_point), X_scale,
62+
static_cast<uint8_t>(Y_zero_point), Y_scale,
63+
0 /* output min */,
64+
255 /* output max */,
65+
&qnnpackOperator_);
66+
CAFFE_ENFORCE(
67+
createStatus == qnnp_status_success,
68+
"failed to create QNNPACK Leaky ReLU operator");
69+
CAFFE_ENFORCE(this->qnnpackOperator_ != nullptr);
5170
}
71+
72+
const qnnp_status setupStatus = qnnp_setup_leaky_relu_nc_q8(
73+
this->qnnpackOperator_,
74+
X.t.numel() /* batch size */,
75+
X.t.template data<uint8_t>(),
76+
1 /* X stride */,
77+
Y->t.template mutable_data<uint8_t>(),
78+
1 /* Y stride */);
79+
CAFFE_ENFORCE(
80+
setupStatus == qnnp_status_success,
81+
"failed to setup QNNPACK Leaky ReLU operator");
82+
83+
#ifdef FBCODE_CAFFE2
84+
const qnnp_status runStatus =
85+
qnnp_run_operator(this->qnnpackOperator_, nullptr /* thread pool */);
86+
#else
87+
pthreadpool_t threadpool =
88+
reinterpret_cast<pthreadpool_t>(ws_->GetThreadPool());
89+
const qnnp_status runStatus =
90+
qnnp_run_operator(this->qnnpackOperator_, threadpool);
91+
#endif
92+
CAFFE_ENFORCE(
93+
runStatus == qnnp_status_success,
94+
"failed to run QNNPACK Leaky ReLU operator");
95+
5296
return true;
5397
}
5498

5599
private:
56-
int32_t multiplier_;
57-
int shift_;
100+
float alpha_;
101+
Workspace* ws_;
102+
// QNNPACK Leaky ReLU operator
103+
qnnp_operator_t qnnpackOperator_{nullptr};
58104
};
59105

60106
} // namespace int8

caffe2/operators/quantized/int8_relu_op.h

Lines changed: 56 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,29 @@
11
#ifndef CAFFE2_OPERATORS_INT8_RELU_OP_H_
22
#define CAFFE2_OPERATORS_INT8_RELU_OP_H_
33

4+
#include <qnnpack.h>
5+
46
#include "caffe2/core/context.h"
57
#include "caffe2/core/operator.h"
68
#include "caffe2/core/tensor_int8.h"
79
#include "caffe2/operators/quantized/int8_utils.h"
8-
#include "caffe2/utils/eigen_utils.h"
910

1011
namespace caffe2 {
1112

1213
namespace int8 {
1314

1415
class Int8ReluOp final : public Operator<CPUContext> {
1516
public:
16-
using Operator<CPUContext>::Operator;
17+
Int8ReluOp(const OperatorDef& operator_def, Workspace* ws)
18+
: Operator<CPUContext>(operator_def, ws),
19+
ws_(ws) {}
20+
21+
~Int8ReluOp() {
22+
if (this->qnnpackOperator_ != nullptr) {
23+
qnnp_delete_operator(this->qnnpackOperator_);
24+
this->qnnpackOperator_ = nullptr;
25+
}
26+
}
1727

1828
bool RunOnDevice() override {
1929
const auto& X = Inputs()[0]->template Get<Int8TensorCPU>();
@@ -29,11 +39,52 @@ class Int8ReluOp final : public Operator<CPUContext> {
2939
this->template GetSingleArgument<float>("Y_scale", 1.0f);
3040
CHECK_EQ(Y_offset, X.zero_point);
3141
CHECK_EQ(Y_scale, X.scale);
32-
EigenVectorMap<uint8_t>(Y->t.mutable_data<uint8_t>(), X.t.numel()) =
33-
ConstEigenVectorMap<uint8_t>(X.t.data<uint8_t>(), X.t.numel())
34-
.cwiseMax(QuantizeUint8(X.scale, X.zero_point, 0));
42+
43+
initQNNPACK();
44+
45+
if (this->qnnpackOperator_ == nullptr) {
46+
const qnnp_status createStatus = qnnp_create_clamp_nc_u8(
47+
1 /* channels */,
48+
X.zero_point /* output min */,
49+
255 /* output max */,
50+
&qnnpackOperator_);
51+
CAFFE_ENFORCE(
52+
createStatus == qnnp_status_success,
53+
"failed to create QNNPACK Clamp operator");
54+
CAFFE_ENFORCE(this->qnnpackOperator_ != nullptr);
55+
}
56+
57+
const qnnp_status setupStatus = qnnp_setup_clamp_nc_u8(
58+
this->qnnpackOperator_,
59+
X.t.numel() /* batch size */,
60+
X.t.template data<uint8_t>(),
61+
1 /* X stride */,
62+
Y->t.template mutable_data<uint8_t>(),
63+
1 /* Y stride */);
64+
CAFFE_ENFORCE(
65+
setupStatus == qnnp_status_success,
66+
"failed to setup QNNPACK Clamp operator");
67+
68+
#ifdef FBCODE_CAFFE2
69+
const qnnp_status runStatus =
70+
qnnp_run_operator(this->qnnpackOperator_, nullptr /* thread pool */);
71+
#else
72+
pthreadpool_t threadpool =
73+
reinterpret_cast<pthreadpool_t>(ws_->GetThreadPool());
74+
const qnnp_status runStatus =
75+
qnnp_run_operator(this->qnnpackOperator_, threadpool);
76+
#endif
77+
CAFFE_ENFORCE(
78+
runStatus == qnnp_status_success,
79+
"failed to run QNNPACK Clamp operator");
80+
3581
return true;
3682
}
83+
84+
private:
85+
Workspace* ws_;
86+
// QNNPACK Clamp operator
87+
qnnp_operator_t qnnpackOperator_{nullptr};
3788
};
3889

3990
} // namespace int8

0 commit comments

Comments
 (0)