Skip to content

Commit 7c7e09f

Browse files
Adding the Percentile op & UT
Reviewed By: MisterTea Differential Revision: D6879507 fbshipit-source-id: 7ca4165a42c073e384d3a6138ef033ca384afd49
1 parent 3f0a99d commit 7c7e09f

File tree

3 files changed

+262
-0
lines changed

3 files changed

+262
-0
lines changed

caffe2/operators/percentile_op.cc

Lines changed: 132 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,132 @@
1+
#include "caffe2/operators/percentile_op.h"
2+
3+
namespace caffe2 {
4+
5+
template <>
6+
bool PercentileOp<CPUContext>::RunOnDevice() {
7+
const auto& original_values = Input(X);
8+
CAFFE_ENFORCE_EQ(original_values.ndim(), 2);
9+
const auto num_examples = original_values.dim(0);
10+
const float* original_values_data = original_values.template data<float>();
11+
const auto num_features = original_values.dim(1);
12+
13+
const auto& value_pct_pairs = Input(VAL_PCT_PAIRS);
14+
CAFFE_ENFORCE_EQ(value_pct_pairs.ndim(), 2);
15+
CAFFE_ENFORCE_EQ(value_pct_pairs.dim(1), 2);
16+
const int num_values = value_pct_pairs.dim(0);
17+
const float* value_pct_data = value_pct_pairs.template data<float>();
18+
19+
const auto& lengths = Input(LENS);
20+
const int* lengths_data = lengths.template data<int>();
21+
CAFFE_ENFORCE_EQ(lengths.size(), num_features);
22+
23+
CAFFE_ENFORCE_EQ(
24+
std::accumulate(lengths_data, lengths_data + lengths.size(), 0),
25+
num_values,
26+
"Sum of lengths should be equal to the total number of samples");
27+
28+
values_tensor.Resize(num_values);
29+
percentiles_tensor.Resize(num_values);
30+
float* values_tensor_data = values_tensor.template mutable_data<float>();
31+
float* percentiles_tensor_data =
32+
percentiles_tensor.template mutable_data<float>();
33+
for (int ind = 0; ind < num_values; ind++) {
34+
values_tensor_data[ind] = value_pct_data[2 * ind];
35+
percentiles_tensor_data[ind] = value_pct_data[2 * ind + 1];
36+
}
37+
38+
auto* percentile_values = Output(PCT);
39+
percentile_values->ResizeLike(original_values);
40+
float* percentile_values_data =
41+
percentile_values->template mutable_data<float>();
42+
43+
int current_ind = 0;
44+
int current_dist_start = 0;
45+
int current_length;
46+
for (int i = 0; i < num_examples; i++) {
47+
current_dist_start = 0;
48+
49+
for (int j = 0; j < num_features; j++) {
50+
current_length = lengths_data[j];
51+
const auto lower_bound =
52+
std::lower_bound(
53+
values_tensor_data + current_dist_start,
54+
values_tensor_data + current_dist_start + current_length,
55+
original_values_data[current_ind]) -
56+
values_tensor_data;
57+
if (lower_bound == current_dist_start + current_length) {
58+
percentile_values_data[current_ind] = 1.0;
59+
} else if (
60+
original_values_data[current_ind] ==
61+
values_tensor_data[lower_bound]) {
62+
percentile_values_data[current_ind] =
63+
percentiles_tensor_data[lower_bound];
64+
} else if (lower_bound == current_dist_start) {
65+
percentile_values_data[current_ind] = 0.0;
66+
} else {
67+
float lower_pct = percentiles_tensor_data[lower_bound - 1];
68+
float upper_pct = percentiles_tensor_data[lower_bound];
69+
float interval_length = values_tensor_data[lower_bound] -
70+
values_tensor_data[lower_bound - 1];
71+
float normalized_dist_to_lower = (original_values_data[current_ind] -
72+
values_tensor_data[lower_bound - 1]) /
73+
interval_length;
74+
percentile_values_data[current_ind] =
75+
lower_pct + normalized_dist_to_lower * (upper_pct - lower_pct);
76+
}
77+
current_dist_start += current_length;
78+
current_ind++;
79+
}
80+
}
81+
return true;
82+
}
83+
84+
REGISTER_CPU_OPERATOR(Percentile, PercentileOp<CPUContext>);
85+
OPERATOR_SCHEMA(Percentile)
86+
.NumInputs(3)
87+
.NumOutputs(1)
88+
.SetDoc(R"DOC(
89+
This operator is used to find percentile representations for raw values, given a sample
90+
set of raw values, labeled with their corresponding percentiles from the same distribution.
91+
In particular, this operator takes as input a tensor of floats to find the percentile values
92+
for, a 2D tensor of floats, where the first column of the tensor represents sampled values,
93+
and the second column represents the percentile labels, and a tensor of integers lengths.
94+
95+
This lengths tensor is used because the operator works on multiple sets of raw values at the same time. For
96+
example, for an input:
97+
original_values=[[3, 5, 3],[5, 1, 6]], lengths = [2, 1, 1], value_to_pct = [[3, 0.2], [5, 0.5], [1, 0.3], [3. 0.6]]
98+
99+
Our operator expects that each column i of the input tensor is sampled from distribution i. Lengths tells
100+
us that the first two elements in value_to_pct are sampled from distribution 1, the next is from distribution two,
101+
and the last is from distribution 3. We expect the output of our operator to give us [[0.2, 1.0, 0.6], [0.5, 0.3, 1.0]].
102+
103+
To calculate the percentile of an element, we check to see if its value is already mapped to
104+
a percentile in value_to_pct. If so, we return that value. If not, we linearly interpolate between
105+
the two closest values in value_to_pct. If the value is larger than all values in value_to_pct, we
106+
return 1. If it's smaller than all the values, we return 0.
107+
108+
)DOC")
109+
.Input(
110+
0,
111+
"original_values",
112+
"Input 2D tensor of floats, representing the original, raw data to calculate percentiles for.")
113+
.Input(
114+
1,
115+
"value_to_pct",
116+
"Sorted 2D tensor, with 2 columns. Each element in the first column is a float representing the"
117+
" raw value of a sample. Its corresponding element in the next column represents the percentile it maps to.")
118+
.Input(
119+
2,
120+
"lengths",
121+
"1D tensor, representing the length of each distribution. We expect that the sum of elements of this tensor"
122+
" is equal to the total length of value_to_pct.")
123+
.Output(
124+
0,
125+
"percentile_values",
126+
"1D tensor of floats, with the same dimensions as the flattened input tensor. Each element "
127+
"of this tensor, percentile_values[i], corresponds to the percentile calculated "
128+
"for original_values[i].");
129+
130+
NO_GRADIENT(Percentile);
131+
132+
} // namespace caffe2

caffe2/operators/percentile_op.h

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
// Operator to calculate percentile values for an input tensor of data,
2+
// given samples of data from the same distribution, labeled with their
3+
// percentile values.
4+
5+
#ifndef CAFFE2_OPERATORS_PERCENTILE_OP_H_
6+
#define CAFFE2_OPERATORS_PERCENTILE_OP_H_
7+
8+
#include "caffe2/core/context.h"
9+
#include "caffe2/core/logging.h"
10+
#include "caffe2/core/operator.h"
11+
#include "caffe2/core/tensor.h"
12+
#include "caffe2/utils/math.h"
13+
14+
namespace caffe2 {
15+
16+
template <class Context>
17+
class PercentileOp final : public Operator<Context> {
18+
public:
19+
USE_OPERATOR_CONTEXT_FUNCTIONS;
20+
PercentileOp(const OperatorDef& operator_def, Workspace* ws)
21+
: Operator<Context>(operator_def, ws) {}
22+
23+
bool RunOnDevice() override;
24+
25+
protected:
26+
INPUT_TAGS(X, VAL_PCT_PAIRS, LENS);
27+
OUTPUT_TAGS(PCT);
28+
Tensor<Context> values_tensor;
29+
Tensor<Context> percentiles_tensor;
30+
};
31+
32+
} // namespace caffe2
33+
34+
#endif // CAFFE2_OPERATORS_PERCENTILE_OP_H_
Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,96 @@
1+
from __future__ import absolute_import
2+
from __future__ import division
3+
from __future__ import print_function
4+
from __future__ import unicode_literals
5+
6+
from caffe2.python import core, workspace, dyndep
7+
import caffe2.python.hypothesis_test_util as hu
8+
import numpy as np
9+
10+
class TestPercentileOp(hu.HypothesisTestCase):
11+
def _test_percentile_op(
12+
self,
13+
original_inp,
14+
value_to_pct_map,
15+
dist_lengths,
16+
expected_values
17+
):
18+
op = core.CreateOperator(
19+
'Percentile',
20+
['original_values', 'value_to_pct_map', 'dist_lengths'],
21+
['percentile_values']
22+
)
23+
workspace.FeedBlob('original_values', np.array(original_inp, dtype=np.float32))
24+
workspace.FeedBlob(
25+
'value_to_pct_map', np.array(value_to_pct_map, dtype=np.float32))
26+
workspace.FeedBlob('dist_lengths', np.array(dist_lengths, dtype=np.int32))
27+
workspace.RunOperatorOnce(op)
28+
np.testing.assert_array_almost_equal(
29+
workspace.FetchBlob('percentile_values'),
30+
np.array(expected_values),
31+
decimal=5
32+
)
33+
34+
def test_percentile_op_with_only_one_dist(self):
35+
self._test_percentile_op(
36+
original_inp=[[5]],
37+
value_to_pct_map=[[5, 0.4]],
38+
dist_lengths=[1],
39+
expected_values=[[0.4]]
40+
)
41+
42+
def test_percentile_op_with_all_elements_in_map(self):
43+
self._test_percentile_op(
44+
original_inp=[[3, 4], [10, 4]],
45+
value_to_pct_map=[[3, 0.3], [4, 0.6], [10, 0.8], [4, 0.5], [5, 0.6]],
46+
dist_lengths=[3, 2],
47+
expected_values=[[0.3, 0.5], [0.8, 0.5]],
48+
)
49+
50+
def test_percentile_op_with_same_value(self):
51+
self._test_percentile_op(
52+
original_inp=[[1, 1], [1, 2]],
53+
value_to_pct_map=[[1, 0.1], [4, 0.4], [2, 0.5]],
54+
dist_lengths=[2, 1],
55+
expected_values=[[0.1, 0.0], [0.1, 0.5]]
56+
)
57+
58+
def test_percentile_op_with_elements_bigger_than_map_range(self):
59+
self._test_percentile_op(
60+
original_inp=[[1, 5], [3, 4]],
61+
value_to_pct_map=[[1, 0.1], [4, 0.4], [2, 0.1], [3, 0.3]],
62+
dist_lengths=[2, 2],
63+
expected_values=[[0.1, 1.], [0.3, 1.0]]
64+
)
65+
66+
def test_percentile_op_with_elements_smaller_than_map_range(self):
67+
self._test_percentile_op(
68+
original_inp=[[1], [5], [6]],
69+
value_to_pct_map=[[2, 0.2], [5, 0.5], [7, 0.5]],
70+
dist_lengths=[3],
71+
expected_values=[[0.0], [0.5], [0.5]]
72+
)
73+
74+
def test_percentile_op_with_interpolation(self):
75+
self._test_percentile_op(
76+
original_inp=[[3, 2, 5], [6, 7, 8]],
77+
value_to_pct_map=[[1, 0.1], [4, 0.7], [4.5, 0.8],
78+
[6, 0.5], [8, 0.9],
79+
[8, 0.6]],
80+
dist_lengths=[3, 2, 1],
81+
expected_values=[[0.5, 0.0, 0.0], [1.0, 0.7, 0.6]]
82+
)
83+
84+
def test_percentile_op_with_large_sample_size_per_dist(self):
85+
self._test_percentile_op(
86+
original_inp=[[3, 1], [5, 7]],
87+
value_to_pct_map=[[3, 0.5], [4, 0.6], [5, 0.7],
88+
[1, 0.2], [2, 0.3], [5, 0.8]],
89+
dist_lengths=[3, 3],
90+
expected_values=[[0.5, 0.2], [0.7, 1.0]]
91+
)
92+
93+
94+
if __name__ == "__main__":
95+
import unittest
96+
unittest.main()

0 commit comments

Comments
 (0)