Adding the Percentile op & UT

eugene-kharitonov · facebook-github-bot · commit 7c7e09fe2dbf · 2018-02-05T16:08:00.000-08:00
Reviewed By: MisterTea

Differential Revision: D6879507

fbshipit-source-id: 7ca4165a42c073e384d3a6138ef033ca384afd49
diff --git a/caffe2/operators/percentile_op.cc b/caffe2/operators/percentile_op.cc
@@ -0,0 +1,132 @@
+#include "caffe2/operators/percentile_op.h"
+
+namespace caffe2 {
+
+template <>
+bool PercentileOp<CPUContext>::RunOnDevice() {
+  const auto& original_values = Input(X);
+  CAFFE_ENFORCE_EQ(original_values.ndim(), 2);
+  const auto num_examples = original_values.dim(0);
+  const float* original_values_data = original_values.template data<float>();
+  const auto num_features = original_values.dim(1);
+
+  const auto& value_pct_pairs = Input(VAL_PCT_PAIRS);
+  CAFFE_ENFORCE_EQ(value_pct_pairs.ndim(), 2);
+  CAFFE_ENFORCE_EQ(value_pct_pairs.dim(1), 2);
+  const int num_values = value_pct_pairs.dim(0);
+  const float* value_pct_data = value_pct_pairs.template data<float>();
+
+  const auto& lengths = Input(LENS);
+  const int* lengths_data = lengths.template data<int>();
+  CAFFE_ENFORCE_EQ(lengths.size(), num_features);
+
+  CAFFE_ENFORCE_EQ(
+      std::accumulate(lengths_data, lengths_data + lengths.size(), 0),
+      num_values,
+      "Sum of lengths should be equal to the total number of samples");
+
+  values_tensor.Resize(num_values);
+  percentiles_tensor.Resize(num_values);
+  float* values_tensor_data = values_tensor.template mutable_data<float>();
+  float* percentiles_tensor_data =
+      percentiles_tensor.template mutable_data<float>();
+  for (int ind = 0; ind < num_values; ind++) {
+    values_tensor_data[ind] = value_pct_data[2 * ind];
+    percentiles_tensor_data[ind] = value_pct_data[2 * ind + 1];
+  }
+
+  auto* percentile_values = Output(PCT);
+  percentile_values->ResizeLike(original_values);
+  float* percentile_values_data =
+      percentile_values->template mutable_data<float>();
+
+  int current_ind = 0;
+  int current_dist_start = 0;
+  int current_length;
+  for (int i = 0; i < num_examples; i++) {
+    current_dist_start = 0;
+
+    for (int j = 0; j < num_features; j++) {
+      current_length = lengths_data[j];
+      const auto lower_bound =
+          std::lower_bound(
+              values_tensor_data + current_dist_start,
+              values_tensor_data + current_dist_start + current_length,
+              original_values_data[current_ind]) -
+          values_tensor_data;
+      if (lower_bound == current_dist_start + current_length) {
+        percentile_values_data[current_ind] = 1.0;
+      } else if (
+          original_values_data[current_ind] ==
+          values_tensor_data[lower_bound]) {
+        percentile_values_data[current_ind] =
+            percentiles_tensor_data[lower_bound];
+      } else if (lower_bound == current_dist_start) {
+        percentile_values_data[current_ind] = 0.0;
+      } else {
+        float lower_pct = percentiles_tensor_data[lower_bound - 1];
+        float upper_pct = percentiles_tensor_data[lower_bound];
+        float interval_length = values_tensor_data[lower_bound] -
+            values_tensor_data[lower_bound - 1];
+        float normalized_dist_to_lower = (original_values_data[current_ind] -
+                                          values_tensor_data[lower_bound - 1]) /
+            interval_length;
+        percentile_values_data[current_ind] =
+            lower_pct + normalized_dist_to_lower * (upper_pct - lower_pct);
+      }
+      current_dist_start += current_length;
+      current_ind++;
+    }
+  }
+  return true;
+}
+
+REGISTER_CPU_OPERATOR(Percentile, PercentileOp<CPUContext>);
+OPERATOR_SCHEMA(Percentile)
+    .NumInputs(3)
+    .NumOutputs(1)
+    .SetDoc(R"DOC(
+    This operator is used to find percentile representations for raw values, given a sample
+    set of raw values, labeled with their corresponding percentiles from the same distribution.
+    In particular, this operator takes as input a tensor of floats to find the percentile values
+    for, a 2D tensor of floats, where the first column of the tensor represents sampled values,
+    and the second column represents the percentile labels, and a tensor  of integers lengths.
+
+    This lengths tensor is used because the operator works on multiple sets of raw values at the same time. For
+    example, for an input:
+    original_values=[[3, 5, 3],[5, 1, 6]], lengths = [2, 1, 1], value_to_pct = [[3, 0.2], [5, 0.5], [1, 0.3], [3. 0.6]]
+
+    Our operator expects that each column i of the input tensor is sampled from distribution i. Lengths tells
+    us that the first two elements in value_to_pct are sampled from distribution 1, the next is from distribution two,
+    and the last is from distribution 3. We expect the output of our operator to give us [[0.2, 1.0, 0.6], [0.5, 0.3, 1.0]].
+
+    To calculate the percentile of an element, we check to see if its value is already mapped to
+    a percentile in value_to_pct. If so, we return that value. If not, we linearly interpolate between
+    the two closest values in value_to_pct. If the value is larger than all values in value_to_pct, we
+    return 1. If it's smaller than all the values, we return 0.
+
+)DOC")
+    .Input(
+        0,
+        "original_values",
+        "Input 2D tensor of floats, representing the original, raw data to calculate percentiles for.")
+    .Input(
+        1,
+        "value_to_pct",
+        "Sorted 2D tensor, with 2 columns. Each element in the first column is a float representing the"
+        " raw value of a sample. Its corresponding element in the next column represents the percentile it maps to.")
+    .Input(
+        2,
+        "lengths",
+        "1D tensor, representing the length of each distribution. We expect that the sum of elements of this tensor"
+        " is equal to the total length of value_to_pct.")
+    .Output(
+        0,
+        "percentile_values",
+        "1D tensor of floats, with the same dimensions as the flattened input tensor. Each element "
+        "of this tensor, percentile_values[i], corresponds to the percentile calculated "
+        "for original_values[i].");
+
+NO_GRADIENT(Percentile);
+
+} // namespace caffe2
diff --git a/caffe2/operators/percentile_op.h b/caffe2/operators/percentile_op.h
@@ -0,0 +1,34 @@
+// Operator to calculate percentile values for an input tensor of data,
+// given samples of data from the same distribution, labeled with their
+// percentile values.
+
+#ifndef CAFFE2_OPERATORS_PERCENTILE_OP_H_
+#define CAFFE2_OPERATORS_PERCENTILE_OP_H_
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/logging.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/core/tensor.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+template <class Context>
+class PercentileOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  PercentileOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws) {}
+
+  bool RunOnDevice() override;
+
+ protected:
+  INPUT_TAGS(X, VAL_PCT_PAIRS, LENS);
+  OUTPUT_TAGS(PCT);
+  Tensor<Context> values_tensor;
+  Tensor<Context> percentiles_tensor;
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_PERCENTILE_OP_H_
diff --git a/caffe2/python/operator_test/percentile_op_test.py b/caffe2/python/operator_test/percentile_op_test.py
@@ -0,0 +1,96 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.python import core, workspace, dyndep
+import caffe2.python.hypothesis_test_util as hu
+import numpy as np
+
+class TestPercentileOp(hu.HypothesisTestCase):
+    def _test_percentile_op(
+        self,
+        original_inp,
+        value_to_pct_map,
+        dist_lengths,
+        expected_values
+    ):
+        op = core.CreateOperator(
+            'Percentile',
+            ['original_values', 'value_to_pct_map', 'dist_lengths'],
+            ['percentile_values']
+        )
+        workspace.FeedBlob('original_values', np.array(original_inp, dtype=np.float32))
+        workspace.FeedBlob(
+            'value_to_pct_map', np.array(value_to_pct_map, dtype=np.float32))
+        workspace.FeedBlob('dist_lengths', np.array(dist_lengths, dtype=np.int32))
+        workspace.RunOperatorOnce(op)
+        np.testing.assert_array_almost_equal(
+            workspace.FetchBlob('percentile_values'),
+            np.array(expected_values),
+            decimal=5
+        )
+
+    def test_percentile_op_with_only_one_dist(self):
+        self._test_percentile_op(
+            original_inp=[[5]],
+            value_to_pct_map=[[5, 0.4]],
+            dist_lengths=[1],
+            expected_values=[[0.4]]
+        )
+
+    def test_percentile_op_with_all_elements_in_map(self):
+        self._test_percentile_op(
+            original_inp=[[3, 4], [10, 4]],
+            value_to_pct_map=[[3, 0.3], [4, 0.6], [10, 0.8], [4, 0.5], [5, 0.6]],
+            dist_lengths=[3, 2],
+            expected_values=[[0.3, 0.5], [0.8, 0.5]],
+        )
+
+    def test_percentile_op_with_same_value(self):
+        self._test_percentile_op(
+            original_inp=[[1, 1], [1, 2]],
+            value_to_pct_map=[[1, 0.1], [4, 0.4], [2, 0.5]],
+            dist_lengths=[2, 1],
+            expected_values=[[0.1, 0.0], [0.1, 0.5]]
+        )
+
+    def test_percentile_op_with_elements_bigger_than_map_range(self):
+        self._test_percentile_op(
+            original_inp=[[1, 5], [3, 4]],
+            value_to_pct_map=[[1, 0.1], [4, 0.4], [2, 0.1], [3, 0.3]],
+            dist_lengths=[2, 2],
+            expected_values=[[0.1, 1.], [0.3, 1.0]]
+        )
+
+    def test_percentile_op_with_elements_smaller_than_map_range(self):
+        self._test_percentile_op(
+            original_inp=[[1], [5], [6]],
+            value_to_pct_map=[[2, 0.2], [5, 0.5], [7, 0.5]],
+            dist_lengths=[3],
+            expected_values=[[0.0], [0.5], [0.5]]
+        )
+
+    def test_percentile_op_with_interpolation(self):
+        self._test_percentile_op(
+            original_inp=[[3, 2, 5], [6, 7, 8]],
+            value_to_pct_map=[[1, 0.1], [4, 0.7], [4.5, 0.8],
+                              [6, 0.5], [8, 0.9],
+                              [8, 0.6]],
+            dist_lengths=[3, 2, 1],
+            expected_values=[[0.5, 0.0, 0.0], [1.0, 0.7, 0.6]]
+        )
+
+    def test_percentile_op_with_large_sample_size_per_dist(self):
+        self._test_percentile_op(
+            original_inp=[[3, 1], [5, 7]],
+            value_to_pct_map=[[3, 0.5], [4, 0.6], [5, 0.7],
+                              [1, 0.2], [2, 0.3], [5, 0.8]],
+            dist_lengths=[3, 3],
+            expected_values=[[0.5, 0.2], [0.7, 1.0]]
+        )
+
+
+if __name__ == "__main__":
+    import unittest
+    unittest.main()