|
| 1 | +#include "caffe2/operators/percentile_op.h" |
| 2 | + |
| 3 | +namespace caffe2 { |
| 4 | + |
| 5 | +template <> |
| 6 | +bool PercentileOp<CPUContext>::RunOnDevice() { |
| 7 | + const auto& original_values = Input(X); |
| 8 | + CAFFE_ENFORCE_EQ(original_values.ndim(), 2); |
| 9 | + const auto num_examples = original_values.dim(0); |
| 10 | + const float* original_values_data = original_values.template data<float>(); |
| 11 | + const auto num_features = original_values.dim(1); |
| 12 | + |
| 13 | + const auto& value_pct_pairs = Input(VAL_PCT_PAIRS); |
| 14 | + CAFFE_ENFORCE_EQ(value_pct_pairs.ndim(), 2); |
| 15 | + CAFFE_ENFORCE_EQ(value_pct_pairs.dim(1), 2); |
| 16 | + const int num_values = value_pct_pairs.dim(0); |
| 17 | + const float* value_pct_data = value_pct_pairs.template data<float>(); |
| 18 | + |
| 19 | + const auto& lengths = Input(LENS); |
| 20 | + const int* lengths_data = lengths.template data<int>(); |
| 21 | + CAFFE_ENFORCE_EQ(lengths.size(), num_features); |
| 22 | + |
| 23 | + CAFFE_ENFORCE_EQ( |
| 24 | + std::accumulate(lengths_data, lengths_data + lengths.size(), 0), |
| 25 | + num_values, |
| 26 | + "Sum of lengths should be equal to the total number of samples"); |
| 27 | + |
| 28 | + values_tensor.Resize(num_values); |
| 29 | + percentiles_tensor.Resize(num_values); |
| 30 | + float* values_tensor_data = values_tensor.template mutable_data<float>(); |
| 31 | + float* percentiles_tensor_data = |
| 32 | + percentiles_tensor.template mutable_data<float>(); |
| 33 | + for (int ind = 0; ind < num_values; ind++) { |
| 34 | + values_tensor_data[ind] = value_pct_data[2 * ind]; |
| 35 | + percentiles_tensor_data[ind] = value_pct_data[2 * ind + 1]; |
| 36 | + } |
| 37 | + |
| 38 | + auto* percentile_values = Output(PCT); |
| 39 | + percentile_values->ResizeLike(original_values); |
| 40 | + float* percentile_values_data = |
| 41 | + percentile_values->template mutable_data<float>(); |
| 42 | + |
| 43 | + int current_ind = 0; |
| 44 | + int current_dist_start = 0; |
| 45 | + int current_length; |
| 46 | + for (int i = 0; i < num_examples; i++) { |
| 47 | + current_dist_start = 0; |
| 48 | + |
| 49 | + for (int j = 0; j < num_features; j++) { |
| 50 | + current_length = lengths_data[j]; |
| 51 | + const auto lower_bound = |
| 52 | + std::lower_bound( |
| 53 | + values_tensor_data + current_dist_start, |
| 54 | + values_tensor_data + current_dist_start + current_length, |
| 55 | + original_values_data[current_ind]) - |
| 56 | + values_tensor_data; |
| 57 | + if (lower_bound == current_dist_start + current_length) { |
| 58 | + percentile_values_data[current_ind] = 1.0; |
| 59 | + } else if ( |
| 60 | + original_values_data[current_ind] == |
| 61 | + values_tensor_data[lower_bound]) { |
| 62 | + percentile_values_data[current_ind] = |
| 63 | + percentiles_tensor_data[lower_bound]; |
| 64 | + } else if (lower_bound == current_dist_start) { |
| 65 | + percentile_values_data[current_ind] = 0.0; |
| 66 | + } else { |
| 67 | + float lower_pct = percentiles_tensor_data[lower_bound - 1]; |
| 68 | + float upper_pct = percentiles_tensor_data[lower_bound]; |
| 69 | + float interval_length = values_tensor_data[lower_bound] - |
| 70 | + values_tensor_data[lower_bound - 1]; |
| 71 | + float normalized_dist_to_lower = (original_values_data[current_ind] - |
| 72 | + values_tensor_data[lower_bound - 1]) / |
| 73 | + interval_length; |
| 74 | + percentile_values_data[current_ind] = |
| 75 | + lower_pct + normalized_dist_to_lower * (upper_pct - lower_pct); |
| 76 | + } |
| 77 | + current_dist_start += current_length; |
| 78 | + current_ind++; |
| 79 | + } |
| 80 | + } |
| 81 | + return true; |
| 82 | +} |
| 83 | + |
| 84 | +REGISTER_CPU_OPERATOR(Percentile, PercentileOp<CPUContext>); |
| 85 | +OPERATOR_SCHEMA(Percentile) |
| 86 | + .NumInputs(3) |
| 87 | + .NumOutputs(1) |
| 88 | + .SetDoc(R"DOC( |
| 89 | + This operator is used to find percentile representations for raw values, given a sample |
| 90 | + set of raw values, labeled with their corresponding percentiles from the same distribution. |
| 91 | + In particular, this operator takes as input a tensor of floats to find the percentile values |
| 92 | + for, a 2D tensor of floats, where the first column of the tensor represents sampled values, |
| 93 | + and the second column represents the percentile labels, and a tensor of integers lengths. |
| 94 | +
|
| 95 | + This lengths tensor is used because the operator works on multiple sets of raw values at the same time. For |
| 96 | + example, for an input: |
| 97 | + original_values=[[3, 5, 3],[5, 1, 6]], lengths = [2, 1, 1], value_to_pct = [[3, 0.2], [5, 0.5], [1, 0.3], [3. 0.6]] |
| 98 | +
|
| 99 | + Our operator expects that each column i of the input tensor is sampled from distribution i. Lengths tells |
| 100 | + us that the first two elements in value_to_pct are sampled from distribution 1, the next is from distribution two, |
| 101 | + and the last is from distribution 3. We expect the output of our operator to give us [[0.2, 1.0, 0.6], [0.5, 0.3, 1.0]]. |
| 102 | +
|
| 103 | + To calculate the percentile of an element, we check to see if its value is already mapped to |
| 104 | + a percentile in value_to_pct. If so, we return that value. If not, we linearly interpolate between |
| 105 | + the two closest values in value_to_pct. If the value is larger than all values in value_to_pct, we |
| 106 | + return 1. If it's smaller than all the values, we return 0. |
| 107 | +
|
| 108 | +)DOC") |
| 109 | + .Input( |
| 110 | + 0, |
| 111 | + "original_values", |
| 112 | + "Input 2D tensor of floats, representing the original, raw data to calculate percentiles for.") |
| 113 | + .Input( |
| 114 | + 1, |
| 115 | + "value_to_pct", |
| 116 | + "Sorted 2D tensor, with 2 columns. Each element in the first column is a float representing the" |
| 117 | + " raw value of a sample. Its corresponding element in the next column represents the percentile it maps to.") |
| 118 | + .Input( |
| 119 | + 2, |
| 120 | + "lengths", |
| 121 | + "1D tensor, representing the length of each distribution. We expect that the sum of elements of this tensor" |
| 122 | + " is equal to the total length of value_to_pct.") |
| 123 | + .Output( |
| 124 | + 0, |
| 125 | + "percentile_values", |
| 126 | + "1D tensor of floats, with the same dimensions as the flattened input tensor. Each element " |
| 127 | + "of this tensor, percentile_values[i], corresponds to the percentile calculated " |
| 128 | + "for original_values[i]."); |
| 129 | + |
| 130 | +NO_GRADIENT(Percentile); |
| 131 | + |
| 132 | +} // namespace caffe2 |
0 commit comments