Remove unsafecoalesce op (pytorch#12897)

Yangqing Jia · facebook-github-bot · commit da73d709a80b · 2018-10-22T09:42:26.000-07:00
Summary: Pull Request resolved: pytorch#12897 UnsafeCoalesce Op is used during memonger days when we try to coalesce operators for better efficienct computation kernels. It creates a little bit of an unsafe underlying memory storage pattern. With the new tensor unification I am not sure if it is still safe for us to do so, so I propose we delete it for the sake of safety. Reviewed By: bddppq, ilia-cher Differential Revision: D10475980 fbshipit-source-id: b1a838c9f47d681c309ee8e2f961b432236e157e
diff --git a/caffe2/operators/utility_ops.cc b/caffe2/operators/utility_ops.cc
@@ -72,8 +72,6 @@ OPERATOR_SCHEMA(WallClockTime)
     .SetDoc("Time since epoch in nanoseconds.")
     .Output(0, "time", "The time in nanoseconds.");
 
-REGISTER_CPU_OPERATOR(UnsafeCoalesce, UnsafeCoalesceOp<CPUContext>);
-
 OPERATOR_SCHEMA(Print)
     .NumInputs(1)
     .NumOutputs(0)
@@ -649,31 +647,6 @@ weights derived by lengths. i.e 1/pow(length, power)
 
 SHOULD_NOT_DO_GRADIENT(WallClockTime);
 
-OPERATOR_SCHEMA(UnsafeCoalesce)
-    .NumInputsOutputs([](int inputs, int outputs) {
-      return inputs + 1 == outputs;
-    })
-    .AllowInplace([](int input, int output) { return input == output; })
-    .SetDoc(R"DOC(
-Coalesce the N inputs into N outputs and a single coalesced output blob.
-
-This allows operations that operate over multiple small kernels (e.g.
-biases in a deep CNN) to be coalesced into a single larger operation,
-amortizing the kernel launch overhead, synchronization costs for
-distributed computation, etc.
-
-The operator:
-
-- computes the total size of the coalesced blob by summing the input sizes
-- allocates the coalesced output blob as the total size
-- copies the input vectors into the coalesced blob, at the correct offset.
-- aliases each Output(i) to- point into the coalesced blob, at the corresponding offset for Input(i).
-
-This is 'unsafe' as the output vectors are aliased, so use with
-caution.
-
-)DOC");
-
 OPERATOR_SCHEMA(EnsureDense)
     .NumInputs(1)
     .NumOutputs(1)
@@ -739,7 +712,6 @@ SHOULD_NOT_DO_GRADIENT(Print);
 SHOULD_NOT_DO_GRADIENT(HasElements);
 SHOULD_NOT_DO_GRADIENT(IsEmpty);
 SHOULD_NOT_DO_GRADIENT(LengthsToShape);
-SHOULD_NOT_DO_GRADIENT(UnsafeCoalesce);
 
 class GetAliasGradient : public GradientMakerBase {
   using GradientMakerBase::GradientMakerBase;
diff --git a/caffe2/operators/utility_ops.cu b/caffe2/operators/utility_ops.cu
@@ -44,8 +44,6 @@ REGISTER_CUDA_OPERATOR(ResizeLike, ResizeLikeOp<CUDAContext>);
 REGISTER_CUDA_OPERATOR(Sum, SumOp<CUDAContext>);
 REGISTER_CUDA_OPERATOR(WeightedSum, WeightedSumOp<CUDAContext>);
 
-REGISTER_CUDA_OPERATOR(UnsafeCoalesce, UnsafeCoalesceOp<CUDAContext>);
-
 CAFFE_KNOWN_TYPE(const float*);
 
 REGISTER_CUDA_OPERATOR(EnsureDense, EnsureDenseOp<CUDAContext>);
diff --git a/caffe2/operators/utility_ops.h b/caffe2/operators/utility_ops.h
@@ -1144,63 +1144,6 @@ class LengthsGatherOp : public Operator<Context> {
   INPUT_TAGS(ITEMS, LENGTHS, INDICES);
 };
 
-template <class Context>
-class UnsafeCoalesceOp final : public Operator<Context> {
- public:
-  USE_OPERATOR_CONTEXT_FUNCTIONS;
-  using Operator<Context>::Operator;
-
-  bool RunOnDevice() override {
-    size_t coalesced_size = 0;
-    for (int i = 0; i < InputSize(); ++i) {
-      CAFFE_ENFORCE(
-          !Input(i).meta().placementNew(),
-          "Must only coalesce fundamental types, error at input: ",
-          i);
-    }
-
-    auto roundToAlignment = [](size_t bytes) -> size_t {
-      return ((bytes + gCaffe2Alignment - 1) / gCaffe2Alignment) *
-          gCaffe2Alignment;
-    };
-
-    for (int i = 0; i < InputSize(); ++i) {
-      coalesced_size += roundToAlignment(Input(i).nbytes());
-    }
-
-    auto* coalesced = Output(OutputSize() - 1);
-    coalesced->Resize(coalesced_size);
-    math::Set<uint8_t, Context>(
-        coalesced_size,
-        0.0,
-        coalesced->template mutable_data<uint8_t>(),
-        &context_);
-
-    size_t coalesced_offset = 0;
-    for (auto i = 0; i < InputSize(); ++i) {
-      const auto input_nbytes = Input(i).nbytes();
-      context_.CopyBytesSameDevice(
-          input_nbytes,
-          (const uint8_t*)Input(i).raw_data(),
-          coalesced->template mutable_data<uint8_t>() + coalesced_offset);
-
-      // Note: this could cause Input(i) to free it's data if
-      // Output(i) and Input(i) alias each other. This is safe on a
-      // GPU (as the copy will happen-before the free), but it's
-      // worth mentioning.
-
-      Output(i)->ResizeLike(Input(i));
-      Output(i)->ShareExternalPointer(
-          static_cast<void*>(
-              coalesced->template mutable_data<uint8_t>() + coalesced_offset),
-          Input(i).meta(),
-          input_nbytes);
-      coalesced_offset += roundToAlignment(input_nbytes);
-    }
-    return true;
-  }
-};
-
 template <typename T, class Context>
 class AccumulateHistogramOp : public Operator<Context> {
  public:
diff --git a/caffe2/python/hypothesis_test.py b/caffe2/python/hypothesis_test.py
@@ -2223,36 +2223,6 @@ def ref_nhwc(x, scale, bias):
         for blob, arr in feeds:
             np.testing.assert_array_equal(ws.blobs[blob].fetch(), arr)
 
-    @given(sizes=st.lists(st.integers(1, 100), min_size=1),
-           in_place=st.booleans(),
-           **hu.gcs)
-    def test_unsafe_coalesce(self, sizes, in_place, gc, dc):
-        gAlignment = 64
-        Xs = [np.random.randn(size)
-              .astype(np.random.choice([np.float32, np.float64, np.uint8]))
-              for size in sizes]
-        op = core.CreateOperator(
-            "UnsafeCoalesce",
-            ["X_{}".format(i) for i, _ in enumerate(sizes)],
-            [("X_{}" if in_place else "Y_{}").format(i)
-             for i, _ in enumerate(sizes)] + ["coalesced"])
-        self.assertDeviceChecks(dc, op, Xs, list(range(len(sizes) + 1)))
-
-        def unsafe_coalesce(*xs):
-            def to_uint8(x):
-                x_aligned_bytes = ((x.nbytes + gAlignment - 1) // gAlignment) \
-                    * gAlignment
-                x_aligned = np.zeros(
-                    shape=(x_aligned_bytes // x.dtype.itemsize, ),
-                    dtype=x.dtype)
-                x_aligned[:x.size] = x
-                x_cast = np.fromstring(x_aligned.tobytes(), dtype='<u1')
-                return x_cast
-            flat = [to_uint8(x) for x in xs]
-            coalesced = np.concatenate(flat)
-            return list(xs) + [coalesced]
-        self.assertReferenceChecks(gc, op, Xs, unsafe_coalesce)
-
     @given(inp=_dtypes().flatmap(lambda dt: _tensor_and_indices(
         elements=st.floats(min_value=0, max_value=1), dtype=dt)),
         **hu.gcs)