kevinch-nv
diff --git a/‎binaries/benchmark_helper.cc
Lines changed: 3 additions & 3 deletions b/‎binaries/benchmark_helper.cc
Lines changed: 3 additions & 3 deletions
diff --git a/‎binaries/benchmark_helper.h
Lines changed: 1 addition & 1 deletion b/‎binaries/benchmark_helper.h
Lines changed: 1 addition & 1 deletion
diff --git a/‎binaries/core_overhead_benchmark.cc
Lines changed: 3 additions & 3 deletions b/‎binaries/core_overhead_benchmark.cc
Lines changed: 3 additions & 3 deletions
diff --git a/‎binaries/print_core_object_sizes.cc
Lines changed: 1 addition & 2 deletions b/‎binaries/print_core_object_sizes.cc
Lines changed: 1 addition & 2 deletions
diff --git a/‎binaries/speed_benchmark.cc
Lines changed: 1 addition & 1 deletion b/‎binaries/speed_benchmark.cc
Lines changed: 1 addition & 1 deletion
diff --git a/‎caffe2/contrib/aten/aten_op_template.h
Lines changed: 7 additions & 7 deletions b/‎caffe2/contrib/aten/aten_op_template.h
Lines changed: 7 additions & 7 deletions
diff --git a/‎caffe2/contrib/gloo/common.cc
Lines changed: 1 addition & 1 deletion b/‎caffe2/contrib/gloo/common.cc
Lines changed: 1 addition & 1 deletion
diff --git a/‎caffe2/contrib/nccl/cuda_nccl_op_gpu.cc
Lines changed: 5 additions & 5 deletions b/‎caffe2/contrib/nccl/cuda_nccl_op_gpu.cc
Lines changed: 5 additions & 5 deletions
diff --git a/‎caffe2/contrib/nervana/nervana_fc_op_gpu_test.cc
Lines changed: 3 additions & 3 deletions b/‎caffe2/contrib/nervana/nervana_fc_op_gpu_test.cc
Lines changed: 3 additions & 3 deletions
diff --git a/‎caffe2/contrib/warpctc/ctc_op.h
Lines changed: 7 additions & 7 deletions b/‎caffe2/contrib/warpctc/ctc_op.h
Lines changed: 7 additions & 7 deletions
@@ -160,7 +160,7 @@ void loadInput(
           CAFFE_THROW("Not support GPU on mobile.");
 #endif
         } else {
-          caffe2::TensorCPU* tensor = blob->GetMutable<caffe2::TensorCPU>();
+          caffe2::TensorCPU* tensor = blob->GetMutableTensor(caffe2::CPU);
           CHECK_NOTNULL(tensor);
           tensor->Resize(input_dims);
           if (input_type_list[i] == "uint8_t") {
@@ -197,7 +197,7 @@ void fillInputBlob(
     int protos_size = tensor_kv.second.protos_size();
     caffe2::TensorProto* tensor_proto =
         tensor_kv.second.mutable_protos(iteration % protos_size);
-    caffe2::TensorCPU* tensor = blob->GetMutable<caffe2::TensorCPU>();
+    caffe2::TensorCPU* tensor = blob->GetMutableTensor(caffe2::CPU);
     tensor->Resize(std::vector<caffe2::TIndex>());
     if (tensor_proto->data_type() == caffe2::TensorProto::STRING) {
       (tensor->mutable_data<std::string>())[0] = tensor_proto->string_data(0);
@@ -290,7 +290,7 @@ void writeOutput(
 #endif
         } else {
           writeTextOutput<caffe2::CPUContext, caffe2::TensorCPU>(
-              workspace->GetBlob(name)->GetMutable<caffe2::TensorCPU>(),
+              workspace->GetBlob(name)->GetMutableTensor(caffe2::CPU),
               output_prefix,
               name);
         }
 
@@ -35,7 +35,7 @@ void writeTextOutput(
     const string& output_prefix,
     const string& name) {
   string output_name = output_prefix + "/" + name + ".txt";
-  caffe2::TensorSerializer<ContextType> ser;
+  caffe2::TensorSerializer ser;
   caffe2::BlobProto blob_proto;
   ser.Serialize(
       *tensor, output_name, blob_proto.mutable_tensor(), 0, tensor->size());
 
@@ -139,7 +139,7 @@ BENCHMARK(BM_cudaStreamWaitEventThenStreamSynchronize);
 
 static void BM_CudaPointerAffinity(benchmark::State& state) {
   CAFFE2_SKIP_IF_NO_GPU;
-  TensorCUDA tensor(vector<TIndex>{1, 2, 3, 4});
+  Tensor tensor(vector<TIndex>{1, 2, 3, 4}, CUDA);
   float* ptr = tensor.mutable_data<float>();
   while (state.KeepRunning()) {
     volatile int id = GetGPUIDForPointer(ptr);
@@ -198,7 +198,7 @@ static void BM_RawAllocDeallocCPU(benchmark::State& state) {
 BENCHMARK(BM_RawAllocDeallocCPU);
 
 static void BM_TensorAllocDeallocCPU(benchmark::State& state) {
-  Tensor<CPUContext> tensor;
+  Tensor tensor(CPU);
   // small allocation
   tensor.Resize(32, 32);
   while (state.KeepRunning()) {
@@ -210,7 +210,7 @@ BENCHMARK(BM_TensorAllocDeallocCPU);
 
 static void BM_TensorAllocDeallocCUDA(benchmark::State& state) {
   CAFFE2_SKIP_IF_NO_GPU;
-  Tensor<CUDAContext> tensor;
+  Tensor tensor(CUDA);
   // small allocation
   tensor.Resize(32, 32);
   while (state.KeepRunning()) {
 
@@ -28,8 +28,7 @@
 
 int main(int /* unused */, char** /* unused */) {
   PRINT_SIZE(caffe2::Blob);
-  PRINT_SIZE(caffe2::Tensor<caffe2::CPUContext>);
-  PRINT_SIZE(caffe2::Tensor<caffe2::CUDAContext>);
+  PRINT_SIZE(caffe2::Tensor);
   PRINT_SIZE(caffe2::CPUContext);
   PRINT_SIZE(caffe2::CUDAContext);
   PRINT_SIZE(caffe2::OperatorBase);
 
@@ -136,7 +136,7 @@ int main(int argc, char** argv) {
         if (blob == nullptr) {
           blob = workspace->CreateBlob(input_names[i]);
         }
-        caffe2::TensorCPU* tensor = blob->GetMutable<caffe2::TensorCPU>();
+        caffe2::TensorCPU* tensor = blob->GetMutableTensor(caffe2::CPU);
         CHECK_NOTNULL(tensor);
         tensor->Resize(input_dims);
         if (input_type_list[i] == "uint8_t") {
 
@@ -54,11 +54,11 @@ class ATenOp : public Operator<Context> {
     #undef DEFINE_CASE
   }
 
-  at::Type & typeFor(const Tensor<Context> & ten) {
+  at::Type& typeFor(const Tensor& ten) {
     return at::getType(backend(), atScalarTypeFor(ten.meta()));
   }
-  at::Tensor tensorWrapping(const Tensor<Context>& ten_) {
-    auto& ten = const_cast<Tensor<Context>&>(ten_);
+  at::Tensor tensorWrapping(const Tensor& ten_) {
+    auto& ten = const_cast<Tensor&>(ten_);
     return typeFor(ten).tensorFromBlob(ten.raw_mutable_data(), ten.dims());
   }
 
@@ -88,7 +88,7 @@ class ATenOp : public Operator<Context> {
     }
     CAFFE_THROW("Unknown type meta"); // TODO: improve error message...
   }
-  void assignTo(Tensor<Context> * dst, const at::Tensor & src_) {
+  void assignTo(Tensor* dst, const at::Tensor& src_) {
     at::Tensor src = src_.contiguous();
     auto at_sizes = src.sizes();
     std::vector<int64_t> dims(at_sizes.begin(),at_sizes.end());
@@ -121,7 +121,7 @@ class ATenOp : public Operator<Context> {
     return s.toLong();
   }
 
-  void assignTo(Tensor<Context> * dst, at::Type & inferred_type, at::Scalar scalar) {
+  void assignTo(Tensor* dst, at::Type& inferred_type, at::Scalar scalar) {
     switch(inferred_type.scalarType()) {
       #define DEFINE_CASE(ctype,aten_name,native) \
         case at::k##aten_name: { \
@@ -134,8 +134,8 @@ class ATenOp : public Operator<Context> {
         CAFFE_THROW("Unknown ATen Type");
     }
   }
-  template<typename T>
-  void assignToValue(Tensor<Context> * dst, T v) {
+  template <typename T>
+  void assignToValue(Tensor* dst, T v) {
     dst->Resize(std::vector<TIndex>());
     math::Set(1, v, dst->template mutable_data<T>(), &context_);
   }
 
@@ -12,7 +12,7 @@ namespace caffe2 {
 namespace gloo {
 
 void signalFailure(Blob* status_blob, std::exception& /* unused */) {
-  auto* res = status_blob->GetMutable<TensorCPU>();
+  auto* res = status_blob->GetMutableTensor(CPU);
   res->Resize(1);
   res->template mutable_data<int32_t>()[0] = 1;
 }
 
@@ -17,17 +17,17 @@ nccl::NCCLExecution getNCCLElements(
   ex.elements.resize(op->InputSize());
   for (auto i = 0; i < op->InputSize(); ++i) {
     auto& el = ex.elements[i];
-    el.src = &(op->Input<TensorCUDA>(i));
+    el.src = &(op->Input<Tensor>(i, CUDA));
     if (op->OutputSize() == 1) {
       // Reduce op
       if (i == ex.root) {
-        el.dst = op->Output<TensorCUDA>(0);
+        el.dst = op->Output<Tensor>(0, CUDA);
       }
     } else if (i < op->OutputSize()) {
-      el.dst = op->Output<TensorCUDA>(i);
+      el.dst = op->Output<Tensor>(i, CUDA);
     }
     // TODO - expensive (>1ms) - cache these.
-    el.device = GetGPUIDForPointer(op->Input<TensorCUDA>(i).raw_data());
+    el.device = GetGPUIDForPointer(op->Input<Tensor>(i, CUDA).raw_data());
   }
 
   return ex;
@@ -38,7 +38,7 @@ namespace {
 template <typename T>
 bool AllInputsAre(OperatorBase* op) {
   for (auto i = 0; i < op->InputSize(); ++i) {
-    if (op->Input<TensorCUDA>(i).IsType<T>()) {
+    if (op->Input<Tensor>(i, CUDA).IsType<T>()) {
       continue;
     } else {
       return false;
 
@@ -22,7 +22,7 @@ static void AddConstInput(const std::vector<int>& shape, const float value,
   option.set_device_type(CUDA);
   CUDAContext context(option);
   Blob* blob = ws->CreateBlob(name);
-  auto* tensor = blob->GetMutable<Tensor<CUDAContext>>();
+  auto* tensor = blob->GetMutableTensor(CUDA);
   tensor->Resize(shape);
   math::Set<float, CUDAContext>(tensor->size(), value,
                                 tensor->mutable_data<float>(),
@@ -54,8 +54,8 @@ TEST(NervanaFullyConnectedTest, Test) {
   EXPECT_TRUE(op->Run());
   Blob* Yblob = ws.GetBlob("Y");
   EXPECT_NE(nullptr, Yblob);
-  auto& Y = Yblob->Get<Tensor<CUDAContext>>();
-  TensorCPU Y_cpu(Y);
+  auto& Y = Yblob->Get<Tensor>();
+  Tensor Y_cpu(Y, CPU);
   EXPECT_EQ(Y.size(), 5 * 6);
   for (int i = 0; i < Y.size(); ++i) {
     CHECK_LT(Y_cpu.data<float>()[i], 10.11);
 
@@ -47,26 +47,26 @@ class CTCOp final : public Operator<Context> {
     const auto& inputs = Input(INPUTS);
     const auto minibatchSize = inputs.dim(1);
     const auto alphabetSize = inputs.dim(2);
-    const auto& labels = OperatorBase::template Input<TensorCPU>(LABELS);
+    const auto& labels = OperatorBase::template Input<Tensor>(LABELS, CPU);
     const auto& labelLengths =
-        OperatorBase::template Input<TensorCPU>(LABEL_LENGTHS);
+        OperatorBase::template Input<Tensor>(LABEL_LENGTHS, CPU);
     const auto& inputLengths =
-        OperatorBase::template Input<TensorCPU>(INPUT_LENGTHS);
+        OperatorBase::template Input<Tensor>(INPUT_LENGTHS, CPU);
 
     // outputs
-    Tensor<Context>* gradients = nullptr;
+    Tensor* gradients = nullptr;
     TensorCPU* costs;
-    Tensor<Context>* workspace;
+    Tensor* workspace;
     if (!is_test_) {
       // [grads, costs, workspace] to maintain backward compatibility
       gradients = Output(0);
       gradients->ResizeLike(inputs);
-      costs = OperatorBase::template Output<TensorCPU>(1);
+      costs = OperatorBase::template Output<Tensor>(1, CPU);
       costs->ResizeLike(labelLengths);
       workspace = Output(2);
     } else {
       // [costs, workspace]
-      costs = OperatorBase::template Output<TensorCPU>(0);
+      costs = OperatorBase::template Output<Tensor>(0, CPU);
       costs->ResizeLike(labelLengths);
       workspace = Output(1);
     }
Original file line number	Diff line number	Diff line change
`@@ -136,7 +136,7 @@ int main(int argc, char** argv) {`
`136`	`136`	`if (blob == nullptr) {`
`137`	`137`	`blob = workspace->CreateBlob(input_names[i]);`
`138`	`138`	`}`
`139`		`- caffe2::TensorCPU* tensor = blob->GetMutable<caffe2::TensorCPU>();`
	`139`	`+ caffe2::TensorCPU* tensor = blob->GetMutableTensor(caffe2::CPU);`
`140`	`140`	`CHECK_NOTNULL(tensor);`
`141`	`141`	`tensor->Resize(input_dims);`
`142`	`142`	`if (input_type_list[i] == "uint8_t") {`
Original file line number	Diff line number	Diff line change
`@@ -54,11 +54,11 @@ class ATenOp : public Operator<Context> {`
`54`	`54`	`#undef DEFINE_CASE`
`55`	`55`	`}`
`56`	`56`
`57`		`- at::Type & typeFor(const Tensor<Context> & ten) {`
	`57`	`+ at::Type& typeFor(const Tensor& ten) {`
`58`	`58`	`return at::getType(backend(), atScalarTypeFor(ten.meta()));`
`59`	`59`	`}`
`60`		`- at::Tensor tensorWrapping(const Tensor<Context>& ten_) {`
`61`		`- auto& ten = const_cast<Tensor<Context>&>(ten_);`
	`60`	`+ at::Tensor tensorWrapping(const Tensor& ten_) {`
	`61`	`+ auto& ten = const_cast<Tensor&>(ten_);`
`62`	`62`	`return typeFor(ten).tensorFromBlob(ten.raw_mutable_data(), ten.dims());`
`63`	`63`	`}`
`64`	`64`
`@@ -88,7 +88,7 @@ class ATenOp : public Operator<Context> {`
`88`	`88`	`}`
`89`	`89`	`CAFFE_THROW("Unknown type meta"); // TODO: improve error message...`
`90`	`90`	`}`
`91`		`- void assignTo(Tensor<Context> * dst, const at::Tensor & src_) {`
	`91`	`+ void assignTo(Tensor* dst, const at::Tensor& src_) {`
`92`	`92`	`at::Tensor src = src_.contiguous();`
`93`	`93`	`auto at_sizes = src.sizes();`
`94`	`94`	`std::vector<int64_t> dims(at_sizes.begin(),at_sizes.end());`
`@@ -121,7 +121,7 @@ class ATenOp : public Operator<Context> {`
`121`	`121`	`return s.toLong();`
`122`	`122`	`}`
`123`	`123`
`124`		`- void assignTo(Tensor<Context> * dst, at::Type & inferred_type, at::Scalar scalar) {`
	`124`	`+ void assignTo(Tensor* dst, at::Type& inferred_type, at::Scalar scalar) {`
`125`	`125`	`switch(inferred_type.scalarType()) {`
`126`	`126`	`#define DEFINE_CASE(ctype,aten_name,native) \`
`127`	`127`	`case at::k##aten_name: { \`
`@@ -134,8 +134,8 @@ class ATenOp : public Operator<Context> {`
`134`	`134`	`CAFFE_THROW("Unknown ATen Type");`
`135`	`135`	`}`
`136`	`136`	`}`
`137`		`- template<typename T>`
`138`		`- void assignToValue(Tensor<Context> * dst, T v) {`
	`137`	`+ template <typename T>`
	`138`	`+ void assignToValue(Tensor* dst, T v) {`
`139`	`139`	`dst->Resize(std::vector<TIndex>());`
`140`	`140`	`math::Set(1, v, dst->template mutable_data<T>(), &context_);`
`141`	`141`	`}`
Original file line number	Diff line number	Diff line change
`@@ -12,7 +12,7 @@ namespace caffe2 {`
`12`	`12`	`namespace gloo {`
`13`	`13`
`14`	`14`	`void signalFailure(Blob* status_blob, std::exception& /* unused */) {`
`15`		`- auto* res = status_blob->GetMutable<TensorCPU>();`
	`15`	`+ auto* res = status_blob->GetMutableTensor(CPU);`
`16`	`16`	`res->Resize(1);`
`17`	`17`	`res->template mutable_data<int32_t>()[0] = 1;`
`18`	`18`	`}`