FedML-AI
diff --git a/Diff for: ‎caffe2/sgd/adagrad_op.h
+6-6 b/Diff for: ‎caffe2/sgd/adagrad_op.h
+6-6
diff --git a/Diff for: ‎caffe2/sgd/adam_op.h
+16-16 b/Diff for: ‎caffe2/sgd/adam_op.h
+16-16
diff --git a/Diff for: ‎caffe2/sgd/learning_rate_adaption_op.h
+1-1 b/Diff for: ‎caffe2/sgd/learning_rate_adaption_op.h
+1-1
diff --git a/Diff for: ‎caffe2/sgd/learning_rate_op.h
+2-1 b/Diff for: ‎caffe2/sgd/learning_rate_op.h
+2-1
diff --git a/Diff for: ‎caffe2/sgd/momentum_sgd_op.h
+2-2 b/Diff for: ‎caffe2/sgd/momentum_sgd_op.h
+2-2
diff --git a/Diff for: ‎caffe2/sgd/rowwise_adagrad_fused.h
+8-8 b/Diff for: ‎caffe2/sgd/rowwise_adagrad_fused.h
+8-8
diff --git a/Diff for: ‎caffe2/sgd/rowwise_counter.h
+1-1 b/Diff for: ‎caffe2/sgd/rowwise_counter.h
+1-1
diff --git a/Diff for: ‎caffe2/sgd/storm_op.h
+5-5 b/Diff for: ‎caffe2/sgd/storm_op.h
+5-5
@@ -39,7 +39,7 @@ void adagrad_update_output_effective_lr(
     const float* lr,
     Context* /*context*/,
     float weight_decay = 0.f) {
-  for (auto i = 0; i < N; ++i) {
+  for (const auto i : c10::irange(N)) {
     float grad = std::fma(weight_decay, paramIn[i], gradIn[i]);
     float moment = momentOut[i] = decay * momentIn[i] + grad * grad;
     float effective_lr = effectiveLROut[i] =
@@ -63,7 +63,7 @@ void adagrad_update_output_effective_lr_and_update(
     const float* lr,
     Context* /*context*/,
     float weight_decay = 0.f) {
-  for (auto i = 0; i < N; ++i) {
+  for (const auto i : c10::irange(N)) {
     float grad = std::fma(weight_decay, paramIn[i], gradIn[i]);
     float moment = momentOut[i] = decay * momentIn[i] + grad * grad;
     float effective_lr = effectiveLROut[i] =
@@ -300,7 +300,7 @@ class SparseAdagradOp final : public Operator<CPUContext> {
     const auto* momentIn = Input(MOMENT_1).template data<float>();
 
     std::vector<float> grad(block_size);
-    for (auto i = 0; i < n; ++i) {
+    for (const auto i : c10::irange(n)) {
       auto idx = indices[i];
       auto offsetI = i * block_size;
       auto offsetIdx = idx * block_size;
@@ -504,7 +504,7 @@ class RowWiseSparseAdagradOp final : public Operator<Context> {
 #else
     VLOG(1) << "using plain adagrad updates in RowWiseSparseAdagradOp";
 
-    for (auto i = 0; i < n; ++i) {
+    for (const auto i : c10::irange(n)) {
       auto idx = indices[i];
       float freq = (counter_halflife_ > 0 && count[idx] > 0)
           ? counter_halflife_ / count[idx]
@@ -542,13 +542,13 @@ class RowWiseSparseAdagradOp final : public Operator<Context> {
         const float* g = gradIn + offsetI;
         float* h = moment + idx;
         float hs = 0.;
-        for (auto j = 0; j < block_size; ++j) {
+        for (const auto j : c10::irange(block_size)) {
           float gj = std::fma(weight_decay_ * freq, w[j], g[j]);
           hs += gj * gj;
         }
         float hi = h[0] = h[0] + hs / block_size;
         float step = lr[0] / (std::sqrt(hi) + epsilon_);
-        for (auto j = 0; j < block_size; ++j) {
+        for (const auto j : c10::irange(block_size)) {
           float gj = std::fma(weight_decay_ * freq, w[j], g[j]);
           w[j] = w[j] + gj * step;
         }
 
@@ -21,7 +21,7 @@ void adam_update(
     float correction,
     const float* lr,
     Context* /*context*/) {
-  for (auto i = 0; i < N; ++i) {
+  for (const auto i : c10::irange(N)) {
     float gi = g[i];
     float mi = nm[i] = m[i] * beta1 + gi * (1 - beta1);
     float vi = nv[i] = v[i] * beta2 + gi * gi * (1 - beta2);
@@ -45,7 +45,7 @@ void adam_compute(
     float correction,
     const float* lr,
     Context* /*context*/) {
-  for (auto i = 0; i < N; ++i) {
+  for (const auto i : c10::irange(N)) {
     float gi = g[i];
     float mi = nm[i] = m[i] * beta1 + gi * (1 - beta1);
     float vi = nv[i] = v[i] * beta2 + gi * gi * (1 - beta2);
@@ -74,7 +74,7 @@ void adam_compute_smart_decay(
     Context* /*context*/) {
   float k = (float)(t - lastSeenIn[0]);
   lastSeenOut[0] = t;
-  for (auto i = 0; i < N; ++i) {
+  for (const auto i : c10::irange(N)) {
     float gi = g[i];
     // The number of steps since this param was last seen.
     // We don't need integer precision for k.  Float is fine and it's faster to convert here.
@@ -107,7 +107,7 @@ void adam_compute_output_grad(
     float correction,
     const float* lr,
     Context* /*context*/) {
-  for (auto i = 0; i < N; ++i) {
+  for (const auto i : c10::irange(N)) {
     float gi = g[i];
     float mi = nm[i] = m[i] * beta1 + gi * (1 - beta1);
     float vi = nv[i] = v[i] * beta2 + gi * gi * (1 - beta2);
@@ -135,7 +135,7 @@ void radam_update(
     float r_correction,
     const float* lr,
     Context* /*context*/) {
-  for (auto i = 0; i < N; ++i) {
+  for (const auto i : c10::irange(N)) {
     float gi = g[i];
     float mi = nm[i] = m[i] * beta1 + gi * (1 - beta1);
     float vi = nv[i] = v[i] * beta2 + gi * gi * (1 - beta2);
@@ -169,7 +169,7 @@ void radam_compute(
     float r_correction,
     const float* lr,
     Context* /*context*/) {
-  for (auto i = 0; i < N; ++i) {
+  for (const auto i : c10::irange(N)) {
     float gi = g[i];
     float mi = nm[i] = m[i] * beta1 + gi * (1 - beta1);
     float vi = nv[i] = v[i] * beta2 + gi * gi * (1 - beta2);
@@ -204,7 +204,7 @@ void radam_compute_output_grad(
     float r_correction,
     const float* lr,
     Context* /*context*/) {
-  for (auto i = 0; i < N; ++i) {
+  for (const auto i : c10::irange(N)) {
     float gi = g[i];
     float mi = nm[i] = m[i] * beta1 + gi * (1 - beta1);
     float vi = nv[i] = v[i] * beta2 + gi * gi * (1 - beta2);
@@ -350,7 +350,7 @@ class SparseAdamOp final : public Operator<Context> {
     auto* moment2Out = Output(OUTPUT_MOMENT_2)->template mutable_data<T>();
 
     if (OutputSize() == 3) {
-      for (auto i = 0; i < n; ++i) {
+      for (const auto i : c10::irange(n)) {
         auto idx = indices[i];
 
         if (block_size == 1) {
@@ -444,7 +444,7 @@ class SparseAdamOp final : public Operator<Context> {
     } else {
       Output(OUTPUT_GRAD)->ResizeLike(Input(GRAD));
       auto* gradOut = Output(OUTPUT_GRAD)->template mutable_data<T>();
-      for (auto i = 0; i < n; ++i) {
+      for (const auto i : c10::irange(n)) {
         auto idx = indices[i];
 
         if (block_size == 1) {
@@ -593,7 +593,7 @@ class SmartDecaySparseAdamOp final : public Operator<Context> {
     auto* moment2Out = Output(OUTPUT_MOMENT_2)->template mutable_data<T>();
     int64_t* lastSeenOut = Output(OUTPUT_LAST_SEEN)->template mutable_data<int64_t>();
 
-    for (auto i = 0; i < n; ++i) {
+    for (const auto i : c10::irange(n)) {
         auto idx = indices[i];
         auto offsetI = i * block_size;
         auto offsetIdx = idx * block_size;
@@ -673,7 +673,7 @@ class RowWiseSparseAdamOp final : public Operator<Context> {
     auto* moment2Out = Output(OUTPUT_MOMENT_2)->template mutable_data<T>();
 
     if (OutputSize() == 3) {
-      for (auto i = 0; i < n; ++i) {
+      for (const auto i : c10::irange(n)) {
         auto idx = indices[i];
 
         if (block_size == 1) {
@@ -719,13 +719,13 @@ class RowWiseSparseAdamOp final : public Operator<Context> {
           float* nm2 = moment2Out + idx;
 
           float m2_sum = 0.;
-          for (auto j = 0; j < block_size; ++j) {
+          for (const auto j : c10::irange(block_size)) {
             float gj = g[j];
             m2_sum += gj * gj;
           }
           float vi = nm2[0] =
               m2[0] * beta2_ + (m2_sum / block_size) * (1 - beta2_);
-          for (auto j = 0; j < block_size; ++j) {
+          for (const auto j : c10::irange(block_size)) {
             float mi = nm1[j] = m1[j] * beta1_ + g[j] * (1 - beta1_);
             nw[j] = w[j] + lr[0] * correction * mi / (std::sqrt(vi) + epsilon_);
           }
@@ -734,7 +734,7 @@ class RowWiseSparseAdamOp final : public Operator<Context> {
     } else {
       Output(OUTPUT_GRAD)->ResizeLike(Input(GRAD));
       auto* gradOut = Output(OUTPUT_GRAD)->template mutable_data<T>();
-      for (auto i = 0; i < n; ++i) {
+      for (const auto i : c10::irange(n)) {
         auto idx = indices[i];
 
         if (block_size == 1) {
@@ -781,13 +781,13 @@ class RowWiseSparseAdamOp final : public Operator<Context> {
           float* ng = gradOut + offsetI;
 
           float m2_sum = 0.;
-          for (auto j = 0; j < block_size; ++j) {
+          for (const auto j : c10::irange(block_size)) {
             float gj = g[j];
             m2_sum += gj * gj;
           }
           float vi = nm2[0] =
               m2[0] * beta2_ + (m2_sum / block_size) * (1 - beta2_);
-          for (auto j = 0; j < block_size; ++j) {
+          for (const auto j : c10::irange(block_size)) {
             float mi = nm1[j] = m1[j] * beta1_ + g[j] * (1 - beta1_);
             float ngi = ng[j] = correction * mi / (std::sqrt(vi) + epsilon_);
             nw[j] = w[j] + lr[0] * ngi;
 
@@ -21,7 +21,7 @@ void lr_update(
   float x = 0;
   float y = 0, z = 0;
   const float kEps = 1e-12f;
-  for (auto i = 0; i < n; i++) {
+  for (const auto i : c10::irange(n)) {
     x += grad[i] * effgrad[i];
     if (normalized_lr_adaption) {
       y += grad[i] * grad[i];
 
@@ -5,6 +5,7 @@
 #include <cmath>
 #include "caffe2/core/context.h"
 #include "caffe2/core/export_caffe2_op_to_c10.h"
+#include <c10/util/irange.h>
 #include "caffe2/core/operator.h"
 #include "caffe2/sgd/learning_rate_functors.h"
 
@@ -162,7 +163,7 @@ class LearningRateOp final : public Operator<Context> {
           sub_policy_num_iters.size(),
           0,
           "Must specify at least one sub learning rate policy.");
-      for (size_t i = 0; i < sub_policy_num_iters.size(); ++i) {
+      for (const auto i : c10::irange(sub_policy_num_iters.size())) {
         CAFFE_ENFORCE_GT(
             sub_policy_num_iters[i],
             0,
 
@@ -17,7 +17,7 @@ void momentum_sgd_update(
     float* param,
     Context* /*context*/) {
   const float LR = lr[0];
-  for (auto i = 0; i < N; ++i) {
+  for (const auto i : c10::irange(N)) {
     if (!nesterov) {
       const float adjusted_gradient = LR * g[i] + momentum * m[i];
       nm[i] = adjusted_gradient;
@@ -154,7 +154,7 @@ class SparseMomentumSGDUpdateOp final : public Operator<Context> {
     auto* momentumOut = Output(OUTPUT_MOMENTUM)->template mutable_data<T>();
     auto* paramOut = Output(OUTPUT_PARAM)->template mutable_data<T>();
 
-    for (auto i = 0; i < n; ++i) {
+    for (const auto i : c10::irange(n)) {
       auto idx = indices[i];
       auto offsetI = i * block_size;
       auto offsetIdx = idx * block_size;
 
@@ -217,8 +217,8 @@ class RowWiseSparseAdagradFusedWithSparseLengthsSumGradientOp final
     auto* grad_buffer_data =
         is_mean ? grad_buffer_.template mutable_data<T>() : NULL;
     if (is_mean) {
-      for (auto rangeIndex = 0; rangeIndex < numSegments; ++rangeIndex) {
-        for (auto tmpIndex = 0; tmpIndex < block_size; ++tmpIndex) {
+      for (const auto rangeIndex : c10::irange(numSegments)) {
+        for (const auto tmpIndex : c10::irange(block_size)) {
           auto offsetI = rangeIndex * block_size;
           grad_buffer_data[offsetI + tmpIndex] = lengths[rangeIndex] > 0
               ? gradIn[offsetI + tmpIndex] / lengths[rangeIndex]
@@ -269,7 +269,7 @@ class RowWiseSparseAdagradFusedWithSparseLengthsSumGradientOp final
       T counter_halflife,
       rowWiseAdagradT& kernel) {
     int dataIndex = 0;
-    for (auto rangeIndex = 0; rangeIndex < numSegments; ++rangeIndex) {
+    for (const auto rangeIndex : c10::irange(numSegments)) {
       auto offsetI = rangeIndex * block_size;
       const float* g = gradIn + offsetI;
 
@@ -557,7 +557,7 @@ class RowWiseSparseAdagradFusedWithSparseLengthsWeightedSumGradientOp final
     // ignores this dependency and fuses these two loops.
     std::vector<T> temp_grad(block_size);
     int dataIndex = 0;
-    for (auto rangeIndex = 0; rangeIndex < numSegments; ++rangeIndex) {
+    for (const auto rangeIndex : c10::irange(numSegments)) {
       for (auto start = dataIndex; dataIndex < start + lengths[rangeIndex];
            ++dataIndex) {
         std::size_t idx = indices[dataIndex];
@@ -591,7 +591,7 @@ class RowWiseSparseAdagradFusedWithSparseLengthsWeightedSumGradientOp final
     CAFFE_ENFORCE_EQ(dataIndex, n);
 
     dataIndex = 0;
-    for (auto rangeIndex = 0; rangeIndex < numSegments; ++rangeIndex) {
+    for (const auto rangeIndex : c10::irange(numSegments)) {
       auto offsetI = rangeIndex * block_size;
       const float* g = gradIn + offsetI;
 
@@ -606,7 +606,7 @@ class RowWiseSparseAdagradFusedWithSparseLengthsWeightedSumGradientOp final
         auto offsetIdx = idx * block_size;
         auto localOffset = dataIndex - start;
 
-        for (int i = 0; i < block_size; ++i) {
+        for (const auto i : c10::irange(block_size)) {
           temp_grad[i] = auxParamIn[localOffset] * g[i];
         }
 
@@ -839,7 +839,7 @@ class RowWiseSparseAdagradFusedWithSparseLengthsWeightedSumGradientApproxOp
 
     std::vector<T> temp_grad(block_size);
     int dataIndex = 0;
-    for (auto rangeIndex = 0; rangeIndex < numSegments; ++rangeIndex) {
+    for (const auto rangeIndex : c10::irange(numSegments)) {
       auto offsetI = rangeIndex * block_size;
       const float* g = gradIn + offsetI;
 
@@ -902,7 +902,7 @@ class RowWiseSparseAdagradFusedWithSparseLengthsWeightedSumGradientApproxOp
 
         alignas(64) float temp[VLEN];
         _mm256_store_ps(temp, acc_v);
-        for (int j = 0; j < VLEN; ++j) {
+        for (const auto j : c10::irange(VLEN)) {
           acc += temp[j];
         }
 #endif
 
@@ -40,7 +40,7 @@ class RowWiseCounterOp final : public Operator<CPUContext> {
       return true;
     }
 
-    for (auto i = 0; i < n; ++i) {
+    for (const auto i : c10::irange(n)) {
       const std::size_t idx = indices[i];
       CAFFE_ENFORCE_GE(
           Input(COUNTER).numel(),
 
@@ -19,15 +19,15 @@ void storm_update(
     const float beta,
     Context* /*context*/) {
   float gradSqSumTmp = 0.0;
-  for (auto i = 0; i < N; ++i) {
+  for (const auto i : c10::irange(N)) {
     const float gi = gradIn[i];
     gradSqSumTmp += gi * gi;
   }
   gradSqSumOut[0] = gradSqSumIn[0] + gradSqSumTmp;
 
   const float nlr = lr[0] * std::pow(beta + gradSqSumOut[0], -1.0 / 3.0);
   const float alpha = momentum * nlr * nlr;
-  for (auto i = 0; i < N; ++i) {
+  for (const auto i : c10::irange(N)) {
     const float gi = gradIn[i];
     const float mi = momentIn[i];
     float new_mi = momentOut[i] = gi + (1.0 - alpha) * (mi - gi);
@@ -120,7 +120,7 @@ class SparseStormOp final : public Operator<Context> {
     }
 
     float gradSqSumTmp = 0.0;
-    for (auto i = 0; i < Input(GRAD).numel(); ++i) {
+    for (const auto i : c10::irange(Input(GRAD).numel())) {
       const float gi = gradIn[i];
       gradSqSumTmp += gi * gi;
     }
@@ -130,7 +130,7 @@ class SparseStormOp final : public Operator<Context> {
     const float alpha = momentum_ * nlr * nlr;
     const auto block_size = Input(GRAD).numel() / n;
 
-    for (auto i = 0; i < n; ++i) {
+    for (const auto i : c10::irange(n)) {
       auto idx = indices[i];
       if (block_size == 1) {
         const float gi = gradIn[i];
@@ -162,7 +162,7 @@ class SparseStormOp final : public Operator<Context> {
             i);
 #endif
 
-        for (auto j = 0; j < block_size; ++j) {
+        for (const auto j : c10::irange(block_size)) {
           const float gi = gradIn[offsetI + j];
           const float mi = momentIn[offsetIdx + j];
           float new_mi = momentOut[offsetIdx + j] =
Original file line number	Diff line number	Diff line change
`@@ -40,7 +40,7 @@ class RowWiseCounterOp final : public Operator<CPUContext> {`
`40`	`40`	`return true;`
`41`	`41`	`}`
`42`	`42`
`43`		`- for (auto i = 0; i < n; ++i) {`
	`43`	`+ for (const auto i : c10::irange(n)) {`
`44`	`44`	`const std::size_t idx = indices[i];`
`45`	`45`	`CAFFE_ENFORCE_GE(`
`46`	`46`	`Input(COUNTER).numel(),`