Enhance Huber robust mean estimator (#121)

TimotheeMathieu · rth · web-flow · commit 859ce5e7d8e0 · 2021-09-07T09:53:40.000+02:00
* add stopping criterion and add test for huber * add c is None iqr heuristic and correct heuristic in robust_weighted_estimator * Apply suggestions from code review Co-authored-by: Roman Yurchak <rth.yurchak@gmail.com> * change name c and update doc * change forgotten name c * Revert "change forgotten name c" This reverts commit c5a59ac. * Revert "change name c and update doc" This reverts commit 8dd0cf9. * change c_ to c_numeric * add chegelog Co-authored-by: Roman Yurchak <rth.yurchak@gmail.com>
diff --git a/doc/changelog.rst b/doc/changelog.rst
@@ -4,6 +4,8 @@ Changelog
 Unreleased
 ----------
 
+- Add a stopping criterion and parameter tuning heuristic for Huber robust mean
+    estimator.
 - Add `CLARA` (Clustering for Large Applications) which extends k-medoids to
     be more scalable using a sampling approach.
     [`#83 <https://github.com/scikit-learn-contrib/scikit-learn-extra/pull/83>`_].
diff --git a/sklearn_extra/robust/mean_estimators.py b/sklearn_extra/robust/mean_estimators.py
@@ -88,7 +88,7 @@ def median_of_means(X, k, random_state=np.random.RandomState(42)):
     return median_of_means_blocked(x, blocks)[0]
 
 
-def huber(X, c=1.35, T=20):
+def huber(X, c=None, T=20, tol=1e-3):
     """Compute the Huber estimator of location of X with parameter c
 
     Parameters
@@ -97,14 +97,19 @@ def huber(X, c=1.35, T=20):
     X : array like, length = n_sample
         sample from which we want an estimator of the mean
 
-    c : float >0, default = 1.35
+    c : float >0, default = None
         parameter that control the robustness of the estimator.
         c going to zero gives a  behavior close to the median.
         c going to infinity gives a behavior close to sample mean.
+        if c is None, the interquartile range (IQR) is used
+        as heuristic.
 
     T : int, default = 20
         Number of iterations of the algorithm.
 
+    tol : float, default=1e-3
+        Tolerance on stopping criterion.
+
     Return
     ------
 
@@ -116,23 +121,38 @@ def huber(X, c=1.35, T=20):
     # Initialize the algorithm with a robust first-guess : the median.
     mu = np.median(x)
 
+    if c is None:
+        c_numeric = iqr(x)
+    else:
+        c_numeric = c
+
     def psisx(x, c):
         # Huber weight function.
         res = np.zeros(len(x))
-        mask = np.abs(x) <= c
+        mask = np.abs(x) <= c_numeric
         res[mask] = 1
-        res[~mask] = c / np.abs(x[~mask])
+        res[~mask] = c_numeric / np.abs(x[~mask])
         return res
 
+    # Create a list to keep the ten last values of mu
+    last_mu = mu
+
     # Run the iterative reweighting algorithm to compute M-estimator.
     for t in range(T):
         # Compute the weights
-        w = psisx(x - mu, c)
+        w = psisx(x - mu, c_numeric)
 
         # Infinite coordinates in x gives zero weight, we take them out.
         ind_pos = w > 0
 
         # Update the value of the estimate with the new estimate using the
         # new weights.
         mu = np.sum(np.array(w[ind_pos]) * x[ind_pos]) / np.sum(w[ind_pos])
+
+        # Stopping criterion. The error is decreasing at each iteration
+        if np.abs(mu - last_mu) < tol:
+            break
+        else:
+            last_mu = mu
+
     return mu
diff --git a/sklearn_extra/robust/robust_weighted_estimator.py b/sklearn_extra/robust/robust_weighted_estimator.py
@@ -460,7 +460,7 @@ def _get_weights(self, loss_values, random_state):
         if self.weighting == "huber":
             if self.c is None:
                 # If no c parameter given, estimate using inter quartile range.
-                c = iqr(np.abs(loss_values - np.median(loss_values))) / 2
+                c = iqr(loss_values) / 2
                 if c == 0:
                     warnings.warn(
                         "Too many samples are parfectly predicted "
diff --git a/sklearn_extra/robust/tests/test_mean_estimators.py b/sklearn_extra/robust/tests/test_mean_estimators.py
@@ -27,5 +27,6 @@ def test_mom():
 def test_huber():
     X = np.hstack([np.zeros(90), np.ones(10)])
     with pytest.warns(None) as record:
-        huber(X)
+        mu = huber(X, c=0.5)
     assert len(record) == 0
+    assert np.abs(mu) < 0.1