[MRG] MNT Fixes for PCA with n_components='mle' (scikit-learn#16841)

NicolasHug · web-flow · commit a655de515e24 · 2020-04-07T22:12:40.000+02:00
* Fixed off by one in MLE and better handling of small eigenvalues

* light update tests

* pep8

* Added test + threhsold on small log
diff --git a/doc/whats_new/v0.23.rst b/doc/whats_new/v0.23.rst
@@ -139,10 +139,12 @@ Changelog
 - |Fix| :class:`decomposition.PCA` with a float `n_components` parameter, will
    exclusively choose the components that explain the variance greater than
    `n_components`. :pr:`15669` by :user:`Krishna Chaitanya <krishnachaitanya9>`
-- |Fix| :func:`decomposition._pca._assess_dimension` now correctly handles small
-   eigenvalues. :pr: `4441` by :user:`Lisa Schwetlick <lschwetlick>`, and
-   :user:`Gelavizh Ahmadi <gelavizh1>` and
-   :user:`Marija Vlajic Wheeler <marijavlajic>`.
+
+- |Fix| :class:`decomposition.PCA` with `n_components='mle'` now correctly
+  handles small eigenvalues, and does not infer 0 as the correct number of
+  components. :pr: `4441` by :user:`Lisa Schwetlick <lschwetlick>`, and
+  :user:`Gelavizh Ahmadi <gelavizh1>` and :user:`Marija Vlajic Wheeler
+  <marijavlajic>` and :pr:`16841` by `Nicolas Hug`_.
 
 - |Enhancement| :class:`decomposition.NMF` and
   :func:`decomposition.non_negative_factorization` now preserves float32 dtype.
diff --git a/sklearn/decomposition/_pca.py b/sklearn/decomposition/_pca.py
@@ -28,22 +28,22 @@
 from ..utils.validation import _deprecate_positional_args
 
 
-def _assess_dimension(spectrum, rank, n_samples, n_features):
-    """Compute the likelihood of a rank ``rank`` dataset.
+def _assess_dimension(spectrum, rank, n_samples):
+    """Compute the log-likelihood of a rank ``rank`` dataset.
 
     The dataset is assumed to be embedded in gaussian noise of shape(n,
     dimf) having spectrum ``spectrum``.
 
     Parameters
     ----------
-    spectrum : array of shape (n)
+    spectrum : array of shape (n_features)
         Data spectrum.
     rank : int
-        Tested rank value.
+        Tested rank value. It should be strictly lower than n_features,
+        otherwise the method isn't specified (division by zero in equation
+        (31) from the paper).
     n_samples : int
         Number of samples.
-    n_features : int
-        Number of features.
 
     Returns
     -------
@@ -55,45 +55,39 @@ def _assess_dimension(spectrum, rank, n_samples, n_features):
     This implements the method of `Thomas P. Minka:
     Automatic Choice of Dimensionality for PCA. NIPS 2000: 598-604`
     """
-    if rank > len(spectrum):
-        raise ValueError("The tested rank cannot exceed the rank of the"
-                         " dataset")
 
-    spectrum_threshold = np.finfo(type(spectrum[0])).eps
+    n_features = spectrum.shape[0]
+    if not 1 <= rank < n_features:
+        raise ValueError("the tested rank should be in [1, n_features - 1]")
+
+    eps = 1e-15
+
+    if spectrum[rank - 1] < eps:
+        # When the tested rank is associated with a small eigenvalue, there's
+        # no point in computing the log-likelihood: it's going to be very
+        # small and won't be the max anyway. Also, it can lead to numerical
+        # issues below when computing pa, in particular in log((spectrum[i] -
+        # spectrum[j]) because this will take the log of something very small.
+        return -np.inf
 
     pu = -rank * log(2.)
-    for i in range(rank):
-        pu += (gammaln((n_features - i) / 2.) -
-               log(np.pi) * (n_features - i) / 2.)
+    for i in range(1, rank + 1):
+        pu += (gammaln((n_features - i + 1) / 2.) -
+               log(np.pi) * (n_features - i + 1) / 2.)
 
     pl = np.sum(np.log(spectrum[:rank]))
     pl = -pl * n_samples / 2.
 
-    if rank == n_features:
-        # TODO: this line is never executed because _infer_dimension's
-        # for loop is off by one
-        pv = 0
-        v = 1
-    else:
-        v = np.sum(spectrum[rank:]) / (n_features - rank)
-        if spectrum_threshold > v:
-            return -np.inf
-        pv = -np.log(v) * n_samples * (n_features - rank) / 2.
+    v = max(eps, np.sum(spectrum[rank:]) / (n_features - rank))
+    pv = -np.log(v) * n_samples * (n_features - rank) / 2.
 
     m = n_features * rank - rank * (rank + 1.) / 2.
-    pp = log(2. * np.pi) * (m + rank + 1.) / 2.
+    pp = log(2. * np.pi) * (m + rank) / 2.
 
     pa = 0.
     spectrum_ = spectrum.copy()
     spectrum_[rank:n_features] = v
     for i in range(rank):
-        if spectrum_[i] < spectrum_threshold:
-            # TODO: this line is never executed
-            # (off by one in _infer_dimension)
-            # this break only happens when rank == n_features and
-            # spectrum_[i] < spectrum_threshold, otherwise the early return
-            # above catches this case.
-            break
         for j in range(i + 1, len(spectrum)):
             pa += log((spectrum[i] - spectrum[j]) *
                       (1. / spectrum_[j] - 1. / spectrum_[i])) + log(n_samples)
@@ -103,15 +97,15 @@ def _assess_dimension(spectrum, rank, n_samples, n_features):
     return ll
 
 
-def _infer_dimension(spectrum, n_samples, n_features):
-    """Infers the dimension of a dataset of shape (n_samples, n_features)
+def _infer_dimension(spectrum, n_samples):
+    """Infers the dimension of a dataset with a given spectrum.
 
-    The dataset is described by its spectrum `spectrum`.
+    The returned value will be in [1, n_features - 1].
     """
-    n_spectrum = len(spectrum)
-    ll = np.empty(n_spectrum)
-    for rank in range(n_spectrum):
-        ll[rank] = _assess_dimension(spectrum, rank, n_samples, n_features)
+    ll = np.empty_like(spectrum)
+    ll[0] = -np.inf  # we don't want to return n_components = 0
+    for rank in range(1, spectrum.shape[0]):
+        ll[rank] = _assess_dimension(spectrum, rank, n_samples)
     return ll.argmax()
 
 
@@ -472,7 +466,7 @@ def _fit_full(self, X, n_components):
         # Postprocess the number of components required
         if n_components == 'mle':
             n_components = \
-                _infer_dimension(explained_variance_, n_samples, n_features)
+                _infer_dimension(explained_variance_, n_samples)
         elif 0 < n_components < 1.0:
             # number of components for which the cumulated explained
             # variance percentage is superior to the desired threshold
diff --git a/sklearn/decomposition/tests/test_pca.py b/sklearn/decomposition/tests/test_pca.py
@@ -295,7 +295,7 @@ def test_n_components_mle(svd_solver):
     X = rng.randn(n_samples, n_features)
     pca = PCA(n_components='mle', svd_solver=svd_solver)
     pca.fit(X)
-    assert pca.n_components_ == 0
+    assert pca.n_components_ == 1
 
 
 @pytest.mark.parametrize("svd_solver", ["arpack", "randomized"])
@@ -333,7 +333,7 @@ def test_infer_dim_1():
     pca = PCA(n_components=p, svd_solver='full')
     pca.fit(X)
     spect = pca.explained_variance_
-    ll = np.array([_assess_dimension(spect, k, n, p) for k in range(p)])
+    ll = np.array([_assess_dimension(spect, k, n) for k in range(1, p)])
     assert ll[1] > ll.max() - .01 * n
 
 
@@ -348,7 +348,7 @@ def test_infer_dim_2():
     pca = PCA(n_components=p, svd_solver='full')
     pca.fit(X)
     spect = pca.explained_variance_
-    assert _infer_dimension(spect, n, p) > 1
+    assert _infer_dimension(spect, n) > 1
 
 
 def test_infer_dim_3():
@@ -361,7 +361,7 @@ def test_infer_dim_3():
     pca = PCA(n_components=p, svd_solver='full')
     pca.fit(X)
     spect = pca.explained_variance_
-    assert _infer_dimension(spect, n, p) > 2
+    assert _infer_dimension(spect, n) > 2
 
 
 @pytest.mark.parametrize(
@@ -570,51 +570,43 @@ def test_pca_n_components_mostly_explained_variance_ratio():
     assert pca2.n_components_ == X.shape[1]
 
 
-def test_infer_dim_bad_spec():
-    # Test a spectrum that drops to near zero for PR #16224
+def test_assess_dimension_bad_rank():
+    # Test error when tested rank not in [1, n_features - 1]
     spectrum = np.array([1, 1e-30, 1e-30, 1e-30])
     n_samples = 10
-    n_features = 5
-    ret = _infer_dimension(spectrum, n_samples, n_features)
-    assert ret == 0
+    for rank in (0, 5):
+        with pytest.raises(ValueError,
+                           match=r"should be in \[1, n_features - 1\]"):
+            _assess_dimension(spectrum, rank, n_samples)
 
 
-def test_assess_dimension_error_rank_greater_than_features():
-    # Test error when tested rank is greater than the number of features
-    # for PR #16224
+def test_small_eigenvalues_mle():
+    # Test rank associated with tiny eigenvalues are given a log-likelihood of
+    # -inf. The inferred rank will be 1
     spectrum = np.array([1, 1e-30, 1e-30, 1e-30])
-    n_samples = 10
-    n_features = 4
-    rank = 5
-    with pytest.raises(ValueError, match="The tested rank cannot exceed "
-                                         "the rank of the dataset"):
-        _assess_dimension(spectrum, rank, n_samples, n_features)
 
+    assert _assess_dimension(spectrum, rank=1, n_samples=10) > -np.inf
 
-def test_assess_dimension_small_eigenvalues():
-    # Test tiny eigenvalues appropriately when using 'mle'
-    # for  PR #16224
-    spectrum = np.array([1, 1e-30, 1e-30, 1e-30])
-    n_samples = 10
-    n_features = 5
-    rank = 3
-    ret = _assess_dimension(spectrum, rank, n_samples, n_features)
-    assert ret == -np.inf
+    for rank in (2, 3):
+        assert _assess_dimension(spectrum, rank, 10) == -np.inf
+
+    assert _infer_dimension(spectrum, 10) == 1
 
 
-def test_infer_dim_mle():
-    # Test small eigenvalues when 'mle' with pathological 'X' dataset
-    # for PR #16224
-    X, _ = datasets.make_classification(n_informative=1, n_repeated=18,
+def test_mle_redundant_data():
+    # Test 'mle' with pathological X: only one relevant feature should give a
+    # rank of 1
+    X, _ = datasets.make_classification(n_features=20,
+                                        n_informative=1, n_repeated=18,
                                         n_redundant=1, n_clusters_per_class=1,
                                         random_state=42)
     pca = PCA(n_components='mle').fit(X)
-    assert pca.n_components_ == 0
+    assert pca.n_components_ == 1
 
 
 def test_fit_mle_too_few_samples():
     # Tests that an error is raised when the number of samples is smaller
-    # than the number of features during an mle fit for PR #16224
+    # than the number of features during an mle fit
     X, _ = datasets.make_classification(n_samples=20, n_features=21,
                                         random_state=42)
 
@@ -623,3 +615,26 @@ def test_fit_mle_too_few_samples():
                                          "supported if "
                                          "n_samples >= n_features"):
         pca.fit(X)
+
+
+def test_mle_simple_case():
+    # non-regression test for issue
+    # https://github.com/scikit-learn/scikit-learn/issues/16730
+    n_samples, n_dim = 1000, 10
+    X = np.random.RandomState(0).randn(n_samples, n_dim)
+    X[:, -1] = np.mean(X[:, :-1], axis=-1)  # true X dim is ndim - 1
+    pca_skl = PCA('mle', svd_solver='full')
+    pca_skl.fit(X)
+    assert pca_skl.n_components_ == n_dim - 1
+
+
+def test_assess_dimesion_rank_one():
+    # Make sure assess_dimension works properly on a matrix of rank 1
+    n_samples, n_features = 9, 6
+    X = np.ones((n_samples, n_features))  # rank 1 matrix
+    _, s, _ = np.linalg.svd(X, full_matrices=True)
+    assert sum(s[1:]) == 0  # except for rank 1, all eigenvalues are 0
+
+    assert np.isfinite(_assess_dimension(s, rank=1, n_samples=n_samples))
+    for rank in range(2, n_features):
+        assert _assess_dimension(s, rank, n_samples) == -np.inf