@@ -295,7 +295,7 @@ def test_n_components_mle(svd_solver):
295
295
X = rng .randn (n_samples , n_features )
296
296
pca = PCA (n_components = 'mle' , svd_solver = svd_solver )
297
297
pca .fit (X )
298
- assert pca .n_components_ == 0
298
+ assert pca .n_components_ == 1
299
299
300
300
301
301
@pytest .mark .parametrize ("svd_solver" , ["arpack" , "randomized" ])
@@ -333,7 +333,7 @@ def test_infer_dim_1():
333
333
pca = PCA (n_components = p , svd_solver = 'full' )
334
334
pca .fit (X )
335
335
spect = pca .explained_variance_
336
- ll = np .array ([_assess_dimension (spect , k , n , p ) for k in range (p )])
336
+ ll = np .array ([_assess_dimension (spect , k , n ) for k in range (1 , p )])
337
337
assert ll [1 ] > ll .max () - .01 * n
338
338
339
339
@@ -348,7 +348,7 @@ def test_infer_dim_2():
348
348
pca = PCA (n_components = p , svd_solver = 'full' )
349
349
pca .fit (X )
350
350
spect = pca .explained_variance_
351
- assert _infer_dimension (spect , n , p ) > 1
351
+ assert _infer_dimension (spect , n ) > 1
352
352
353
353
354
354
def test_infer_dim_3 ():
@@ -361,7 +361,7 @@ def test_infer_dim_3():
361
361
pca = PCA (n_components = p , svd_solver = 'full' )
362
362
pca .fit (X )
363
363
spect = pca .explained_variance_
364
- assert _infer_dimension (spect , n , p ) > 2
364
+ assert _infer_dimension (spect , n ) > 2
365
365
366
366
367
367
@pytest .mark .parametrize (
@@ -570,51 +570,43 @@ def test_pca_n_components_mostly_explained_variance_ratio():
570
570
assert pca2 .n_components_ == X .shape [1 ]
571
571
572
572
573
- def test_infer_dim_bad_spec ():
574
- # Test a spectrum that drops to near zero for PR #16224
573
+ def test_assess_dimension_bad_rank ():
574
+ # Test error when tested rank not in [1, n_features - 1]
575
575
spectrum = np .array ([1 , 1e-30 , 1e-30 , 1e-30 ])
576
576
n_samples = 10
577
- n_features = 5
578
- ret = _infer_dimension (spectrum , n_samples , n_features )
579
- assert ret == 0
577
+ for rank in (0 , 5 ):
578
+ with pytest .raises (ValueError ,
579
+ match = r"should be in \[1, n_features - 1\]" ):
580
+ _assess_dimension (spectrum , rank , n_samples )
580
581
581
582
582
- def test_assess_dimension_error_rank_greater_than_features ():
583
- # Test error when tested rank is greater than the number of features
584
- # for PR #16224
583
+ def test_small_eigenvalues_mle ():
584
+ # Test rank associated with tiny eigenvalues are given a log-likelihood of
585
+ # -inf. The inferred rank will be 1
585
586
spectrum = np .array ([1 , 1e-30 , 1e-30 , 1e-30 ])
586
- n_samples = 10
587
- n_features = 4
588
- rank = 5
589
- with pytest .raises (ValueError , match = "The tested rank cannot exceed "
590
- "the rank of the dataset" ):
591
- _assess_dimension (spectrum , rank , n_samples , n_features )
592
587
588
+ assert _assess_dimension (spectrum , rank = 1 , n_samples = 10 ) > - np .inf
593
589
594
- def test_assess_dimension_small_eigenvalues ():
595
- # Test tiny eigenvalues appropriately when using 'mle'
596
- # for PR #16224
597
- spectrum = np .array ([1 , 1e-30 , 1e-30 , 1e-30 ])
598
- n_samples = 10
599
- n_features = 5
600
- rank = 3
601
- ret = _assess_dimension (spectrum , rank , n_samples , n_features )
602
- assert ret == - np .inf
590
+ for rank in (2 , 3 ):
591
+ assert _assess_dimension (spectrum , rank , 10 ) == - np .inf
592
+
593
+ assert _infer_dimension (spectrum , 10 ) == 1
603
594
604
595
605
- def test_infer_dim_mle ():
606
- # Test small eigenvalues when 'mle' with pathological 'X' dataset
607
- # for PR #16224
608
- X , _ = datasets .make_classification (n_informative = 1 , n_repeated = 18 ,
596
+ def test_mle_redundant_data ():
597
+ # Test 'mle' with pathological X: only one relevant feature should give a
598
+ # rank of 1
599
+ X , _ = datasets .make_classification (n_features = 20 ,
600
+ n_informative = 1 , n_repeated = 18 ,
609
601
n_redundant = 1 , n_clusters_per_class = 1 ,
610
602
random_state = 42 )
611
603
pca = PCA (n_components = 'mle' ).fit (X )
612
- assert pca .n_components_ == 0
604
+ assert pca .n_components_ == 1
613
605
614
606
615
607
def test_fit_mle_too_few_samples ():
616
608
# Tests that an error is raised when the number of samples is smaller
617
- # than the number of features during an mle fit for PR #16224
609
+ # than the number of features during an mle fit
618
610
X , _ = datasets .make_classification (n_samples = 20 , n_features = 21 ,
619
611
random_state = 42 )
620
612
@@ -623,3 +615,26 @@ def test_fit_mle_too_few_samples():
623
615
"supported if "
624
616
"n_samples >= n_features" ):
625
617
pca .fit (X )
618
+
619
+
620
+ def test_mle_simple_case ():
621
+ # non-regression test for issue
622
+ # https://github.com/scikit-learn/scikit-learn/issues/16730
623
+ n_samples , n_dim = 1000 , 10
624
+ X = np .random .RandomState (0 ).randn (n_samples , n_dim )
625
+ X [:, - 1 ] = np .mean (X [:, :- 1 ], axis = - 1 ) # true X dim is ndim - 1
626
+ pca_skl = PCA ('mle' , svd_solver = 'full' )
627
+ pca_skl .fit (X )
628
+ assert pca_skl .n_components_ == n_dim - 1
629
+
630
+
631
+ def test_assess_dimesion_rank_one ():
632
+ # Make sure assess_dimension works properly on a matrix of rank 1
633
+ n_samples , n_features = 9 , 6
634
+ X = np .ones ((n_samples , n_features )) # rank 1 matrix
635
+ _ , s , _ = np .linalg .svd (X , full_matrices = True )
636
+ assert sum (s [1 :]) == 0 # except for rank 1, all eigenvalues are 0
637
+
638
+ assert np .isfinite (_assess_dimension (s , rank = 1 , n_samples = n_samples ))
639
+ for rank in range (2 , n_features ):
640
+ assert _assess_dimension (s , rank , n_samples ) == - np .inf
0 commit comments