Merge pull request #18 from ATISLabs/feature/generate_low_rank_matrix_2

filipebraida · web-flow · commit f1818722b73f · 2020-09-05T14:24:39.000-03:00
Feature/Generate_Low_Rank_Matrix_2
diff --git a/src/sklearn.jl b/src/sklearn.jl
@@ -240,6 +240,35 @@ function generate_classification(;  n_samples::Int = 100,
     return convert(features, labels)
 end
 
+"""
+function generate_low_rank_matrix(; n_samples::Int =100,
+                                    n_features::Int =100,
+                                    effective_rank::Int =10,
+                                    tail_strength::Float64 =0.5,
+                                    random_state::Union{Int, Nothing} = nothing)
+Generate a mostly low rank matrix with bell-shaped singular values
+#Arguments
+- `n_samples::Int = 100`: The number of samples.
+- `n_features::Int = 20`: The total number of features. These comprise `n_informative` informative features, `n_redundant` redundant features, `n_repeated` duplicated features and `n_features-n_informative-n_redundant-n_repeated` useless features drawn at random.
+- `effective_rank::Int = 10`: The approximate number of singular vectors required to explain most of the data by linear combinations.
+- `tail_strength::Float64 = 0.5`: The relative importance of the fat noisy tail of the singular values profile.
+- `random_state::Union{Int, Nothing} = nothing`: Determines random number generation for dataset creation. Pass an int for reproducible output across multiple function calls. See Glossary.
+Reference: [link](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_low_rank_matrix.html)
+"""
+function generate_low_rank_matrix(; n_samples::Int = 100,
+                                    n_features::Int = 100,
+                                    effective_rank::Int = 10,
+                                    tail_strength::Float64 = 0.5,
+                                    random_state::Union{Int, Nothing} = nothing)
+
+    features = datasets.make_low_rank_matrix(n_samples = n_samples,
+                                                       n_features = n_features,
+                                                       effective_rank = effective_rank,
+                                                       tail_strength = tail_strength,
+                                                       random_state = random_state)
+   return features
+end
+
 """
 function generate_swiss_roll(;  n_samples::Int = 100,
                                noise::Float64 = 0.0,
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -47,6 +47,15 @@ using Test
     @test size(data)[1] == samples
     @test size(data)[2] == features + 1
 
+    data = SyntheticDatasets.generate_low_rank_matrix(n_samples = samples,
+                                                    n_features = features,
+                                                    effective_rank = 10,
+                                                    tail_strength = 0.5,
+                                                    random_state = 5)
+
+    @test size(data)[1] == samples
+    @test size(data)[2] == features
+    
     data = SyntheticDatasets.generate_swiss_roll(n_samples =samples,
                                                  noise = 2.2,
                                                  random_state = 5)