ATISLabs · yaansz · Sep 4, 2020 · Sep 4, 2020
diff --git a/src/sklearn.jl b/src/sklearn.jl
@@ -1,40 +1,40 @@
 """
-    generate_moons(;    n_samples::Union{Tuple{Int, Int}, Int} = 100, 
-                        shuffle = true, 
-                        noise = nothing, 
+    generate_moons(;    n_samples::Union{Tuple{Int, Int}, Int} = 100,
+                        shuffle = true,
+                        noise = nothing,
                         random_state = nothing)::DataFrame
-Make two interleaving half circles. Sklearn interface to make_moons. 
+Make two interleaving half circles. Sklearn interface to make_moons.
 # Arguments
 - `n_samples::Union{Tuple{Int, Int}, Int} = 100`: If int, the total number of points generated. If two-element tuple, number of points in each of two moons.
 - `shuffle::Bool = true`: Whether to shuffle the samples.
 - `noise::Union{Nothing, Float64} = nothing`: Standard deviation of Gaussian noise added to the data.
 - `random_state::Union{Int, Nothing} = nothing`: Determines random number generation for dataset shuffling and noise.
 Reference: [link](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_moons.html)
 """
-function generate_moons(;n_samples::Union{Tuple{Int, Int}, Int} = 100, 
-                        shuffle::Bool = true, 
-                        noise::Union{Nothing, Float64} = nothing, 
+function generate_moons(;n_samples::Union{Tuple{Int, Int}, Int} = 100,
+                        shuffle::Bool = true,
+                        noise::Union{Nothing, Float64} = nothing,
                         random_state::Union{Int, Nothing} = nothing)::DataFrame
 
-    (features, labels) = datasets.make_moons(   n_samples=n_samples, 
-                                                shuffle = shuffle, 
-                                                noise = noise, 
+    (features, labels) = datasets.make_moons(   n_samples=n_samples,
+                                                shuffle = shuffle,
+                                                noise = noise,
                                                 random_state = random_state)
 
     return convert(features, labels)
 end
 
 """
-    generate_blobs(;    n_samples::Union{Int, Array{Int, 1}} = 100, 
+    generate_blobs(;    n_samples::Union{Int, Array{Int, 1}} = 100,
                         n_features::Int = 2,
                         centers::Union{Int, Union{Nothing, Array{Float64, 2}}} = nothing,
                         cluster_std::Union{Float64, Array{Float64, 1}} = 1.0,
                         center_box = (-10.0, 10.0),
-                        shuffle::Bool = true, 
+                        shuffle::Bool = true,
                         random_state::Union{Int, Nothing} = nothing)::DataFrame
-Generate isotropic Gaussian blobs for clustering. Sklearn interface to make_blobs. 
+Generate isotropic Gaussian blobs for clustering. Sklearn interface to make_blobs.
 # Arguments
-- `n_samples = 100`: If int, it is the total number of points equally divided among clusters. If array-like, each element of the sequence indicates the number of samples per cluster. 
+- `n_samples = 100`: If int, it is the total number of points equally divided among clusters. If array-like, each element of the sequence indicates the number of samples per cluster.
 - `n_features = 2`: The number of features for each sample.
 - `centers::Union{Int, Union{Nothing, Array{Float64, 2}}} = nothing`: The number of centers to generate, or the fixed center locations. If n_samples is an int and centers is None, 3 centers are generated. If n_samples is array-like, centers must be either None or an array of length equal to the length of n_samples.
 - `cluster_std::Union{Float64, Array{Float64, 1}} = 1.0`: The standard deviation of the clusters.
@@ -43,72 +43,72 @@ Generate isotropic Gaussian blobs for clustering. Sklearn interface to make_blob
 - `random_state::Union{Int, Nothing} = nothing`: Determines random number generation for dataset shuffling and noise.
 Reference: [link](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_blobs.html)
 """
-function generate_blobs(;n_samples::Union{Int, Array{Int, 1}} = 100, 
+function generate_blobs(;n_samples::Union{Int, Array{Int, 1}} = 100,
                         n_features::Int = 2,
                         centers::Union{Int, Union{Nothing, Array{Float64, 2}}} = nothing,
                         cluster_std::Union{Float64, Array{Float64, 1}} = 1.0,
                         center_box::Tuple{Float64, Float64} = (-10.0, 10.0),
-                        shuffle::Bool = true, 
+                        shuffle::Bool = true,
                         random_state::Union{Int, Nothing} = nothing)::DataFrame
 
-    (features, labels) = datasets.make_blobs(   n_samples = n_samples, 
-                                                n_features = n_features, 
-                                                centers = centers, 
-                                                cluster_std = cluster_std, 
-                                                center_box = center_box, 
-                                                shuffle = shuffle, 
-                                                random_state = random_state, 
+    (features, labels) = datasets.make_blobs(   n_samples = n_samples,
+                                                n_features = n_features,
+                                                centers = centers,
+                                                cluster_std = cluster_std,
+                                                center_box = center_box,
+                                                shuffle = shuffle,
+                                                random_state = random_state,
                                                 return_centers = false)
 
     return convert(features, labels)
 end
 
 """
-    generate_s_curve(;  n_samples::Int = 100, 
-                        noise = nothing, 
+    generate_s_curve(;  n_samples::Int = 100,
+                        noise = nothing,
                         random_state = nothing)::DataFrame
-Generate an S curve dataset. Sklearn interface to make_s_curve. 
+Generate an S curve dataset. Sklearn interface to make_s_curve.
 # Arguments
 - `n_samples::Int = 100`: The number of sample points on the S curve.
 - `noise::Union{Nothing, Float64} = nothing`: Standard deviation of Gaussian noise added to the data.
 - `random_state::Union{Int, Nothing} = nothing`: Determines random number generation for dataset creation. Pass an int for reproducible output across multiple function calls.
 Reference: [link](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_s_curve.html)
 """
 function generate_s_curve(; n_samples::Int = 100,
-                            noise::Float64 = 0.0, 
+                            noise::Float64 = 0.0,
                             random_state::Union{Int, Nothing} = nothing)::DataFrame
-        
+
     (features, labels) = datasets.make_s_curve( n_samples = n_samples,
-                                                noise = noise, 
+                                                noise = noise,
                                                 random_state = random_state)
-    
+
     return convert(features, labels)
 end
 
 """
     function generate_circles(; n_samples::Int = 100,
                                 shuffle::Bool = true,
-                                noise::Float64 = 0.0, 
+                                noise::Float64 = 0.0,
                                 random_state::Union{Int, Nothing} = nothing,
                                 factor::Float64 = 0.8)::DataFrame
 Make a large circle containing a smaller circle in 2d. Sklearn interface to make_circles.
 # Arguments
 - `n_samples::Union{Int, Tuple{Int, Int}} = 100`: If int, it is the total number of points generated. For odd numbers, the inner circle will have one point more than the outer circle. If two-element tuple, number of points in outer circle and inner circle.
 - `shuffle::Bool = true`: Whether to shuffle the samples.
 - `noise::Union{Nothing, Float64} = nothing`: Standard deviation of Gaussian noise added to the data.
-- `random_state::Union{Int, Nothing} = nothing`: Determines random number generation for dataset shuffling and noise. Pass an int for reproducible output across multiple function calls. 
+- `random_state::Union{Int, Nothing} = nothing`: Determines random number generation for dataset shuffling and noise. Pass an int for reproducible output across multiple function calls.
 - `factor::Float64 = 0.8`: Scale factor between inner and outer circle.
 Reference: [link](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_circles.html)
 
 """
 function generate_circles(; n_samples::Union{Int, Tuple{Int, Int}} = 100,
                             shuffle::Bool = true,
-                            noise::Union{Nothing, Float64} = nothing, 
+                            noise::Union{Nothing, Float64} = nothing,
                             random_state::Union{Int, Nothing} = nothing,
                             factor::Float64 = 0.8)::DataFrame
 
     (features, labels) = datasets.make_circles( n_samples = n_samples,
-                            shuffle = shuffle, 
+                            shuffle = shuffle,
                             noise = noise,
                             random_state = random_state,
                             factor = factor)
@@ -122,11 +122,11 @@ end
                             n_informative::Int = 10,
                             n_targets::Int = 1,
                             bias::Float64 = 0.0,
-                            effective_rank::Union{Int, Nothing} = nothing, 
-                            tail_strength::Float64 = 0.5, 
-                            noise::Float64 = 0.0, 
-                            shuffle::Bool = true, 
-                            coef::Bool = false, 
+                            effective_rank::Union{Int, Nothing} = nothing,
+                            tail_strength::Float64 = 0.5,
+                            noise::Float64 = 0.0,
+                            shuffle::Bool = true,
+                            coef::Bool = false,
                             random_state::Union{Int, Nothing}= nothing)
 Generate a random regression problem. Sklearn interface to make_regression.
 # Arguments
@@ -157,7 +157,7 @@ function generate_regression(;  n_samples::Int = 100,
 
     (features, labels) = datasets.make_regression(  n_samples = n_samples,
                                                     n_features = n_features,
-                                                    n_informative = n_informative, 
+                                                    n_informative = n_informative,
                                                     n_targets = n_targets,
                                                     bias = bias,
                                                     effective_rank = effective_rank,
@@ -181,10 +181,10 @@ function generate_classification(;  n_samples::Int = 100,
                                     weights::Union{Nothing, Array{Float64,1}} = nothing,
                                     flip_y::Float64 = 0.01,
                                     class_sep::Float64 = 1.0,
-                                    hypercube::Bool = true, 
+                                    hypercube::Bool = true,
                                     shift::Union{Nothing, Array{Float64,1}} = 0.0,
-                                    scale::Union{Nothing, Array{Float64,1}} = 1.0, 
-                                    shuffle::Bool = true, 
+                                    scale::Union{Nothing, Array{Float64,1}} = 1.0,
+                                    shuffle::Bool = true,
                                     random_state::Union{Int, Nothing} = nothing)
 Generate a random n-class classification problem. Sklearn interface to make_classification.
 #Arguments
@@ -195,7 +195,7 @@ Generate a random n-class classification problem. Sklearn interface to make_clas
 - `n_repeated::Int = 0`: The number of duplicated features, drawn randomly from the informative and the redundant features.
 - `n_classes::Int = 2`: The number of classes (or labels) of the classification problem.
 - `n_clusters_per_class::Int = 2`: The number of clusters per class.
-- `weights::Union{Nothing, Array{Float64,1}} = nothing`: 
+- `weights::Union{Nothing, Array{Float64,1}} = nothing`:
 - `flip_y::Float64 = 0.01`: The fraction of samples whose class is assigned randomly. Larger values introduce noise in the labels and make the classification task harder. Note that the default setting flip_y > 0 might lead to less than n_classes in y in some cases.
 - `class_sep::Float64 = 1.0`: The factor multiplying the hypercube size. Larger values spread out the clusters/classes and make the classification task easier.
 - `hypercube::Bool = true`: If True, the clusters are put on the vertices of a hypercube. If False, the clusters are put on the vertices of a random polytope.
@@ -215,10 +215,10 @@ function generate_classification(;  n_samples::Int = 100,
                                     weights::Union{Nothing, Array{Float64,1}} = nothing,
                                     flip_y::Float64 = 0.01,
                                     class_sep::Float64 = 1.0,
-                                    hypercube::Bool = true, 
+                                    hypercube::Bool = true,
                                     shift::Union{Nothing, Float64, Array{Float64,1}} = 0.0,
-                                    scale::Union{Nothing, Float64, Array{Float64,1}} = 1.0, 
-                                    shuffle::Bool = true, 
+                                    scale::Union{Nothing, Float64, Array{Float64,1}} = 1.0,
+                                    shuffle::Bool = true,
                                     random_state::Union{Int, Nothing} = nothing)
 
     (features, labels) = datasets.make_classification(  n_samples = n_samples,
@@ -238,4 +238,35 @@ function generate_classification(;  n_samples::Int = 100,
                                                         random_state = random_state)
 
     return convert(features, labels)
-end
+end
+
+"""
+function generate_low_rank_matrix(; n_samples::Int =100,
+                                    n_features::Int =100,
+                                    effective_rank::Int =10,
+                                    tail_strength::Float64 =0.5,
+                                    random_state::Union{Int, Nothing} = nothing)
+Generate a mostly low rank matrix with bell-shaped singular values
+#Arguments
+- `n_samples::Int = 100`: The number of samples.
+- `n_features::Int = 20`: The total number of features. These comprise `n_informative` informative features, `n_redundant` redundant features, `n_repeated` duplicated features and `n_features-n_informative-n_redundant-n_repeated` useless features drawn at random.
+- `effective_rank::Int = 10`: The approximate number of singular vectors required to explain most of the data by linear combinations.
+- `tail_strength::Float64 = 0.5`: The relative importance of the fat noisy tail of the singular values profile.
+- `random_state::Union{Int, Nothing} = nothing`: Determines random number generation for dataset creation. Pass an int for reproducible output across multiple function calls. See Glossary.
+Reference: [link](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_low_rank_matrix.html)
+"""
+
+function generate_low_rank_matrix(; n_samples::Int = 100,
+                                    n_features::Int = 100,
+                                    effective_rank::Int = 10,
+                                    tail_strength::Float64 = 0.5,
+                                    random_state::Union{Int, Nothing} = nothing)
+
+    (features, labels) = datasets.make_low_rank_matrix(n_samples = n_samples,
+                                                       n_features = n_features,
+                                                       effective_rank = effective_rank,
+                                                       tail_strength = tail_strength,
+                                                       random_state = random_state)
+   #return convert(features, labels)
+
+end
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -6,11 +6,11 @@ using Test
     samples = 20000
     features = 20
 
-    data = SyntheticDatasets.generate_blobs(centers = [-1 1;-0.5 0.75], 
-                                            cluster_std = 0.225, 
+    data = SyntheticDatasets.generate_blobs(centers = [-1 1;-0.5 0.75],
+                                            cluster_std = 0.225,
                                             n_samples = 20000,
                                             center_box = (-1.5, 1.5))
-    
+
     @test size(data)[1] == samples
     @test size(data)[2] == 3
 
@@ -44,8 +44,17 @@ using Test
                                                     n_features = features,
                                                     n_classes = 1)
 
-                 
+
     @test size(data)[1] == samples
     @test size(data)[2] == features + 1
 
-end
+    data = SyntheticDatasets.generate_low_rank_matrix(n_samples = samples,
+                                                    n_features = features,
+                                                    effective_rank = 10,
+                                                    tail_strength = 0.5,
+                                                    random_state = 5)
+
+    @test size(data)[1] == samples
+    @test size(data)[2] == features
+
+end