Merge remote-tracking branch 'origin/feature/generate_twospirals_function' into feature/generate_twospirals_function

Sergiorezende22 · Sergiorezende22 · commit eedc5174f249 · 2020-09-10T15:43:29.000-03:00
diff --git a/README.md b/README.md
@@ -25,13 +25,21 @@ The package has an interface for the dataset generator of the [ScikitLearn](http
 ### ScikitLearn
 List of package datasets:
 
-Dataset         | Title                                                                  | Reference
-----------------|------------------------------------------------------------------------|--------------------------------------------------
-make_blobs      | Generate isotropic Gaussian blobs for clustering.                      | [link](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_moons.html)
-make_moons      | Make two interleaving half circles                                     | [link](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_blobs.html)
-make_s_curve    | Generate an S curve dataset.                                           | [link](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_s_curve.html)
-make_regression | Generate a random regression problem.                                  | [link](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_regression.html])
-make_classification | Generate a random n-class classification problem.                                  | [link](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_classification.html])
+Dataset              | Title                                                                   | Reference
+---------------------|-------------------------------------------------------------------------|--------------------------------------------------
+make_blobs           | Generate isotropic Gaussian blobs for clustering.                       | [link](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_moons.html)
+make_moons           | Make two interleaving half circles                                      | [link](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_blobs.html)
+make_s_curve         | Generate an S curve dataset.                                            | [link](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_s_curve.html)
+make_regression      | Generate a random regression problem.                                   | [link](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_regression.html])
+make_classification  | Generate a random n-class classification problem.                       | [link](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_classification.html])
+make_friedman1       | Generate the “Friedman #1” regression problem.                          | [link](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_friedman1.html)
+make_friedman2       | Generate the “Friedman #2” regression problem.                          | [link](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_friedman2.html)
+make_friedman3       | Generate the “Friedman #3” regression problem.                          | [link](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_friedman3.html)
+make_circles         | Make a large circle containing a smaller circle in 2d                   | [link](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_circles.html)
+make_regression      | Generate a random regression problem.                                   | [link](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_regression.html)
+make_classification  | Generate a random n-class classification problem.                       | [link](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_classification.html)
+make_low_rank_matrix | Generate a mostly low rank matrix with bell-shaped singular values.     | [link](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_low_rank_matrix.html)
+make_swiss_roll      | Generate a swiss roll dataset.                                          | [link](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_swiss_roll.html)
 
 **Disclaimer**: SyntheticDatasets.jl borrows code and documentation from
 [scikit-learn](https://scikit-learn.org/stable/modules/classes.html#samples-generator) in the dataset module, but *it is not an official part
diff --git a/src/sklearn.jl b/src/sklearn.jl
@@ -85,6 +85,37 @@ function generate_s_curve(; n_samples::Int = 100,
     return convert(features, labels)
 end
 
+"""
+    function generate_circles(; n_samples::Int = 100,
+                                shuffle::Bool = true,
+                                noise::Float64 = 0.0, 
+                                random_state::Union{Int, Nothing} = nothing,
+                                factor::Float64 = 0.8)::DataFrame
+Make a large circle containing a smaller circle in 2d. Sklearn interface to make_circles.
+# Arguments
+- `n_samples::Union{Int, Tuple{Int, Int}} = 100`: If int, it is the total number of points generated. For odd numbers, the inner circle will have one point more than the outer circle. If two-element tuple, number of points in outer circle and inner circle.
+- `shuffle::Bool = true`: Whether to shuffle the samples.
+- `noise::Union{Nothing, Float64} = nothing`: Standard deviation of Gaussian noise added to the data.
+- `random_state::Union{Int, Nothing} = nothing`: Determines random number generation for dataset shuffling and noise. Pass an int for reproducible output across multiple function calls. 
+- `factor::Float64 = 0.8`: Scale factor between inner and outer circle.
+Reference: [link](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_circles.html)
+
+"""
+function generate_circles(; n_samples::Union{Int, Tuple{Int, Int}} = 100,
+                            shuffle::Bool = true,
+                            noise::Union{Nothing, Float64} = nothing, 
+                            random_state::Union{Int, Nothing} = nothing,
+                            factor::Float64 = 0.8)::DataFrame
+
+    (features, labels) = datasets.make_circles( n_samples = n_samples,
+                            shuffle = shuffle, 
+                            noise = noise,
+                            random_state = random_state,
+                            factor = factor)
+
+    return convert(features, labels)
+end
+
 """
     generate_regression(;   n_samples::Int = 100,
                             n_features::Int = 100,
@@ -124,7 +155,6 @@ function generate_regression(;  n_samples::Int = 100,
                                 coef::Bool = false,
                                 random_state::Union{Int, Nothing}= nothing)
 
-    
     (features, labels) = datasets.make_regression(  n_samples = n_samples,
                                                     n_features = n_features,
                                                     n_informative = n_informative, 
@@ -136,10 +166,8 @@ function generate_regression(;  n_samples::Int = 100,
                                                     shuffle = shuffle,
                                                     coef = coef,
                                                     random_state = random_state)
-    
 
     return convert(features, labels)
-
 end
 
 """
@@ -193,7 +221,6 @@ function generate_classification(;  n_samples::Int = 100,
                                     shuffle::Bool = true, 
                                     random_state::Union{Int, Nothing} = nothing)
 
-
     (features, labels) = datasets.make_classification(  n_samples = n_samples,
                                                         n_features = n_features,
                                                         n_informative = n_informative,
@@ -211,4 +238,126 @@ function generate_classification(;  n_samples::Int = 100,
                                                         random_state = random_state)
 
     return convert(features, labels)
-end
+end
+
+"""
+    function generate_friedman1(;   n_samples::Int = 100,
+                                    n_features::Int = 10,
+                                    noise::Float64 = 0.0, 
+                                    random_state::Union{Int, Nothing} = nothing)::DataFrame
+Generate the “Friedman #1” regression problem. Sklearn interface to make_regression.
+#Arguments
+- `n_samples::Int = 100`: The number of samples.
+- `n_features::Int = 10`: The number of features. Should be at least 5.
+- `noise::Union{Nothing, Float64} = nothing`: The standard deviation of the gaussian noise applied to the output.
+- `random_state::Union{Int, Nothing} = nothing`: Determines random number generation for dataset noise. Pass an int for reproducible output across multiple function calls.
+Reference: [link](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_friedman1.html)
+"""
+function generate_friedman1(;   n_samples::Int = 100,
+                                n_features::Int = 10,
+                                noise::Float64 = 0.0, 
+                                random_state::Union{Int, Nothing} = nothing)::DataFrame
+
+    (features, labels) = datasets.make_friedman1(   n_samples = n_samples,
+                                                    n_features = n_features,
+                                                    noise = noise, 
+                                                    random_state = random_state)
+
+    return convert(features, labels)
+end
+
+"""
+    function generate_friedman2(;   n_samples::Int = 100,
+                                    noise::Float64 = 0.0, 
+                                    random_state::Union{Int, Nothing} = nothing)::DataFrame
+Generate the “Friedman #2” regression problem. Sklearn interface to make_friedman2.
+#Arguments
+- `n_samples::Int = 100`: The number of samples.
+- `n_features::Int = 10`: The number of features. Should be at least 5.
+- `noise::Union{Nothing, Float64} = nothing`: The standard deviation of the gaussian noise applied to the output.
+- `random_state::Union{Int, Nothing} = nothing`: Determines random number generation for dataset noise. Pass an int for reproducible output across multiple function calls.
+Reference: [link](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_friedman2.html)
+"""
+function generate_friedman2(;   n_samples::Int = 100,
+                                noise::Float64 = 0.0, 
+                                random_state::Union{Int, Nothing} = nothing)::DataFrame
+
+    (features, labels) = datasets.make_friedman2(   n_samples = n_samples,
+                                                    noise = noise, 
+                                                    random_state = random_state)
+
+    return convert(features, labels)
+end
+
+"""
+    function generate_friedman3(;   n_samples::Int = 100,
+                                    noise::Float64 = 0.0, 
+                                    random_state::Union{Int, Nothing} = nothing)::DataFrame
+Generate the “Friedman #3” regression problem. Sklearn interface to make_friedman3.
+#Arguments
+- `n_samples::Int = 100`: The number of samples.
+- `noise::Union{Nothing, Float64} = nothing`: The standard deviation of the gaussian noise applied to the output.
+- `random_state::Union{Int, Nothing} = nothing`: Determines random number generation for dataset noise. Pass an int for reproducible output across multiple function calls.
+Reference: [link](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_friedman3.html)
+"""
+function generate_friedman3(;   n_samples::Int = 100,
+                                noise::Float64 = 0.0, 
+                                random_state::Union{Int, Nothing} = nothing)::DataFrame
+
+    (features, labels) = datasets.make_friedman3(   n_samples = n_samples,
+                                                    noise = noise, 
+                                                    random_state = random_state)
+
+    return convert(features, labels)
+end
+
+"""
+function generate_low_rank_matrix(; n_samples::Int =100,
+                                    n_features::Int =100,
+                                    effective_rank::Int =10,
+                                    tail_strength::Float64 =0.5,
+                                    random_state::Union{Int, Nothing} = nothing)
+Generate a mostly low rank matrix with bell-shaped singular values
+#Arguments
+- `n_samples::Int = 100`: The number of samples.
+- `n_features::Int = 20`: The total number of features. These comprise `n_informative` informative features, `n_redundant` redundant features, `n_repeated` duplicated features and `n_features-n_informative-n_redundant-n_repeated` useless features drawn at random.
+- `effective_rank::Int = 10`: The approximate number of singular vectors required to explain most of the data by linear combinations.
+- `tail_strength::Float64 = 0.5`: The relative importance of the fat noisy tail of the singular values profile.
+- `random_state::Union{Int, Nothing} = nothing`: Determines random number generation for dataset creation. Pass an int for reproducible output across multiple function calls. See Glossary.
+Reference: [link](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_low_rank_matrix.html)
+"""
+function generate_low_rank_matrix(; n_samples::Int = 100,
+                                    n_features::Int = 100,
+                                    effective_rank::Int = 10,
+                                    tail_strength::Float64 = 0.5,
+                                    random_state::Union{Int, Nothing} = nothing)
+
+    features = datasets.make_low_rank_matrix(n_samples = n_samples,
+                                                       n_features = n_features,
+                                                       effective_rank = effective_rank,
+                                                       tail_strength = tail_strength,
+                                                       random_state = random_state)
+   return features
+end
+
+"""
+function generate_swiss_roll(;  n_samples::Int = 100,
+                               noise::Float64 = 0.0,
+                               random_state::Union{Int,Nothing} = nothing)
+Generate a swiss roll dataset.
+#Arguments
+- `n_samples::Int = 100`: The number of samples.
+- `noise::Float64 = 0.0 : Standard deviation of Gaussian noise added to the data.
+- `random_state::Union{Int, Nothing} = nothing`: Determines random number generation for dataset creation. Pass an int for reproducible output across multiple function calls. See Glossary.
+Reference: [link](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_swiss_roll.htmll)
+"""
+function generate_swiss_roll(;  n_samples::Int = 100,
+                               noise::Float64 = 0.0,
+                               random_state::Union{Int,Nothing} = nothing)
+
+   (features, labels) = datasets.make_swiss_roll(  n_samples = n_samples,
+                                                   noise = noise,
+                                                   random_state = random_state)
+
+   return convert(features, labels)
+end
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -27,6 +27,10 @@ using Test
     @test size(data)[1] == samples
     @test size(data)[2] == 4
 
+    data = SyntheticDatasets.generate_circles(n_samples = samples)
+
+    @test size(data)[1] == samples
+    @test size(data)[2] == 3
 
     data = SyntheticDatasets.generate_regression(n_samples = samples,
                                                  n_features = features,
@@ -40,10 +44,43 @@ using Test
                                                     n_features = features,
                                                     n_classes = 1)
 
+    @test size(data)[1] == samples
+    @test size(data)[2] == features + 1
 
     @test size(data)[1] == samples
     @test size(data)[2] == features + 1
 
+    data = SyntheticDatasets.generate_friedman1(n_samples = samples,
+                                                    n_features = features)
+
+    @test size(data)[1] == samples
+    @test size(data)[2] == features + 1
+
+    data = SyntheticDatasets.generate_friedman2(n_samples = samples)
+
+    @test size(data)[1] == samples
+    @test size(data)[2] == 5
+
+    data = SyntheticDatasets.generate_friedman3(n_samples = samples)
+
+    @test size(data)[1] == samples
+    @test size(data)[2] == 5
+
+    data = SyntheticDatasets.generate_low_rank_matrix(n_samples = samples,
+                                                    n_features = features,
+                                                    effective_rank = 10,
+                                                    tail_strength = 0.5,
+                                                    random_state = 5)
+
+    @test size(data)[1] == samples
+    @test size(data)[2] == features
+    
+    data = SyntheticDatasets.generate_swiss_roll(n_samples =samples,
+                                                 noise = 2.2,
+                                                 random_state = 5)
+
+    @test size(data)[1] == samples
+    @test size(data)[2] == 4
 end
 
 @testset "Matlab Generators" begin
@@ -52,6 +89,5 @@ end
     data = SyntheticDatasets.generate_twospirals(n_samples = samples,
                                                  noise = 2.2)
 
-
     @test size(data)[1] == samples
 end