Merge pull request #11 from ATISLabs/feature/generate_regression_function

filipebraida · web-flow · commit e7f15d56d994 · 2020-09-02T01:01:39.000-03:00
[#1] - Feature/generate regression function
diff --git a/README.md b/README.md
@@ -30,6 +30,7 @@ Dataset         | Title
 make_blobs      | Generate isotropic Gaussian blobs for clustering.                      | [link](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_moons.html)
 make_moons      | Make two interleaving half circles                                     | [link](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_blobs.html)
 make_s_curve    | Generate an S curve dataset.                                           | [link](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_s_curve.html)
+make_regression | Generate a random regression problem.                                  | [link](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_regression.html])
 
 **Disclaimer**: SyntheticDatasets.jl borrows code and documentation from
 [scikit-learn](https://scikit-learn.org/stable/modules/classes.html#samples-generator) in the dataset module, but *it is not an official part
diff --git a/src/sklearn.jl b/src/sklearn.jl
@@ -83,4 +83,61 @@ function generate_s_curve(; n_samples::Int = 100,
                                                 random_state = random_state)
     
     return convert(features, labels)
+end
+
+"""
+    generate_regression(;   n_samples::Int = 100,
+                            n_features::Int = 100,
+                            n_informative::Int = 10,
+                            n_targets::Int = 1,
+                            bias::Float64 = 0.0,
+                            effective_rank::Union{Int, Nothing} = nothing, 
+                            tail_strength::Float64 = 0.5, 
+                            noise::Float64 = 0.0, 
+                            shuffle::Bool = true, 
+                            coef::Bool = false, 
+                            random_state::Union{Int, Nothing}= nothing)
+Generate a random regression problem. Sklearn interface to make_regression.
+# Arguments
+- `n_samples::Int = 100`: The number of samples.
+- `n_features::Int = 2`: The number of features.
+- `n_informative::Int = 10`: The number of informative features, i.e., the number of features used to build the linear model used to generate the output.
+- `n_targets::Int = 1`: The number of regression targets, i.e., the dimension of the y output vector associated with a sample. By default, the output is a scalar.
+- `bias::Float = 0.0`: The bias term in the underlying linear model.
+- `effective_rank::Union{Int, Nothing} = nothing`: If not `nothing`, the approximate number of singular vectors required to explain most of the input data by linear combinations. Using this kind of singular spectrum in the input allows the generator to reproduce the correlations often observed in practice. If `nothing`, the input set is well conditioned, centered and gaussian with unit variance.
+- `tail_strength::Float = 0.5`: The relative importance of the fat noisy tail of the singular values profile if effective_rank is not None.
+- `noise::Union{Nothing, Float64} = nothing`: Standard deviation of Gaussian noise added to the data.
+- `shuffle::Bool = true`: Shuffle the samples and the features.
+- `coef::Bool = false`: If `true`, the coefficients of the underlying linear model are returned.
+- `random_state::Union{Int, Nothing} = nothing`: Determines random number generation for dataset shuffling and noise.
+Reference: [link](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_regression.html)
+"""
+function generate_regression(;  n_samples::Int = 100,
+                                n_features::Int = 100,
+                                n_informative::Int = 10,
+                                n_targets::Int = 1,
+                                bias::Float64 = 0.0,
+                                effective_rank::Union{Int, Nothing} = nothing,
+                                tail_strength::Float64 = 0.5,
+                                noise::Float64 = 0.0,
+                                shuffle::Bool = true,
+                                coef::Bool = false,
+                                random_state::Union{Int, Nothing}= nothing)
+
+    
+    (features, labels) = datasets.make_regression(  n_samples = n_samples,
+                                                    n_features = n_features,
+                                                    n_informative = n_informative, 
+                                                    n_targets = n_targets,
+                                                    bias = bias,
+                                                    effective_rank = effective_rank,
+                                                    tail_strength = tail_strength,
+                                                    noise = noise,
+                                                    shuffle = shuffle,
+                                                    coef = coef,
+                                                    random_state = random_state)
+    
+
+    return convert(features, labels)
+
 end
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -4,6 +4,8 @@ using Test
 
 @testset "SkLearn Generators" begin
     samples = 20000
+    features = 20
+
     data = SyntheticDatasets.generate_blobs(centers = [-1 1;-0.5 0.75], 
                                             cluster_std = 0.225, 
                                             n_samples = 20000,
@@ -25,4 +27,13 @@ using Test
     @test size(data)[1] == samples
     @test size(data)[2] == 4
 
+
+    data = SyntheticDatasets.generate_regression(n_samples = samples,
+                                                 n_features = features,
+                                                 noise = 2.2,
+                                                 random_state = 5)
+
+    @test size(data)[1] == samples
+    @test size(data)[2] == features + 1
+
 end