From a0cc985dbfa5900477a80f35934d0e3445514af8 Mon Sep 17 00:00:00 2001 From: Pedro Date: Wed, 2 Sep 2020 22:51:56 -0300 Subject: [PATCH 1/5] Add generate classification function --- src/sklearn.jl | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/src/sklearn.jl b/src/sklearn.jl index d748d88..62a82f5 100644 --- a/src/sklearn.jl +++ b/src/sklearn.jl @@ -140,4 +140,40 @@ function generate_regression(; n_samples::Int = 100, return convert(features, labels) +end + +function generate_classification(; n_samples::Int = 100, + n_features::Int = 20, + n_informative::Int = 2, + n_redundant::Int = 2, + n_repeated::Int = 0, + n_classes::Int = 2, + n_clusters_per_class::Int = 2, + weights::Union{Nothing, Array{Float64,1}} = nothing, + flip_y::Float64 = 0.01, + class_sep::Float64 = 1.0, + hypercube::Bool = true, + shift::Union{Nothing, Array{Float64,1}} = 0.0, + scale::Union{Nothing, Array{Float64,1}} = 1.0, + shuffle::Bool = true, + random_state::Union{Int, Nothing} = nothing) + + + (features, labels) = datasets.make_classification( n_samples = n_samples, + n_features = n_features, + n_informative = n_informative, + n_redundant = n_redundant, + n_repeated = n_repeated, + n_classes = n_classes, + n_clusters_per_class = n_clusters_per_class, + weights = weights, + flip_y = flip_y, + class_sep = class_sep, + hypercube = hypercube, + shift = shift, + scale = scale, + shuffle = shuffle, + random_state = random_state) + + return convert(features, labels) end \ No newline at end of file From 2b03311752a6bbc0af0c3aee50eefe5e1f645b2a Mon Sep 17 00:00:00 2001 From: Pedro Date: Wed, 2 Sep 2020 22:52:53 -0300 Subject: [PATCH 2/5] Add generate classification docstring --- src/sklearn.jl | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/src/sklearn.jl b/src/sklearn.jl index 62a82f5..faa5ded 100644 --- a/src/sklearn.jl +++ b/src/sklearn.jl @@ -142,6 +142,41 @@ function generate_regression(; n_samples::Int = 100, end +""" +function generate_classification(; n_samples::Int = 100, + n_features::Int = 20, + n_informative::Int = 2, + n_redundant::Int = 2, + n_repeated::Int = 0, + n_classes::Int = 2, + n_clusters_per_class::Int = 2, + weights::Union{Nothing, Array{Float64,1}} = nothing, + flip_y::Float64 = 0.01, + class_sep::Float64 = 1.0, + hypercube::Bool = true, + shift::Union{Nothing, Array{Float64,1}} = 0.0, + scale::Union{Nothing, Array{Float64,1}} = 1.0, + shuffle::Bool = true, + random_state::Union{Int, Nothing} = nothing) +Generate a random n-class classification problem. Sklearn interface to make_classification. +#Arguments +- `n_samples::Int = 100`: The number of samples. +- `n_features::Int = 20`: The total number of features. These comprise `n_informative` informative features, `n_redundant` redundant features, `n_repeated` duplicated features and `n_features-n_informative-n_redundant-n_repeated` useless features drawn at random. +- `n_informative::Int = 2`: The number of informative features. Each class is composed of a number of gaussian clusters each located around the vertices of a hypercube in a subspace of dimension `n_informative`. For each cluster, informative features are drawn independently from N(0, 1) and then randomly linearly combined within each cluster in order to add covariance. The clusters are then placed on the vertices of the hypercube. +- `n_redundant::Int = 2`: The number of redundant features. These features are generated as random linear combinations of the informative features. +- `n_repeated::Int = 0`: The number of duplicated features, drawn randomly from the informative and the redundant features. +- `n_classes::Int = 2`: The number of classes (or labels) of the classification problem. +- `n_clusters_per_class::Int = 2`: The number of clusters per class. +- `weights::Union{Nothing, Array{Float64,1}} = nothing`: +- `flip_y::Float64 = 0.01`: The fraction of samples whose class is assigned randomly. Larger values introduce noise in the labels and make the classification task harder. Note that the default setting flip_y > 0 might lead to less than n_classes in y in some cases. +- `class_sep::Float64 = 1.0`: The factor multiplying the hypercube size. Larger values spread out the clusters/classes and make the classification task easier. +- `hypercube::Bool = true`: If True, the clusters are put on the vertices of a hypercube. If False, the clusters are put on the vertices of a random polytope. +- `shift::Union{Nothing, Array{Float64,1}} = 0.0`: Shift features by the specified value. If None, then features are shifted by a random value drawn in [-class_sep, class_sep]. +- `scale::Union{Nothing, Array{Float64,1}} = 1.0`: Multiply features by the specified value. If None, then features are scaled by a random value drawn in [1, 100]. Note that scaling happens after shifting. +- `shuffle::Bool = true`: Shuffle the samples and the features. +- `random_state::Union{Int, Nothing} = nothing`: Determines random number generation for dataset creation. Pass an int for reproducible output across multiple function calls. See Glossary. +Reference: [link](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_classification.html) +""" function generate_classification(; n_samples::Int = 100, n_features::Int = 20, n_informative::Int = 2, From ca7c00e0b41093f49f5e060679f6009d3168b61b Mon Sep 17 00:00:00 2001 From: Pedro Date: Wed, 2 Sep 2020 22:57:46 -0300 Subject: [PATCH 3/5] Add generate classification to readme --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index c14cf4f..fdd784b 100644 --- a/README.md +++ b/README.md @@ -31,6 +31,7 @@ make_blobs | Generate isotropic Gaussian blobs for clustering. make_moons | Make two interleaving half circles | [link](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_blobs.html) make_s_curve | Generate an S curve dataset. | [link](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_s_curve.html) make_regression | Generate a random regression problem. | [link](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_regression.html]) +make_classification | Generate a random n-class classification problem. | [link](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_classification.html]) **Disclaimer**: SyntheticDatasets.jl borrows code and documentation from [scikit-learn](https://scikit-learn.org/stable/modules/classes.html#samples-generator) in the dataset module, but *it is not an official part From 9936c065b8e7f9363510173e4e474b6f1c851beb Mon Sep 17 00:00:00 2001 From: Pedro Date: Wed, 2 Sep 2020 23:05:10 -0300 Subject: [PATCH 4/5] Fix generate classification function --- src/sklearn.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/sklearn.jl b/src/sklearn.jl index faa5ded..835ca6b 100644 --- a/src/sklearn.jl +++ b/src/sklearn.jl @@ -188,8 +188,8 @@ function generate_classification(; n_samples::Int = 100, flip_y::Float64 = 0.01, class_sep::Float64 = 1.0, hypercube::Bool = true, - shift::Union{Nothing, Array{Float64,1}} = 0.0, - scale::Union{Nothing, Array{Float64,1}} = 1.0, + shift::Union{Nothing, Float64, Array{Float64,1}} = 0.0, + scale::Union{Nothing, Float64, Array{Float64,1}} = 1.0, shuffle::Bool = true, random_state::Union{Int, Nothing} = nothing) From 11006b414d3bb8121931989cc7f618f9a27df846 Mon Sep 17 00:00:00 2001 From: Pedro Date: Wed, 2 Sep 2020 23:07:57 -0300 Subject: [PATCH 5/5] Add tests for the generate classification --- test/runtests.jl | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/test/runtests.jl b/test/runtests.jl index b30dd67..03f4919 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -36,4 +36,12 @@ using Test @test size(data)[1] == samples @test size(data)[2] == features + 1 + data = SyntheticDatasets.generate_classification(n_samples = samples, + n_features = features, + n_classes = 1) + + + @test size(data)[1] == samples + @test size(data)[2] == features + 1 + end \ No newline at end of file