Skip to content

Feature/generate_low_rank_matrix #16

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
127 changes: 79 additions & 48 deletions src/sklearn.jl
Original file line number Diff line number Diff line change
@@ -1,40 +1,40 @@
"""
generate_moons(; n_samples::Union{Tuple{Int, Int}, Int} = 100,
shuffle = true,
noise = nothing,
generate_moons(; n_samples::Union{Tuple{Int, Int}, Int} = 100,
shuffle = true,
noise = nothing,
random_state = nothing)::DataFrame
Make two interleaving half circles. Sklearn interface to make_moons.
Make two interleaving half circles. Sklearn interface to make_moons.
# Arguments
- `n_samples::Union{Tuple{Int, Int}, Int} = 100`: If int, the total number of points generated. If two-element tuple, number of points in each of two moons.
- `shuffle::Bool = true`: Whether to shuffle the samples.
- `noise::Union{Nothing, Float64} = nothing`: Standard deviation of Gaussian noise added to the data.
- `random_state::Union{Int, Nothing} = nothing`: Determines random number generation for dataset shuffling and noise.
Reference: [link](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_moons.html)
"""
function generate_moons(;n_samples::Union{Tuple{Int, Int}, Int} = 100,
shuffle::Bool = true,
noise::Union{Nothing, Float64} = nothing,
function generate_moons(;n_samples::Union{Tuple{Int, Int}, Int} = 100,
shuffle::Bool = true,
noise::Union{Nothing, Float64} = nothing,
random_state::Union{Int, Nothing} = nothing)::DataFrame

(features, labels) = datasets.make_moons( n_samples=n_samples,
shuffle = shuffle,
noise = noise,
(features, labels) = datasets.make_moons( n_samples=n_samples,
shuffle = shuffle,
noise = noise,
random_state = random_state)

return convert(features, labels)
end

"""
generate_blobs(; n_samples::Union{Int, Array{Int, 1}} = 100,
generate_blobs(; n_samples::Union{Int, Array{Int, 1}} = 100,
n_features::Int = 2,
centers::Union{Int, Union{Nothing, Array{Float64, 2}}} = nothing,
cluster_std::Union{Float64, Array{Float64, 1}} = 1.0,
center_box = (-10.0, 10.0),
shuffle::Bool = true,
shuffle::Bool = true,
random_state::Union{Int, Nothing} = nothing)::DataFrame
Generate isotropic Gaussian blobs for clustering. Sklearn interface to make_blobs.
Generate isotropic Gaussian blobs for clustering. Sklearn interface to make_blobs.
# Arguments
- `n_samples = 100`: If int, it is the total number of points equally divided among clusters. If array-like, each element of the sequence indicates the number of samples per cluster.
- `n_samples = 100`: If int, it is the total number of points equally divided among clusters. If array-like, each element of the sequence indicates the number of samples per cluster.
- `n_features = 2`: The number of features for each sample.
- `centers::Union{Int, Union{Nothing, Array{Float64, 2}}} = nothing`: The number of centers to generate, or the fixed center locations. If n_samples is an int and centers is None, 3 centers are generated. If n_samples is array-like, centers must be either None or an array of length equal to the length of n_samples.
- `cluster_std::Union{Float64, Array{Float64, 1}} = 1.0`: The standard deviation of the clusters.
Expand All @@ -43,72 +43,72 @@ Generate isotropic Gaussian blobs for clustering. Sklearn interface to make_blob
- `random_state::Union{Int, Nothing} = nothing`: Determines random number generation for dataset shuffling and noise.
Reference: [link](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_blobs.html)
"""
function generate_blobs(;n_samples::Union{Int, Array{Int, 1}} = 100,
function generate_blobs(;n_samples::Union{Int, Array{Int, 1}} = 100,
n_features::Int = 2,
centers::Union{Int, Union{Nothing, Array{Float64, 2}}} = nothing,
cluster_std::Union{Float64, Array{Float64, 1}} = 1.0,
center_box::Tuple{Float64, Float64} = (-10.0, 10.0),
shuffle::Bool = true,
shuffle::Bool = true,
random_state::Union{Int, Nothing} = nothing)::DataFrame

(features, labels) = datasets.make_blobs( n_samples = n_samples,
n_features = n_features,
centers = centers,
cluster_std = cluster_std,
center_box = center_box,
shuffle = shuffle,
random_state = random_state,
(features, labels) = datasets.make_blobs( n_samples = n_samples,
n_features = n_features,
centers = centers,
cluster_std = cluster_std,
center_box = center_box,
shuffle = shuffle,
random_state = random_state,
return_centers = false)

return convert(features, labels)
end

"""
generate_s_curve(; n_samples::Int = 100,
noise = nothing,
generate_s_curve(; n_samples::Int = 100,
noise = nothing,
random_state = nothing)::DataFrame
Generate an S curve dataset. Sklearn interface to make_s_curve.
Generate an S curve dataset. Sklearn interface to make_s_curve.
# Arguments
- `n_samples::Int = 100`: The number of sample points on the S curve.
- `noise::Union{Nothing, Float64} = nothing`: Standard deviation of Gaussian noise added to the data.
- `random_state::Union{Int, Nothing} = nothing`: Determines random number generation for dataset creation. Pass an int for reproducible output across multiple function calls.
Reference: [link](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_s_curve.html)
"""
function generate_s_curve(; n_samples::Int = 100,
noise::Float64 = 0.0,
noise::Float64 = 0.0,
random_state::Union{Int, Nothing} = nothing)::DataFrame

(features, labels) = datasets.make_s_curve( n_samples = n_samples,
noise = noise,
noise = noise,
random_state = random_state)

return convert(features, labels)
end

"""
function generate_circles(; n_samples::Int = 100,
shuffle::Bool = true,
noise::Float64 = 0.0,
noise::Float64 = 0.0,
random_state::Union{Int, Nothing} = nothing,
factor::Float64 = 0.8)::DataFrame
Make a large circle containing a smaller circle in 2d. Sklearn interface to make_circles.
# Arguments
- `n_samples::Union{Int, Tuple{Int, Int}} = 100`: If int, it is the total number of points generated. For odd numbers, the inner circle will have one point more than the outer circle. If two-element tuple, number of points in outer circle and inner circle.
- `shuffle::Bool = true`: Whether to shuffle the samples.
- `noise::Union{Nothing, Float64} = nothing`: Standard deviation of Gaussian noise added to the data.
- `random_state::Union{Int, Nothing} = nothing`: Determines random number generation for dataset shuffling and noise. Pass an int for reproducible output across multiple function calls.
- `random_state::Union{Int, Nothing} = nothing`: Determines random number generation for dataset shuffling and noise. Pass an int for reproducible output across multiple function calls.
- `factor::Float64 = 0.8`: Scale factor between inner and outer circle.
Reference: [link](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_circles.html)

"""
function generate_circles(; n_samples::Union{Int, Tuple{Int, Int}} = 100,
shuffle::Bool = true,
noise::Union{Nothing, Float64} = nothing,
noise::Union{Nothing, Float64} = nothing,
random_state::Union{Int, Nothing} = nothing,
factor::Float64 = 0.8)::DataFrame

(features, labels) = datasets.make_circles( n_samples = n_samples,
shuffle = shuffle,
shuffle = shuffle,
noise = noise,
random_state = random_state,
factor = factor)
Expand All @@ -122,11 +122,11 @@ end
n_informative::Int = 10,
n_targets::Int = 1,
bias::Float64 = 0.0,
effective_rank::Union{Int, Nothing} = nothing,
tail_strength::Float64 = 0.5,
noise::Float64 = 0.0,
shuffle::Bool = true,
coef::Bool = false,
effective_rank::Union{Int, Nothing} = nothing,
tail_strength::Float64 = 0.5,
noise::Float64 = 0.0,
shuffle::Bool = true,
coef::Bool = false,
random_state::Union{Int, Nothing}= nothing)
Generate a random regression problem. Sklearn interface to make_regression.
# Arguments
Expand Down Expand Up @@ -157,7 +157,7 @@ function generate_regression(; n_samples::Int = 100,

(features, labels) = datasets.make_regression( n_samples = n_samples,
n_features = n_features,
n_informative = n_informative,
n_informative = n_informative,
n_targets = n_targets,
bias = bias,
effective_rank = effective_rank,
Expand All @@ -181,10 +181,10 @@ function generate_classification(; n_samples::Int = 100,
weights::Union{Nothing, Array{Float64,1}} = nothing,
flip_y::Float64 = 0.01,
class_sep::Float64 = 1.0,
hypercube::Bool = true,
hypercube::Bool = true,
shift::Union{Nothing, Array{Float64,1}} = 0.0,
scale::Union{Nothing, Array{Float64,1}} = 1.0,
shuffle::Bool = true,
scale::Union{Nothing, Array{Float64,1}} = 1.0,
shuffle::Bool = true,
random_state::Union{Int, Nothing} = nothing)
Generate a random n-class classification problem. Sklearn interface to make_classification.
#Arguments
Expand All @@ -195,7 +195,7 @@ Generate a random n-class classification problem. Sklearn interface to make_clas
- `n_repeated::Int = 0`: The number of duplicated features, drawn randomly from the informative and the redundant features.
- `n_classes::Int = 2`: The number of classes (or labels) of the classification problem.
- `n_clusters_per_class::Int = 2`: The number of clusters per class.
- `weights::Union{Nothing, Array{Float64,1}} = nothing`:
- `weights::Union{Nothing, Array{Float64,1}} = nothing`:
- `flip_y::Float64 = 0.01`: The fraction of samples whose class is assigned randomly. Larger values introduce noise in the labels and make the classification task harder. Note that the default setting flip_y > 0 might lead to less than n_classes in y in some cases.
- `class_sep::Float64 = 1.0`: The factor multiplying the hypercube size. Larger values spread out the clusters/classes and make the classification task easier.
- `hypercube::Bool = true`: If True, the clusters are put on the vertices of a hypercube. If False, the clusters are put on the vertices of a random polytope.
Expand All @@ -215,10 +215,10 @@ function generate_classification(; n_samples::Int = 100,
weights::Union{Nothing, Array{Float64,1}} = nothing,
flip_y::Float64 = 0.01,
class_sep::Float64 = 1.0,
hypercube::Bool = true,
hypercube::Bool = true,
shift::Union{Nothing, Float64, Array{Float64,1}} = 0.0,
scale::Union{Nothing, Float64, Array{Float64,1}} = 1.0,
shuffle::Bool = true,
scale::Union{Nothing, Float64, Array{Float64,1}} = 1.0,
shuffle::Bool = true,
random_state::Union{Int, Nothing} = nothing)

(features, labels) = datasets.make_classification( n_samples = n_samples,
Expand All @@ -238,4 +238,35 @@ function generate_classification(; n_samples::Int = 100,
random_state = random_state)

return convert(features, labels)
end
end

"""
function generate_low_rank_matrix(; n_samples::Int =100,
n_features::Int =100,
effective_rank::Int =10,
tail_strength::Float64 =0.5,
random_state::Union{Int, Nothing} = nothing)
Generate a mostly low rank matrix with bell-shaped singular values
#Arguments
- `n_samples::Int = 100`: The number of samples.
- `n_features::Int = 20`: The total number of features. These comprise `n_informative` informative features, `n_redundant` redundant features, `n_repeated` duplicated features and `n_features-n_informative-n_redundant-n_repeated` useless features drawn at random.
- `effective_rank::Int = 10`: The approximate number of singular vectors required to explain most of the data by linear combinations.
- `tail_strength::Float64 = 0.5`: The relative importance of the fat noisy tail of the singular values profile.
- `random_state::Union{Int, Nothing} = nothing`: Determines random number generation for dataset creation. Pass an int for reproducible output across multiple function calls. See Glossary.
Reference: [link](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_low_rank_matrix.html)
"""

function generate_low_rank_matrix(; n_samples::Int = 100,
n_features::Int = 100,
effective_rank::Int = 10,
tail_strength::Float64 = 0.5,
random_state::Union{Int, Nothing} = nothing)

(features, labels) = datasets.make_low_rank_matrix(n_samples = n_samples,
n_features = n_features,
effective_rank = effective_rank,
tail_strength = tail_strength,
random_state = random_state)
#return convert(features, labels)

end
19 changes: 14 additions & 5 deletions test/runtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,11 @@ using Test
samples = 20000
features = 20

data = SyntheticDatasets.generate_blobs(centers = [-1 1;-0.5 0.75],
cluster_std = 0.225,
data = SyntheticDatasets.generate_blobs(centers = [-1 1;-0.5 0.75],
cluster_std = 0.225,
n_samples = 20000,
center_box = (-1.5, 1.5))

@test size(data)[1] == samples
@test size(data)[2] == 3

Expand Down Expand Up @@ -44,8 +44,17 @@ using Test
n_features = features,
n_classes = 1)


@test size(data)[1] == samples
@test size(data)[2] == features + 1

end
data = SyntheticDatasets.generate_low_rank_matrix(n_samples = samples,
n_features = features,
effective_rank = 10,
tail_strength = 0.5,
random_state = 5)

@test size(data)[1] == samples
@test size(data)[2] == features

end