Skip to content

Commit a51a6c0

Browse files
committed
Merge branch 'master' into feature/generate_hastie_10_2
2 parents cce693a + 2388f1b commit a51a6c0

File tree

5 files changed

+107
-11
lines changed

5 files changed

+107
-11
lines changed

Diff for: README.md

+7
Original file line numberDiff line numberDiff line change
@@ -41,11 +41,18 @@ make_classification | Generate a random n-class classification problem.
4141
make_low_rank_matrix | Generate a mostly low rank matrix with bell-shaped singular values. | [link](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_low_rank_matrix.html)
4242
make_swiss_roll | Generate a swiss roll dataset. | [link](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_swiss_roll.html)
4343
make_hastie_10_2 | Generates data for binary classification used in Hastie et al. |[link](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_hastie_10_2.html)
44+
make_gaussian_quantiles | Generate a swiss roll dataset. | [link](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_gaussian_quantiles.html)
4445

4546
**Disclaimer**: SyntheticDatasets.jl borrows code and documentation from
4647
[scikit-learn](https://scikit-learn.org/stable/modules/classes.html#samples-generator) in the dataset module, but *it is not an official part
4748
of that project*. It is licensed under [MIT](LICENSE).
4849

50+
### Other Functions
51+
52+
Dataset | Title | Reference
53+
---------------------|-------------------------------------------------------------------------|--------------------------------------------------
54+
generate_twospirals | Generate two spirals dataset. | [link](https://la.mathworks.com/matlabcentral/fileexchange/41459-6-functions-for-generating-artificial-datasets)
55+
4956
[travis-img]: https://travis-ci.com/ATISLabs/SyntheticDatasets.jl.svg?branch=master
5057
[travis-url]: https://travis-ci.com/ATISLabs/SyntheticDatasets.jl
5158

Diff for: src/SyntheticDatasets.jl

+1
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ function __init__()
1111
end
1212

1313
include("sklearn.jl")
14+
include("matlab.jl")
1415

1516
function convert(features::Array{T, 2}, labels::Array{D, 1})::DataFrame where {T <: Number, D <: Number}
1617
df = DataFrame()

Diff for: src/matlab.jl

+33
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
"""
2+
generate_twospirals(; n_samples::Int = 2000,
3+
start_degrees::Int = 90,
4+
total_degrees::Int = 570,
5+
noise::Float64 = 0.2
6+
Generate two spirals dataset. Return a Nx3 matrix, where each line contains the X,Y coordinates and the class of an instance.
7+
# Arguments
8+
- `n_samples::Int = 2000`: The total number of points generated.
9+
- `start_degrees::Int = 90`: Determines how far from the origin the spirals start.
10+
- `total_degrees::Int = 570`: Controls the lenght of the spirals.
11+
- `noise::Float64 = 0.2`: Determines the noise in the dataset.
12+
Reference: [link](https://la.mathworks.com/matlabcentral/fileexchange/41459-6-functions-for-generating-artificial-datasets)
13+
"""
14+
function generate_twospirals(; n_samples::Int = 2000,
15+
start_degrees::Int = 90,
16+
total_degrees::Int = 570,
17+
noise::Float64 = 0.2)
18+
start_degrees = deg2rad(start_degrees);
19+
20+
N1 = floor(Int, n_samples / 2);
21+
N2 = n_samples - N1;
22+
23+
n = start_degrees .+ sqrt.(rand(N1,1)) .* deg2rad(total_degrees);
24+
d1 = [-cos.(n).*n + rand(N1,1).*noise sin.(n).*n+rand(N1,1).*noise];
25+
26+
n = start_degrees .+ sqrt.(rand(N2,1)) .* deg2rad(total_degrees);
27+
d2 = [cos.(n).*n+rand(N2,1)*noise -sin.(n).*n+rand(N2,1)*noise];
28+
29+
features = [d1; d2]
30+
labels = [zeros(Int, N1); ones(Int, N1)]
31+
32+
return convert(features, labels);
33+
end

Diff for: src/sklearn.jl

+39
Original file line numberDiff line numberDiff line change
@@ -379,3 +379,42 @@ function generate_hastie_10_2(; n_samples::Int = 12000,
379379

380380
return convert(features, labels)
381381
end
382+
383+
"""
384+
function generate_gaussian_quantiles(; mean::Array{<:Union{Number, Nothing}, 1} = [nothing],
385+
cov::Float64 = 1,
386+
n_samples::Int = 100,
387+
n_features::Int = 2,
388+
n_classes::Int = 3,
389+
shuffle::Bool = true,
390+
random_state::Union{Int, Nothing} = nothing)
391+
392+
Generate isotropic Gaussian and label samples by quantile.
393+
#Arguments
394+
- `mean::Array{<:Union{Number, Nothing}, 1} = [nothing]`: The mean of the multi-dimensional normal distribution. If None then use the origin (0, 0, …).
395+
- `cov::Float64 = 1`: The covariance matrix will be this value times the unit matrix.
396+
- `n_samples::Int = 100`: The total number of points equally divided among classes.
397+
- `n_features::Int = 2`: The number of features for each sample.
398+
- `n_classes::Int = 3`: The number of classes.
399+
- `shuffle::Bool = true`: Shuffle the samples.
400+
- `random_state::Union{Int, Nothing} = nothing`: Determines random number generation for dataset creation. Pass an int for reproducible output across multiple function calls. See Glossary.
401+
Reference: [link](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_gaussian_quantiles.html)
402+
"""
403+
function generate_gaussian_quantiles(; mean::Union{Array{<:Number, 1}, Nothing} = nothing,
404+
cov::Float64 = 1.0,
405+
n_samples::Int = 100,
406+
n_features::Int = 2,
407+
n_classes::Int = 3,
408+
shuffle::Bool = true,
409+
random_state::Union{Int, Nothing} = nothing)
410+
411+
(features, labels) = datasets.make_gaussian_quantiles(mean = mean,
412+
cov = cov,
413+
n_samples = n_samples,
414+
n_features = n_features,
415+
n_classes = n_classes,
416+
shuffle = shuffle,
417+
random_state = random_state)
418+
419+
return convert(features, labels)
420+
end

Diff for: test/runtests.jl

+27-11
Original file line numberDiff line numberDiff line change
@@ -40,15 +40,15 @@ using Test
4040
@test size(data)[1] == samples
4141
@test size(data)[2] == features + 1
4242

43-
data = SyntheticDatasets.generate_classification(n_samples = samples,
44-
n_features = features,
45-
n_classes = 1)
43+
data = SyntheticDatasets.generate_classification( n_samples = samples,
44+
n_features = features,
45+
n_classes = 1)
4646

4747
@test size(data)[1] == samples
4848
@test size(data)[2] == features + 1
4949

5050
data = SyntheticDatasets.generate_friedman1(n_samples = samples,
51-
n_features = features)
51+
n_features = features)
5252

5353
@test size(data)[1] == samples
5454
@test size(data)[2] == features + 1
@@ -63,11 +63,11 @@ using Test
6363
@test size(data)[1] == samples
6464
@test size(data)[2] == 5
6565

66-
data = SyntheticDatasets.generate_low_rank_matrix(n_samples = samples,
67-
n_features = features,
68-
effective_rank = 10,
69-
tail_strength = 0.5,
70-
random_state = 5)
66+
data = SyntheticDatasets.generate_low_rank_matrix( n_samples = samples,
67+
n_features = features,
68+
effective_rank = 10,
69+
tail_strength = 0.5,
70+
random_state = 5)
7171

7272
@test size(data)[1] == samples
7373
@test size(data)[2] == features
@@ -79,9 +79,25 @@ using Test
7979
@test size(data)[1] == samples
8080
@test size(data)[2] == 4
8181

82-
data = SyntheticDatasets.generate_hastie_10_2(n_samples = samples,
83-
random_state = 5)
82+
data = SyntheticDatasets.generate_hastie_10_2( n_samples = samples,
83+
random_state = 5)
8484

8585
@test size(data)[1] == samples
8686
@test size(data)[2] == 11
87+
88+
data = SyntheticDatasets.generate_gaussian_quantiles( n_samples = samples,
89+
n_features = features,
90+
random_state = 5)
91+
92+
@test size(data)[1] == samples
93+
@test size(data)[2] == features + 1
94+
end
95+
96+
@testset "Matlab Generators" begin
97+
samples = 20000
98+
99+
data = SyntheticDatasets.generate_twospirals(n_samples = samples,
100+
noise = 2.2)
101+
102+
@test size(data)[1] == samples
87103
end

0 commit comments

Comments
 (0)