Skip to content

Commit 6010166

Browse files
committed
Reduced minibatch algo allocations & whitespace removal in various modules
1 parent baa3150 commit 6010166

File tree

5 files changed

+50
-39
lines changed

5 files changed

+50
-39
lines changed

docs/src/index.md

+6-6
Original file line numberDiff line numberDiff line change
@@ -78,11 +78,11 @@ pkg> free ParallelKMeans
7878
- [X] Implementation of [Coresets](http://proceedings.mlr.press/v51/lucic16-supp.pdf).
7979
- [X] Support for weighted K-means.
8080
- [X] Support of MLJ Random generation hyperparameter.
81+
- [X] Implementation of [Mini-batch KMeans variant](https://www.eecs.tufts.edu/~dsculley/papers/fastkmeans.pdf)
8182
- [ ] Support for other distance metrics supported by [Distances.jl](https://github.com/JuliaStats/Distances.jl#supported-distances).
8283
- [ ] Implementation of [Geometric methods to accelerate k-means algorithm](http://cs.baylor.edu/~hamerly/papers/sdm2016_rysavy_hamerly.pdf).
8384
- [ ] Native support for tabular data inputs outside of MLJModels' interface.
84-
- [ ] Refactoring and finalization of API design.
85-
- [ ] GPU support.
85+
- [ ] GPU support?
8686
- [ ] Distributed calculations support.
8787
- [ ] Optimization of code base.
8888
- [ ] Improved Documentation
@@ -127,7 +127,7 @@ r.converged # whether the procedure converged
127127
- [Elkan()](https://www.aaai.org/Papers/ICML/2003/ICML03-022.pdf) - Recommended for high dimensional data.
128128
- [Yinyang()](https://www.microsoft.com/en-us/research/wp-content/uploads/2016/02/ding15.pdf) - Recommended for large dimensions and/or large number of clusters.
129129
- [Coreset()](http://proceedings.mlr.press/v51/lucic16-supp.pdf) - Recommended for very fast clustering of very large datasets, when extreme accuracy is not important.
130-
- [MiniBatch()](https://www.eecs.tufts.edu/~dsculley/papers/fastkmeans.pdf) - Recommended for extremely large datasets.
130+
- [MiniBatch()](https://www.eecs.tufts.edu/~dsculley/papers/fastkmeans.pdf) - Recommended for extremely large datasets, when extreme accuracy is not important.
131131
- [Geometric()](http://cs.baylor.edu/~hamerly/papers/sdm2016_rysavy_hamerly.pdf) - (Coming soon)
132132

133133
### Practical Usage Examples
@@ -175,9 +175,9 @@ Currently, this package is benchmarked against similar implementations in both P
175175

176176
*Note*: All benchmark tests are made on the same computer to help eliminate any bias.
177177

178-
|PC Name |CPU |Ram |
179-
|:---------------------------:|:------------------------:|:----------------:|
180-
|iMac (Retina 5K 27-inch 2019)|3 GHz 6-Core Intel Core i5|8 GB 2667 MHz DDR4|
178+
|PC Name |CPU |Ram |
179+
|:---------------------------:|:------------------------:|:-----------------:|
180+
|iMac (Retina 5K 27-inch 2019)|3 GHz 6-Core Intel Core i5|24 GB 2667 MHz DDR4|
181181

182182
Currently, the benchmark speed tests are based on the search for optimal number of clusters using the [Elbow Method](https://en.wikipedia.org/wiki/Elbow_method_(clustering)) since this is a practical use case for most practioners employing the K-Means algorithm.
183183

src/kmeans.jl

+9-9
Original file line numberDiff line numberDiff line change
@@ -170,15 +170,15 @@ alternatively one can use `rand` to choose random points for init.
170170
171171
A `KmeansResult` structure representing labels, centroids, and sum_squares is returned.
172172
"""
173-
function kmeans(alg::AbstractKMeansAlg, design_matrix, k;
174-
weights = nothing,
173+
function kmeans(alg::AbstractKMeansAlg, design_matrix, k;
174+
weights = nothing,
175175
n_threads = Threads.nthreads(),
176-
k_init = "k-means++",
176+
k_init = "k-means++",
177177
max_iters = 300,
178178
tol = eltype(design_matrix)(1e-6),
179179
verbose = false,
180180
init = nothing,
181-
rng = Random.GLOBAL_RNG,
181+
rng = Random.GLOBAL_RNG,
182182
metric = Euclidean())
183183

184184
nrow, ncol = size(design_matrix)
@@ -187,12 +187,12 @@ function kmeans(alg::AbstractKMeansAlg, design_matrix, k;
187187
containers = create_containers(alg, design_matrix, k, nrow, ncol, n_threads)
188188

189189
return kmeans!(alg, containers, design_matrix, k, weights, metric;
190-
n_threads = n_threads,
191-
k_init = k_init,
190+
n_threads = n_threads,
191+
k_init = k_init,
192192
max_iters = max_iters,
193-
tol = tol,
194-
verbose = verbose,
195-
init = init,
193+
tol = tol,
194+
verbose = verbose,
195+
init = init,
196196
rng = rng)
197197

198198
end

src/mini_batch.jl

+31-20
Original file line numberDiff line numberDiff line change
@@ -22,30 +22,34 @@ function kmeans!(alg::MiniBatch, containers, X, k,
2222
k_init = "k-means++", init = nothing, max_iters = 300,
2323
tol = eltype(X)(1e-6), max_no_improvement = 10, verbose = false, rng = Random.GLOBAL_RNG)
2424

25+
# Retrieve initialized artifacts from the container
26+
centroids = containers.centroids_new
27+
batch_rand_idx = containers.batch_rand_idx
28+
labels = containers.labels
29+
2530
# Get the type and dimensions of design matrix, X - (Step 1)
2631
T = eltype(X)
2732
nrow, ncol = size(X)
2833

2934
# Initiate cluster centers - (Step 2) in paper
30-
centroids = isnothing(init) ? smart_init(X, k, n_threads, weights, rng, init = k_init).centroids : deepcopy(init)
35+
centroids .= isnothing(init) ? smart_init(X, k, n_threads, weights, rng, init = k_init).centroids : deepcopy(init)
3136

3237
# Initialize counter for the no. of data in each cluster - (Step 3) in paper
3338
N = zeros(T, k)
3439

35-
# Initialize nearest centers for both batch and whole dataset labels
40+
# Initialize various artifacts
3641
converged = false
3742
niters = 1
3843
counter = 0
3944
J_previous = zero(T)
4045
J = zero(T)
4146
totalcost = zero(T)
42-
batch_rand_idx = containers.batch_rand_idx
43-
47+
4448
# Main Steps. Batch update centroids until convergence
4549
while niters <= max_iters # Step 4 in paper
4650

4751
# b examples picked randomly from X (Step 5 in paper)
48-
batch_rand_idx = isnothing(weights) ? rand!(rng, batch_rand_idx, 1:ncol) : wsample!(rng, 1:ncol, weights, batch_rand_idx)
52+
isnothing(weights) ? rand!(rng, batch_rand_idx, 1:ncol) : wsample!(rng, 1:ncol, weights, batch_rand_idx)
4953

5054
# Cache/label the batch samples nearest to the centers (Step 6 & 7)
5155
@inbounds for i in batch_rand_idx
@@ -58,12 +62,12 @@ function kmeans!(alg::MiniBatch, containers, X, k,
5862
min_dist = dist < min_dist ? dist : min_dist
5963
end
6064

61-
containers.labels[i] = label
65+
labels[i] = label
6266

6367
##### Batch gradient step #####
6468
# iterate over examples (each column) ==> (Step 9)
65-
# Get cached center/label for each example label = labels[i] => (Step 10)
66-
69+
# Get cached center/label for each example label = labels[i] => (Step 10)
70+
6771
# Update per-center counts
6872
N[label] += isnothing(weights) ? 1 : weights[i] # (Step 11)
6973

@@ -75,10 +79,10 @@ function kmeans!(alg::MiniBatch, containers, X, k,
7579
end
7680

7781
# Reassign all labels based on new centres generated from the latest sample
78-
containers.labels .= reassign_labels(X, metric, containers.labels, centroids)
82+
labels .= reassign_labels(X, metric, labels, centroids)
7983

8084
# Calculate cost on whole dataset after reassignment and check for convergence
81-
@parallelize 1 ncol sum_of_squares(containers, X, containers.labels, centroids, weights, metric)
85+
@parallelize 1 ncol sum_of_squares(containers, X, labels, centroids, weights, metric)
8286
J = sum(containers.sum_of_squares)
8387

8488
if verbose
@@ -94,12 +98,12 @@ function kmeans!(alg::MiniBatch, containers, X, k,
9498
if counter >= max_no_improvement
9599
converged = true
96100
# Compute label assignment for the complete dataset
97-
containers.labels .= reassign_labels(X, metric, containers.labels, centroids)
101+
labels .= reassign_labels(X, metric, labels, centroids)
98102

99103
# Compute totalcost for the complete dataset
100-
@parallelize 1 ncol sum_of_squares(containers, X, containers.labels, centroids, weights, metric)
104+
@parallelize 1 ncol sum_of_squares(containers, X, labels, centroids, weights, metric)
101105
totalcost = sum(containers.sum_of_squares)
102-
106+
103107
# Print convergence message to user
104108
if verbose
105109
println("Successfully terminated with convergence.")
@@ -117,11 +121,13 @@ function kmeans!(alg::MiniBatch, containers, X, k,
117121
if verbose
118122
println("Clustering model failed to converge. Labelling data with latest centroids.")
119123
end
120-
containers.labels .= reassign_labels(X, metric, containers.labels, centroids)
124+
125+
labels .= reassign_labels(X, metric, labels, centroids)
121126

122127
# Compute totalcost for unconverged model
123-
@parallelize 1 ncol sum_of_squares(containers, X, containers.labels, centroids, weights, metric)
128+
@parallelize 1 ncol sum_of_squares(containers, X, labels, centroids, weights, metric)
124129
totalcost = sum(containers.sum_of_squares)
130+
125131
break
126132
end
127133

@@ -130,10 +136,14 @@ function kmeans!(alg::MiniBatch, containers, X, k,
130136
end
131137

132138
# Push learned artifacts to KmeansResult
133-
return KmeansResult(centroids, containers.labels, T[], Int[], T[], totalcost, niters, converged)
139+
return KmeansResult(centroids, labels, T[], Int[], T[], totalcost, niters, converged)
134140
end
135141

142+
"""
143+
reassign_labels(DMatrix, metric, labels, centres)
136144
145+
An internal function to relabel DMatrix based on centres and metric.
146+
"""
137147
function reassign_labels(DMatrix, metric, labels, centres)
138148
@inbounds for i in axes(DMatrix, 2)
139149
min_dist = distance(metric, DMatrix, centres, i, 1)
@@ -156,17 +166,18 @@ end
156166
Internal function for the creation of all necessary intermidiate structures.
157167
158168
- `centroids_new` - container which holds new positions of centroids
159-
- `centroids_cnt` - container which holds number of points for each centroid
160169
- `labels` - vector which holds labels of corresponding points
161170
- `sum_of_squares` - vector which holds the sum of squares values for each thread
171+
- `batch_rand_idx` - vector which holds the selected batch indices
162172
"""
163173
function create_containers(alg::MiniBatch, X, k, nrow, ncol, n_threads)
164174
# Initiate placeholders to avoid allocations
165-
T = eltype(X)
175+
T = eltype(X)
166176
labels = Vector{Int}(undef, ncol) # labels vector
167177
sum_of_squares = Vector{T}(undef, 1) # total_sum_calculation
168-
batch_rand_idx = Vector{Int}(undef, alg.b)
178+
batch_rand_idx = Vector{Int}(undef, alg.b) # selected batch indices
179+
centroids_new = Matrix{T}(undef, nrow, k) # centroids
169180

170-
return (batch_rand_idx = batch_rand_idx,
181+
return (batch_rand_idx = batch_rand_idx, centroids_new = centroids_new,
171182
labels = labels, sum_of_squares = sum_of_squares)
172183
end

src/mlj_interface.jl

+1-1
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ const MLJDICT = Dict(:Lloyd => Lloyd(),
1010
:Elkan => Elkan(),
1111
:Yinyang => Yinyang(),
1212
:Coreset => Coreset(),
13-
:阴阳 => Coreset(),
13+
:阴阳 => Coreset(),
1414
:MiniBatch => MiniBatch())
1515

1616
####

test/test80_mlj_interface.jl

+3-3
Original file line numberDiff line numberDiff line change
@@ -200,15 +200,15 @@ end
200200
@test report.totalcost 18.03007733451847
201201

202202
params = fitted_params(model, results)
203-
@test all(params.cluster_centers .≈ [0.39739206832613827 0.4818900563319951;
204-
0.7695625526281311 0.30986081763964723;
203+
@test all(params.cluster_centers .≈ [0.39739206832613827 0.4818900563319951;
204+
0.7695625526281311 0.30986081763964723;
205205
0.6175496080776439 0.3911138270823586])
206206

207207
# Use trained model to cluster new data X_test
208208
preds = transform(model, results, X_test)
209209
@test preds[:x1][1] 0.48848842207123555
210210
@test preds[:x2][1] 0.08355805256372761
211-
211+
212212
# Make predictions on new data X_test with fitted params
213213
yhat = predict(model, results, X_test)
214214
@test yhat == report.assignments[1:2]

0 commit comments

Comments
 (0)