Skip to content

Commit e0cd5f5

Browse files
authored
Merge pull request #72 from Arkoniak/coresets
Added Coresets algorithm
2 parents e95461b + 9e6a174 commit e0cd5f5

19 files changed

+530
-132
lines changed

Project.toml

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
name = "ParallelKMeans"
22
uuid = "42b8e9d4-006b-409a-8472-7f34b3fb58af"
33
authors = ["Bernard Brenyah", "Andrey Oskin"]
4-
version = "0.1.5"
4+
version = "0.1.6"
55

66
[deps]
77
Distances = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7"

docs/src/index.md

+5-2
Original file line numberDiff line numberDiff line change
@@ -72,15 +72,16 @@ git checkout experimental
7272
- [X] Implementation of [Hamerly implementation](https://www.researchgate.net/publication/220906984_Making_k-means_Even_Faster).
7373
- [X] Interface for inclusion in Alan Turing Institute's [MLJModels](https://github.com/alan-turing-institute/MLJModels.jl#who-is-this-repo-for).
7474
- [X] Full Implementation of Triangle inequality based on [Elkan - 2003 Using the Triangle Inequality to Accelerate K-Means"](https://www.aaai.org/Papers/ICML/2003/ICML03-022.pdf).
75-
- [X] Implementation of [Yinyang K-Means: A Drop-In Replacement of the Classic K-Means with Consistent Speedup](https://www.microsoft.com/en-us/research/wp-content/uploads/2016/02/ding15.pdf)
75+
- [X] Implementation of [Yinyang K-Means: A Drop-In Replacement of the Classic K-Means with Consistent Speedup](https://www.microsoft.com/en-us/research/wp-content/uploads/2016/02/ding15.pdf).
76+
- [X] Implementation of [Coresets](http://proceedings.mlr.press/v51/lucic16-supp.pdf).
7677
- [ ] Implementation of [Geometric methods to accelerate k-means algorithm](http://cs.baylor.edu/~hamerly/papers/sdm2016_rysavy_hamerly.pdf).
78+
- [X] Support for weighted K-means.
7779
- [ ] Support for other distance metrics supported by [Distances.jl](https://github.com/JuliaStats/Distances.jl#supported-distances).
7880
- [ ] Support of MLJ Random generation hyperparameter.
7981
- [ ] Native support for tabular data inputs outside of MLJModels' interface.
8082
- [ ] Refactoring and finalizaiton of API desgin.
8183
- [ ] GPU support.
8284
- [ ] Distributed calculations support.
83-
- [ ] Implementation of other K-Means algorithm variants based on recent literature.
8485
- [ ] Optimization of code base.
8586
- [ ] Improved Documentation
8687
- [ ] More benchmark tests.
@@ -123,6 +124,7 @@ r.converged # whether the procedure converged
123124
- [Hamerly()](https://www.researchgate.net/publication/220906984_Making_k-means_Even_Faster) - Hamerly is good for moderate number of clusters (< 50?) and moderate dimensions (<100?).
124125
- [Elkan()](https://www.aaai.org/Papers/ICML/2003/ICML03-022.pdf) - Recommended for high dimensional data.
125126
- [Yinyang()](https://www.microsoft.com/en-us/research/wp-content/uploads/2016/02/ding15.pdf) - Recommended for large dimensions and/or large number of clusters.
127+
- [Coreset()](http://proceedings.mlr.press/v51/lucic16-supp.pdf) - Recommended for very fast clustering of very large datasets, when extreme accuracy is not important.
126128
- [Geometric()](http://cs.baylor.edu/~hamerly/papers/sdm2016_rysavy_hamerly.pdf) - (Coming soon)
127129
- [MiniBatch()](https://www.eecs.tufts.edu/~dsculley/papers/fastkmeans.pdf) - (Coming soon)
128130

@@ -204,6 +206,7 @@ ________________________________________________________________________________
204206
- 0.1.3 Faster & optimized execution.
205207
- 0.1.4 Bug fixes.
206208
- 0.1.5 Added `Yinyang` algorithm.
209+
- 0.1.6 Added support for weighted k-means; Added `Coreset` algorithm; improved support for different types of the design matrix.
207210

208211
## Contributing
209212

src/ParallelKMeans.jl

+3-2
Original file line numberDiff line numberDiff line change
@@ -7,15 +7,16 @@ import Distances
77

88
const MMI = MLJModelInterface
99

10-
include("seeding.jl")
1110
include("kmeans.jl")
11+
include("seeding.jl")
1212
include("lloyd.jl")
1313
include("hamerly.jl")
1414
include("elkan.jl")
1515
include("yinyang.jl")
1616
include("mlj_interface.jl")
17+
include("coreset.jl")
1718

1819
export kmeans
19-
export Lloyd, Hamerly, Elkan, Yinyang
20+
export Lloyd, Hamerly, Elkan, Yinyang, 阴阳, Coreset
2021

2122
end # module

src/coreset.jl

+206
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,206 @@
1+
"""
2+
Coreset()
3+
4+
Coreset algorithm implementation, based on "Lucic, Mario & Bachem,
5+
Olivier & Krause, Andreas. (2015). Strong Coresets for Hard and Soft Bregman
6+
Clustering with Applications to Exponential Family Mixtures."
7+
8+
`Coreset` supports following arguments:
9+
- `m`: default 100, subsample size
10+
- `alg`: default `Lloyd()`, algorithm used to clusterize sample
11+
12+
It can be used directly in `kmeans` function
13+
14+
```julia
15+
X = rand(30, 100_000) # 100_000 random points in 30 dimensions
16+
17+
# 3 clusters, Coreset algorithm with default Lloyd algorithm and 100 subsamples
18+
kmeans(Coreset(), X, 3)
19+
20+
# 3 clusters, Coreset algorithm with Hamerly algorithm and 500 subsamples
21+
kmeans(Coreset(m = 500, alg = Hamerly()), X, 3)
22+
kmeans(Coreset(500, Hamerly()), X, 3)
23+
24+
# alternatively short form can be used for defining subsample size or algorithm only
25+
kmeans(Coreset(500), X, 3) # sample of the size 500, Lloyd clustering algorithm
26+
kmeans(Coreset(Hamerly()), X, 3) # sample of the size 100, Hamerly clustering algorithm
27+
```
28+
"""
29+
struct Coreset{T <: AbstractKMeansAlg} <: AbstractKMeansAlg
30+
m::Int
31+
alg::T
32+
end
33+
34+
Coreset(; m = 100, alg = Lloyd()) = Coreset(m, alg)
35+
Coreset(m::Int) = Coreset(m, Lloyd())
36+
Coreset(alg::AbstractKMeansAlg) = Coreset(100, alg)
37+
38+
function kmeans!(alg::Coreset, containers, X, k, weights;
39+
n_threads = Threads.nthreads(),
40+
k_init = "k-means++", max_iters = 300,
41+
tol = eltype(design_matrix)(1e-6), verbose = false, init = nothing)
42+
nrow, ncol = size(X)
43+
centroids = isnothing(init) ? smart_init(X, k, n_threads, init=k_init).centroids : deepcopy(init)
44+
45+
T = eltype(X)
46+
# Steps 2-4 of the paper's algorithm 3
47+
# We distribute points over the centers and calculate weights of each cluster
48+
@parallelize n_threads ncol chunk_fit(alg, containers, centroids, X, weights)
49+
50+
# after this step, containers.centroids_new
51+
collect_containers(alg, containers, n_threads)
52+
53+
# step 7 of the algorithm 3
54+
@parallelize n_threads ncol chunk_update_sensitivity(alg, containers)
55+
56+
# sample from containers.s
57+
coreset_ids = wsample(1:ncol, containers.s, alg.m)
58+
coreset = X[:, coreset_ids]
59+
# create new weights as 1/s[i]
60+
coreset_weights = one(T) ./ @view containers.s[coreset_ids]
61+
62+
# run usual kmeans for new set with new weights.
63+
res = kmeans(alg.alg, coreset, k, coreset_weights, tol = tol, max_iters = max_iters,
64+
verbose = verbose, init = centroids, n_threads = n_threads)
65+
66+
@parallelize n_threads ncol chunk_apply(alg, containers, res.centers, X, weights)
67+
68+
totalcost = sum(containers.totalcost)
69+
70+
return KmeansResult(res.centers, containers.labels, T[], Int[], T[], totalcost, res.iterations, res.converged)
71+
end
72+
73+
function create_containers(alg::Coreset, X, k, nrow, ncol, n_threads)
74+
T = eltype(X)
75+
76+
centroids_cnt = Vector{Vector{T}}(undef, n_threads)
77+
centroids_dist = Vector{Vector{T}}(undef, n_threads)
78+
79+
# sensitivity
80+
81+
for i in 1:n_threads
82+
centroids_cnt[i] = zeros(T, k)
83+
centroids_dist[i] = zeros(T, k)
84+
end
85+
86+
labels = Vector{Int}(undef, ncol)
87+
s = Vector{T}(undef, ncol)
88+
89+
# J is the same as $c_\phi$ in the paper.
90+
J = Vector{T}(undef, n_threads)
91+
92+
alpha = 16 * (log(k) + 2)
93+
94+
centroids_const = Vector{T}(undef, k)
95+
96+
# total_sum_calculation
97+
totalcost = Vector{T}(undef, n_threads)
98+
99+
return (
100+
centroids_cnt = centroids_cnt,
101+
centroids_dist = centroids_dist,
102+
s = s,
103+
labels = labels,
104+
totalcost = totalcost,
105+
J = J,
106+
centroids_const = centroids_const,
107+
alpha = alpha
108+
)
109+
end
110+
111+
function chunk_fit(alg::Coreset, containers, centroids, X, weights, r, idx)
112+
centroids_cnt = containers.centroids_cnt[idx]
113+
centroids_dist = containers.centroids_dist[idx]
114+
labels = containers.labels
115+
s = containers.s
116+
T = eltype(X)
117+
118+
J = zero(T)
119+
for i in r
120+
dist = distance(X, centroids, i, 1)
121+
label = 1
122+
for j in 2:size(centroids, 2)
123+
new_dist = distance(X, centroids, i, j)
124+
125+
# calculation of the closest center (steps 2-3 of the paper's algorithm 3)
126+
label = new_dist < dist ? j : label
127+
dist = new_dist < dist ? new_dist : dist
128+
end
129+
labels[i] = label
130+
131+
# calculation of the $c_\phi$ (step 4)
132+
# Note: $d_A(x', B) = min_{b \in B} d_A(x', b)$
133+
# Not exactly sure about whole `weights` thing, needs further investigation
134+
# for Nothing `weights` (default) it'll work as intendent
135+
centroids_cnt[label] += isnothing(weights) ? one(T) : weights[i]
136+
centroids_dist[label] += isnothing(weights) ? dist : weights[i] * dist
137+
J += dist
138+
139+
# for now we write dist to sensitivity, update it later
140+
s[i] = dist
141+
end
142+
143+
containers.J[idx] = J
144+
end
145+
146+
function collect_containers(::Coreset, containers, n_threads)
147+
# Here we transform formula of the step 6
148+
# By multiplying both sides of equation on $c_\phi / \alpha$ we obtain
149+
# $s(x) <- d_A(x, B) + 2 \sum d_A(x, B) / |B_i| + 4 c_\phi |\Chi| / (|B_i| * \alpha)$
150+
# Taking into account that $c_\phi = 1/|\Chi| \sum d_A(x', B) = J / |\Chi|$ we get
151+
# $s(x) <- d_A(x, B) + 2 \sum d_A(x, B) / |B_i| + 4 J / \alpha * 1/ |B_i|$
152+
153+
alpha = containers.alpha
154+
centroids_const = containers.centroids_const
155+
156+
centroids_cnt = containers.centroids_cnt[1]
157+
centroids_dist = containers.centroids_dist[1]
158+
J = containers.J[1]
159+
160+
@inbounds for i in 2:n_threads
161+
centroids_cnt .+= containers.centroids_cnt[i]
162+
centroids_dist .+= containers.centroids_dist[i]
163+
J += containers.J[i]
164+
end
165+
166+
J = 4 * J / alpha
167+
168+
for i in 1:length(centroids_dist)
169+
centroids_const[i] = 2 * centroids_dist[i] / centroids_cnt[i] +
170+
J / centroids_cnt[i]
171+
end
172+
end
173+
174+
function chunk_update_sensitivity(alg::Coreset, containers, r, idx)
175+
labels = containers.labels
176+
centroids_const = containers.centroids_const
177+
s = containers.s
178+
179+
@inbounds for i in r
180+
s[i] += centroids_const[labels[i]]
181+
end
182+
end
183+
184+
function chunk_apply(alg::Coreset, containers, centroids, X, weights, r, idx)
185+
centroids_cnt = containers.centroids_cnt[idx]
186+
centroids_dist = containers.centroids_dist[idx]
187+
labels = containers.labels
188+
T = eltype(X)
189+
190+
J = zero(T)
191+
for i in r
192+
dist = distance(X, centroids, i, 1)
193+
label = 1
194+
for j in 2:size(centroids, 2)
195+
new_dist = distance(X, centroids, i, j)
196+
197+
# calculation of the closest center (steps 2-3 of the paper's algorithm 3)
198+
label = new_dist < dist ? j : label
199+
dist = new_dist < dist ? new_dist : dist
200+
end
201+
labels[i] = label
202+
J += isnothing(weights) ? dist : weights[i] * dist
203+
end
204+
205+
containers.totalcost[idx] = J
206+
end

src/elkan.jl

+13-13
Original file line numberDiff line numberDiff line change
@@ -18,15 +18,15 @@ kmeans(Elkan(), X, 3) # 3 clusters, Elkan algorithm
1818
"""
1919
struct Elkan <: AbstractKMeansAlg end
2020

21-
function kmeans!(alg::Elkan, containers, X, k;
21+
function kmeans!(alg::Elkan, containers, X, k, weights;
2222
n_threads = Threads.nthreads(),
2323
k_init = "k-means++", max_iters = 300,
2424
tol = eltype(X)(1e-6), verbose = false, init = nothing)
2525
nrow, ncol = size(X)
26-
centroids = init == nothing ? smart_init(X, k, n_threads, init=k_init).centroids : deepcopy(init)
26+
centroids = init == nothing ? smart_init(X, k, n_threads, weights, init=k_init).centroids : deepcopy(init)
2727

2828
update_containers(alg, containers, centroids, n_threads)
29-
@parallelize n_threads ncol chunk_initialize(alg, containers, centroids, X)
29+
@parallelize n_threads ncol chunk_initialize(alg, containers, centroids, X, weights)
3030

3131
T = eltype(X)
3232
converged = false
@@ -37,7 +37,7 @@ function kmeans!(alg::Elkan, containers, X, k;
3737
while niters < max_iters
3838
niters += 1
3939
# Core iteration
40-
@parallelize n_threads ncol chunk_update_centroids(alg, containers, centroids, X)
40+
@parallelize n_threads ncol chunk_update_centroids(alg, containers, centroids, X, weights)
4141

4242
# Collect distributed containers (such as centroids_new, centroids_cnt)
4343
# in paper it is step 4
@@ -70,7 +70,7 @@ function kmeans!(alg::Elkan, containers, X, k;
7070
J_previous = J
7171
end
7272

73-
@parallelize n_threads ncol sum_of_squares(containers, X, containers.labels, centroids)
73+
@parallelize n_threads ncol sum_of_squares(containers, X, containers.labels, centroids, weights)
7474
totalcost = sum(containers.sum_of_squares)
7575

7676
# Terminate algorithm with the assumption that K-means has converged
@@ -127,7 +127,7 @@ function create_containers(alg::Elkan, X, k, nrow, ncol, n_threads)
127127
)
128128
end
129129

130-
function chunk_initialize(::Elkan, containers, centroids, X, r, idx)
130+
function chunk_initialize(::Elkan, containers, centroids, X, weights, r, idx)
131131
ub = containers.ub
132132
lb = containers.lb
133133
centroids_dist = containers.centroids_dist
@@ -153,9 +153,9 @@ function chunk_initialize(::Elkan, containers, centroids, X, r, idx)
153153
end
154154
ub[i] = min_dist
155155
labels[i] = label
156-
centroids_cnt[label] += one(T)
156+
centroids_cnt[label] += isnothing(weights) ? one(T) : weights[i]
157157
for j in axes(X, 1)
158-
centroids_new[j, label] += X[j, i]
158+
centroids_new[j, label] += isnothing(weights) ? X[j, i] : weights[i] * X[j, i]
159159
end
160160
end
161161
end
@@ -188,7 +188,7 @@ function update_containers(::Elkan, containers, centroids, n_threads)
188188
return centroids_dist
189189
end
190190

191-
function chunk_update_centroids(::Elkan, containers, centroids, X, r, idx)
191+
function chunk_update_centroids(::Elkan, containers, centroids, X, weights, r, idx)
192192
# unpack
193193
ub = containers.ub
194194
lb = containers.lb
@@ -231,11 +231,11 @@ function chunk_update_centroids(::Elkan, containers, centroids, X, r, idx)
231231

232232
if label != label_old
233233
labels[i] = label
234-
centroids_cnt[label_old] -= one(T)
235-
centroids_cnt[label] += one(T)
234+
centroids_cnt[label_old] -= isnothing(weights) ? one(T) : weights[i]
235+
centroids_cnt[label] += isnothing(weights) ? one(T) : weights[i]
236236
for j in axes(X, 1)
237-
centroids_new[j, label_old] -= X[j, i]
238-
centroids_new[j, label] += X[j, i]
237+
centroids_new[j, label_old] -= isnothing(weights) ? X[j, i] : weights[i] * X[j, i]
238+
centroids_new[j, label] += isnothing(weights) ? X[j, i] : weights[i] * X[j, i]
239239
end
240240
end
241241
end

0 commit comments

Comments
 (0)