Skip to content

Commit 0055088

Browse files
authored
Merge pull request #29 from Arkoniak/todo_doc_cleanup
refactored code and added doc strings
2 parents 64e20f9 + 81f6898 commit 0055088

File tree

9 files changed

+587
-677
lines changed

9 files changed

+587
-677
lines changed

src/ParallelKMeans.jl

Lines changed: 6 additions & 633 deletions
Large diffs are not rendered by default.

src/kmeans.jl

Lines changed: 196 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,196 @@
1+
2+
# All Abstract types defined
3+
"""
4+
AbstractKMeansAlg
5+
6+
Abstract base type inherited by all sub-KMeans algorithms.
7+
"""
8+
abstract type AbstractKMeansAlg end
9+
10+
11+
"""
12+
ClusteringResult
13+
14+
Base type for the output of clustering algorithm.
15+
"""
16+
abstract type ClusteringResult end
17+
18+
19+
# Here we mimic `Clustering` output structure
20+
"""
21+
KmeansResult{C,D<:Real,WC<:Real} <: ClusteringResult
22+
23+
The output of [`kmeans`](@ref) and [`kmeans!`](@ref).
24+
# Type parameters
25+
* `C<:AbstractMatrix{<:AbstractFloat}`: type of the `centers` matrix
26+
* `D<:Real`: type of the assignment cost
27+
* `WC<:Real`: type of the cluster weight
28+
# C is the type of centers, an (abstract) matrix of size (d x k)
29+
# D is the type of pairwise distance computation from points to cluster centers
30+
# WC is the type of cluster weights, either Int (in the case where points are
31+
# unweighted) or eltype(weights) (in the case where points are weighted).
32+
"""
33+
struct KmeansResult{C<:AbstractMatrix{<:AbstractFloat},D<:Real,WC<:Real} <: ClusteringResult
34+
centers::C # cluster centers (d x k)
35+
assignments::Vector{Int} # assignments (n)
36+
costs::Vector{D} # cost of the assignments (n)
37+
counts::Vector{Int} # number of points assigned to each cluster (k)
38+
wcounts::Vector{WC} # cluster weights (k)
39+
totalcost::D # total cost (i.e. objective)
40+
iterations::Int # number of elapsed iterations
41+
converged::Bool # whether the procedure converged
42+
end
43+
44+
"""
45+
sum_of_squares(x, labels, centre, k)
46+
47+
This function computes the total sum of squares based on the assigned (labels)
48+
design matrix(x), centroids (centre), and the number of desired groups (k).
49+
50+
A Float type representing the computed metric is returned.
51+
"""
52+
function sum_of_squares(x, labels, centre)
53+
s = 0.0
54+
55+
@inbounds for j in axes(x, 2)
56+
for i in axes(x, 1)
57+
s += (x[i, j] - centre[i, labels[j]])^2
58+
end
59+
end
60+
61+
return s
62+
end
63+
64+
65+
"""
66+
Kmeans([alg::AbstractKMeansAlg,] design_matrix, k; n_threads = nthreads(), k_init="k-means++", max_iters=300, tol=1e-6, verbose=true)
67+
68+
This main function employs the K-means algorithm to cluster all examples
69+
in the training data (design_matrix) into k groups using either the
70+
`k-means++` or random initialisation technique for selecting the initial
71+
centroids.
72+
73+
At the end of the number of iterations specified (max_iters), convergence is
74+
achieved if difference between the current and last cost objective is
75+
less than the tolerance level (tol). An error is thrown if convergence fails.
76+
77+
Arguments:
78+
- `alg` defines one of the algorithms used to calculate `k-means`. This
79+
argument can be omitted, by default Lloyd algorithm is used.
80+
- `n_threads` defines number of threads used for calculations, by default it is equal
81+
to the `Threads.nthreads()` which is defined by `JULIA_NUM_THREADS` environmental
82+
variable. For small size design matrices it make sense to set this argument to 1 in order
83+
to avoid overhead of threads generation.
84+
- `k_init` is one of the algorithms used for initialization. By default `k-means++` algorithm is used,
85+
alternatively one can use `rand` to choose random points for init.
86+
- `max_iters` is the maximum number of iterations
87+
- `tol` defines tolerance for early stopping.
88+
- `verbose` is verbosity level. Details of operations can be either printed or not by setting verbose accordingly.
89+
90+
A `KmeansResult` structure representing labels, centroids, and sum_squares is returned.
91+
"""
92+
function kmeans(alg, design_matrix, k;
93+
n_threads = Threads.nthreads(),
94+
k_init = "k-means++", max_iters = 300,
95+
tol = 1e-6, verbose = true, init = nothing)
96+
nrow, ncol = size(design_matrix)
97+
containers = create_containers(alg, k, nrow, ncol, n_threads)
98+
99+
return kmeans!(alg, containers, design_matrix, k, n_threads = n_threads,
100+
k_init = k_init, max_iters = max_iters, tol = tol,
101+
verbose = verbose, init = init)
102+
end
103+
104+
"""
105+
Kmeans!(alg::AbstractKMeansAlg, containers, design_matrix, k; n_threads = nthreads(), k_init="k-means++", max_iters=300, tol=1e-6, verbose=true)
106+
107+
Mutable version of `kmeans` function. Definition of arguments and results can be
108+
found in `kmeans`.
109+
110+
Argument `containers` represent algorithm specific containers, such as labels, intermidiate
111+
centroids and so on, which are used during calculations.
112+
"""
113+
function kmeans!(alg, containers, design_matrix, k;
114+
n_threads = Threads.nthreads(),
115+
k_init = "k-means++", max_iters = 300,
116+
tol = 1e-6, verbose = true, init = nothing)
117+
nrow, ncol = size(design_matrix)
118+
centroids = init == nothing ? smart_init(design_matrix, k, n_threads, init=k_init).centroids : deepcopy(init)
119+
120+
converged = false
121+
niters = 1
122+
J_previous = 0.0
123+
124+
# Update centroids & labels with closest members until convergence
125+
126+
while niters <= max_iters
127+
update_containers!(containers, alg, centroids, n_threads)
128+
J = update_centroids!(centroids, containers, alg, design_matrix, n_threads)
129+
130+
if verbose
131+
# Show progress and terminate if J stopped decreasing.
132+
println("Iteration $iter: Jclust = $J")
133+
end
134+
135+
# Check for convergence
136+
if (niters > 1) & (abs(J - J_previous) < (tol * J))
137+
converged = true
138+
break
139+
end
140+
141+
J_previous = J
142+
niters += 1
143+
end
144+
145+
totalcost = sum_of_squares(design_matrix, containers.labels, centroids)
146+
147+
# Terminate algorithm with the assumption that K-means has converged
148+
if verbose & converged
149+
println("Successfully terminated with convergence.")
150+
end
151+
152+
# TODO empty placeholder vectors should be calculated
153+
# TODO Float64 type definitions is too restrictive, should be relaxed
154+
# especially during GPU related development
155+
return KmeansResult(centroids, containers.labels, Float64[], Int[], Float64[], totalcost, niters, converged)
156+
end
157+
158+
"""
159+
update_centroids!(centroids, containers, alg, design_matrix, n_threads)
160+
161+
Internal function, used to update centroids by utilizing one of `alg`. It works as
162+
a wrapper of internal `chunk_update_centroids!` function, splitting incoming
163+
`design_matrix` in chunks and combining results together.
164+
"""
165+
function update_centroids!(centroids, containers, alg, design_matrix, n_threads)
166+
ncol = size(design_matrix, 2)
167+
168+
if n_threads == 1
169+
r = axes(design_matrix, 2)
170+
J = chunk_update_centroids!(centroids, containers, alg, design_matrix, r, 0)
171+
172+
centroids .= containers.new_centroids ./ containers.centroids_cnt'
173+
else
174+
ranges = splitter(ncol, n_threads)
175+
176+
waiting_list = Vector{Task}(undef, n_threads - 1)
177+
178+
for i in 1:length(ranges) - 1
179+
waiting_list[i] = @spawn chunk_update_centroids!(centroids, containers,
180+
alg, design_matrix, ranges[i], i + 1)
181+
end
182+
183+
J = chunk_update_centroids!(centroids, containers, alg, design_matrix, ranges[end], 1)
184+
185+
J += sum(fetch.(waiting_list))
186+
187+
for i in 1:length(ranges) - 1
188+
containers.new_centroids[1] .+= containers.new_centroids[i + 1]
189+
containers.centroids_cnt[1] .+= containers.centroids_cnt[i + 1]
190+
end
191+
192+
centroids .= containers.new_centroids[1] ./ containers.centroids_cnt[1]'
193+
end
194+
195+
return J/ncol
196+
end

src/light_elkan.jl

Lines changed: 159 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,159 @@
1+
"""
2+
LightElkan <: AbstractKMeansAlg
3+
4+
Simplified version of Elkan algorithm for k-means calculation. This algorithm
5+
gives the same results as basic Lloyd algorithm, but improve in speed by omitting
6+
unnecessary calculations. In this implementation there are two conditions applied
7+
to accelerate calculations
8+
9+
- if point is sufficiently close to it's centroid, i.e. distance to the centroid is smaller than
10+
half minimum distance from centroid to all other centroid. In this scenario point immediately get
11+
label of closest centroid.
12+
- if during calculation of new label distance from the point to current centroid is less than
13+
half of the distance from centroid to any other centroid, then distance from the point to
14+
other centroid is not calculated.
15+
16+
One has to take into account, that LightElkan algorithm has an overhead of the calculation
17+
k x k matrix of centroids distances, so for tasks with no apparent cluster structure may perform
18+
worser than basic LLoyd algorithm.
19+
"""
20+
struct LightElkan <: AbstractKMeansAlg end
21+
22+
"""
23+
create_containers(::LightElkan, k, nrow, ncol, n_threads)
24+
25+
Internal function for the creation of all necessary intermidiate structures.
26+
27+
- `new_centroids` - container which holds new positions of centroids
28+
- `centroids_cnt` - container which holds number of points for each centroid
29+
- `labels` - vector which holds labels of corresponding points
30+
- `centroids_dist` - symmetric matrix k x k which holds weighted distances between centroids
31+
"""
32+
function create_containers(alg::LightElkan, k, nrow, ncol, n_threads)
33+
if n_threads == 1
34+
new_centroids = Array{Float64, 2}(undef, nrow, k)
35+
centroids_cnt = Vector{Int}(undef, k)
36+
else
37+
new_centroids = Vector{Array{Float64, 2}}(undef, n_threads)
38+
centroids_cnt = Vector{Vector{Int}}(undef, n_threads)
39+
40+
for i in 1:n_threads
41+
new_centroids[i] = Array{Float64, 2}(undef, nrow, k)
42+
centroids_cnt[i] = Vector{Int}(undef, k)
43+
end
44+
end
45+
46+
labels = zeros(Int, ncol)
47+
48+
centroids_dist = Matrix{Float64}(undef, k, k)
49+
50+
return (new_centroids = new_centroids, centroids_cnt = centroids_cnt,
51+
labels = labels, centroids_dist = centroids_dist)
52+
end
53+
54+
55+
"""
56+
update_containers!(containers, ::LightElkan, centroids, n_threads)
57+
58+
Internal function for the `LightElkan` algorithm which updates distances
59+
between centroids. These distances are presented as symmetric matrix,
60+
on diagonal is written minimal distance from current centroid to all other.
61+
All distances are weighted with the factor 0.25 in order to simplify following
62+
update_centroids calculations.
63+
"""
64+
function update_containers!(containers, ::LightElkan, centroids, n_threads)
65+
# unpack containers for easier manipulations
66+
centroids_dist = containers.centroids_dist
67+
68+
k = size(centroids_dist, 1) # number of clusters
69+
@inbounds for j in axes(centroids_dist, 2)
70+
min_dist = Inf
71+
for i in j + 1:k
72+
d = 0.0
73+
for m in axes(centroids, 1)
74+
d += (centroids[m, i] - centroids[m, j])^2
75+
end
76+
centroids_dist[i, j] = d
77+
centroids_dist[j, i] = d
78+
min_dist = min_dist < d ? min_dist : d
79+
end
80+
for i in 1:j - 1
81+
min_dist = min_dist < centroids_dist[j, i] ? min_dist : centroids_dist[j, i]
82+
end
83+
centroids_dist[j, j] = min_dist
84+
end
85+
86+
# TODO: oh, one should be careful here. inequality holds for eucledian metrics
87+
# not square eucledian. So, for Lp norm it should be something like
88+
# centroids_dist = 0.5^p. Should check one more time original paper
89+
centroids_dist .*= 0.25
90+
91+
return centroids_dist
92+
end
93+
94+
"""
95+
chunk_update_centroids!(centroids, containers, ::AbstractKMeansAlg, design_matrix, r, idx)
96+
97+
Internal function which calculates single centroids update for data chunk.
98+
99+
Argument `idx` denotes number of the thread used, if it is equals 0 it means, that we are in single
100+
thread mode.
101+
"""
102+
function chunk_update_centroids!(centroids, containers, ::LightElkan,
103+
design_matrix, r, idx)
104+
105+
# unpack containers for easier manipulations
106+
if idx == 0
107+
new_centroids = containers.new_centroids
108+
centroids_cnt = containers.centroids_cnt
109+
else
110+
new_centroids = containers.new_centroids[idx]
111+
centroids_cnt = containers.centroids_cnt[idx]
112+
end
113+
centroids_dist = containers.centroids_dist
114+
labels = containers.labels
115+
116+
new_centroids .= 0.0
117+
centroids_cnt .= 0
118+
J = 0.0
119+
@inbounds for i in r
120+
# calculate distance to the previous center
121+
label = labels[i] > 0 ? labels[i] : 1
122+
last_label = label
123+
distance = 0.0
124+
for j in axes(design_matrix, 1)
125+
distance += (design_matrix[j, i] - centroids[j, label])^2
126+
end
127+
128+
min_distance = distance
129+
130+
# we can optimize in two ways
131+
# if point is close (less then centroids_dist[i, i]) to the center then there is no need to recalculate it
132+
# if it's not close, then we can skip some of centers if the center is too far away from
133+
# current point (Elkan triangular inequality)
134+
if min_distance > centroids_dist[label, label]
135+
for k in axes(centroids, 2)
136+
k == last_label && continue
137+
# triangular inequality
138+
centroids_dist[k, label] > min_distance && continue
139+
distance = 0.0
140+
for j in axes(design_matrix, 1)
141+
# TODO: we can break this calculation if distance already larger than
142+
# min_distance
143+
distance += (design_matrix[j, i] - centroids[j, k])^2
144+
end
145+
label = min_distance > distance ? k : label
146+
min_distance = min_distance > distance ? distance : min_distance
147+
end
148+
end
149+
150+
labels[i] = label
151+
centroids_cnt[label] += 1
152+
for j in axes(design_matrix, 1)
153+
new_centroids[j, label] += design_matrix[j, i]
154+
end
155+
J += min_distance
156+
end
157+
158+
return J
159+
end

0 commit comments

Comments
 (0)