PyDataBlog
diff --git a/‎src/ParallelKMeans.jl
Lines changed: 6 additions & 633 deletions b/‎src/ParallelKMeans.jl
Lines changed: 6 additions & 633 deletions
diff --git a/‎src/kmeans.jl
Lines changed: 196 additions & 0 deletions b/‎src/kmeans.jl
Lines changed: 196 additions & 0 deletions
diff --git a/‎src/light_elkan.jl
Lines changed: 159 additions & 0 deletions b/‎src/light_elkan.jl
Lines changed: 159 additions & 0 deletions
@@ -0,0 +1,196 @@
+
+# All Abstract types defined
+"""
+    AbstractKMeansAlg
+
+Abstract base type inherited by all sub-KMeans algorithms.
+"""
+abstract type AbstractKMeansAlg end
+
+
+"""
+    ClusteringResult
+
+Base type for the output of clustering algorithm.
+"""
+abstract type ClusteringResult end
+
+
+# Here we mimic `Clustering` output structure
+"""
+    KmeansResult{C,D<:Real,WC<:Real} <: ClusteringResult
+
+The output of [`kmeans`](@ref) and [`kmeans!`](@ref).
+# Type parameters
+ * `C<:AbstractMatrix{<:AbstractFloat}`: type of the `centers` matrix
+ * `D<:Real`: type of the assignment cost
+ * `WC<:Real`: type of the cluster weight
+ # C is the type of centers, an (abstract) matrix of size (d x k)
+# D is the type of pairwise distance computation from points to cluster centers
+# WC is the type of cluster weights, either Int (in the case where points are
+# unweighted) or eltype(weights) (in the case where points are weighted).
+"""
+struct KmeansResult{C<:AbstractMatrix{<:AbstractFloat},D<:Real,WC<:Real} <: ClusteringResult
+    centers::C                 # cluster centers (d x k)
+    assignments::Vector{Int}   # assignments (n)
+    costs::Vector{D}           # cost of the assignments (n)
+    counts::Vector{Int}        # number of points assigned to each cluster (k)
+    wcounts::Vector{WC}        # cluster weights (k)
+    totalcost::D               # total cost (i.e. objective)
+    iterations::Int            # number of elapsed iterations
+    converged::Bool            # whether the procedure converged
+end
+
+"""
+    sum_of_squares(x, labels, centre, k)
+
+This function computes the total sum of squares based on the assigned (labels)
+design matrix(x), centroids (centre), and the number of desired groups (k).
+
+A Float type representing the computed metric is returned.
+"""
+function sum_of_squares(x, labels, centre)
+    s = 0.0
+
+    @inbounds for j in axes(x, 2)
+        for i in axes(x, 1)
+            s += (x[i, j] - centre[i, labels[j]])^2
+        end
+    end
+
+    return s
+end
+
+
+"""
+    Kmeans([alg::AbstractKMeansAlg,] design_matrix, k; n_threads = nthreads(), k_init="k-means++", max_iters=300, tol=1e-6, verbose=true)
+
+This main function employs the K-means algorithm to cluster all examples
+in the training data (design_matrix) into k groups using either the
+`k-means++` or random initialisation technique for selecting the initial
+centroids.
+
+At the end of the number of iterations specified (max_iters), convergence is
+achieved if difference between the current and last cost objective is
+less than the tolerance level (tol). An error is thrown if convergence fails.
+
+Arguments:
+- `alg` defines one of the algorithms used to calculate `k-means`. This
+argument can be omitted, by default Lloyd algorithm is used.
+- `n_threads` defines number of threads used for calculations, by default it is equal
+to the `Threads.nthreads()` which is defined by `JULIA_NUM_THREADS` environmental
+variable. For small size design matrices it make sense to set this argument to 1 in order
+to avoid overhead of threads generation.
+- `k_init` is one of the algorithms used for initialization. By default `k-means++` algorithm is used,
+alternatively one can use `rand` to choose random points for init.
+- `max_iters` is the maximum number of iterations
+- `tol` defines tolerance for early stopping.
+- `verbose` is verbosity level. Details of operations can be either printed or not by setting verbose accordingly.
+
+A `KmeansResult` structure representing labels, centroids, and sum_squares is returned.
+"""
+function kmeans(alg, design_matrix, k;
+                n_threads = Threads.nthreads(),
+                k_init = "k-means++", max_iters = 300,
+                tol = 1e-6, verbose = true, init = nothing)
+    nrow, ncol = size(design_matrix)
+    containers = create_containers(alg, k, nrow, ncol, n_threads)
+
+    return kmeans!(alg, containers, design_matrix, k, n_threads = n_threads,
+                    k_init = k_init, max_iters = max_iters, tol = tol,
+                    verbose = verbose, init = init)
+end
+
+"""
+    Kmeans!(alg::AbstractKMeansAlg, containers, design_matrix, k; n_threads = nthreads(), k_init="k-means++", max_iters=300, tol=1e-6, verbose=true)
+
+Mutable version of `kmeans` function. Definition of arguments and results can be
+found in `kmeans`.
+
+Argument `containers` represent algorithm specific containers, such as labels, intermidiate
+centroids and so on, which are used during calculations.
+"""
+function kmeans!(alg, containers, design_matrix, k;
+                n_threads = Threads.nthreads(),
+                k_init = "k-means++", max_iters = 300,
+                tol = 1e-6, verbose = true, init = nothing)
+    nrow, ncol = size(design_matrix)
+    centroids = init == nothing ? smart_init(design_matrix, k, n_threads, init=k_init).centroids : deepcopy(init)
+
+    converged = false
+    niters = 1
+    J_previous = 0.0
+
+    # Update centroids & labels with closest members until convergence
+
+    while niters <= max_iters
+        update_containers!(containers, alg, centroids, n_threads)
+        J = update_centroids!(centroids, containers, alg, design_matrix, n_threads)
+
+        if verbose
+            # Show progress and terminate if J stopped decreasing.
+            println("Iteration $iter: Jclust = $J")
+        end
+
+        # Check for convergence
+        if (niters > 1) & (abs(J - J_previous) < (tol * J))
+            converged = true
+            break
+        end
+
+        J_previous = J
+        niters += 1
+    end
+
+    totalcost = sum_of_squares(design_matrix, containers.labels, centroids)
+
+    # Terminate algorithm with the assumption that K-means has converged
+    if verbose & converged
+        println("Successfully terminated with convergence.")
+    end
+
+    # TODO empty placeholder vectors should be calculated
+    # TODO Float64 type definitions is too restrictive, should be relaxed
+    # especially during GPU related development
+    return KmeansResult(centroids, containers.labels, Float64[], Int[], Float64[], totalcost, niters, converged)
+end
+
+"""
+    update_centroids!(centroids, containers, alg, design_matrix, n_threads)
+
+Internal function, used to update centroids by utilizing one of `alg`. It works as
+a wrapper of internal `chunk_update_centroids!` function, splitting incoming
+`design_matrix` in chunks and combining results together.
+"""
+function update_centroids!(centroids, containers, alg, design_matrix, n_threads)
+    ncol = size(design_matrix, 2)
+
+    if n_threads == 1
+        r = axes(design_matrix, 2)
+        J = chunk_update_centroids!(centroids, containers, alg, design_matrix, r, 0)
+
+        centroids .= containers.new_centroids ./ containers.centroids_cnt'
+    else
+        ranges = splitter(ncol, n_threads)
+
+        waiting_list = Vector{Task}(undef, n_threads - 1)
+
+        for i in 1:length(ranges) - 1
+            waiting_list[i] = @spawn chunk_update_centroids!(centroids, containers,
+                alg, design_matrix, ranges[i], i + 1)
+        end
+
+        J = chunk_update_centroids!(centroids, containers, alg, design_matrix, ranges[end], 1)
+
+        J += sum(fetch.(waiting_list))
+
+        for i in 1:length(ranges) - 1
+            containers.new_centroids[1] .+= containers.new_centroids[i + 1]
+            containers.centroids_cnt[1] .+= containers.centroids_cnt[i + 1]
+        end
+
+        centroids .= containers.new_centroids[1] ./ containers.centroids_cnt[1]'
+    end
+
+    return J/ncol
+end
@@ -0,0 +1,159 @@
+"""
+    LightElkan <: AbstractKMeansAlg
+
+Simplified version of Elkan algorithm for k-means calculation. This algorithm
+gives the same results as basic Lloyd algorithm, but improve in speed by omitting
+unnecessary calculations. In this implementation there are two conditions applied
+to accelerate calculations
+
+- if point is sufficiently close to it's centroid, i.e. distance to the centroid is smaller than
+half minimum distance from centroid to all other centroid. In this scenario point immediately get
+label of closest centroid.
+- if during calculation of new label distance from the point to current centroid is less than
+half of the distance from centroid to any other centroid, then distance from the point to
+other centroid is not calculated.
+
+One has to take into account, that LightElkan algorithm has an overhead of the calculation
+k x k matrix of centroids distances, so for tasks with no apparent cluster structure may perform
+worser than basic LLoyd algorithm.
+"""
+struct LightElkan <: AbstractKMeansAlg end
+
+"""
+    create_containers(::LightElkan, k, nrow, ncol, n_threads)
+
+Internal function for the creation of all necessary intermidiate structures.
+
+- `new_centroids` - container which holds new positions of centroids
+- `centroids_cnt` - container which holds number of points for each centroid
+- `labels` - vector which holds labels of corresponding points
+- `centroids_dist` - symmetric matrix k x k which holds weighted distances between centroids
+"""
+function create_containers(alg::LightElkan, k, nrow, ncol, n_threads)
+    if n_threads == 1
+        new_centroids = Array{Float64, 2}(undef, nrow, k)
+        centroids_cnt = Vector{Int}(undef, k)
+    else
+        new_centroids = Vector{Array{Float64, 2}}(undef, n_threads)
+        centroids_cnt = Vector{Vector{Int}}(undef, n_threads)
+
+        for i in 1:n_threads
+            new_centroids[i] = Array{Float64, 2}(undef, nrow, k)
+            centroids_cnt[i] = Vector{Int}(undef, k)
+        end
+    end
+
+    labels = zeros(Int, ncol)
+
+    centroids_dist = Matrix{Float64}(undef, k, k)
+
+    return (new_centroids = new_centroids, centroids_cnt = centroids_cnt,
+            labels = labels, centroids_dist = centroids_dist)
+end
+
+
+"""
+    update_containers!(containers, ::LightElkan, centroids, n_threads)
+
+Internal function for the `LightElkan` algorithm which updates distances
+between centroids. These distances are presented as symmetric matrix,
+on diagonal is written minimal distance from current centroid to all other.
+All distances are weighted with the factor 0.25 in order to simplify following
+update_centroids calculations.
+"""
+function update_containers!(containers, ::LightElkan, centroids, n_threads)
+    # unpack containers for easier manipulations
+    centroids_dist = containers.centroids_dist
+
+    k = size(centroids_dist, 1) # number of clusters
+    @inbounds for j in axes(centroids_dist, 2)
+        min_dist = Inf
+        for i in j + 1:k
+            d = 0.0
+            for m in axes(centroids, 1)
+                d += (centroids[m, i] - centroids[m, j])^2
+            end
+            centroids_dist[i, j] = d
+            centroids_dist[j, i] = d
+            min_dist = min_dist < d ? min_dist : d
+        end
+        for i in 1:j - 1
+            min_dist = min_dist < centroids_dist[j, i] ? min_dist : centroids_dist[j, i]
+        end
+        centroids_dist[j, j] = min_dist
+    end
+
+    # TODO: oh, one should be careful here. inequality holds for eucledian metrics
+    # not square eucledian. So, for Lp norm it should be something like
+    # centroids_dist = 0.5^p. Should check one more time original paper
+    centroids_dist .*= 0.25
+
+    return centroids_dist
+end
+
+"""
+    chunk_update_centroids!(centroids, containers, ::AbstractKMeansAlg, design_matrix, r, idx)
+
+Internal function which calculates single centroids update for data chunk.
+
+Argument `idx` denotes number of the thread used, if it is equals 0 it means, that we are in single
+thread mode.
+"""
+function chunk_update_centroids!(centroids, containers, ::LightElkan,
+    design_matrix, r, idx)
+
+    # unpack containers for easier manipulations
+    if idx == 0
+        new_centroids = containers.new_centroids
+        centroids_cnt = containers.centroids_cnt
+    else
+        new_centroids = containers.new_centroids[idx]
+        centroids_cnt = containers.centroids_cnt[idx]
+    end
+    centroids_dist = containers.centroids_dist
+    labels = containers.labels
+
+    new_centroids .= 0.0
+    centroids_cnt .= 0
+    J = 0.0
+    @inbounds for i in r
+        # calculate distance to the previous center
+        label = labels[i] > 0 ? labels[i] : 1
+        last_label = label
+        distance = 0.0
+        for j in axes(design_matrix, 1)
+            distance += (design_matrix[j, i] - centroids[j, label])^2
+        end
+
+        min_distance = distance
+
+        # we can optimize in two ways
+        # if point is close (less then centroids_dist[i, i]) to the center then there is no need to recalculate it
+        # if it's not close, then we can skip some of centers if the center is too far away from
+        # current point (Elkan triangular inequality)
+        if min_distance > centroids_dist[label, label]
+            for k in axes(centroids, 2)
+                k == last_label && continue
+                # triangular inequality
+                centroids_dist[k, label] > min_distance && continue
+                distance = 0.0
+                for j in axes(design_matrix, 1)
+                    # TODO: we can break this calculation if distance already larger than
+                    # min_distance
+                    distance += (design_matrix[j, i] - centroids[j, k])^2
+                end
+                label = min_distance > distance ? k : label
+                min_distance = min_distance > distance ? distance : min_distance
+            end
+        end
+
+        labels[i] = label
+        centroids_cnt[label] += 1
+        for j in axes(design_matrix, 1)
+            new_centroids[j, label] += design_matrix[j, i]
+        end
+        J += min_distance
+    end
+
+    return J
+end