PyDataBlog
diff --git a/‎Project.toml
+1-1 b/‎Project.toml
+1-1
diff --git a/‎docs/src/index.md
+5-2 b/‎docs/src/index.md
+5-2
diff --git a/‎src/ParallelKMeans.jl
+3-2 b/‎src/ParallelKMeans.jl
+3-2
diff --git a/‎src/coreset.jl
+206 b/‎src/coreset.jl
+206
diff --git a/‎src/elkan.jl
+13-13 b/‎src/elkan.jl
+13-13
@@ -1,7 +1,7 @@
 name = "ParallelKMeans"
 uuid = "42b8e9d4-006b-409a-8472-7f34b3fb58af"
 authors = ["Bernard Brenyah", "Andrey Oskin"]
-version = "0.1.5"
+version = "0.1.6"
 
 [deps]
 Distances = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7"
 
@@ -72,15 +72,16 @@ git checkout experimental
 - [X] Implementation of [Hamerly implementation](https://www.researchgate.net/publication/220906984_Making_k-means_Even_Faster).
 - [X] Interface for inclusion in Alan Turing Institute's [MLJModels](https://github.com/alan-turing-institute/MLJModels.jl#who-is-this-repo-for).
 - [X] Full Implementation of Triangle inequality based on [Elkan - 2003 Using the Triangle Inequality to Accelerate K-Means"](https://www.aaai.org/Papers/ICML/2003/ICML03-022.pdf).
-- [X] Implementation of [Yinyang K-Means: A Drop-In Replacement of the Classic K-Means with Consistent Speedup](https://www.microsoft.com/en-us/research/wp-content/uploads/2016/02/ding15.pdf)
+- [X] Implementation of [Yinyang K-Means: A Drop-In Replacement of the Classic K-Means with Consistent Speedup](https://www.microsoft.com/en-us/research/wp-content/uploads/2016/02/ding15.pdf).
+- [X] Implementation of [Coresets](http://proceedings.mlr.press/v51/lucic16-supp.pdf).
 - [ ] Implementation of [Geometric methods to accelerate k-means algorithm](http://cs.baylor.edu/~hamerly/papers/sdm2016_rysavy_hamerly.pdf).
+- [X] Support for weighted K-means.
 - [ ] Support for other distance metrics supported by [Distances.jl](https://github.com/JuliaStats/Distances.jl#supported-distances).
 - [ ] Support of MLJ Random generation hyperparameter.
 - [ ] Native support for tabular data inputs outside of MLJModels' interface.
 - [ ] Refactoring and finalizaiton of API desgin.
 - [ ] GPU support.
 - [ ] Distributed calculations support.
-- [ ] Implementation of other K-Means algorithm variants based on recent literature.
 - [ ] Optimization of code base.
 - [ ] Improved Documentation
 - [ ] More benchmark tests.
@@ -123,6 +124,7 @@ r.converged             # whether the procedure converged
 - [Hamerly()](https://www.researchgate.net/publication/220906984_Making_k-means_Even_Faster) - Hamerly is good for moderate number of clusters (< 50?) and moderate dimensions (<100?).
 - [Elkan()](https://www.aaai.org/Papers/ICML/2003/ICML03-022.pdf) - Recommended for high dimensional data.
 - [Yinyang()](https://www.microsoft.com/en-us/research/wp-content/uploads/2016/02/ding15.pdf) - Recommended for large dimensions and/or large number of clusters.
+- [Coreset()](http://proceedings.mlr.press/v51/lucic16-supp.pdf) - Recommended for very fast clustering of very large datasets, when extreme accuracy is not important.
 - [Geometric()](http://cs.baylor.edu/~hamerly/papers/sdm2016_rysavy_hamerly.pdf) - (Coming soon)
 - [MiniBatch()](https://www.eecs.tufts.edu/~dsculley/papers/fastkmeans.pdf) - (Coming soon)
 
@@ -204,6 +206,7 @@ ________________________________________________________________________________
 - 0.1.3 Faster & optimized execution.
 - 0.1.4 Bug fixes.
 - 0.1.5 Added `Yinyang` algorithm.
+- 0.1.6 Added support for weighted k-means; Added `Coreset` algorithm; improved support for different types of the design matrix.
 
 ## Contributing
 
 
@@ -7,15 +7,16 @@ import Distances
 
 const MMI = MLJModelInterface
 
-include("seeding.jl")
 include("kmeans.jl")
+include("seeding.jl")
 include("lloyd.jl")
 include("hamerly.jl")
 include("elkan.jl")
 include("yinyang.jl")
 include("mlj_interface.jl")
+include("coreset.jl")
 
 export kmeans
-export Lloyd, Hamerly, Elkan, Yinyang
+export Lloyd, Hamerly, Elkan, Yinyang, 阴阳, Coreset
 
 end # module
@@ -0,0 +1,206 @@
+"""
+    Coreset()
+
+Coreset algorithm implementation, based on "Lucic, Mario & Bachem,
+Olivier & Krause, Andreas. (2015). Strong Coresets for Hard and Soft Bregman
+Clustering with Applications to Exponential Family Mixtures."
+
+`Coreset` supports following arguments:
+- `m`: default 100, subsample size
+- `alg`: default `Lloyd()`, algorithm used to clusterize sample
+
+It can be used directly in `kmeans` function
+
+```julia
+X = rand(30, 100_000)   # 100_000 random points in 30 dimensions
+
+# 3 clusters, Coreset algorithm with default Lloyd algorithm and 100 subsamples
+kmeans(Coreset(), X, 3)
+
+# 3 clusters, Coreset algorithm with Hamerly algorithm and 500 subsamples
+kmeans(Coreset(m = 500, alg = Hamerly()), X, 3)
+kmeans(Coreset(500, Hamerly()), X, 3)
+
+# alternatively short form can be used for defining subsample size or algorithm only
+kmeans(Coreset(500), X, 3) # sample of the size 500, Lloyd clustering algorithm
+kmeans(Coreset(Hamerly()), X, 3) # sample of the size 100, Hamerly clustering algorithm
+```
+"""
+struct Coreset{T <: AbstractKMeansAlg} <: AbstractKMeansAlg
+    m::Int
+    alg::T
+end
+
+Coreset(; m = 100, alg = Lloyd()) = Coreset(m, alg)
+Coreset(m::Int) = Coreset(m, Lloyd())
+Coreset(alg::AbstractKMeansAlg) = Coreset(100, alg)
+
+function kmeans!(alg::Coreset, containers, X, k, weights;
+                n_threads = Threads.nthreads(),
+                k_init = "k-means++", max_iters = 300,
+                tol = eltype(design_matrix)(1e-6), verbose = false, init = nothing)
+    nrow, ncol = size(X)
+    centroids = isnothing(init) ? smart_init(X, k, n_threads, init=k_init).centroids : deepcopy(init)
+
+    T = eltype(X)
+    # Steps 2-4 of the paper's algorithm 3
+    # We distribute points over the centers and calculate weights of each cluster
+    @parallelize n_threads ncol chunk_fit(alg, containers, centroids, X, weights)
+
+    # after this step, containers.centroids_new
+    collect_containers(alg, containers, n_threads)
+
+    # step 7 of the algorithm 3
+    @parallelize n_threads ncol chunk_update_sensitivity(alg, containers)
+
+    # sample from containers.s
+    coreset_ids = wsample(1:ncol, containers.s, alg.m)
+    coreset = X[:, coreset_ids]
+    # create new weights as 1/s[i]
+    coreset_weights = one(T) ./ @view containers.s[coreset_ids]
+
+    # run usual kmeans for new set with new weights.
+    res = kmeans(alg.alg, coreset, k, coreset_weights, tol = tol, max_iters = max_iters,
+        verbose = verbose, init = centroids, n_threads = n_threads)
+
+    @parallelize n_threads ncol chunk_apply(alg, containers, res.centers, X, weights)
+
+    totalcost = sum(containers.totalcost)
+
+    return KmeansResult(res.centers, containers.labels, T[], Int[], T[], totalcost, res.iterations, res.converged)
+end
+
+function create_containers(alg::Coreset, X, k, nrow, ncol, n_threads)
+    T = eltype(X)
+
+    centroids_cnt = Vector{Vector{T}}(undef, n_threads)
+    centroids_dist = Vector{Vector{T}}(undef, n_threads)
+
+    # sensitivity
+
+    for i in 1:n_threads
+        centroids_cnt[i] = zeros(T, k)
+        centroids_dist[i] = zeros(T, k)
+    end
+
+    labels = Vector{Int}(undef, ncol)
+    s = Vector{T}(undef, ncol)
+
+    # J is the same as $c_\phi$ in the paper.
+    J = Vector{T}(undef, n_threads)
+
+    alpha = 16 * (log(k) + 2)
+
+    centroids_const = Vector{T}(undef, k)
+
+    # total_sum_calculation
+    totalcost = Vector{T}(undef, n_threads)
+
+    return (
+        centroids_cnt = centroids_cnt,
+        centroids_dist = centroids_dist,
+        s = s,
+        labels = labels,
+        totalcost = totalcost,
+        J = J,
+        centroids_const = centroids_const,
+        alpha = alpha
+    )
+end
+
+function chunk_fit(alg::Coreset, containers, centroids, X, weights, r, idx)
+    centroids_cnt = containers.centroids_cnt[idx]
+    centroids_dist = containers.centroids_dist[idx]
+    labels = containers.labels
+    s = containers.s
+    T = eltype(X)
+
+    J = zero(T)
+    for i in r
+        dist = distance(X, centroids, i, 1)
+        label = 1
+        for j in 2:size(centroids, 2)
+            new_dist = distance(X, centroids, i, j)
+
+            # calculation of the closest center (steps 2-3 of the paper's algorithm 3)
+            label = new_dist < dist ? j : label
+            dist = new_dist < dist ? new_dist : dist
+        end
+        labels[i] = label
+
+        # calculation of the $c_\phi$ (step 4)
+        # Note: $d_A(x', B) = min_{b \in B} d_A(x', b)$
+        # Not exactly sure about whole `weights` thing, needs further investigation
+        # for Nothing `weights` (default) it'll work as intendent
+        centroids_cnt[label] += isnothing(weights) ? one(T) : weights[i]
+        centroids_dist[label] += isnothing(weights) ? dist : weights[i] * dist
+        J += dist
+
+        # for now we write dist to sensitivity, update it later
+        s[i] = dist
+    end
+
+    containers.J[idx] = J
+end
+
+function collect_containers(::Coreset, containers, n_threads)
+    # Here we transform formula of the step 6
+    # By multiplying both sides of equation on $c_\phi / \alpha$ we obtain
+    # $s(x) <- d_A(x, B) + 2 \sum d_A(x, B) / |B_i| + 4 c_\phi |\Chi| / (|B_i| * \alpha)$
+    # Taking into account that $c_\phi = 1/|\Chi| \sum d_A(x', B) = J / |\Chi|$ we get
+    # $s(x) <- d_A(x, B) + 2 \sum d_A(x, B) / |B_i| + 4 J / \alpha * 1/ |B_i|$
+
+    alpha = containers.alpha
+    centroids_const = containers.centroids_const
+
+    centroids_cnt = containers.centroids_cnt[1]
+    centroids_dist = containers.centroids_dist[1]
+    J = containers.J[1]
+
+    @inbounds for i in 2:n_threads
+        centroids_cnt .+= containers.centroids_cnt[i]
+        centroids_dist .+= containers.centroids_dist[i]
+        J += containers.J[i]
+    end
+
+    J = 4 * J / alpha
+
+    for i in 1:length(centroids_dist)
+        centroids_const[i] = 2 * centroids_dist[i] / centroids_cnt[i] +
+            J / centroids_cnt[i]
+    end
+end
+
+function chunk_update_sensitivity(alg::Coreset, containers, r, idx)
+    labels = containers.labels
+    centroids_const = containers.centroids_const
+    s = containers.s
+
+    @inbounds for i in r
+        s[i] += centroids_const[labels[i]]
+    end
+end
+
+function chunk_apply(alg::Coreset, containers, centroids, X, weights, r, idx)
+    centroids_cnt = containers.centroids_cnt[idx]
+    centroids_dist = containers.centroids_dist[idx]
+    labels = containers.labels
+    T = eltype(X)
+
+    J = zero(T)
+    for i in r
+        dist = distance(X, centroids, i, 1)
+        label = 1
+        for j in 2:size(centroids, 2)
+            new_dist = distance(X, centroids, i, j)
+
+            # calculation of the closest center (steps 2-3 of the paper's algorithm 3)
+            label = new_dist < dist ? j : label
+            dist = new_dist < dist ? new_dist : dist
+        end
+        labels[i] = label
+        J += isnothing(weights) ? dist : weights[i] * dist
+    end
+
+    containers.totalcost[idx] = J
+end
@@ -18,15 +18,15 @@ kmeans(Elkan(), X, 3) # 3 clusters, Elkan algorithm
 """
 struct Elkan <: AbstractKMeansAlg end
 
-function kmeans!(alg::Elkan, containers, X, k;
+function kmeans!(alg::Elkan, containers, X, k, weights;
                 n_threads = Threads.nthreads(),
                 k_init = "k-means++", max_iters = 300,
                 tol = eltype(X)(1e-6), verbose = false, init = nothing)
     nrow, ncol = size(X)
-    centroids = init == nothing ? smart_init(X, k, n_threads, init=k_init).centroids : deepcopy(init)
+    centroids = init == nothing ? smart_init(X, k, n_threads, weights, init=k_init).centroids : deepcopy(init)
 
     update_containers(alg, containers, centroids, n_threads)
-    @parallelize n_threads ncol chunk_initialize(alg, containers, centroids, X)
+    @parallelize n_threads ncol chunk_initialize(alg, containers, centroids, X, weights)
 
     T = eltype(X)
     converged = false
@@ -37,7 +37,7 @@ function kmeans!(alg::Elkan, containers, X, k;
     while niters < max_iters
         niters += 1
         # Core iteration
-        @parallelize n_threads ncol chunk_update_centroids(alg, containers, centroids, X)
+        @parallelize n_threads ncol chunk_update_centroids(alg, containers, centroids, X, weights)
 
         # Collect distributed containers (such as centroids_new, centroids_cnt)
         # in paper it is step 4
@@ -70,7 +70,7 @@ function kmeans!(alg::Elkan, containers, X, k;
         J_previous = J
     end
 
-    @parallelize n_threads ncol sum_of_squares(containers, X, containers.labels, centroids)
+    @parallelize n_threads ncol sum_of_squares(containers, X, containers.labels, centroids, weights)
     totalcost = sum(containers.sum_of_squares)
 
     # Terminate algorithm with the assumption that K-means has converged
@@ -127,7 +127,7 @@ function create_containers(alg::Elkan, X, k, nrow, ncol, n_threads)
     )
 end
 
-function chunk_initialize(::Elkan, containers, centroids, X, r, idx)
+function chunk_initialize(::Elkan, containers, centroids, X, weights, r, idx)
     ub = containers.ub
     lb = containers.lb
     centroids_dist = containers.centroids_dist
@@ -153,9 +153,9 @@ function chunk_initialize(::Elkan, containers, centroids, X, r, idx)
         end
         ub[i] = min_dist
         labels[i] = label
-        centroids_cnt[label] += one(T)
+        centroids_cnt[label] += isnothing(weights) ? one(T) : weights[i]
         for j in axes(X, 1)
-            centroids_new[j, label] += X[j, i]
+            centroids_new[j, label] += isnothing(weights) ? X[j, i] : weights[i] * X[j, i]
         end
     end
 end
@@ -188,7 +188,7 @@ function update_containers(::Elkan, containers, centroids, n_threads)
     return centroids_dist
 end
 
-function chunk_update_centroids(::Elkan, containers, centroids, X, r, idx)
+function chunk_update_centroids(::Elkan, containers, centroids, X, weights, r, idx)
     # unpack
     ub = containers.ub
     lb = containers.lb
@@ -231,11 +231,11 @@ function chunk_update_centroids(::Elkan, containers, centroids, X, r, idx)
 
         if label != label_old
             labels[i] = label
-            centroids_cnt[label_old] -= one(T)
-            centroids_cnt[label] += one(T)
+            centroids_cnt[label_old] -= isnothing(weights) ? one(T) : weights[i]
+            centroids_cnt[label] += isnothing(weights) ? one(T) : weights[i]
             for j in axes(X, 1)
-                centroids_new[j, label_old] -= X[j, i]
-                centroids_new[j, label] += X[j, i]
+                centroids_new[j, label_old] -= isnothing(weights) ? X[j, i] : weights[i] * X[j, i]
+                centroids_new[j, label] += isnothing(weights) ? X[j, i] : weights[i] * X[j, i]
             end
         end
     end