PyDataBlog
diff --git a/‎Project.toml
Lines changed: 1 addition & 1 deletion b/‎Project.toml
Lines changed: 1 addition & 1 deletion
diff --git a/‎README.md
Lines changed: 9 additions & 6 deletions b/‎README.md
Lines changed: 9 additions & 6 deletions
diff --git a/‎docs/src/index.md
Lines changed: 8 additions & 6 deletions b/‎docs/src/index.md
Lines changed: 8 additions & 6 deletions
diff --git a/‎src/ParallelKMeans.jl
Lines changed: 2 additions & 1 deletion b/‎src/ParallelKMeans.jl
Lines changed: 2 additions & 1 deletion
diff --git a/‎src/kmeans.jl
Lines changed: 17 additions & 7 deletions b/‎src/kmeans.jl
Lines changed: 17 additions & 7 deletions
diff --git a/‎src/lloyd.jl
Lines changed: 1 addition & 1 deletion b/‎src/lloyd.jl
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/mini_batch.jl
Lines changed: 183 additions & 0 deletions b/‎src/mini_batch.jl
Lines changed: 183 additions & 0 deletions
@@ -1,7 +1,7 @@
 name = "ParallelKMeans"
 uuid = "42b8e9d4-006b-409a-8472-7f34b3fb58af"
 authors = ["Bernard Brenyah", "Andrey Oskin"]
-version = "0.2.0"
+version = "0.2.1"
 
 [deps]
 Distances = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7"
 
@@ -2,8 +2,8 @@
 
 [![Stable](https://img.shields.io/badge/docs-stable-blue.svg)](https://PyDataBlog.github.io/ParallelKMeans.jl/stable)
 [![Dev](https://img.shields.io/badge/docs-dev-blue.svg)](https://PyDataBlog.github.io/ParallelKMeans.jl/dev)
-[![Build Status](https://www.travis-ci.org/PyDataBlog/ParallelKMeans.jl.svg?branch=master)](https://www.travis-ci.org/PyDataBlog/ParallelKMeans.jl)
-[![Coverage Status](https://coveralls.io/repos/github/PyDataBlog/ParallelKMeans.jl/badge.svg?branch=master)](https://coveralls.io/github/PyDataBlog/ParallelKMeans.jl?branch=master)
+[![Build Status](https://github.com/PyDataBlog/ParallelKMeans.jl/actions/workflows/CI.yml/badge.svg)](https://github.com/PyDataBlog/ParallelKMeans.jl/actions/workflows/CI.yml/badge.svg)
+[![codecov](https://codecov.io/gh/PyDataBlog/ParallelKMeans.jl/branch/master/graph/badge.svg?token=799USS6BPH)](https://codecov.io/gh/PyDataBlog/ParallelKMeans.jl)
 [![FOSSA Status](https://app.fossa.com/api/projects/git%2Bgithub.com%2FPyDataBlog%2FParallelKMeans.jl.svg?type=shield)](https://app.fossa.com/projects/git%2Bgithub.com%2FPyDataBlog%2FParallelKMeans.jl?ref=badge_shield)
 [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/PyDataBlog/ParallelKMeans.jl/master)
 _________________________________________________________________________________________________________
@@ -22,10 +22,13 @@ ________________________________________________________________________________
 
 ## Table Of Content
 
-1. [Documentation](#Documentation)
-2. [Installation](#Installation)
-3. [Features](#Features)
-4. [License](#License)
+- [ParallelKMeans](#parallelkmeans)
+  - [Table Of Content](#table-of-content)
+    - [Documentation](#documentation)
+    - [Installation](#installation)
+    - [Features](#features)
+    - [Benchmarks](#benchmarks)
+    - [License](#license)
 
 _________________________________________________________________________________________________________
 
 
@@ -78,11 +78,11 @@ pkg> free ParallelKMeans
 - [X] Implementation of [Coresets](http://proceedings.mlr.press/v51/lucic16-supp.pdf).
 - [X] Support for weighted K-means.
 - [X] Support of MLJ Random generation hyperparameter.
+- [X] Implementation of [Mini-batch KMeans variant](https://www.eecs.tufts.edu/~dsculley/papers/fastkmeans.pdf)
 - [ ] Support for other distance metrics supported by [Distances.jl](https://github.com/JuliaStats/Distances.jl#supported-distances).
 - [ ] Implementation of [Geometric methods to accelerate k-means algorithm](http://cs.baylor.edu/~hamerly/papers/sdm2016_rysavy_hamerly.pdf).
 - [ ] Native support for tabular data inputs outside of MLJModels' interface.
-- [ ] Refactoring and finalization of API design.
-- [ ] GPU support.
+- [ ] GPU support?
 - [ ] Distributed calculations support.
 - [ ] Optimization of code base.
 - [ ] Improved Documentation
@@ -127,8 +127,8 @@ r.converged             # whether the procedure converged
 - [Elkan()](https://www.aaai.org/Papers/ICML/2003/ICML03-022.pdf) - Recommended for high dimensional data.
 - [Yinyang()](https://www.microsoft.com/en-us/research/wp-content/uploads/2016/02/ding15.pdf) - Recommended for large dimensions and/or large number of clusters.
 - [Coreset()](http://proceedings.mlr.press/v51/lucic16-supp.pdf) - Recommended for very fast clustering of very large datasets, when extreme accuracy is not important.
+- [MiniBatch()](https://www.eecs.tufts.edu/~dsculley/papers/fastkmeans.pdf) - Recommended for extremely large datasets, when extreme accuracy is not important.
 - [Geometric()](http://cs.baylor.edu/~hamerly/papers/sdm2016_rysavy_hamerly.pdf) - (Coming soon)
-- [MiniBatch()](https://www.eecs.tufts.edu/~dsculley/papers/fastkmeans.pdf) - (Coming soon)
 
 ### Practical Usage Examples
 
@@ -175,9 +175,9 @@ Currently, this package is benchmarked against similar implementations in both P
 
 *Note*: All benchmark tests are made on the same computer to help eliminate any bias.
 
-|PC Name                      |CPU                       |Ram               |
-|:---------------------------:|:------------------------:|:----------------:|
-|iMac (Retina 5K 27-inch 2019)|3 GHz 6-Core Intel Core i5|8 GB 2667 MHz DDR4|
+|PC Name                      |CPU                       |Ram                |
+|:---------------------------:|:------------------------:|:-----------------:|
+|iMac (Retina 5K 27-inch 2019)|3 GHz 6-Core Intel Core i5|24 GB 2667 MHz DDR4|
 
 Currently, the benchmark speed tests are based on the search for optimal number of clusters using the [Elbow Method](https://en.wikipedia.org/wiki/Elbow_method_(clustering)) since this is a practical use case for most practioners employing the K-Means algorithm.
 
@@ -213,6 +213,8 @@ ________________________________________________________________________________
 - 0.1.7 Added `Yinyang` and `Coreset` support in MLJ interface; added `weights` support in MLJ; added RNG seed support in MLJ interface and through all algorithms; added metric support.
 - 0.1.8 Minor cleanup
 - 0.1.9 Added travis support for Julia 1.5
+- 0.2.0 Updated MLJ Interface
+- 0.2.1 Mini-batch implementation
 
 ## Contributing
 
 
@@ -15,9 +15,10 @@ include("hamerly.jl")
 include("elkan.jl")
 include("yinyang.jl")
 include("coreset.jl")
+include("mini_batch.jl")
 include("mlj_interface.jl")
 
 export kmeans
-export Lloyd, Hamerly, Elkan, Yinyang, 阴阳, Coreset
+export Lloyd, Hamerly, Elkan, Yinyang, 阴阳, Coreset, MiniBatch
 
 end # module
@@ -115,7 +115,7 @@ Allocationless calculation of square eucledean distance between vectors X1[:, i1
 @inline function distance(metric::Euclidean, X1, X2, i1, i2)
     # here goes my definition
     d = zero(eltype(X1))
-    # TODO: break of the loop if d is larger than threshold (known minimum disatnce)
+    # TODO: break of the loop if d is larger than threshold (known minimum distance)
     @inbounds @simd for i in axes(X1, 1)
         d += (X1[i, i1] - X2[i, i2])^2
     end
@@ -170,20 +170,30 @@ alternatively one can use `rand` to choose random points for init.
 
 A `KmeansResult` structure representing labels, centroids, and sum_squares is returned.
 """
-function kmeans(alg::AbstractKMeansAlg, design_matrix, k; weights = nothing,
+function kmeans(alg::AbstractKMeansAlg, design_matrix, k;
+                weights = nothing,
                 n_threads = Threads.nthreads(),
-                k_init = "k-means++", max_iters = 300,
-                tol = eltype(design_matrix)(1e-6), verbose = false,
-                init = nothing, rng = Random.GLOBAL_RNG, metric = Euclidean())
+                k_init = "k-means++",
+                max_iters = 300,
+                tol = eltype(design_matrix)(1e-6),
+                verbose = false,
+                init = nothing,
+                rng = Random.GLOBAL_RNG,
+                metric = Euclidean())
 
     nrow, ncol = size(design_matrix)
 
     # Create containers based on the dimensions and specifications
     containers = create_containers(alg, design_matrix, k, nrow, ncol, n_threads)
 
     return kmeans!(alg, containers, design_matrix, k, weights, metric;
-                   n_threads = n_threads, k_init = k_init, max_iters = max_iters,
-                   tol = tol, verbose = verbose, init = init, rng = rng)
+                   n_threads = n_threads,
+                   k_init = k_init,
+                   max_iters = max_iters,
+                   tol = tol,
+                   verbose = verbose,
+                   init = init,
+                   rng = rng)
 
 end
 
 
@@ -6,7 +6,7 @@ Basic algorithm for k-means calculation.
 struct Lloyd <: AbstractKMeansAlg end
 
 """
-    Kmeans!(alg::AbstractKMeansAlg, containers, design_matrix, k; n_threads = nthreads(), k_init="k-means++", max_iters=300, tol=1e-6, verbose=true)
+    kmeans!(alg::AbstractKMeansAlg, containers, design_matrix, k; n_threads = nthreads(), k_init="k-means++", max_iters=300, tol=1e-6, verbose=true)
 
 Mutable version of `kmeans` function. Definition of arguments and results can be
 found in `kmeans`.
 
@@ -0,0 +1,183 @@
+"""
+    MiniBatch(b::Int)
+    `b` represents the size of the batch which should be sampled.
+
+    Sculley et al. 2007 Mini batch k-means algorithm implementation.
+
+```julia
+X = rand(30, 100_000)  # 100_000 random points in 30 dimensions
+
+kmeans(MiniBatch(100), X, 3)  # 3 clusters, MiniBatch algorithm with 100 batch samples at each iteration
+```
+"""
+struct MiniBatch <: AbstractKMeansAlg
+    b::Int  # batch size
+end
+
+
+MiniBatch() = MiniBatch(100)
+
+function kmeans!(alg::MiniBatch, containers, X, k,
+                 weights = nothing, metric = Euclidean(); n_threads = Threads.nthreads(),
+                 k_init = "k-means++", init = nothing, max_iters = 300,
+                 tol = eltype(X)(1e-6), max_no_improvement = 10, verbose = false, rng = Random.GLOBAL_RNG)
+
+    # Retrieve initialized artifacts from the container
+    centroids = containers.centroids_new
+    batch_rand_idx = containers.batch_rand_idx
+    labels = containers.labels
+
+    # Get the type and dimensions of design matrix, X - (Step 1)
+    T = eltype(X)
+    nrow, ncol = size(X)
+
+    # Initiate cluster centers - (Step 2) in paper
+    centroids .= isnothing(init) ? smart_init(X, k, n_threads, weights, rng, init = k_init).centroids : deepcopy(init)
+
+    # Initialize counter for the no. of data in each cluster - (Step 3) in paper
+    N = zeros(T, k)
+
+    # Initialize various artifacts
+    converged = false
+    niters = 1
+    counter = 0
+    J_previous = zero(T)
+    J = zero(T)
+    totalcost = zero(T)
+
+    # Main Steps. Batch update centroids until convergence
+    while niters <= max_iters  # Step 4 in paper
+
+        # b examples picked randomly from X (Step 5 in paper)
+        isnothing(weights) ? rand!(rng, batch_rand_idx, 1:ncol) : wsample!(rng, 1:ncol, weights, batch_rand_idx)
+
+        # Cache/label the batch samples nearest to the centers (Step 6 & 7)
+        @inbounds for i in batch_rand_idx
+            min_dist = distance(metric, X, centroids, i, 1)
+            label = 1
+
+            for j in 2:size(centroids, 2)
+                dist = distance(metric, X, centroids, i, j)
+                label = dist < min_dist ? j : label
+                min_dist = dist < min_dist ? dist : min_dist
+            end
+
+            labels[i] = label
+
+            ##### Batch gradient step  #####
+            # iterate over examples (each column) ==> (Step 9)
+            # Get cached center/label for each example label = labels[i] => (Step 10)
+
+            # Update per-center counts
+            N[label] += isnothing(weights) ? 1 : weights[i]  # (Step 11)
+
+            # Get per-center learning rate (Step 12)
+            lr = 1 / N[label]
+
+            # Take gradient step (Step 13) # TODO: Replace with faster loop?
+            @views centroids[:, label] .= (1 - lr) .* centroids[:, label] .+ (lr .* X[:, i])
+        end
+
+        # Reassign all labels based on new centres generated from the latest sample
+        labels .= reassign_labels(X, metric, labels, centroids)
+
+        # Calculate cost on whole dataset after reassignment and check for convergence
+        @parallelize 1 ncol sum_of_squares(containers, X, labels, centroids, weights, metric)
+        J = sum(containers.sum_of_squares)
+
+        if verbose
+            # Show progress and terminate if J stopped decreasing.
+            println("Iteration $niters: Jclust = $J")
+        end
+
+        # Check for early stopping convergence
+        if (niters > 1) & (abs(J - J_previous) < (tol * J))
+            counter += 1
+
+            # Declare convergence if max_no_improvement criterion is met
+            if counter >= max_no_improvement
+                converged = true
+                # Compute label assignment for the complete dataset
+                labels .= reassign_labels(X, metric, labels, centroids)
+
+                # Compute totalcost for the complete dataset
+                @parallelize 1 ncol sum_of_squares(containers, X, labels, centroids, weights, metric)
+                totalcost = sum(containers.sum_of_squares)
+
+                # Print convergence message to user
+                if verbose
+                    println("Successfully terminated with convergence.")
+                end
+
+                break
+            end
+        else
+            counter = 0
+        end
+
+        # Warn users if model doesn't converge at max iterations
+        if (niters >= max_iters) & (!converged)
+
+            if verbose
+                println("Clustering model failed to converge. Labelling data with latest centroids.")
+            end
+
+            labels .= reassign_labels(X, metric, labels, centroids)
+
+            # Compute totalcost for unconverged model
+            @parallelize 1 ncol sum_of_squares(containers, X, labels, centroids, weights, metric)
+            totalcost = sum(containers.sum_of_squares)
+
+            break
+        end
+
+        J_previous = J
+        niters += 1
+    end
+
+    # Push learned artifacts to KmeansResult
+    return KmeansResult(centroids, labels, T[], Int[], T[], totalcost, niters, converged)
+end
+
+"""
+    reassign_labels(DMatrix, metric, labels, centres)
+
+An internal function to relabel DMatrix based on centres and metric.
+"""
+function reassign_labels(DMatrix, metric, labels, centres)
+    @inbounds for i in axes(DMatrix, 2)
+        min_dist = distance(metric, DMatrix, centres, i, 1)
+        label = 1
+
+        for j in 2:size(centres, 2)
+            dist = distance(metric, DMatrix, centres, i, j)
+            label = dist < min_dist ? j : label
+            min_dist = dist < min_dist ? dist : min_dist
+        end
+
+        labels[i] = label
+    end
+    return labels
+end
+
+"""
+    create_containers(::MiniBatch, k, nrow, ncol, n_threads)
+
+Internal function for the creation of all necessary intermidiate structures.
+
+- `centroids_new` - container which holds new positions of centroids
+- `labels` - vector which holds labels of corresponding points
+- `sum_of_squares` - vector which holds the sum of squares values for each thread
+- `batch_rand_idx` - vector which holds the selected batch indices
+"""
+function create_containers(alg::MiniBatch, X, k, nrow, ncol, n_threads)
+    # Initiate placeholders to avoid allocations
+    T = eltype(X)
+    labels = Vector{Int}(undef, ncol)  # labels vector
+    sum_of_squares = Vector{T}(undef, 1)  # total_sum_calculation
+    batch_rand_idx = Vector{Int}(undef, alg.b)  # selected batch indices
+    centroids_new = Matrix{T}(undef, nrow, k)  # centroids
+
+    return (batch_rand_idx = batch_rand_idx, centroids_new = centroids_new,
+            labels = labels, sum_of_squares = sum_of_squares)
+end