Reduced minibatch algo allocations & whitespace removal in various modules

PyDataBlog · PyDataBlog · commit 6010166092db · 2021-04-05T01:13:02.000+02:00
diff --git a/docs/src/index.md b/docs/src/index.md
@@ -78,11 +78,11 @@ pkg> free ParallelKMeans
 - [X] Implementation of [Coresets](http://proceedings.mlr.press/v51/lucic16-supp.pdf).
 - [X] Support for weighted K-means.
 - [X] Support of MLJ Random generation hyperparameter.
+- [X] Implementation of [Mini-batch KMeans variant](https://www.eecs.tufts.edu/~dsculley/papers/fastkmeans.pdf)
 - [ ] Support for other distance metrics supported by [Distances.jl](https://github.com/JuliaStats/Distances.jl#supported-distances).
 - [ ] Implementation of [Geometric methods to accelerate k-means algorithm](http://cs.baylor.edu/~hamerly/papers/sdm2016_rysavy_hamerly.pdf).
 - [ ] Native support for tabular data inputs outside of MLJModels' interface.
-- [ ] Refactoring and finalization of API design.
-- [ ] GPU support.
+- [ ] GPU support?
 - [ ] Distributed calculations support.
 - [ ] Optimization of code base.
 - [ ] Improved Documentation
@@ -127,7 +127,7 @@ r.converged             # whether the procedure converged
 - [Elkan()](https://www.aaai.org/Papers/ICML/2003/ICML03-022.pdf) - Recommended for high dimensional data.
 - [Yinyang()](https://www.microsoft.com/en-us/research/wp-content/uploads/2016/02/ding15.pdf) - Recommended for large dimensions and/or large number of clusters.
 - [Coreset()](http://proceedings.mlr.press/v51/lucic16-supp.pdf) - Recommended for very fast clustering of very large datasets, when extreme accuracy is not important.
-- [MiniBatch()](https://www.eecs.tufts.edu/~dsculley/papers/fastkmeans.pdf) - Recommended for extremely large datasets.
+- [MiniBatch()](https://www.eecs.tufts.edu/~dsculley/papers/fastkmeans.pdf) - Recommended for extremely large datasets, when extreme accuracy is not important.
 - [Geometric()](http://cs.baylor.edu/~hamerly/papers/sdm2016_rysavy_hamerly.pdf) - (Coming soon)
 
 ### Practical Usage Examples
@@ -175,9 +175,9 @@ Currently, this package is benchmarked against similar implementations in both P
 
 *Note*: All benchmark tests are made on the same computer to help eliminate any bias.
 
-|PC Name                      |CPU                       |Ram               |
-|:---------------------------:|:------------------------:|:----------------:|
-|iMac (Retina 5K 27-inch 2019)|3 GHz 6-Core Intel Core i5|8 GB 2667 MHz DDR4|
+|PC Name                      |CPU                       |Ram                |
+|:---------------------------:|:------------------------:|:-----------------:|
+|iMac (Retina 5K 27-inch 2019)|3 GHz 6-Core Intel Core i5|24 GB 2667 MHz DDR4|
 
 Currently, the benchmark speed tests are based on the search for optimal number of clusters using the [Elbow Method](https://en.wikipedia.org/wiki/Elbow_method_(clustering)) since this is a practical use case for most practioners employing the K-Means algorithm.
 
diff --git a/src/kmeans.jl b/src/kmeans.jl
@@ -170,15 +170,15 @@ alternatively one can use `rand` to choose random points for init.
 
 A `KmeansResult` structure representing labels, centroids, and sum_squares is returned.
 """
-function kmeans(alg::AbstractKMeansAlg, design_matrix, k; 
-                weights = nothing, 
+function kmeans(alg::AbstractKMeansAlg, design_matrix, k;
+                weights = nothing,
                 n_threads = Threads.nthreads(),
-                k_init = "k-means++", 
+                k_init = "k-means++",
                 max_iters = 300,
                 tol = eltype(design_matrix)(1e-6),
                 verbose = false,
                 init = nothing,
-                rng = Random.GLOBAL_RNG, 
+                rng = Random.GLOBAL_RNG,
                 metric = Euclidean())
 
     nrow, ncol = size(design_matrix)
@@ -187,12 +187,12 @@ function kmeans(alg::AbstractKMeansAlg, design_matrix, k;
     containers = create_containers(alg, design_matrix, k, nrow, ncol, n_threads)
 
     return kmeans!(alg, containers, design_matrix, k, weights, metric;
-                   n_threads = n_threads, 
-                   k_init = k_init, 
+                   n_threads = n_threads,
+                   k_init = k_init,
                    max_iters = max_iters,
-                   tol = tol, 
-                   verbose = verbose, 
-                   init = init, 
+                   tol = tol,
+                   verbose = verbose,
+                   init = init,
                    rng = rng)
 
 end
diff --git a/src/mini_batch.jl b/src/mini_batch.jl
@@ -22,30 +22,34 @@ function kmeans!(alg::MiniBatch, containers, X, k,
                  k_init = "k-means++", init = nothing, max_iters = 300,
                  tol = eltype(X)(1e-6), max_no_improvement = 10, verbose = false, rng = Random.GLOBAL_RNG)
 
+    # Retrieve initialized artifacts from the container
+    centroids = containers.centroids_new
+    batch_rand_idx = containers.batch_rand_idx
+    labels = containers.labels
+
     # Get the type and dimensions of design matrix, X - (Step 1)
     T = eltype(X)
     nrow, ncol = size(X)
 
     # Initiate cluster centers - (Step 2) in paper
-    centroids = isnothing(init) ? smart_init(X, k, n_threads, weights, rng, init = k_init).centroids : deepcopy(init)
+    centroids .= isnothing(init) ? smart_init(X, k, n_threads, weights, rng, init = k_init).centroids : deepcopy(init)
 
     # Initialize counter for the no. of data in each cluster - (Step 3) in paper
     N = zeros(T, k)
 
-    # Initialize nearest centers for both batch and whole dataset labels
+    # Initialize various artifacts
     converged = false
     niters = 1
     counter = 0
     J_previous = zero(T)
     J = zero(T)
     totalcost = zero(T)
-    batch_rand_idx = containers.batch_rand_idx
-    
+
     # Main Steps. Batch update centroids until convergence
     while niters <= max_iters  # Step 4 in paper
 
         # b examples picked randomly from X (Step 5 in paper)
-        batch_rand_idx = isnothing(weights) ? rand!(rng, batch_rand_idx, 1:ncol) : wsample!(rng, 1:ncol, weights, batch_rand_idx)
+        isnothing(weights) ? rand!(rng, batch_rand_idx, 1:ncol) : wsample!(rng, 1:ncol, weights, batch_rand_idx)
 
         # Cache/label the batch samples nearest to the centers (Step 6 & 7)
         @inbounds for i in batch_rand_idx
@@ -58,12 +62,12 @@ function kmeans!(alg::MiniBatch, containers, X, k,
                 min_dist = dist < min_dist ? dist : min_dist
             end
 
-            containers.labels[i] = label
+            labels[i] = label
 
             ##### Batch gradient step  #####
             # iterate over examples (each column) ==> (Step 9)
-            # Get cached center/label for each example label = labels[i] => (Step 10) 
-            
+            # Get cached center/label for each example label = labels[i] => (Step 10)
+
             # Update per-center counts
             N[label] += isnothing(weights) ? 1 : weights[i]  # (Step 11)
 
@@ -75,10 +79,10 @@ function kmeans!(alg::MiniBatch, containers, X, k,
         end
 
         # Reassign all labels based on new centres generated from the latest sample
-        containers.labels .= reassign_labels(X, metric, containers.labels, centroids)
+        labels .= reassign_labels(X, metric, labels, centroids)
 
         # Calculate cost on whole dataset after reassignment and check for convergence
-        @parallelize 1 ncol sum_of_squares(containers, X, containers.labels, centroids, weights, metric)  
+        @parallelize 1 ncol sum_of_squares(containers, X, labels, centroids, weights, metric)
         J = sum(containers.sum_of_squares)
 
         if verbose
@@ -94,12 +98,12 @@ function kmeans!(alg::MiniBatch, containers, X, k,
             if counter >= max_no_improvement
                 converged = true
                 # Compute label assignment for the complete dataset
-                containers.labels .= reassign_labels(X, metric, containers.labels, centroids)
+                labels .= reassign_labels(X, metric, labels, centroids)
 
                 # Compute totalcost for the complete dataset
-                @parallelize 1 ncol sum_of_squares(containers, X, containers.labels, centroids, weights, metric)
+                @parallelize 1 ncol sum_of_squares(containers, X, labels, centroids, weights, metric)
                 totalcost = sum(containers.sum_of_squares)
-                
+
                 # Print convergence message to user
                 if verbose
                     println("Successfully terminated with convergence.")
@@ -117,11 +121,13 @@ function kmeans!(alg::MiniBatch, containers, X, k,
             if verbose
                 println("Clustering model failed to converge. Labelling data with latest centroids.")
             end
-            containers.labels .= reassign_labels(X, metric, containers.labels, centroids)
+
+            labels .= reassign_labels(X, metric, labels, centroids)
 
             # Compute totalcost for unconverged model
-            @parallelize 1 ncol sum_of_squares(containers, X, containers.labels, centroids, weights, metric)
+            @parallelize 1 ncol sum_of_squares(containers, X, labels, centroids, weights, metric)
             totalcost = sum(containers.sum_of_squares)
+
             break
         end
 
@@ -130,10 +136,14 @@ function kmeans!(alg::MiniBatch, containers, X, k,
     end
 
     # Push learned artifacts to KmeansResult
-    return KmeansResult(centroids, containers.labels, T[], Int[], T[], totalcost, niters, converged)
+    return KmeansResult(centroids, labels, T[], Int[], T[], totalcost, niters, converged)
 end
 
+"""
+    reassign_labels(DMatrix, metric, labels, centres)
 
+An internal function to relabel DMatrix based on centres and metric.
+"""
 function reassign_labels(DMatrix, metric, labels, centres)
     @inbounds for i in axes(DMatrix, 2)
         min_dist = distance(metric, DMatrix, centres, i, 1)
@@ -156,17 +166,18 @@ end
 Internal function for the creation of all necessary intermidiate structures.
 
 - `centroids_new` - container which holds new positions of centroids
-- `centroids_cnt` - container which holds number of points for each centroid
 - `labels` - vector which holds labels of corresponding points
 - `sum_of_squares` - vector which holds the sum of squares values for each thread
+- `batch_rand_idx` - vector which holds the selected batch indices
 """
 function create_containers(alg::MiniBatch, X, k, nrow, ncol, n_threads)
     # Initiate placeholders to avoid allocations
-    T = eltype(X) 
+    T = eltype(X)
     labels = Vector{Int}(undef, ncol)  # labels vector
     sum_of_squares = Vector{T}(undef, 1)  # total_sum_calculation
-    batch_rand_idx = Vector{Int}(undef, alg.b)
+    batch_rand_idx = Vector{Int}(undef, alg.b)  # selected batch indices
+    centroids_new = Matrix{T}(undef, nrow, k)  # centroids
 
-    return (batch_rand_idx = batch_rand_idx,
+    return (batch_rand_idx = batch_rand_idx, centroids_new = centroids_new,
             labels = labels, sum_of_squares = sum_of_squares)
 end
diff --git a/src/mlj_interface.jl b/src/mlj_interface.jl
@@ -10,7 +10,7 @@ const MLJDICT = Dict(:Lloyd => Lloyd(),
                      :Elkan => Elkan(),
 					 :Yinyang => Yinyang(),
 					 :Coreset => Coreset(),
-					 :阴阳 => Coreset(), 
+					 :阴阳 => Coreset(),
                      :MiniBatch => MiniBatch())
 
 ####
diff --git a/test/test80_mlj_interface.jl b/test/test80_mlj_interface.jl
@@ -200,15 +200,15 @@ end
     @test report.totalcost   ≈ 18.03007733451847
 
     params = fitted_params(model, results)
-    @test all(params.cluster_centers .≈ [0.39739206832613827 0.4818900563319951; 
-                                        0.7695625526281311 0.30986081763964723; 
+    @test all(params.cluster_centers .≈ [0.39739206832613827 0.4818900563319951;
+                                        0.7695625526281311 0.30986081763964723;
                                         0.6175496080776439 0.3911138270823586])
 
     # Use trained model to cluster new data X_test
     preds = transform(model, results, X_test)
     @test preds[:x1][1] ≈ 0.48848842207123555
     @test preds[:x2][1] ≈ 0.08355805256372761
-    
+
     # Make predictions on new data X_test with fitted params
     yhat = predict(model, results, X_test)
     @test yhat == report.assignments[1:2]