Skip to content

Commit e3c921b

Browse files
committed
Tweaks to K-Means code, text, and images
Vector is a struct instead of a class.
1 parent 591fa45 commit e3c921b

File tree

9 files changed

+217
-199
lines changed

9 files changed

+217
-199
lines changed

Diff for: K-Means/Images/k_means_bad1.png

-4.22 KB
Loading

Diff for: K-Means/Images/k_means_bad2.png

-4.59 KB
Loading

Diff for: K-Means/Images/k_means_good.png

-4 KB
Loading

Diff for: K-Means/KMeans.swift

+31-34
Original file line numberDiff line numberDiff line change
@@ -1,28 +1,22 @@
1-
//
2-
// KMeans.swift
3-
//
4-
// Created by John Gill on 2/25/16.
5-
61
import Foundation
72

83
class KMeans<Label: Hashable> {
94
let numCenters: Int
10-
let labels: Array<Label>
11-
private(set) var centroids: Array<Vector>
12-
13-
init(labels: Array<Label>) {
5+
let labels: [Label]
6+
private(set) var centroids = [Vector]()
7+
8+
init(labels: [Label]) {
149
assert(labels.count > 1, "Exception: KMeans with less than 2 centers.")
1510
self.labels = labels
1611
self.numCenters = labels.count
17-
centroids = []
1812
}
19-
20-
private func nearestCenterIndex(x: Vector, centers: [Vector]) -> Int {
13+
14+
private func indexOfNearestCenter(x: Vector, centers: [Vector]) -> Int {
2115
var nearestDist = DBL_MAX
2216
var minIndex = 0
2317

2418
for (idx, center) in centers.enumerate() {
25-
let dist = x.distTo(center)
19+
let dist = x.distanceTo(center)
2620
if dist < nearestDist {
2721
minIndex = idx
2822
nearestDist = dist
@@ -31,43 +25,46 @@ class KMeans<Label: Hashable> {
3125
return minIndex
3226
}
3327

34-
35-
36-
func trainCenters(points: [Vector], convergeDist: Double) {
37-
38-
var centerMoveDist = 0.0
39-
let zeroVector = Vector(d: [Double](count: points[0].length, repeatedValue: 0.0))
28+
func trainCenters(points: [Vector], convergeDistance: Double) {
29+
let zeroVector = Vector([Double](count: points[0].length, repeatedValue: 0))
4030

41-
var kCenters = reservoirSample(points, k: numCenters)
31+
// Randomly take k objects from the input data to make the initial centroids.
32+
var centers = reservoirSample(points, k: numCenters)
4233

34+
var centerMoveDist = 0.0
4335
repeat {
36+
// This array keeps track of which data points belong to which centroids.
37+
var classification: [[Vector]] = .init(count: numCenters, repeatedValue: [])
4438

45-
var classification: Array<[Vector]> = Array(count: numCenters, repeatedValue: [])
46-
39+
// For each data point, find the centroid that it is closest to.
4740
for p in points {
48-
let classIndex = nearestCenterIndex(p, centers: kCenters)
41+
let classIndex = indexOfNearestCenter(p, centers: centers)
4942
classification[classIndex].append(p)
5043
}
51-
44+
45+
// Take the average of all the data points that belong to each centroid.
46+
// This moves the centroid to a new position.
5247
let newCenters = classification.map { assignedPoints in
5348
assignedPoints.reduce(zeroVector, combine: +) / Double(assignedPoints.count)
5449
}
5550

51+
// Find out how far each centroid moved since the last iteration. If it's
52+
// only a small distance, then we're done.
5653
centerMoveDist = 0.0
5754
for idx in 0..<numCenters {
58-
centerMoveDist += kCenters[idx].distTo(newCenters[idx])
55+
centerMoveDist += centers[idx].distanceTo(newCenters[idx])
5956
}
6057

61-
kCenters = newCenters
62-
} while centerMoveDist > convergeDist
63-
64-
centroids = kCenters
58+
centers = newCenters
59+
} while centerMoveDist > convergeDistance
60+
61+
centroids = centers
6562
}
6663

6764
func fit(point: Vector) -> Label {
6865
assert(!centroids.isEmpty, "Exception: KMeans tried to fit on a non trained model.")
6966

70-
let centroidIndex = nearestCenterIndex(point, centers: centroids)
67+
let centroidIndex = indexOfNearestCenter(point, centers: centroids)
7168
return labels[centroidIndex]
7269
}
7370

@@ -79,20 +76,20 @@ class KMeans<Label: Hashable> {
7976
}
8077

8178
// Pick k random elements from samples
82-
func reservoirSample<T>(samples:[T], k:Int) -> [T] {
79+
func reservoirSample<T>(samples: [T], k: Int) -> [T] {
8380
var result = [T]()
8481

8582
// Fill the result array with first k elements
8683
for i in 0..<k {
8784
result.append(samples[i])
8885
}
89-
// randomly replace elements from remaining pool
86+
87+
// Randomly replace elements from remaining pool
9088
for i in (k+1)..<samples.count {
91-
let j = random() % (i+1)
89+
let j = Int(arc4random_uniform(UInt32(i + 1)))
9290
if j < k {
9391
result[j] = samples[i]
9492
}
9593
}
9694
return result
9795
}
98-

Diff for: K-Means/README.markdown

+90-59
Original file line numberDiff line numberDiff line change
@@ -1,101 +1,132 @@
11
# k-Means Clustering
22

3-
Goal: Partition data into **k** clusters based on nearest means.
3+
Goal: Partition data into two or more clusters.
44

5-
The idea behind k-Means is to take data that has no formal classification to it and determine if there are any natural clusters (groups of related objects) within the data.
5+
The idea behind k-Means Clustering is to take a bunch of data and determine if there are any natural clusters (groups of related objects) within the data.
66

7-
k-Means assumes that there are **k-centers** within the data. The data that is closest to these *centroids* become classified or grouped together. k-Means doesn't tell you what the classifier is for that particular data group, but it assists in trying to find what clusters potentially exist.
7+
The k-Means algorithm is a so-called *unsupervised* learning algorithm. We don't know in advance what patterns exist in the data -- it has no formal classification to it -- but we would like to see if we can divide the data into groups somehow.
88

9-
## The algorithm
9+
For example, you can use k-Means to find what are the 3 most prominent colors in an image by telling it to group pixels into 3 clusters based on their color value. Or you can use it to group related news articles together, without deciding beforehand what categories to use. The algorithm will automatically figure out what the best groups are.
1010

11-
The k-Means algorithm is really quite simple at its core:
11+
The "k" in k-Means is a number. The algorithm assumes that there are **k** centers within the data that the various data elements are scattered around. The data that is closest to these so-called **centroids** become classified or grouped together.
1212

13-
1. Choose **k** random points to be the initial centers
14-
2. Repeat the following two steps until the *centroids* reach convergence:
15-
1. Assign each point to its nearest *centroid*
16-
2. Update the *centroid* to the mean of its nearest points
13+
k-Means doesn't tell you what the classifier is for each particular data group. After dividing news articles into groups, it doesn't say that group 1 is about science, group 2 is about celebrities, group 3 is about the upcoming election, etc. You only know that related news stories are now together, but not necessarily what that relationship signifies. k-Means only assists in trying to find what clusters potentially exist.
1714

18-
Convergence is said to be reached when all of the *centroids* have not changed.
15+
## The algorithm
1916

20-
This brings about a few of the parameters that are required for k-Means:
17+
The k-Means algorithm is really quite simple at its core.
2118

22-
- **k**: This is the number of *centroids* to attempt to locate.
23-
- **convergence distance**: The minimum distance that the centers are allowed to move after a particular update step.
24-
- **distance function**: There are a number of distance functions that can be used, but mostly commonly the Euclidean distance function is adequate. But often that can lead to convergence not being reached in higher dimensionally.
19+
First, we choose **k** random data points to be the initial centroids. Then, we repeat the following two steps until we've found our clusters:
2520

26-
This is what the algorithm would look like in Swift:
21+
1. For each data point, find which centroid it is closest to. We assign each point to its nearest centroid.
22+
2. Update the centroid to the mean (i.e. the average) of its nearest data points. We move the centroid so that it really sits in the center of the cluster.
2723

28-
```swift
29-
func kMeans(numCenters: Int, convergeDist: Double, points: [Vector]) -> [Vector] {
30-
var centerMoveDist = 0.0
31-
let zeros = [Double](count: points[0].length, repeatedValue: 0.0)
32-
33-
var kCenters = reservoirSample(points, k: numCenters)
34-
35-
repeat {
36-
var cnts = [Double](count: numCenters, repeatedValue: 0.0)
37-
var newCenters = [Vector](count:numCenters, repeatedValue: Vector(d:zeros))
24+
We need to repeat this multiple times because moving the centroid changes which data points belong to it. This goes back and forth for a bit until everything stabilizes. That's when we reach "convergence", i.e. when the centroids no longer move around.
3825

39-
for p in points {
40-
let c = nearestCenter(p, centers: kCenters)
41-
cnts[c] += 1
42-
newCenters[c] += p
43-
}
44-
45-
for idx in 0..<numCenters {
46-
newCenters[idx] /= cnts[idx]
47-
}
48-
49-
centerMoveDist = 0.0
50-
for idx in 0..<numCenters {
51-
centerMoveDist += euclidean(kCenters[idx], newCenters[idx])
52-
}
53-
54-
kCenters = newCenters
55-
} while centerMoveDist > convergeDist
26+
A few of the parameters that are required for k-Means:
5627

57-
return kCenters
58-
}
59-
```
28+
- **k**: This is the number of centroids to attempt to locate. If you want to group news articles, this is the number of groups to look for.
29+
- **convergence distance**: If all the centroids move less than this distance after a particular update step, we're done.
30+
- **distance function**: This calculates how far data points are from the centroids, to find which centroid they are closest to. There are a number of distance functions that can be used, but most commonly the Euclidean distance function is adequate (you know, Pythagoras). But often that can lead to convergence not being reached in higher dimensionally.
6031

61-
## Example
32+
Let's look at an example.
6233

63-
These examples are contrived to show the exact nature of k-Means and finding clusters. These clusters are very easily identified by human eyes: we see there is one in the lower left corner, one in the upper right corner, and maybe one in the middle.
34+
#### Good clusters
6435

65-
In all these examples the squares represent the data points and the stars represent the *centroids*.
36+
This first example shows k-Means finding all three clusters. In all these examples the circles represent the data points and the stars represent the centroids.
6637

67-
##### Good clusters
68-
69-
This first example shows k-Means finding all three clusters:
38+
In the first iteration, we choose three data points at random and put our centroids on top of them. Then in each subsequent iteration, we figure out which data points are closest to these centroids, and move the centroids to the average position of those data points. This repeats until we reach equilibrium and the centroids stop moving.
7039

7140
![Good Clustering](Images/k_means_good.png)
7241

73-
The selection of initial centroids found the lower left cluster (indicated by red) and did pretty good on the center and upper left clusters.
42+
The selection of initial centroids was fortuitous! We found the lower left cluster (indicated by red) and did pretty good on the center and upper left clusters.
43+
44+
> **Note:** These examples are contrived to show the exact nature of k-Means and finding clusters. The clusters in these examples are very easily identified by human eyes: we see there is one in the lower left corner, one in the upper right corner, and maybe one in the middle. In practice, however, data may have many dimensions and may be impossible to visualize. In such cases, k-Means is much better at this job than human eyes!
7445
75-
#### Bad Clustering
46+
#### Bad clustering
7647

7748
The next two examples highlight the unpredictability of k-Means and how it not always finds the best clustering.
7849

7950
![Bad Clustering 1](Images/k_means_bad1.png)
8051

81-
As you can see in this one, the initial *centroids* were all a little too close and the 'blue' didn't quite get to a good place. By adjusting the convergence distance we should be able to get it better.
52+
As you can see in this one, the initial centroids were all a little too close to one another, and the blue one didn't quite get to a good place. By adjusting the convergence distance we should be able to get it better.
8253

8354
![Bad Clustering 1](Images/k_means_bad2.png)
8455

8556
In this example, the blue cluster never really could separate from the red cluster and as such sort of got stuck down there.
8657

58+
#### Improving bad clustering
59+
60+
In these examples of "bad" clustering, the algorithm got stuck in a local optimum. It does find clusters but they're not the best way to divide up the data. To increase your chances of success, you can run the algorithm several times, each time with different points as the initial centroids. You choose the clustering that gives the best results.
61+
62+
To calculate how "good" the clustering is, you find the distance of each data point to its cluster, and add up all these distances. The lower this number, the better! That means each cluster is really in the center of a group of data points, and all clusters are roughly the same size and are spaced evenly apart.
63+
64+
## The code
65+
66+
This is what the algorithm could look like in Swift. The `points` array contains the input data as `Vector` objects. The output is an array of `Vector` objects representing the clusters that were found.
67+
68+
```swift
69+
func kMeans(numCenters: Int, convergeDistance: Double, points: [Vector]) -> [Vector] {
70+
71+
// Randomly take k objects from the input data to make the initial centroids.
72+
var centers = reservoirSample(points, k: numCenters)
73+
74+
// This loop repeats until we've reached convergence, i.e. when the centroids
75+
// have moved less than convergeDistance since the last iteration.
76+
var centerMoveDist = 0.0
77+
repeat {
78+
// In each iteration of the loop, we move the centroids to a new position.
79+
// The newCenters array contains those new positions.
80+
let zeros = [Double](count: points[0].length, repeatedValue: 0)
81+
var newCenters = [Vector](count: numCenters, repeatedValue: Vector(zeros))
82+
83+
// We keep track of how many data points belong to each centroid, so we
84+
// can calculate the average later.
85+
var counts = [Double](count: numCenters, repeatedValue: 0)
86+
87+
// For each data point, find the centroid that it is closest to. We also
88+
// add up the data points that belong to that centroid, in order to compute
89+
// that average.
90+
for p in points {
91+
let c = indexOfNearestCenter(p, centers: centers)
92+
newCenters[c] += p
93+
counts[c] += 1
94+
}
95+
96+
// Take the average of all the data points that belong to each centroid.
97+
// This moves the centroid to a new position.
98+
for idx in 0..<numCenters {
99+
newCenters[idx] /= counts[idx]
100+
}
101+
102+
// Find out how far each centroid moved since the last iteration. If it's
103+
// only a small distance, then we're done.
104+
centerMoveDist = 0.0
105+
for idx in 0..<numCenters {
106+
centerMoveDist += centers[idx].distanceTo(newCenters[idx])
107+
}
108+
109+
centers = newCenters
110+
} while centerMoveDist > convergeDistance
111+
112+
return centers
113+
}
114+
```
115+
116+
> **Note:** The code in [KMeans.swift](KMeans.swift) is slightly more advanced than the above listing. It also assigns labels to the clusters and has a few other tricks up its sleeve. Check it out!
117+
87118
## Performance
88119

89-
The first thing to recognize is that k-Means is classified as an NP-Hard type of problem. The selection of the initial *centroids* has a big effect on how the resulting clusters may end up. This means that trying to find an exact solution is not likely -- even in 2 dimensional space.
120+
k-Means is classified as an NP-Hard type of problem. That means it's almost impossible to find the optimal solution. The selection of the initial centroids has a big effect on how the resulting clusters may end up. Finding an exact solution is not likely -- even in 2 dimensional space.
90121

91-
As seen from the steps above the complexity really isn't that bad -- it is often considered to be on the order of **O(kndi)**, where **k** is the number of *centroids*, **n** is the number of **d**-dimensional vectors, and **i** is the number of iterations for convergence.
122+
As seen from the steps above the complexity really isn't that bad -- it is often considered to be on the order of **O(kndi)**, where **k** is the number of centroids, **n** is the number of **d**-dimensional vectors, and **i** is the number of iterations for convergence.
92123

93-
The amount of data has a big linear effect on the running time of k-Means, but tuning how far you want the *centroids* to converge can have a big impact how many iterations will be done. As a general rule, **k** should be relatively small compared to the number of vectors.
124+
The amount of data has a linear effect on the running time of k-Means, but tuning how far you want the centroids to converge can have a big impact how many iterations will be done. As a general rule, **k** should be relatively small compared to the number of vectors.
94125

95-
Often times as more data is added certain points may lie in the boundary between two *centroids* and as such those centroids would continue to bounce back and forth and the **convergence** distance would need to be tuned to prevent that.
126+
Often times as more data is added certain points may lie in the boundary between two centroids and as such those centroids would continue to bounce back and forth and the convergence distance would need to be tuned to prevent that.
96127

97128
## See Also
98129

99130
[K-Means Clustering on Wikipedia](https://en.wikipedia.org/wiki/K-means_clustering)
100131

101-
*Written by John Gill*
132+
*Written by John Gill and Matthijs Hollemans*

0 commit comments

Comments
 (0)