-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathHelpFunctions.py
237 lines (209 loc) · 9.62 KB
/
HelpFunctions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
# ********************************************************************************************************
# HELPER FUNCTIONS DEFINITION
# ********************************************************************************************************
# ********************************************
# Imports
# ********************************************
import numpy as np
import dash
from sklearn import datasets
import plotly.graph_objs as go
# ********************************************
# Dataset Creation Function
# ********************************************
def create_dataset(shape='gmm', sampleSize=200, n_clusters=3):
"""
This function creates a dataset to apply K-means on.
Inputs :
- shape : shape of the dataset. Possible values : 'gmm', 'circle', 'moon', 'anisotropic', 'No Structure'. Default : 'gmm'.
- sampleSize : number of data points. Default : 200.
- n_clusters : number of clusters in the dataset. This is applicable only for shapes 'gmm' and 'anisotropic'.
Output :
- (sampleSize, 2) numpy array.
"""
clusterStd = [0.5, 1, 1.3]*3
clusterStd = clusterStd[:n_clusters]
if shape=='gmm':
X = datasets.make_blobs(n_samples=sampleSize, n_features=2, centers=n_clusters, cluster_std=clusterStd)[0]
X[:,0] = 30*(X[:,0]-min(X[:,0]))/(max(X[:,0])-min(X[:,0])) - 15
X[:,1] = 30*(X[:,1]-min(X[:,1]))/(max(X[:,1])-min(X[:,1])) - 15
elif shape=='circle':
X = datasets.make_circles(n_samples=sampleSize, factor=.5, noise=.05)[0]
X[:,0] = 30*(X[:,0]-min(X[:,0]))/(max(X[:,0])-min(X[:,0])) - 15
X[:,1] = 30*(X[:,1]-min(X[:,1]))/(max(X[:,1])-min(X[:,1])) - 15
elif shape=='moon':
X = datasets.make_moons(n_samples=sampleSize, noise=.1)[0]
X[:,0] = 30*(X[:,0]-min(X[:,0]))/(max(X[:,0])-min(X[:,0])) - 15
X[:,1] = 30*(X[:,1]-min(X[:,1]))/(max(X[:,1])-min(X[:,1])) - 15
elif shape=='anisotropic':
transformations = {0:[[0.6, -0.6], [-0.4, 0.8]], 1:[[-0.7, -0.6], [0.6, 0.8]], 2:[[0.8, -0.1], [0.8, 0.1]]}
X, y = datasets.make_blobs(n_samples=sampleSize, n_features=2, centers=n_clusters, cluster_std=clusterStd)
for i in range(n_clusters):
X[y==i] = np.dot(X[y==i], transformations[i%3])
X = 5*X
X[:,0] = 30*(X[:,0]-min(X[:,0]))/(max(X[:,0])-min(X[:,0])) - 15
X[:,1] = 30*(X[:,1]-min(X[:,1]))/(max(X[:,1])-min(X[:,1])) - 15
else:
X = 30*np.random.rand(sampleSize, 2)-15
return X
# ********************************************
# Centroid Initialization Function
# ********************************************
def init_centroids(X, k=3, initMethod='random'):
"""
This function initializes the centroids for the K-means algorithm.
Inputs :
- X : numpy matrix representing the dataset used for clustering. This is needed for the K-means++ centroid initialization.
- k : number of centroids (or prototypes) to create. Default : 3.
- initMethod : initialization method. Possible values : 'random' and 'kmeans++'. Default : 'random'.
Output :
- (k, 2) numpy array.
"""
if initMethod == 'random':
# Random Initialization
centroids = 30*np.random.rand(k, 2)-15
else:
# Kmeans++ Initialization
indices = list(range(X.shape[0]))
centroids = np.empty((k,2))
centroids[0, :] = X[np.random.choice(indices), :]
D = np.power(np.linalg.norm(X-centroids[0], axis=1), 2)
P = D/D.sum()
for i in range(k-1):
centroidIdx = np.random.choice(indices, size=1, p=P)
centroids[i+1, :] = X[centroidIdx, :]
Dtemp = np.power(np.linalg.norm(X-centroids[-1, :], axis=1), 2)
D = np.min(np.vstack((Dtemp, D)), axis=0)
P = D/D.sum()
return centroids
# ********************************************
# K-means Expectation Step Function
# ********************************************
def Kmeans_EStep(X, centroids):
"""
This function performs the Expectation Step of the K-means algorithm. It assigns each data point to its closest centroid.
Inputs :
- X : numpy array of the dataset such as the one returned by the create_dataset function.
- centroids : numpy array of centroids such as the one returned by the init_centroids function.
"""
k = centroids.shape[0]
# Initialize Points-Centroid Distance Matrix
centroidDistMat = np.empty((X.shape[0], k))
# Compute Points-Centroid Distance Matrix
for i in range(k):
centroidDistMat[:, i] = np.linalg.norm(X-centroids[i,:], axis=1)
# Infer Labels
labels = centroidDistMat.argmin(axis=1)
return labels
# ********************************************
# K-means Maximization Step Function
# ********************************************
def Kmeans_MStep(X, centroids, labels):
"""
This function performs the Maximization Step of K-means. It computes the new centroids given the current centroid assignment of data points.
K-Means learning is done by iterating the Expectation and Maximization steps until convergence, or until max_iter is reached.
Inputs :
- X : numpy array of the dataset.
- centroids : numpy array of the centroids.
- labels : list or numpy array of the current cluster assignment of each data point.
Output :
- Numpy array of the same shape of centroids.
"""
k = centroids.shape[0]
Newcentroids = centroids
# Compute values for New Centroids
for i in range(k):
if sum(labels==i)>0:
Newcentroids[i, :] = X[labels==i, :].mean(axis=0)
return Newcentroids
# ********************************************
# Make K-means Visualization Data Function
# ********************************************
def make_kmeans_viz_data(X, labels, centroids, clusColorMap):
"""
This function creates the Plotly traces of a given K-means setting (Data points, Centroids, labels of data points).
Inputs :
- X : numpy array of the dataset.
- labels : list or 1D numpy array representing the current cluster assignment of data points.
- centroids : 2D numpy array representing the centroids.
- clusColorMap : a dictionary mapping a centroid number to its desired color.
Output :
- a list of traces to plot using Plotly.
"""
clusLabelColors = list(map(clusColorMap.get, labels))
# Num Clusters
k = centroids.shape[0]
data = []
# IF WE'RE IN THE INITIALIZATION STEP (Coloring changes)
if sum(labels)==-X.shape[0]:
# Data Points
tracePoints = go.Scatter(
x = X[:, 0],
y = X[:, 1],
mode = 'markers',
marker = dict(color='gray', size=10,
line = dict(width=1, color='white')),
name = 'Data Points'
)
data.append(tracePoints)
# Centroid-Point Lines
centroidPoints = X.copy()
for idx,i in enumerate(range(1, 3*centroidPoints.shape[0], 3)):
centroidPoints = np.insert(centroidPoints, i, X.mean(axis=0), axis=0)
centroidPoints = np.insert(centroidPoints, i+1, np.array([None, None]), axis=0)
traceLines = go.Scatter(
x = centroidPoints[:, 0],
y = centroidPoints[:, 1],
mode = 'lines',
line = dict(color='white', width=.5),
name = 'Memberships'
)
data.append(traceLines)
# Centroids
traceCentroid = go.Scatter(
x = centroids[:, 0],
y = centroids[:, 1],
name = 'Centroids',
mode='markers',
marker = dict(color='gray',
size=20, symbol='circle', opacity=.8, line = dict(width=3, color='black'))
)
data.append(traceCentroid)
return data
# ELSE
## FIRST TRACE TYPE : DATA POINTS
tracePoints = go.Scatter(
x = X[:, 0],
y = X[:, 1],
mode = 'markers',
marker = dict(color=[clusColorMap[i] for i in labels], size=10,
line = dict(width=1, color='white')),
name = 'Data Points'
)
data.append(tracePoints)
## SECOND TRACE TYPE : LINES BETWEEN DATA POINTS AND CENTROIDS
# Compute Array with Nones to link centroids to their respective points with lines
centroidPoints = X.copy()
for idx,i in enumerate(range(1, 3*centroidPoints.shape[0], 3)):
centroidPoints = np.insert(centroidPoints, i, centroids[labels[idx], :], axis=0)
centroidPoints = np.insert(centroidPoints, i+1, np.array([None, None]), axis=0)
# Trace Lines
traceLines = go.Scatter(
x = centroidPoints[:, 0],
y = centroidPoints[:, 1],
mode = 'lines',
line = dict(color='white', width=.5),
name = 'Cluster Lines'
)
data.append(traceLines)
## THIRD TRACE TYPE : CENTROIDS
traceCentroid = go.Scatter(
x = centroids[:, 0],
y = centroids[:, 1],
name = 'Centroids',
mode='markers',
marker = dict(color=list(clusColorMap.values()),
size=20, symbol='circle', opacity=.8, line = dict(width=3, color='black'))
)
data.append(traceCentroid)
return data