Skip to content

Commit 4c6ffb1

Browse files
python 3
1 parent 794ae3f commit 4c6ffb1

File tree

9 files changed

+140
-75
lines changed

9 files changed

+140
-75
lines changed

Diff for: unsupervised_class/books.py

+29-19
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,12 @@
11
# https://deeplearningcourses.com/c/cluster-analysis-unsupervised-machine-learning-python
22
# https://www.udemy.com/cluster-analysis-unsupervised-machine-learning-python
3+
from __future__ import print_function, division
4+
from future.utils import iteritems
5+
from builtins import range, input
6+
# Note: you may need to update your version of future
7+
# sudo pip install -U future
8+
9+
310
import networkx as nx
411
import nltk
512
import numpy as np
@@ -38,9 +45,12 @@ def my_tokenizer(s):
3845
all_tokens = []
3946
all_titles = []
4047
index_word_map = []
48+
print("num titles:", len(titles))
49+
print("first title:", titles[0])
4150
for title in titles:
4251
try:
4352
title = title.encode('ascii', 'ignore') # this will throw exception if bad characters
53+
title = title.decode('utf-8')
4454
all_titles.append(title)
4555
tokens = my_tokenizer(title)
4656
all_tokens.append(tokens)
@@ -49,8 +59,8 @@ def my_tokenizer(s):
4959
word_index_map[token] = current_index
5060
current_index += 1
5161
index_word_map.append(token)
52-
except:
53-
pass
62+
except Exception as e:
63+
print(e)
5464

5565

5666

@@ -76,9 +86,9 @@ def d(u, v):
7686

7787
def cost(X, R, M):
7888
cost = 0
79-
for k in xrange(len(M)):
89+
for k in range(len(M)):
8090
# method 1
81-
# for n in xrange(len(X)):
91+
# for n in range(len(X)):
8292
# cost += R[n,k]*d(M[k], X[n])
8393

8494
# method 2
@@ -94,22 +104,22 @@ def plot_k_means(X, K, index_word_map, max_iter=20, beta=1.0, show_plots=True):
94104
exponents = np.empty((N, K))
95105

96106
# initialize M to random
97-
for k in xrange(K):
107+
for k in range(K):
98108
M[k] = X[np.random.choice(N)]
99109

100110
costs = np.zeros(max_iter)
101-
for i in xrange(max_iter):
111+
for i in range(max_iter):
102112
# step 1: determine assignments / resposibilities
103113
# is this inefficient?
104-
for k in xrange(K):
105-
for n in xrange(N):
106-
# R[n,k] = np.exp(-beta*d(M[k], X[n])) / np.sum( np.exp(-beta*d(M[j], X[n])) for j in xrange(K) )
114+
for k in range(K):
115+
for n in range(N):
116+
# R[n,k] = np.exp(-beta*d(M[k], X[n])) / np.sum( np.exp(-beta*d(M[j], X[n])) for j in range(K) )
107117
exponents[n,k] = np.exp(-beta*d(M[k], X[n]))
108118

109119
R = exponents / exponents.sum(axis=1, keepdims=True)
110120

111121
# step 2: recalculate means
112-
for k in xrange(K):
122+
for k in range(K):
113123
M[k] = R[:,k].dot(X) / R[:,k].sum()
114124

115125
costs[i] = cost(X, R, M)
@@ -135,16 +145,16 @@ def plot_k_means(X, K, index_word_map, max_iter=20, beta=1.0, show_plots=True):
135145
hard_responsibilities = np.argmax(R, axis=1) # is an N-size array of cluster identities
136146
# let's "reverse" the order so it's cluster identity -> word index
137147
cluster2word = {}
138-
for i in xrange(len(hard_responsibilities)):
148+
for i in range(len(hard_responsibilities)):
139149
word = index_word_map[i]
140150
cluster = hard_responsibilities[i]
141151
if cluster not in cluster2word:
142152
cluster2word[cluster] = []
143153
cluster2word[cluster].append(word)
144154

145155
# print out the words grouped by cluster
146-
for cluster, wordlist in cluster2word.iteritems():
147-
print "cluster", cluster, "->", wordlist
156+
for cluster, wordlist in cluster2word.items():
157+
print("cluster", cluster, "->", wordlist)
148158

149159
return M, R
150160

@@ -155,7 +165,7 @@ def plot_k_means(X, K, index_word_map, max_iter=20, beta=1.0, show_plots=True):
155165
# G = nx.DiGraph()
156166
# data_nodes = []
157167
# init_pos = {}
158-
# for i in xrange(N):
168+
# for i in range(N):
159169
# x, y = X[i]
160170
# label = index_word_map[i]
161171
# data_str = 'data_{0}'.format(label)
@@ -197,15 +207,15 @@ def plot_k_means(X, K, index_word_map, max_iter=20, beta=1.0, show_plots=True):
197207
def annotate1(X, index_word_map, eps=0.1):
198208
N, D = X.shape
199209
placed = np.empty((N, D))
200-
for i in xrange(N):
210+
for i in range(N):
201211
x, y = X[i]
202212

203213
# if x, y is too close to something already plotted, move it
204214
close = []
205215

206216
x, y = X[i]
207-
for retry in xrange(3):
208-
for j in xrange(i):
217+
for retry in range(3):
218+
for j in range(i):
209219
diff = np.array([x, y]) - placed[j]
210220

211221
# if something is close, append it to the close list
@@ -233,11 +243,11 @@ def annotate1(X, index_word_map, eps=0.1):
233243
}
234244
)
235245

236-
print "vocab size:", current_index
246+
print("vocab size:", current_index)
237247

238248
transformer = TfidfTransformer()
239249
X = transformer.fit_transform(X).toarray()
240250

241251
reducer = TSNE()
242252
Z = reducer.fit_transform(X)
243-
plot_k_means(Z[:,:2], current_index/10, index_word_map, show_plots=True)
253+
plot_k_means(Z[:,:2], current_index//10, index_word_map, show_plots=True)

Diff for: unsupervised_class/choose_k.py

+8-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,12 @@
11
# https://deeplearningcourses.com/c/cluster-analysis-unsupervised-machine-learning-python
22
# https://www.udemy.com/cluster-analysis-unsupervised-machine-learning-python
3+
from __future__ import print_function, division
4+
from future.utils import iteritems
5+
from builtins import range, input
6+
# Note: you may need to update your version of future
7+
# sudo pip install -U future
8+
9+
310
import numpy as np
411
import matplotlib.pyplot as plt
512
from kmeans import plot_k_means, get_simple_data, cost
@@ -13,7 +20,7 @@ def main():
1320

1421
costs = np.empty(10)
1522
costs[0] = None
16-
for k in xrange(1, 10):
23+
for k in range(1, 10):
1724
M, R = plot_k_means(X, k, show_plots=False)
1825
c = cost(X, R, M)
1926
costs[k] = c

Diff for: unsupervised_class/gmm.py

+20-13
Original file line numberDiff line numberDiff line change
@@ -1,46 +1,53 @@
11
# https://deeplearningcourses.com/c/cluster-analysis-unsupervised-machine-learning-python
22
# https://www.udemy.com/cluster-analysis-unsupervised-machine-learning-python
3+
from __future__ import print_function, division
4+
from future.utils import iteritems
5+
from builtins import range, input
6+
# Note: you may need to update your version of future
7+
# sudo pip install -U future
8+
9+
310
import numpy as np
411
import matplotlib.pyplot as plt
512

613
from scipy.stats import multivariate_normal
714

815

9-
def gmm(X, K, max_iter=20, smoothing=10e-3):
16+
def gmm(X, K, max_iter=20, smoothing=1e-2):
1017
N, D = X.shape
1118
M = np.zeros((K, D))
1219
R = np.zeros((N, K))
1320
C = np.zeros((K, D, D))
1421
pi = np.ones(K) / K # uniform
1522

1623
# initialize M to random, initialize C to spherical with variance 1
17-
for k in xrange(K):
24+
for k in range(K):
1825
M[k] = X[np.random.choice(N)]
1926
C[k] = np.eye(D)
2027

2128
costs = np.zeros(max_iter)
2229
weighted_pdfs = np.zeros((N, K)) # we'll use these to store the PDF value of sample n and Gaussian k
23-
for i in xrange(max_iter):
30+
for i in range(max_iter):
2431
# step 1: determine assignments / resposibilities
25-
for k in xrange(K):
26-
for n in xrange(N):
32+
for k in range(K):
33+
for n in range(N):
2734
weighted_pdfs[n,k] = pi[k]*multivariate_normal.pdf(X[n], M[k], C[k])
2835

29-
for k in xrange(K):
30-
for n in xrange(N):
36+
for k in range(K):
37+
for n in range(N):
3138
R[n,k] = weighted_pdfs[n,k] / weighted_pdfs[n,:].sum()
3239

3340
# a faster way to do step 1: "vectorization"
34-
# for k in xrange(K):
41+
# for k in range(K):
3542
# weighted_pdfs[:,k] = pi[k]*multivariate_normal.pdf(X, M[k], C[k])
3643
# R = weighted_pdfs / weighted_pdfs.sum(axis=1, keepdims=True)
3744

3845
# step 2: recalculate params
39-
for k in xrange(K):
46+
for k in range(K):
4047
Nk = R[:,k].sum()
4148
pi[k] = Nk / N
4249
M[k] = R[:,k].dot(X) / Nk
43-
C[k] = np.sum(R[n,k]*np.outer(X[n] - M[k], X[n] - M[k]) for n in xrange(N)) / Nk + np.eye(D)*smoothing
50+
C[k] = np.sum(R[n,k]*np.outer(X[n] - M[k], X[n] - M[k]) for n in range(N)) / Nk + np.eye(D)*smoothing
4451

4552

4653
costs[i] = np.log(weighted_pdfs.sum(axis=1)).sum()
@@ -57,9 +64,9 @@ def gmm(X, K, max_iter=20, smoothing=10e-3):
5764
plt.scatter(X[:,0], X[:,1], c=colors)
5865
plt.show()
5966

60-
print "pi:", pi
61-
print "means:", M
62-
print "covariances:", C
67+
print("pi:", pi)
68+
print("means:", M)
69+
print("covariances:", C)
6370
return R
6471

6572

Diff for: unsupervised_class/gmm_mnist.py

+9-3
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,12 @@
55
# each image is a D = 28x28 = 784 dimensional vector
66
# there are N = 42000 samples
77
# you can plot an image by reshaping to (28,28) and using plt.imshow()
8+
from __future__ import print_function, division
9+
from future.utils import iteritems
10+
from builtins import range, input
11+
# Note: you may need to update your version of future
12+
# sudo pip install -U future
13+
814

915
import numpy as np
1016
import pandas as pd
@@ -18,15 +24,15 @@
1824

1925
def main():
2026
X, Y = get_data(10000)
21-
print "Number of data points:", len(Y)
27+
print("Number of data points:", len(Y))
2228

2329
model = GaussianMixture(n_components=10)
2430
model.fit(X)
2531
M = model.means_
2632
R = model.predict_proba(X)
2733

28-
print "Purity:", purity(Y, R) # max is 1, higher is better
29-
print "DBI:", DBI(X, M, R) # lower is better
34+
print("Purity:", purity(Y, R)) # max is 1, higher is better
35+
print("DBI:", DBI(X, M, R)) # lower is better
3036

3137

3238
if __name__ == "__main__":

Diff for: unsupervised_class/hcluster.py

+8-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,12 @@
11
# https://deeplearningcourses.com/c/cluster-analysis-unsupervised-machine-learning-python
22
# https://www.udemy.com/cluster-analysis-unsupervised-machine-learning-python
3+
from __future__ import print_function, division
4+
from future.utils import iteritems
5+
from builtins import range, input
6+
# Note: you may need to update your version of future
7+
# sudo pip install -U future
8+
9+
310
import numpy as np
411
import matplotlib.pyplot as plt
512

@@ -20,7 +27,7 @@ def main():
2027
X[600:, :] = np.random.randn(300, D) + mu3
2128

2229
Z = linkage(X, 'ward')
23-
print "Z.shape:", Z.shape
30+
print("Z.shape:", Z.shape)
2431
# Z has the format [idx1, idx2, dist, sample_count]
2532
# therefore, its size will be (N-1, 4)
2633

Diff for: unsupervised_class/kmeans.py

+16-9
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,12 @@
11
# https://deeplearningcourses.com/c/cluster-analysis-unsupervised-machine-learning-python
22
# https://www.udemy.com/cluster-analysis-unsupervised-machine-learning-python
3+
from __future__ import print_function, division
4+
from future.utils import iteritems
5+
from builtins import range, input
6+
# Note: you may need to update your version of future
7+
# sudo pip install -U future
8+
9+
310
import numpy as np
411
import matplotlib.pyplot as plt
512

@@ -11,9 +18,9 @@ def d(u, v):
1118

1219
def cost(X, R, M):
1320
cost = 0
14-
for k in xrange(len(M)):
21+
for k in range(len(M)):
1522
# method 1
16-
# for n in xrange(len(X)):
23+
# for n in range(len(X)):
1724
# cost += R[n,k]*d(M[k], X[n])
1825

1926
# method 2
@@ -30,28 +37,28 @@ def plot_k_means(X, K, max_iter=20, beta=1.0, show_plots=True):
3037
exponents = np.empty((N, K))
3138

3239
# initialize M to random
33-
for k in xrange(K):
40+
for k in range(K):
3441
M[k] = X[np.random.choice(N)]
3542

3643
costs = np.zeros(max_iter)
37-
for i in xrange(max_iter):
44+
for i in range(max_iter):
3845
# step 1: determine assignments / resposibilities
3946
# is this inefficient?
40-
for k in xrange(K):
41-
for n in xrange(N):
42-
# R[n,k] = np.exp(-beta*d(M[k], X[n])) / np.sum( np.exp(-beta*d(M[j], X[n])) for j in xrange(K) )
47+
for k in range(K):
48+
for n in range(N):
49+
# R[n,k] = np.exp(-beta*d(M[k], X[n])) / np.sum( np.exp(-beta*d(M[j], X[n])) for j in range(K) )
4350
exponents[n,k] = np.exp(-beta*d(M[k], X[n]))
4451

4552
R = exponents / exponents.sum(axis=1, keepdims=True)
4653
# assert(np.abs(R - R2).sum() < 10e-10)
4754

4855
# step 2: recalculate means
49-
for k in xrange(K):
56+
for k in range(K):
5057
M[k] = R[:,k].dot(X) / R[:,k].sum()
5158

5259
costs[i] = cost(X, R, M)
5360
if i > 0:
54-
if np.abs(costs[i] - costs[i-1]) < 10e-5:
61+
if np.abs(costs[i] - costs[i-1]) < 1e-5:
5562
break
5663

5764
if show_plots:

Diff for: unsupervised_class/kmeans_fail.py

+11-4
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,12 @@
11
# https://deeplearningcourses.com/c/cluster-analysis-unsupervised-machine-learning-python
22
# https://www.udemy.com/cluster-analysis-unsupervised-machine-learning-python
3+
from __future__ import print_function, division
4+
from future.utils import iteritems
5+
from builtins import range, input
6+
# Note: you may need to update your version of future
7+
# sudo pip install -U future
8+
9+
310
import numpy as np
411
from kmeans import plot_k_means
512

@@ -13,12 +20,12 @@ def donut():
1320

1421
# distance from origin is radius + random normal
1522
# angle theta is uniformly distributed between (0, 2pi)
16-
R1 = np.random.randn(N/2) + R_inner
17-
theta = 2*np.pi*np.random.random(N/2)
23+
R1 = np.random.randn(N//2) + R_inner
24+
theta = 2*np.pi*np.random.random(N//2)
1825
X_inner = np.concatenate([[R1 * np.cos(theta)], [R1 * np.sin(theta)]]).T
1926

20-
R2 = np.random.randn(N/2) + R_outer
21-
theta = 2*np.pi*np.random.random(N/2)
27+
R2 = np.random.randn(N//2) + R_outer
28+
theta = 2*np.pi*np.random.random(N//2)
2229
X_outer = np.concatenate([[R2 * np.cos(theta)], [R2 * np.sin(theta)]]).T
2330

2431
X = np.concatenate([ X_inner, X_outer ])

0 commit comments

Comments
 (0)