Skip to content

Commit 71a65de

Browse files
committed
solution for exercise 04
1 parent c75b74b commit 71a65de

File tree

1 file changed

+127
-0
lines changed

1 file changed

+127
-0
lines changed
+127
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,127 @@
1+
"""Face recognition using PCA (eigenfaces) and SVM"""
2+
# License: Simplified BSD
3+
4+
import os
5+
from gzip import GzipFile
6+
7+
import sys
8+
import numpy as np
9+
import pylab as pl
10+
11+
from scikits.learn.grid_search import GridSearchCV
12+
from scikits.learn.metrics import classification_report
13+
from scikits.learn.metrics import confusion_matrix
14+
from scikits.learn.pca import RandomizedPCA
15+
from scikits.learn.svm import SVC
16+
17+
# Load dataset in memory
18+
folder_name = sys.argv[1]
19+
faces_filename = os.path.join(folder_name, "faces.npy.gz")
20+
filenames_filename = os.path.join(folder_name, "face_filenames.txt")
21+
22+
faces = np.load(GzipFile(faces_filename))
23+
face_filenames = [l.strip() for l in file(filenames_filename).readlines()]
24+
25+
# normalize each picture by centering brightness
26+
faces -= faces.mean(axis=1)[:, np.newaxis]
27+
28+
29+
# Index category names into integers suitable for scikit-learn
30+
#
31+
# Here we do a little dance to convert file names in integer indices
32+
# (class indices in machine learning talk) that are suitable to be used
33+
# as a target for training a classifier. Note the use of an array with
34+
# unique entries to store the relation between class index and name,
35+
# often called a 'Look Up Table' (LUT).
36+
# Also, note the use of 'searchsorted' to convert an array in a set of
37+
# integers given a second array to use as a LUT.
38+
person_names = np.array([f.rsplit('_', 1)[0] for f in face_filenames])
39+
40+
# A unique integer per category
41+
target_names = np.unique(person_names)
42+
43+
# Turn the person_names in their corresponding integer label
44+
target = np.searchsorted(target_names, person_names)
45+
46+
# Subsample the dataset to restrict to the most frequent person_names
47+
selected_target = np.argsort(np.bincount(target))[-5:]
48+
most_frequent_mask = np.array([item in selected_target for item in target])
49+
50+
X = faces[most_frequent_mask]
51+
y = target[most_frequent_mask]
52+
53+
n_samples, n_features = X.shape
54+
55+
print "Dataset size:"
56+
print "n_samples: %d" % n_samples
57+
print "n_features: %d" % n_features
58+
59+
# Split the dataset into a training and test set
60+
61+
split = n_samples * 3 / 4
62+
63+
X_train, X_test = X[:split], X[split:]
64+
y_train, y_test = y[:split], y[split:]
65+
66+
# Compute a PCA (eigenfaces) on the face dataset (treated as unlabeled
67+
# dataset): unsupervised feature extraction / dimensionality reduction
68+
69+
n_components = 150
70+
71+
print "Extracting the top %d eigenfaces" % n_components
72+
pca = RandomizedPCA(n_components=n_components, whiten=True).fit(X_train)
73+
74+
eigenfaces = pca.components_.T.reshape((n_components, 64, 64))
75+
76+
# project the input data on the eigenfaces orthonormal basis
77+
X_train_pca = pca.transform(X_train)
78+
X_test_pca = pca.transform(X_test)
79+
80+
81+
# Train a SVM classification model
82+
83+
print "Fitting the classifier to the training set"
84+
param_grid = {
85+
'C': [1, 5, 10, 100],
86+
'gamma': [0.0001, 0.001, 0.01, 0.1],
87+
}
88+
clf = GridSearchCV(SVC(kernel='rbf'), param_grid,
89+
fit_params={'class_weight': 'auto'},
90+
n_jobs=-1)
91+
clf = clf.fit(X_train_pca, y_train)
92+
print "Best estimator found by grid search:"
93+
print clf.best_estimator
94+
95+
96+
# Quantitative evaluation of the model quality on the test set
97+
98+
y_pred = clf.predict(X_test_pca)
99+
print classification_report(y_test, y_pred, labels=selected_target,
100+
class_names=target_names[selected_target])
101+
102+
print confusion_matrix(y_test, y_pred, labels=selected_target)
103+
104+
105+
# Qualitative evaluation of the predictions using matplotlib
106+
107+
n_row = 3
108+
n_col = 4
109+
110+
def title(y_pred, y_test, target_names, i):
111+
pred_name = target_names[y_pred[i]].rsplit('_', 1)[-1]
112+
true_name = target_names[y_test[i]].rsplit('_', 1)[-1]
113+
return 'predicted: %s\ntrue: %s' % (pred_name, true_name)
114+
115+
116+
pl.figure(figsize=(2 * n_col, 2.3 * n_row))
117+
pl.subplots_adjust(bottom=0, left=.01, right=.99, top=.95, hspace=.15)
118+
for i in range(n_row * n_col):
119+
pl.subplot(n_row, n_col, i + 1)
120+
pl.imshow(X_test[i].reshape((64, 64)), cmap=pl.cm.gray)
121+
pl.title(title(y_pred, y_test, target_names, i), size=12)
122+
pl.xticks(())
123+
pl.yticks(())
124+
125+
pl.show()
126+
127+

0 commit comments

Comments
 (0)