-
Notifications
You must be signed in to change notification settings - Fork 103
/
Copy pathexercise_04_face_recognition.py
127 lines (89 loc) · 3.88 KB
/
exercise_04_face_recognition.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
"""Face recognition using PCA (eigenfaces) and SVM"""
# License: Simplified BSD
import os
from gzip import GzipFile
import sys
import numpy as np
import pylab as pl
from scikits.learn.grid_search import GridSearchCV
from scikits.learn.metrics import classification_report
from scikits.learn.metrics import confusion_matrix
from scikits.learn.pca import RandomizedPCA
from scikits.learn.svm import SVC
# Load dataset in memory
folder_name = sys.argv[1]
faces_filename = os.path.join(folder_name, "faces.npy.gz")
filenames_filename = os.path.join(folder_name, "face_filenames.txt")
faces = np.load(GzipFile(faces_filename))
face_filenames = [l.strip() for l in file(filenames_filename).readlines()]
# normalize each picture by centering brightness
faces -= faces.mean(axis=1)[:, np.newaxis]
# Index category names into integers suitable for scikit-learn
#
# Here we do a little dance to convert file names in integer indices
# (class indices in machine learning talk) that are suitable to be used
# as a target for training a classifier. Note the use of an array with
# unique entries to store the relation between class index and name,
# often called a 'Look Up Table' (LUT).
# Also, note the use of 'searchsorted' to convert an array in a set of
# integers given a second array to use as a LUT.
person_names = np.array([f.rsplit('_', 1)[0] for f in face_filenames])
# A unique integer per category
target_names = np.unique(person_names)
# Turn the person_names in their corresponding integer label
target = np.searchsorted(target_names, person_names)
# Subsample the dataset to restrict to the most frequent person_names
selected_target = np.argsort(np.bincount(target))[-5:]
most_frequent_mask = np.array([item in selected_target for item in target])
X = faces[most_frequent_mask]
y = target[most_frequent_mask]
n_samples, n_features = X.shape
print "Dataset size:"
print "n_samples: %d" % n_samples
print "n_features: %d" % n_features
# Split the dataset into a training and test set
split = n_samples * 3 / 4
X_train, X_test = X[:split], X[split:]
y_train, y_test = y[:split], y[split:]
# Compute a PCA (eigenfaces) on the face dataset (treated as unlabeled
# dataset): unsupervised feature extraction / dimensionality reduction
n_components = 150
print "Extracting the top %d eigenfaces" % n_components
pca = RandomizedPCA(n_components=n_components, whiten=True).fit(X_train)
eigenfaces = pca.components_.T.reshape((n_components, 64, 64))
# project the input data on the eigenfaces orthonormal basis
X_train_pca = pca.transform(X_train)
X_test_pca = pca.transform(X_test)
# Train a SVM classification model
print "Fitting the classifier to the training set"
param_grid = {
'C': [1, 5, 10, 100],
'gamma': [0.0001, 0.001, 0.01, 0.1],
}
clf = GridSearchCV(SVC(kernel='rbf'), param_grid,
fit_params={'class_weight': 'auto'},
n_jobs=-1)
clf = clf.fit(X_train_pca, y_train)
print "Best estimator found by grid search:"
print clf.best_estimator
# Quantitative evaluation of the model quality on the test set
y_pred = clf.predict(X_test_pca)
print classification_report(y_test, y_pred, labels=selected_target,
class_names=target_names[selected_target])
print confusion_matrix(y_test, y_pred, labels=selected_target)
# Qualitative evaluation of the predictions using matplotlib
n_row = 3
n_col = 4
def title(y_pred, y_test, target_names, i):
pred_name = target_names[y_pred[i]].rsplit('_', 1)[-1]
true_name = target_names[y_test[i]].rsplit('_', 1)[-1]
return 'predicted: %s\ntrue: %s' % (pred_name, true_name)
pl.figure(figsize=(2 * n_col, 2.3 * n_row))
pl.subplots_adjust(bottom=0, left=.01, right=.99, top=.95, hspace=.15)
for i in range(n_row * n_col):
pl.subplot(n_row, n_col, i + 1)
pl.imshow(X_test[i].reshape((64, 64)), cmap=pl.cm.gray)
pl.title(title(y_pred, y_test, target_names, i), size=12)
pl.xticks(())
pl.yticks(())
pl.show()