|
| 1 | +"""Face recognition using PCA (eigenfaces) and SVM""" |
| 2 | +# License: Simplified BSD |
| 3 | + |
| 4 | +import os |
| 5 | +from gzip import GzipFile |
| 6 | + |
| 7 | +import sys |
| 8 | +import numpy as np |
| 9 | +import pylab as pl |
| 10 | + |
| 11 | +from scikits.learn.grid_search import GridSearchCV |
| 12 | +from scikits.learn.metrics import classification_report |
| 13 | +from scikits.learn.metrics import confusion_matrix |
| 14 | +from scikits.learn.pca import RandomizedPCA |
| 15 | +from scikits.learn.svm import SVC |
| 16 | + |
| 17 | +# Load dataset in memory |
| 18 | +folder_name = sys.argv[1] |
| 19 | +faces_filename = os.path.join(folder_name, "faces.npy.gz") |
| 20 | +filenames_filename = os.path.join(folder_name, "face_filenames.txt") |
| 21 | + |
| 22 | +faces = np.load(GzipFile(faces_filename)) |
| 23 | +face_filenames = [l.strip() for l in file(filenames_filename).readlines()] |
| 24 | + |
| 25 | +# normalize each picture by centering brightness |
| 26 | +faces -= faces.mean(axis=1)[:, np.newaxis] |
| 27 | + |
| 28 | + |
| 29 | +# Index category names into integers suitable for scikit-learn |
| 30 | +# |
| 31 | +# Here we do a little dance to convert file names in integer indices |
| 32 | +# (class indices in machine learning talk) that are suitable to be used |
| 33 | +# as a target for training a classifier. Note the use of an array with |
| 34 | +# unique entries to store the relation between class index and name, |
| 35 | +# often called a 'Look Up Table' (LUT). |
| 36 | +# Also, note the use of 'searchsorted' to convert an array in a set of |
| 37 | +# integers given a second array to use as a LUT. |
| 38 | +person_names = np.array([f.rsplit('_', 1)[0] for f in face_filenames]) |
| 39 | + |
| 40 | +# A unique integer per category |
| 41 | +target_names = np.unique(person_names) |
| 42 | + |
| 43 | +# Turn the person_names in their corresponding integer label |
| 44 | +target = np.searchsorted(target_names, person_names) |
| 45 | + |
| 46 | +# Subsample the dataset to restrict to the most frequent person_names |
| 47 | +selected_target = np.argsort(np.bincount(target))[-5:] |
| 48 | +most_frequent_mask = np.array([item in selected_target for item in target]) |
| 49 | + |
| 50 | +X = faces[most_frequent_mask] |
| 51 | +y = target[most_frequent_mask] |
| 52 | + |
| 53 | +n_samples, n_features = X.shape |
| 54 | + |
| 55 | +print "Dataset size:" |
| 56 | +print "n_samples: %d" % n_samples |
| 57 | +print "n_features: %d" % n_features |
| 58 | + |
| 59 | +# Split the dataset into a training and test set |
| 60 | + |
| 61 | +split = n_samples * 3 / 4 |
| 62 | + |
| 63 | +X_train, X_test = X[:split], X[split:] |
| 64 | +y_train, y_test = y[:split], y[split:] |
| 65 | + |
| 66 | +# Compute a PCA (eigenfaces) on the face dataset (treated as unlabeled |
| 67 | +# dataset): unsupervised feature extraction / dimensionality reduction |
| 68 | + |
| 69 | +n_components = 150 |
| 70 | + |
| 71 | +print "Extracting the top %d eigenfaces" % n_components |
| 72 | +pca = RandomizedPCA(n_components=n_components, whiten=True).fit(X_train) |
| 73 | + |
| 74 | +eigenfaces = pca.components_.T.reshape((n_components, 64, 64)) |
| 75 | + |
| 76 | +# project the input data on the eigenfaces orthonormal basis |
| 77 | +X_train_pca = pca.transform(X_train) |
| 78 | +X_test_pca = pca.transform(X_test) |
| 79 | + |
| 80 | + |
| 81 | +# Train a SVM classification model |
| 82 | + |
| 83 | +print "Fitting the classifier to the training set" |
| 84 | +param_grid = { |
| 85 | + 'C': [1, 5, 10, 100], |
| 86 | + 'gamma': [0.0001, 0.001, 0.01, 0.1], |
| 87 | +} |
| 88 | +clf = GridSearchCV(SVC(kernel='rbf'), param_grid, |
| 89 | + fit_params={'class_weight': 'auto'}, |
| 90 | + n_jobs=-1) |
| 91 | +clf = clf.fit(X_train_pca, y_train) |
| 92 | +print "Best estimator found by grid search:" |
| 93 | +print clf.best_estimator |
| 94 | + |
| 95 | + |
| 96 | +# Quantitative evaluation of the model quality on the test set |
| 97 | + |
| 98 | +y_pred = clf.predict(X_test_pca) |
| 99 | +print classification_report(y_test, y_pred, labels=selected_target, |
| 100 | + class_names=target_names[selected_target]) |
| 101 | + |
| 102 | +print confusion_matrix(y_test, y_pred, labels=selected_target) |
| 103 | + |
| 104 | + |
| 105 | +# Qualitative evaluation of the predictions using matplotlib |
| 106 | + |
| 107 | +n_row = 3 |
| 108 | +n_col = 4 |
| 109 | + |
| 110 | +def title(y_pred, y_test, target_names, i): |
| 111 | + pred_name = target_names[y_pred[i]].rsplit('_', 1)[-1] |
| 112 | + true_name = target_names[y_test[i]].rsplit('_', 1)[-1] |
| 113 | + return 'predicted: %s\ntrue: %s' % (pred_name, true_name) |
| 114 | + |
| 115 | + |
| 116 | +pl.figure(figsize=(2 * n_col, 2.3 * n_row)) |
| 117 | +pl.subplots_adjust(bottom=0, left=.01, right=.99, top=.95, hspace=.15) |
| 118 | +for i in range(n_row * n_col): |
| 119 | + pl.subplot(n_row, n_col, i + 1) |
| 120 | + pl.imshow(X_test[i].reshape((64, 64)), cmap=pl.cm.gray) |
| 121 | + pl.title(title(y_pred, y_test, target_names, i), size=12) |
| 122 | + pl.xticks(()) |
| 123 | + pl.yticks(()) |
| 124 | + |
| 125 | +pl.show() |
| 126 | + |
| 127 | + |
0 commit comments