|
4 | 4 |
|
5 | 5 | import sys
|
6 | 6 |
|
7 |
| -from scikits.learn.feature_extraction.text.sparse import Vectorizer |
| 7 | +from scikits.learn.feature_extraction.text.sparse import CountVectorizer |
| 8 | +from scikits.learn.feature_extraction.text.sparse import TfidfTransformer |
8 | 9 | from scikits.learn.feature_extraction.text import CharNGramAnalyzer
|
9 | 10 | from scikits.learn.svm.sparse import LinearSVC
|
| 11 | +from scikits.learn.pipeline import Pipeline |
10 | 12 | from scikits.learn.datasets import load_files
|
11 | 13 | from scikits.learn import metrics
|
12 | 14 |
|
@@ -34,61 +36,59 @@ def __repr__(self):
|
34 | 36 | dataset = load_files(languages_data_folder)
|
35 | 37 |
|
36 | 38 | # split the dataset in training and test set:
|
| 39 | +n_samples_total = dataset.filenames.shape[0] |
37 | 40 |
|
38 |
| -# TODO: define variables 'filenames_train' and 'filenames_test' |
39 |
| -# TODO: define variables 'y_train' and 'y_test' |
| 41 | +docs_train = [open(f).read() |
| 42 | + for f in dataset.filenames[:n_samples_total/2]] |
| 43 | +docs_test = [open(f).read() |
| 44 | + for f in dataset.filenames[n_samples_total/2:]] |
40 | 45 |
|
41 | 46 |
|
42 |
| -# Build a an analyzer that split strings into sequence of 1 to 3 characters |
43 |
| -# using the previous preprocessor |
44 |
| - |
45 |
| -# TODO: define a variable named analyzer |
46 |
| - |
47 |
| - |
48 |
| -# Build a vectorizer using the analyzer, learn the mapping from feature name to |
49 |
| -# feature id on the training data and then transform it into feature vectors. |
50 |
| -# Then use the fitted vectorizer on the test data |
| 47 | +y_train = dataset.target[:n_samples_total/2] |
| 48 | +y_test = dataset.target[n_samples_total/2:] |
51 | 49 |
|
52 |
| -# TODO: define a variable named 'vectorizer' |
53 |
| -# TODO: define a variable named 'X_train' |
54 |
| -# TODO: define a variable named 'X_test' |
55 | 50 |
|
56 |
| -# XXX: Don't forget to read the content of the text files before feeding it to |
57 |
| -# the vectorizer |
58 |
| - |
59 |
| -# Build a linear classifier and train it on the training set |
60 |
| - |
61 |
| -# TODO: define a variable named 'clf' |
| 51 | +# Build a an analyzer that split strings into sequence of 1 to 3 characters |
| 52 | +# after using the previous preprocessor |
| 53 | +analyzer = CharNGramAnalyzer( |
| 54 | + min_n=1, |
| 55 | + max_n=3, |
| 56 | + preprocessor=LowerCasePreprocessor(), |
| 57 | +) |
| 58 | + |
| 59 | +# Build a vectorizer / classifier pipeline using the previous analyzer |
| 60 | +clf = Pipeline([ |
| 61 | + ('vec', CountVectorizer(analyzer=analyzer)), |
| 62 | + ('tfidf', TfidfTransformer()), |
| 63 | + ('clf', LinearSVC(loss='l2', penalty='l1', dual=False, C=100)), |
| 64 | +]) |
| 65 | + |
| 66 | +# Fit the pipeline on the training set |
| 67 | +clf.fit(docs_train, y_train) |
62 | 68 |
|
63 | 69 | # Predict the outcome on the testing set
|
| 70 | +y_predicted = clf.predict(docs_test) |
64 | 71 |
|
65 |
| -# TODO: define a variable named 'y_predicted' |
| 72 | +# Print the classification report |
| 73 | +print metrics.classification_report(y_test, y_predicted, |
| 74 | + class_names=dataset.target_names) |
66 | 75 |
|
| 76 | +# Plot the confusion matrix |
| 77 | +cm = metrics.confusion_matrix(y_test, y_predicted) |
| 78 | +print cm |
67 | 79 |
|
68 |
| -# |
69 |
| -# Evaluation of the quality of the predictions: uncomment the following when all |
70 |
| -# of the above as been implemented |
71 |
| -# |
| 80 | +# import pylab as pl |
| 81 | +#pl.matshow(cm) |
| 82 | +#pl.show() |
72 | 83 |
|
73 |
| -## Print the classification report |
74 |
| -# |
75 |
| -#print metrics.classification_report(y_test, y_predicted, |
76 |
| -# class_names=dataset.target_names) |
77 |
| -# |
78 |
| -## Print the confusion matrix |
79 |
| -# |
80 |
| -#cm = metrics.confusion_matrix(y_test, y_predicted) |
81 |
| -#print cm |
82 |
| -# |
83 | 84 | # Predict the result on some short new sentences:
|
84 |
| -#sentences = [ |
85 |
| -# u'This is a language detection test.', |
86 |
| -# u'Ceci est un test de d\xe9tection de la langue.', |
87 |
| -# u'Dies ist ein Test, um die Sprache zu erkennen.', |
88 |
| -#] |
89 |
| -#vectors = vectorizer.transform(sentences) |
90 |
| -#predicted = clf.predict(vectors) |
91 |
| -# |
92 |
| -#for s, p in zip(sentences, predicted): |
93 |
| -# print u'The language of "%s" is "%s"' % (s, dataset.target_names[p]) |
| 85 | +sentences = [ |
| 86 | + u'This is a language detection test.', |
| 87 | + u'Ceci est un test de d\xe9tection de la langue.', |
| 88 | + u'Dies ist ein Test, um die Sprache zu erkennen.', |
| 89 | +] |
| 90 | +predicted = clf.predict(sentences) |
| 91 | + |
| 92 | +for s, p in zip(sentences, predicted): |
| 93 | + print u'The language of "%s" is "%s"' % (s, dataset.target_names[p]) |
94 | 94 |
|
0 commit comments