Skip to content

Commit 7101ddd

Browse files
committed
use the pipeline in exercice 02
1 parent 19b1256 commit 7101ddd

File tree

2 files changed

+89
-88
lines changed

2 files changed

+89
-88
lines changed

skeletons/exercise_02_language_train_model.py

Lines changed: 46 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,11 @@
44

55
import sys
66

7-
from scikits.learn.feature_extraction.text.sparse import Vectorizer
7+
from scikits.learn.feature_extraction.text.sparse import CountVectorizer
8+
from scikits.learn.feature_extraction.text.sparse import TfidfTransformer
89
from scikits.learn.feature_extraction.text import CharNGramAnalyzer
910
from scikits.learn.svm.sparse import LinearSVC
11+
from scikits.learn.pipeline import Pipeline
1012
from scikits.learn.datasets import load_files
1113
from scikits.learn import metrics
1214

@@ -34,61 +36,59 @@ def __repr__(self):
3436
dataset = load_files(languages_data_folder)
3537

3638
# split the dataset in training and test set:
39+
n_samples_total = dataset.filenames.shape[0]
3740

38-
# TODO: define variables 'filenames_train' and 'filenames_test'
39-
# TODO: define variables 'y_train' and 'y_test'
41+
docs_train = [open(f).read()
42+
for f in dataset.filenames[:n_samples_total/2]]
43+
docs_test = [open(f).read()
44+
for f in dataset.filenames[n_samples_total/2:]]
4045

4146

42-
# Build a an analyzer that split strings into sequence of 1 to 3 characters
43-
# using the previous preprocessor
44-
45-
# TODO: define a variable named analyzer
46-
47-
48-
# Build a vectorizer using the analyzer, learn the mapping from feature name to
49-
# feature id on the training data and then transform it into feature vectors.
50-
# Then use the fitted vectorizer on the test data
47+
y_train = dataset.target[:n_samples_total/2]
48+
y_test = dataset.target[n_samples_total/2:]
5149

52-
# TODO: define a variable named 'vectorizer'
53-
# TODO: define a variable named 'X_train'
54-
# TODO: define a variable named 'X_test'
5550

56-
# XXX: Don't forget to read the content of the text files before feeding it to
57-
# the vectorizer
58-
59-
# Build a linear classifier and train it on the training set
60-
61-
# TODO: define a variable named 'clf'
51+
# Build a an analyzer that split strings into sequence of 1 to 3 characters
52+
# after using the previous preprocessor
53+
analyzer = CharNGramAnalyzer(
54+
min_n=1,
55+
max_n=3,
56+
preprocessor=LowerCasePreprocessor(),
57+
)
58+
59+
# Build a vectorizer / classifier pipeline using the previous analyzer
60+
clf = Pipeline([
61+
('vec', CountVectorizer(analyzer=analyzer)),
62+
('tfidf', TfidfTransformer()),
63+
('clf', LinearSVC(loss='l2', penalty='l1', dual=False, C=100)),
64+
])
65+
66+
# Fit the pipeline on the training set
67+
clf.fit(docs_train, y_train)
6268

6369
# Predict the outcome on the testing set
70+
y_predicted = clf.predict(docs_test)
6471

65-
# TODO: define a variable named 'y_predicted'
72+
# Print the classification report
73+
print metrics.classification_report(y_test, y_predicted,
74+
class_names=dataset.target_names)
6675

76+
# Plot the confusion matrix
77+
cm = metrics.confusion_matrix(y_test, y_predicted)
78+
print cm
6779

68-
#
69-
# Evaluation of the quality of the predictions: uncomment the following when all
70-
# of the above as been implemented
71-
#
80+
# import pylab as pl
81+
#pl.matshow(cm)
82+
#pl.show()
7283

73-
## Print the classification report
74-
#
75-
#print metrics.classification_report(y_test, y_predicted,
76-
# class_names=dataset.target_names)
77-
#
78-
## Print the confusion matrix
79-
#
80-
#cm = metrics.confusion_matrix(y_test, y_predicted)
81-
#print cm
82-
#
8384
# Predict the result on some short new sentences:
84-
#sentences = [
85-
# u'This is a language detection test.',
86-
# u'Ceci est un test de d\xe9tection de la langue.',
87-
# u'Dies ist ein Test, um die Sprache zu erkennen.',
88-
#]
89-
#vectors = vectorizer.transform(sentences)
90-
#predicted = clf.predict(vectors)
91-
#
92-
#for s, p in zip(sentences, predicted):
93-
# print u'The language of "%s" is "%s"' % (s, dataset.target_names[p])
85+
sentences = [
86+
u'This is a language detection test.',
87+
u'Ceci est un test de d\xe9tection de la langue.',
88+
u'Dies ist ein Test, um die Sprache zu erkennen.',
89+
]
90+
predicted = clf.predict(sentences)
91+
92+
for s, p in zip(sentences, predicted):
93+
print u'The language of "%s" is "%s"' % (s, dataset.target_names[p])
9494

solutions/exercise_02_language_train_model.py

Lines changed: 43 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,11 @@
44

55
import sys
66

7-
from scikits.learn.feature_extraction.text.sparse import Vectorizer
7+
from scikits.learn.feature_extraction.text.sparse import CountVectorizer
8+
from scikits.learn.feature_extraction.text.sparse import TfidfTransformer
89
from scikits.learn.feature_extraction.text import CharNGramAnalyzer
910
from scikits.learn.svm.sparse import LinearSVC
11+
from scikits.learn.pipeline import Pipeline
1012
from scikits.learn.datasets import load_files
1113
from scikits.learn import metrics
1214

@@ -36,57 +38,56 @@ def __repr__(self):
3638
# split the dataset in training and test set:
3739
n_samples_total = dataset.filenames.shape[0]
3840

39-
filenames_train = dataset.filenames[:n_samples_total/2]
40-
filenames_test = dataset.filenames[n_samples_total/2:]
41+
docs_train = [open(f).read()
42+
for f in dataset.filenames[:n_samples_total/2]]
43+
docs_test = [open(f).read()
44+
for f in dataset.filenames[n_samples_total/2:]]
45+
4146

4247
y_train = dataset.target[:n_samples_total/2]
4348
y_test = dataset.target[n_samples_total/2:]
4449

4550

4651
# Build a an analyzer that split strings into sequence of 1 to 3 characters
4752
# after using the previous preprocessor
48-
analyzer = CharNGramAnalyzer(
49-
min_n=1,
50-
max_n=3,
51-
preprocessor=LowerCasePreprocessor(),
52-
)
5353

54-
# Build a vectorizer using the analyzer, learn the mapping from feature name to
55-
# feature id on the training data while transforming it. The use the fitted
56-
# vectorizer on the test data
57-
vectorizer = Vectorizer(analyzer=analyzer, use_idf=False)
54+
# TODO
55+
56+
# Build a vectorizer / classifier pipeline using the previous analyzer
57+
58+
# TODO: the pipeline instance must be named 'clf'
5859

59-
X_train = vectorizer.fit_transform((open(f) for f in filenames_train))
60-
X_test = vectorizer.transform((open(f) for f in filenames_test))
60+
# Fit the pipeline on the training set
6161

62-
# Build a linear classifier and train it on the training set
63-
clf = LinearSVC(loss='l2', penalty='l1', dual=False, C=100)
64-
clf.fit(X_train, y_train)
62+
# TODO
6563

6664
# Predict the outcome on the testing set
67-
y_predicted = clf.predict(X_test)
68-
69-
# Print the classification report
70-
print metrics.classification_report(y_test, y_predicted,
71-
class_names=dataset.target_names)
72-
73-
# Plot the confusion matrix
74-
cm = metrics.confusion_matrix(y_test, y_predicted)
75-
print cm
76-
77-
# import pylab as pl
78-
#pl.matshow(cm)
79-
#pl.show()
80-
81-
# Predict the result on some short new sentences:
82-
sentences = [
83-
u'This is a language detection test.',
84-
u'Ceci est un test de d\xe9tection de la langue.',
85-
u'Dies ist ein Test, um die Sprache zu erkennen.',
86-
]
87-
vectors = vectorizer.transform(sentences)
88-
predicted = clf.predict(vectors)
89-
90-
for s, p in zip(sentences, predicted):
91-
print u'The language of "%s" is "%s"' % (s, dataset.target_names[p])
65+
66+
# TODO: the predicted outcome must be named 'y_predicted'
67+
68+
69+
# TODO: uncomment the following once all of the above is implemented
70+
71+
## Print the classification report
72+
#print metrics.classification_report(y_test, y_predicted,
73+
# class_names=dataset.target_names)
74+
#
75+
## Plot the confusion matrix
76+
#cm = metrics.confusion_matrix(y_test, y_predicted)
77+
#print cm
78+
#
79+
## import pylab as pl
80+
##pl.matshow(cm)
81+
##pl.show()
82+
#
83+
## Predict the result on some short new sentences:
84+
#sentences = [
85+
# u'This is a language detection test.',
86+
# u'Ceci est un test de d\xe9tection de la langue.',
87+
# u'Dies ist ein Test, um die Sprache zu erkennen.',
88+
#]
89+
#predicted = clf.predict(sentences)
90+
#
91+
#for s, p in zip(sentences, predicted):
92+
# print u'The language of "%s" is "%s"' % (s, dataset.target_names[p])
9293

0 commit comments

Comments
 (0)