Skip to content

Commit b179503

Browse files
committed
inversed solution and skeleton
1 parent 7101ddd commit b179503

File tree

2 files changed

+69
-69
lines changed

2 files changed

+69
-69
lines changed

skeletons/exercise_02_language_train_model.py

Lines changed: 34 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -50,45 +50,44 @@ def __repr__(self):
5050

5151
# Build a an analyzer that split strings into sequence of 1 to 3 characters
5252
# after using the previous preprocessor
53-
analyzer = CharNGramAnalyzer(
54-
min_n=1,
55-
max_n=3,
56-
preprocessor=LowerCasePreprocessor(),
57-
)
53+
54+
# TODO
5855

5956
# Build a vectorizer / classifier pipeline using the previous analyzer
60-
clf = Pipeline([
61-
('vec', CountVectorizer(analyzer=analyzer)),
62-
('tfidf', TfidfTransformer()),
63-
('clf', LinearSVC(loss='l2', penalty='l1', dual=False, C=100)),
64-
])
57+
58+
# TODO: the pipeline instance must be named 'clf'
6559

6660
# Fit the pipeline on the training set
67-
clf.fit(docs_train, y_train)
61+
62+
# TODO
6863

6964
# Predict the outcome on the testing set
70-
y_predicted = clf.predict(docs_test)
71-
72-
# Print the classification report
73-
print metrics.classification_report(y_test, y_predicted,
74-
class_names=dataset.target_names)
75-
76-
# Plot the confusion matrix
77-
cm = metrics.confusion_matrix(y_test, y_predicted)
78-
print cm
79-
80-
# import pylab as pl
81-
#pl.matshow(cm)
82-
#pl.show()
83-
84-
# Predict the result on some short new sentences:
85-
sentences = [
86-
u'This is a language detection test.',
87-
u'Ceci est un test de d\xe9tection de la langue.',
88-
u'Dies ist ein Test, um die Sprache zu erkennen.',
89-
]
90-
predicted = clf.predict(sentences)
91-
92-
for s, p in zip(sentences, predicted):
93-
print u'The language of "%s" is "%s"' % (s, dataset.target_names[p])
65+
66+
# TODO: the predicted outcome must be named 'y_predicted'
67+
68+
69+
# TODO: uncomment the following once all of the above is implemented
70+
71+
## Print the classification report
72+
#print metrics.classification_report(y_test, y_predicted,
73+
# class_names=dataset.target_names)
74+
#
75+
## Plot the confusion matrix
76+
#cm = metrics.confusion_matrix(y_test, y_predicted)
77+
#print cm
78+
#
79+
## import pylab as pl
80+
##pl.matshow(cm)
81+
##pl.show()
82+
#
83+
## Predict the result on some short new sentences:
84+
#sentences = [
85+
# u'This is a language detection test.',
86+
# u'Ceci est un test de d\xe9tection de la langue.',
87+
# u'Dies ist ein Test, um die Sprache zu erkennen.',
88+
#]
89+
#predicted = clf.predict(sentences)
90+
#
91+
#for s, p in zip(sentences, predicted):
92+
# print u'The language of "%s" is "%s"' % (s, dataset.target_names[p])
9493

solutions/exercise_02_language_train_model.py

Lines changed: 35 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -50,44 +50,45 @@ def __repr__(self):
5050

5151
# Build a an analyzer that split strings into sequence of 1 to 3 characters
5252
# after using the previous preprocessor
53-
54-
# TODO
53+
analyzer = CharNGramAnalyzer(
54+
min_n=1,
55+
max_n=3,
56+
preprocessor=LowerCasePreprocessor(),
57+
)
5558

5659
# Build a vectorizer / classifier pipeline using the previous analyzer
57-
58-
# TODO: the pipeline instance must be named 'clf'
60+
clf = Pipeline([
61+
('vec', CountVectorizer(analyzer=analyzer)),
62+
('tfidf', TfidfTransformer()),
63+
('clf', LinearSVC(loss='l2', penalty='l1', dual=False, C=100)),
64+
])
5965

6066
# Fit the pipeline on the training set
61-
62-
# TODO
67+
clf.fit(docs_train, y_train)
6368

6469
# Predict the outcome on the testing set
65-
66-
# TODO: the predicted outcome must be named 'y_predicted'
67-
68-
69-
# TODO: uncomment the following once all of the above is implemented
70-
71-
## Print the classification report
72-
#print metrics.classification_report(y_test, y_predicted,
73-
# class_names=dataset.target_names)
74-
#
75-
## Plot the confusion matrix
76-
#cm = metrics.confusion_matrix(y_test, y_predicted)
77-
#print cm
78-
#
79-
## import pylab as pl
80-
##pl.matshow(cm)
81-
##pl.show()
82-
#
83-
## Predict the result on some short new sentences:
84-
#sentences = [
85-
# u'This is a language detection test.',
86-
# u'Ceci est un test de d\xe9tection de la langue.',
87-
# u'Dies ist ein Test, um die Sprache zu erkennen.',
88-
#]
89-
#predicted = clf.predict(sentences)
90-
#
91-
#for s, p in zip(sentences, predicted):
92-
# print u'The language of "%s" is "%s"' % (s, dataset.target_names[p])
70+
y_predicted = clf.predict(docs_test)
71+
72+
# Print the classification report
73+
print metrics.classification_report(y_test, y_predicted,
74+
class_names=dataset.target_names)
75+
76+
# Plot the confusion matrix
77+
cm = metrics.confusion_matrix(y_test, y_predicted)
78+
print cm
79+
80+
# import pylab as pl
81+
#pl.matshow(cm)
82+
#pl.show()
83+
84+
# Predict the result on some short new sentences:
85+
sentences = [
86+
u'This is a language detection test.',
87+
u'Ceci est un test de d\xe9tection de la langue.',
88+
u'Dies ist ein Test, um die Sprache zu erkennen.',
89+
]
90+
predicted = clf.predict(sentences)
91+
92+
for s, p in zip(sentences, predicted):
93+
print u'The language of "%s" is "%s"' % (s, dataset.target_names[p])
9394

0 commit comments

Comments
 (0)