inversed solution and skeleton

ogrisel · ogrisel · commit b179503d73ec · 2011-03-10T01:05:58.000-05:00
diff --git a/skeletons/exercise_02_language_train_model.py b/skeletons/exercise_02_language_train_model.py
@@ -50,45 +50,44 @@ def __repr__(self):
 
 # Build a an analyzer that split strings into sequence of 1 to 3 characters
 # after using the previous preprocessor
-analyzer = CharNGramAnalyzer(
-    min_n=1,
-    max_n=3,
-    preprocessor=LowerCasePreprocessor(),
-)
+
+# TODO
 
 # Build a vectorizer / classifier pipeline using the previous analyzer
-clf = Pipeline([
-    ('vec', CountVectorizer(analyzer=analyzer)),
-    ('tfidf', TfidfTransformer()),
-    ('clf', LinearSVC(loss='l2', penalty='l1', dual=False, C=100)),
-])
+
+# TODO: the pipeline instance must be named 'clf'
 
 # Fit the pipeline on the training set
-clf.fit(docs_train, y_train)
+
+# TODO
 
 # Predict the outcome on the testing set
-y_predicted = clf.predict(docs_test)
-
-# Print the classification report
-print metrics.classification_report(y_test, y_predicted,
-                                    class_names=dataset.target_names)
-
-# Plot the confusion matrix
-cm = metrics.confusion_matrix(y_test, y_predicted)
-print cm
-
-# import pylab as pl
-#pl.matshow(cm)
-#pl.show()
-
-# Predict the result on some short new sentences:
-sentences = [
-    u'This is a language detection test.',
-    u'Ceci est un test de d\xe9tection de la langue.',
-    u'Dies ist ein Test, um die Sprache zu erkennen.',
-]
-predicted = clf.predict(sentences)
-
-for s, p in zip(sentences, predicted):
-    print u'The language of "%s" is "%s"' % (s, dataset.target_names[p])
+
+# TODO: the predicted outcome must be named 'y_predicted'
+
+
+# TODO: uncomment the following once all of the above is implemented
+
+## Print the classification report
+#print metrics.classification_report(y_test, y_predicted,
+#                                    class_names=dataset.target_names)
+#
+## Plot the confusion matrix
+#cm = metrics.confusion_matrix(y_test, y_predicted)
+#print cm
+#
+## import pylab as pl
+##pl.matshow(cm)
+##pl.show()
+#
+## Predict the result on some short new sentences:
+#sentences = [
+#    u'This is a language detection test.',
+#    u'Ceci est un test de d\xe9tection de la langue.',
+#    u'Dies ist ein Test, um die Sprache zu erkennen.',
+#]
+#predicted = clf.predict(sentences)
+#
+#for s, p in zip(sentences, predicted):
+#    print u'The language of "%s" is "%s"' % (s, dataset.target_names[p])
 
diff --git a/solutions/exercise_02_language_train_model.py b/solutions/exercise_02_language_train_model.py
@@ -50,44 +50,45 @@ def __repr__(self):
 
 # Build a an analyzer that split strings into sequence of 1 to 3 characters
 # after using the previous preprocessor
-
-# TODO
+analyzer = CharNGramAnalyzer(
+    min_n=1,
+    max_n=3,
+    preprocessor=LowerCasePreprocessor(),
+)
 
 # Build a vectorizer / classifier pipeline using the previous analyzer
-
-# TODO: the pipeline instance must be named 'clf'
+clf = Pipeline([
+    ('vec', CountVectorizer(analyzer=analyzer)),
+    ('tfidf', TfidfTransformer()),
+    ('clf', LinearSVC(loss='l2', penalty='l1', dual=False, C=100)),
+])
 
 # Fit the pipeline on the training set
-
-# TODO
+clf.fit(docs_train, y_train)
 
 # Predict the outcome on the testing set
-
-# TODO: the predicted outcome must be named 'y_predicted'
-
-
-# TODO: uncomment the following once all of the above is implemented
-
-## Print the classification report
-#print metrics.classification_report(y_test, y_predicted,
-#                                    class_names=dataset.target_names)
-#
-## Plot the confusion matrix
-#cm = metrics.confusion_matrix(y_test, y_predicted)
-#print cm
-#
-## import pylab as pl
-##pl.matshow(cm)
-##pl.show()
-#
-## Predict the result on some short new sentences:
-#sentences = [
-#    u'This is a language detection test.',
-#    u'Ceci est un test de d\xe9tection de la langue.',
-#    u'Dies ist ein Test, um die Sprache zu erkennen.',
-#]
-#predicted = clf.predict(sentences)
-#
-#for s, p in zip(sentences, predicted):
-#    print u'The language of "%s" is "%s"' % (s, dataset.target_names[p])
+y_predicted = clf.predict(docs_test)
+
+# Print the classification report
+print metrics.classification_report(y_test, y_predicted,
+                                    class_names=dataset.target_names)
+
+# Plot the confusion matrix
+cm = metrics.confusion_matrix(y_test, y_predicted)
+print cm
+
+# import pylab as pl
+#pl.matshow(cm)
+#pl.show()
+
+# Predict the result on some short new sentences:
+sentences = [
+    u'This is a language detection test.',
+    u'Ceci est un test de d\xe9tection de la langue.',
+    u'Dies ist ein Test, um die Sprache zu erkennen.',
+]
+predicted = clf.predict(sentences)
+
+for s, p in zip(sentences, predicted):
+    print u'The language of "%s" is "%s"' % (s, dataset.target_names[p])