@@ -50,45 +50,44 @@ def __repr__(self):
50
50
51
51
# Build a an analyzer that split strings into sequence of 1 to 3 characters
52
52
# after using the previous preprocessor
53
- analyzer = CharNGramAnalyzer (
54
- min_n = 1 ,
55
- max_n = 3 ,
56
- preprocessor = LowerCasePreprocessor (),
57
- )
53
+
54
+ # TODO
58
55
59
56
# Build a vectorizer / classifier pipeline using the previous analyzer
60
- clf = Pipeline ([
61
- ('vec' , CountVectorizer (analyzer = analyzer )),
62
- ('tfidf' , TfidfTransformer ()),
63
- ('clf' , LinearSVC (loss = 'l2' , penalty = 'l1' , dual = False , C = 100 )),
64
- ])
57
+
58
+ # TODO: the pipeline instance must be named 'clf'
65
59
66
60
# Fit the pipeline on the training set
67
- clf .fit (docs_train , y_train )
61
+
62
+ # TODO
68
63
69
64
# Predict the outcome on the testing set
70
- y_predicted = clf .predict (docs_test )
71
-
72
- # Print the classification report
73
- print metrics .classification_report (y_test , y_predicted ,
74
- class_names = dataset .target_names )
75
-
76
- # Plot the confusion matrix
77
- cm = metrics .confusion_matrix (y_test , y_predicted )
78
- print cm
79
-
80
- # import pylab as pl
81
- #pl.matshow(cm)
82
- #pl.show()
83
-
84
- # Predict the result on some short new sentences:
85
- sentences = [
86
- u'This is a language detection test.' ,
87
- u'Ceci est un test de d\xe9 tection de la langue.' ,
88
- u'Dies ist ein Test, um die Sprache zu erkennen.' ,
89
- ]
90
- predicted = clf .predict (sentences )
91
-
92
- for s , p in zip (sentences , predicted ):
93
- print u'The language of "%s" is "%s"' % (s , dataset .target_names [p ])
65
+
66
+ # TODO: the predicted outcome must be named 'y_predicted'
67
+
68
+
69
+ # TODO: uncomment the following once all of the above is implemented
70
+
71
+ ## Print the classification report
72
+ #print metrics.classification_report(y_test, y_predicted,
73
+ # class_names=dataset.target_names)
74
+ #
75
+ ## Plot the confusion matrix
76
+ #cm = metrics.confusion_matrix(y_test, y_predicted)
77
+ #print cm
78
+ #
79
+ ## import pylab as pl
80
+ ##pl.matshow(cm)
81
+ ##pl.show()
82
+ #
83
+ ## Predict the result on some short new sentences:
84
+ #sentences = [
85
+ # u'This is a language detection test.',
86
+ # u'Ceci est un test de d\xe9tection de la langue.',
87
+ # u'Dies ist ein Test, um die Sprache zu erkennen.',
88
+ #]
89
+ #predicted = clf.predict(sentences)
90
+ #
91
+ #for s, p in zip(sentences, predicted):
92
+ # print u'The language of "%s" is "%s"' % (s, dataset.target_names[p])
94
93
0 commit comments