-
Notifications
You must be signed in to change notification settings - Fork 103
/
Copy pathexercise_02_language_train_model.py
94 lines (70 loc) · 2.61 KB
/
exercise_02_language_train_model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
"""Build a language detector model"""
# Author: Olivier Grisel <[email protected]>
# License: Simplified BSD
import sys
from scikits.learn.feature_extraction.text.sparse import CountVectorizer
from scikits.learn.feature_extraction.text.sparse import TfidfTransformer
from scikits.learn.feature_extraction.text import CharNGramAnalyzer
from scikits.learn.svm.sparse import LinearSVC
from scikits.learn.pipeline import Pipeline
from scikits.learn.datasets import load_files
from scikits.learn import metrics
#
# New preprocessor better suited for language id than the default
# preprocessor
#
class LowerCasePreprocessor(object):
"""Simple preprocessor that should be available by default"""
def preprocess(self, unicode_content):
return unicode_content.lower()
def __repr__(self):
return "LowerCasePreprocessor()"
#
# The real code starts here
#
# the training data folder must be passed as first argument
languages_data_folder = sys.argv[1]
dataset = load_files(languages_data_folder)
# split the dataset in training and test set:
n_samples_total = dataset.filenames.shape[0]
docs_train = [open(f).read()
for f in dataset.filenames[:n_samples_total/2]]
docs_test = [open(f).read()
for f in dataset.filenames[n_samples_total/2:]]
y_train = dataset.target[:n_samples_total/2]
y_test = dataset.target[n_samples_total/2:]
# Build a an analyzer that split strings into sequence of 1 to 3 characters
# after using the previous preprocessor
analyzer = CharNGramAnalyzer(
min_n=1,
max_n=3,
preprocessor=LowerCasePreprocessor(),
)
# Build a vectorizer / classifier pipeline using the previous analyzer
clf = Pipeline([
('vec', CountVectorizer(analyzer=analyzer)),
('tfidf', TfidfTransformer()),
('clf', LinearSVC(loss='l2', penalty='l1', dual=False, C=100)),
])
# Fit the pipeline on the training set
clf.fit(docs_train, y_train)
# Predict the outcome on the testing set
y_predicted = clf.predict(docs_test)
# Print the classification report
print metrics.classification_report(y_test, y_predicted,
class_names=dataset.target_names)
# Plot the confusion matrix
cm = metrics.confusion_matrix(y_test, y_predicted)
print cm
# import pylab as pl
#pl.matshow(cm)
#pl.show()
# Predict the result on some short new sentences:
sentences = [
u'This is a language detection test.',
u'Ceci est un test de d\xe9tection de la langue.',
u'Dies ist ein Test, um die Sprache zu erkennen.',
]
predicted = clf.predict(sentences)
for s, p in zip(sentences, predicted):
print u'The language of "%s" is "%s"' % (s, dataset.target_names[p])