Skip to content

Commit 1556aaf

Browse files
committed
sentiment analysis
1 parent b179503 commit 1556aaf

File tree

1 file changed

+67
-0
lines changed

1 file changed

+67
-0
lines changed

solutions/exercise_01_sentiment.py

+67
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
"""Build a sentiment analysis / polarity model"""
2+
# Author: Olivier Grisel <[email protected]>
3+
# License: Simplified BSD
4+
5+
import sys
6+
from scikits.learn.feature_extraction.text.sparse import CountVectorizer
7+
from scikits.learn.feature_extraction.text.sparse import TfidfTransformer
8+
from scikits.learn.svm.sparse import LinearSVC
9+
from scikits.learn.pipeline import Pipeline
10+
from scikits.learn.grid_search import GridSearchCV
11+
from scikits.learn.datasets import load_files
12+
from scikits.learn import metrics
13+
14+
#
15+
# The real code starts here
16+
#
17+
18+
19+
# the training data folder must be passed as first argument
20+
movie_reviews_data_folder = sys.argv[1]
21+
dataset = load_files(movie_reviews_data_folder)
22+
23+
# split the dataset in training and test set:
24+
n_samples_total = dataset.filenames.shape[0]
25+
26+
split = (n_samples_total * 3) / 4
27+
28+
docs_train = [open(f).read() for f in dataset.filenames[:split]]
29+
docs_test = [open(f).read() for f in dataset.filenames[split:]]
30+
31+
y_train = dataset.target[:split]
32+
y_test = dataset.target[split:]
33+
34+
# Build a vectorizer / classifier pipeline using the previous analyzer
35+
pipeline = Pipeline([
36+
('vect', CountVectorizer(max_features=100000)),
37+
('tfidf', TfidfTransformer()),
38+
('clf', LinearSVC(C=1000)),
39+
])
40+
41+
parameters = {
42+
'vect__analyzer__max_n': (1, 2),
43+
'vect__max_df': (.95,),
44+
}
45+
46+
# Fit the pipeline on the training set using grid search for the parameters
47+
grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1)
48+
grid_search.fit(docs_train[:200], y_train[:200])
49+
50+
# Refit the best parameter set on the complete training set
51+
clf = grid_search.best_estimator.fit(docs_train, y_train)
52+
53+
# Predict the outcome on the testing set
54+
y_predicted = clf.predict(docs_test)
55+
56+
# Print the classification report
57+
print metrics.classification_report(y_test, y_predicted,
58+
class_names=dataset.target_names)
59+
60+
# Plot the confusion matrix
61+
cm = metrics.confusion_matrix(y_test, y_predicted)
62+
print cm
63+
64+
# import pylab as pl
65+
#pl.matshow(cm)
66+
#pl.show()
67+

0 commit comments

Comments
 (0)