|
| 1 | +"""Build a sentiment analysis / polarity model""" |
| 2 | +# Author: Olivier Grisel <[email protected]> |
| 3 | +# License: Simplified BSD |
| 4 | + |
| 5 | +import sys |
| 6 | +from scikits.learn.feature_extraction.text.sparse import CountVectorizer |
| 7 | +from scikits.learn.feature_extraction.text.sparse import TfidfTransformer |
| 8 | +from scikits.learn.svm.sparse import LinearSVC |
| 9 | +from scikits.learn.pipeline import Pipeline |
| 10 | +from scikits.learn.grid_search import GridSearchCV |
| 11 | +from scikits.learn.datasets import load_files |
| 12 | +from scikits.learn import metrics |
| 13 | + |
| 14 | +# |
| 15 | +# The real code starts here |
| 16 | +# |
| 17 | + |
| 18 | + |
| 19 | +# the training data folder must be passed as first argument |
| 20 | +movie_reviews_data_folder = sys.argv[1] |
| 21 | +dataset = load_files(movie_reviews_data_folder) |
| 22 | + |
| 23 | +# split the dataset in training and test set: |
| 24 | +n_samples_total = dataset.filenames.shape[0] |
| 25 | + |
| 26 | +split = (n_samples_total * 3) / 4 |
| 27 | + |
| 28 | +docs_train = [open(f).read() for f in dataset.filenames[:split]] |
| 29 | +docs_test = [open(f).read() for f in dataset.filenames[split:]] |
| 30 | + |
| 31 | +y_train = dataset.target[:split] |
| 32 | +y_test = dataset.target[split:] |
| 33 | + |
| 34 | +# Build a vectorizer / classifier pipeline using the previous analyzer |
| 35 | +pipeline = Pipeline([ |
| 36 | + ('vect', CountVectorizer(max_features=100000)), |
| 37 | + ('tfidf', TfidfTransformer()), |
| 38 | + ('clf', LinearSVC(C=1000)), |
| 39 | +]) |
| 40 | + |
| 41 | +parameters = { |
| 42 | + 'vect__analyzer__max_n': (1, 2), |
| 43 | + 'vect__max_df': (.95,), |
| 44 | +} |
| 45 | + |
| 46 | +# Fit the pipeline on the training set using grid search for the parameters |
| 47 | +grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1) |
| 48 | +grid_search.fit(docs_train[:200], y_train[:200]) |
| 49 | + |
| 50 | +# Refit the best parameter set on the complete training set |
| 51 | +clf = grid_search.best_estimator.fit(docs_train, y_train) |
| 52 | + |
| 53 | +# Predict the outcome on the testing set |
| 54 | +y_predicted = clf.predict(docs_test) |
| 55 | + |
| 56 | +# Print the classification report |
| 57 | +print metrics.classification_report(y_test, y_predicted, |
| 58 | + class_names=dataset.target_names) |
| 59 | + |
| 60 | +# Plot the confusion matrix |
| 61 | +cm = metrics.confusion_matrix(y_test, y_predicted) |
| 62 | +print cm |
| 63 | + |
| 64 | +# import pylab as pl |
| 65 | +#pl.matshow(cm) |
| 66 | +#pl.show() |
| 67 | + |
0 commit comments