-
Notifications
You must be signed in to change notification settings - Fork 9
/
Copy pathMachineLearningClassifier.py
159 lines (126 loc) · 6.13 KB
/
MachineLearningClassifier.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
# -*- coding: utf-8 -*-
#########################################################################
############## Semeval - Sentiment Analysis in Twitter #################
#########################################################################
####
#### Authors: Pedro Paulo Balage Filho e Lucas Avanço
#### Version: 2.0
#### Date: 26/03/14
####
# Python 3 compatibility
from __future__ import unicode_literals
from __future__ import division
from __future__ import absolute_import
from __future__ import print_function
from __future__ import with_statement
# Requires NLTK. I used the nltk 3.0a
from nltk import bigrams
from nltk import trigrams
# Import classifier libraries
from sklearn.svm import LinearSVC
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import LabelEncoder
from operator import itemgetter
import re
# polarity from lexicon used in the feature set
from LexiconClassifier import LexiconClassifier
#### Provides a Machine Learning Sentiment Analysis classifier
class MachineLearningClassifier(object):
# Constructor
def __init__(self, trainset=[]):
print ('Loading training modules')
self.bag_of_words = []
self.vectorizer = DictVectorizer(dtype=int, sparse=True)
self.encoder = LabelEncoder()
self.lexicon_classifier = LexiconClassifier()
self.classifier = LinearSVC(C=0.005)
self.train(trainset)
# Extract features for ML process
# Some insights from http://aclweb.org/anthology/S/S13/S13-2053.pdf
def extract_features(self, tweet_tokens):
if len(self.bag_of_words) == 0:
print('Bag-of-Words empty!')
unigrams = [w.lower() for w,t in tweet_tokens]
tokens = unigrams
tokens += ['_'.join(b) for b in bigrams(unigrams)]
tokens += ['_'.join(t) for t in trigrams(unigrams)]
tokens += [t1 + '_*_' + t3 for t1,t2,t3 in trigrams(unigrams)]
tweet_tags = [tag for token, tag in tweet_tokens]
feature_set = {}
# 1st set of features: bag-of-words
for token in set(tokens).intersection(self.bag_of_words):
feature_set['has_'+token] = True
# 2nd set of features: the count for each tag type present in the message
# Tweet_nlp taget. Info:
# http://www.ark.cs.cmu.edu/TweetNLP/annot_guidelines.pdf
for tag in ['N','O','^','S','Z','V','A','R','!','D','P','&','T','X','#','@','~','U','E','$',',','G','L','M','Y']:
feature_set['num_'+tag] = sum([1 for t in tweet_tags if t == tag])
# 3rd feature: negation is present?
negators = set(LexiconClassifier().read_negation_words())
if len(negators.intersection(set(tokens))) > 0:
feature_set['has_negator'] = True
# 4th feature: character ngrams
regexp = re.compile(r"([a-z])\1{2,}")
feature_set['has_char_ngrams'] = False
for token,tag in tweet_tokens:
if regexp.search(token):
feature_set['has_char_ngrams'] = True
break
# 5th feature: punctuaion ngrams
regexp = re.compile(r"([!\?])\1{2,}")
feature_set['has_punct_ngrams'] = False
for token,tag in tweet_tokens:
if regexp.search(token):
feature_set['has_punct_ngrams'] = True
break
# 6th feature: the number of all upper cased words
feature_set['num_all_caps'] = sum([1 for token,tag in tweet_tokens if token.isupper() and len(token)>=3])
# 7th and 8th feature: the positive and negative score from lexicon
# classifier (i.e., number of positive and negative words from lexicon)
positive_score, negative_score = self.lexicon_classifier.classify(tweet_tokens)
feature_set['pos_lexicon'] = positive_score
feature_set['neg_lexicon'] = -1 * negative_score
return feature_set
# train the classifier
# Tweets argument must be a list of dicitionaries. Each dictionary must
# have the keys ['MESSAGE'] and ['SENTIMENT'] with the message string and
# the classificationclass, respectively.
def train(self,tweets):
# 1st step: build the bag-of-words model
tweet_tokens_list = [tweet_tokens for tweet_tokens,label in tweets]
tokens = []
print('Computing the trainset vocabulary of n-grams')
for tweet_tokens in tweet_tokens_list:
unigrams = [w.lower() for w,t in tweet_tokens]
tokens += unigrams
tokens += ['_'.join(b) for b in bigrams(unigrams)]
tokens += ['_'.join(t) for t in trigrams(unigrams)]
tokens += [t1 + '_*_' + t3 for t1,t2,t3 in trigrams(unigrams)]
# build the bag-of-words list using all the tokens
self.bag_of_words = set(tokens)
data = list()
total_tweets = len(tweets)
features_list = list()
for index,(tweet_tokens,label) in enumerate(tweets):
print('Training for tweet n. {}/{}'.format(index+1,total_tweets))
features_list.append(self.extract_features(tweet_tokens))
# Train a SVM classifier
#data = self.vectorizer.fit_transform([features for features,label in self.train_set_features])
print('Vectorizing the features')
data = self.vectorizer.fit_transform(features_list)
target = self.encoder.fit_transform([label for tweet_tokens,label in tweets])
print('Builing the model')
self.classifier.fit(data, target)
# classify a new message. Return the scores (probabilities) for each
# classification class
def classify(self, tweet_tokens):
data = self.vectorizer.transform(self.extract_features(tweet_tokens))
probs = self.classifier.decision_function(data)
classes = self.encoder.classes_
return {classes.item(i): probs.item(i) for i in range(len(classes))}
# return the probability of classification into one of the three classes
def decision_function(self, tweet_tokens):
data = self.vectorizer.transform(self.extract_features(tweet_tokens))
probs = self.classifier.decision_function(data)
classes = self.encoder.classes_
return {classes.item(i): probs.item(i) for i in range(len(classes))}