-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathnaivebayes.py
51 lines (39 loc) · 1.75 KB
/
naivebayes.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
import nltk
from nltk.corpus import movie_reviews
from nltk.corpus import stopwords
import random
def document_features(document):
document_words = set(document)
features = {}
for word in word_features:
features['contains({})'.format(word)] = (word in document_words)
return features
def cleanUp(words):
stop_words = set(stopwords.words('english'))
words = [w for w in words if not w.lower() in stop_words]
words = [w for w in words if not w.lower() in [".", ",", "(", ")", "\"", "-", "'", ":"]]
return words
#Get list of each word in movie review plus the label
documents = [(list(movie_reviews.words(fileid)), category)
for category in movie_reviews.categories()
for fileid in movie_reviews.fileids(category)]
#Shuffle the documents
random.shuffle(documents)
#Get frequency of each word
all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words())
#Get list of the top 2000 most used words
word_features = [i[0] for i in all_words.most_common(3000)]
word_features = cleanUp(word_features)
#For each review get a list with T/F values for each word in the top 2000 words
#I guess this means that it gets points for not having
#negative words or loses points for not having positve words?
featuresets = [(document_features(d), c) for (d,c) in documents]
#Effectively run two epochs on data (couldn't figure out a better way to do this)
featuresets += featuresets
#Get train and test set. Then train model using natural language toolkit library
train_set, test_set = featuresets[100:], featuresets[:100]
classifier = nltk.NaiveBayesClassifier.train(train_set)
#Print accuracy
print(nltk.classify.accuracy(classifier, test_set))
#Shows words that most informed the output
classifier.show_most_informative_features(5)