-
Notifications
You must be signed in to change notification settings - Fork 6.4k
/
Copy pathnb.py
44 lines (32 loc) · 1.35 KB
/
nb.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
# Naive Bayes spam detection for NLP class, which can be found at:
# https://deeplearningcourses.com/c/data-science-natural-language-processing-in-python
# https://www.udemy.com/data-science-natural-language-processing-in-python
# dataset: https://archive.ics.uci.edu/ml/datasets/Spambase
# Author: http://lazyprogrammer.me
from __future__ import print_function, division
from builtins import range
# Note: you may need to update your version of future
# sudo pip install -U future
from sklearn.naive_bayes import MultinomialNB
import pandas as pd
import numpy as np
# Note: technically multinomial NB is for "counts", but the documentation says
# it will work for other types of "counts", like tf-idf, so it should
# also work for our "word proportions"
data = pd.read_csv('spambase.data').values # use pandas for convenience
np.random.shuffle(data) # shuffle each row in-place, but preserve the row
X = data[:,:48]
Y = data[:,-1]
# last 100 rows will be test
Xtrain = X[:-100,]
Ytrain = Y[:-100,]
Xtest = X[-100:,]
Ytest = Y[-100:,]
model = MultinomialNB()
model.fit(Xtrain, Ytrain)
print("Classification rate for NB:", model.score(Xtest, Ytest))
##### you can use ANY model! #####
from sklearn.ensemble import AdaBoostClassifier
model = AdaBoostClassifier()
model.fit(Xtrain, Ytrain)
print("Classification rate for AdaBoost:", model.score(Xtest, Ytest))