Skip to content

Commit ecc8cb0

Browse files
author
André Almada
committed
Finished training and classification
1 parent be9118f commit ecc8cb0

File tree

2 files changed

+66
-41
lines changed

2 files changed

+66
-41
lines changed

Diff for: bag-of-words.py

-41
This file was deleted.

Diff for: data-mining.py

+66
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
import glob
2+
import os
3+
import re
4+
import pandas as pd
5+
import nltk
6+
nltk.download('punkt')
7+
nltk.download('stopwords')
8+
from nltk.tokenize import word_tokenize
9+
from nltk.corpus import stopwords
10+
from nltk.stem.porter import PorterStemmer
11+
from sklearn.feature_extraction.text import CountVectorizer
12+
from sklearn.model_selection import StratifiedKFold
13+
from sklearn.naive_bayes import GaussianNB
14+
from sklearn.metrics import classification_report, accuracy_score
15+
16+
path = './txt/'
17+
stemmer = PorterStemmer()
18+
cv = CountVectorizer(max_features=1000, encoding='latin1')
19+
skf = StratifiedKFold(n_splits=10, shuffle=True)
20+
nb = GaussianNB()
21+
22+
# Pega o tema do artigo através do nome do arquivo
23+
def get_class (filename):
24+
return filename.split('-')[0].split('\\')[1]
25+
26+
def get_data (filename):
27+
with open(filename, 'r', encoding='latin1') as f:
28+
29+
data = f.read()
30+
data = re.sub('[^A-Za-z]', ' ', data) # Retira caracteres não alfanumericos
31+
data = data.lower() # Torna todas as palavras minúsculas
32+
33+
data = word_tokenize(data) # Remoção das stop words
34+
for token in data:
35+
if token in stopwords.words('english'):
36+
data.remove(token)
37+
38+
for i in range(len(data)): # Processo de Stemming
39+
data[i] = stemmer.stem(data[i])
40+
41+
plain_text = " ".join(data) # Transforma o array de tokens em uma string única por artigo
42+
return plain_text
43+
44+
names = [f for f in glob.glob(os.path.join(path, '*.txt'))]
45+
46+
dataset = pd.DataFrame({'themes' : [get_class(f) for f in names], 'data' : [get_data(f) for f in names]})
47+
48+
X = cv.fit_transform(dataset.data).toarray() # Bag of words de cada artigo
49+
y = dataset.iloc[:, 0] # Classificação dos artigos
50+
51+
accuracy = 0
52+
for train_index, test_index in skf.split(X, y):
53+
54+
X_train, X_test = X[train_index], X[test_index]
55+
y_train, y_test = y[train_index], y[test_index]
56+
57+
nb.fit(X_train, y_train) # Fase de treinamento
58+
59+
y_pred = nb.predict(X_test) # Fase de teste
60+
61+
cr = classification_report(y_test, y_pred)
62+
print(cr)
63+
64+
accuracy = accuracy + accuracy_score(y_test, y_pred)
65+
66+
print (accuracy/10)

0 commit comments

Comments
 (0)