1
+ import glob
2
+ import os
3
+ import re
4
+ import pandas as pd
5
+ import nltk
6
+ nltk .download ('punkt' )
7
+ nltk .download ('stopwords' )
8
+ from nltk .tokenize import word_tokenize
9
+ from nltk .corpus import stopwords
10
+ from nltk .stem .porter import PorterStemmer
11
+ from sklearn .feature_extraction .text import CountVectorizer
12
+ from sklearn .model_selection import StratifiedKFold
13
+ from sklearn .naive_bayes import GaussianNB
14
+ from sklearn .metrics import classification_report , accuracy_score
15
+
16
+ path = './txt/'
17
+ stemmer = PorterStemmer ()
18
+ cv = CountVectorizer (max_features = 1000 , encoding = 'latin1' )
19
+ skf = StratifiedKFold (n_splits = 10 , shuffle = True )
20
+ nb = GaussianNB ()
21
+
22
+ # Pega o tema do artigo através do nome do arquivo
23
+ def get_class (filename ):
24
+ return filename .split ('-' )[0 ].split ('\\ ' )[1 ]
25
+
26
+ def get_data (filename ):
27
+ with open (filename , 'r' , encoding = 'latin1' ) as f :
28
+
29
+ data = f .read ()
30
+ data = re .sub ('[^A-Za-z]' , ' ' , data ) # Retira caracteres não alfanumericos
31
+ data = data .lower () # Torna todas as palavras minúsculas
32
+
33
+ data = word_tokenize (data ) # Remoção das stop words
34
+ for token in data :
35
+ if token in stopwords .words ('english' ):
36
+ data .remove (token )
37
+
38
+ for i in range (len (data )): # Processo de Stemming
39
+ data [i ] = stemmer .stem (data [i ])
40
+
41
+ plain_text = " " .join (data ) # Transforma o array de tokens em uma string única por artigo
42
+ return plain_text
43
+
44
+ names = [f for f in glob .glob (os .path .join (path , '*.txt' ))]
45
+
46
+ dataset = pd .DataFrame ({'themes' : [get_class (f ) for f in names ], 'data' : [get_data (f ) for f in names ]})
47
+
48
+ X = cv .fit_transform (dataset .data ).toarray () # Bag of words de cada artigo
49
+ y = dataset .iloc [:, 0 ] # Classificação dos artigos
50
+
51
+ accuracy = 0
52
+ for train_index , test_index in skf .split (X , y ):
53
+
54
+ X_train , X_test = X [train_index ], X [test_index ]
55
+ y_train , y_test = y [train_index ], y [test_index ]
56
+
57
+ nb .fit (X_train , y_train ) # Fase de treinamento
58
+
59
+ y_pred = nb .predict (X_test ) # Fase de teste
60
+
61
+ cr = classification_report (y_test , y_pred )
62
+ print (cr )
63
+
64
+ accuracy = accuracy + accuracy_score (y_test , y_pred )
65
+
66
+ print (accuracy / 10 )
0 commit comments