Skip to content

Commit bb624f0

Browse files
author
André Almada
committed
Tests with Weka algorithms
1 parent ecc8cb0 commit bb624f0

8 files changed

+10467
-0
lines changed

data-to-csv.py

+41
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
import glob
2+
import os
3+
import re
4+
import pandas as pd
5+
import nltk
6+
nltk.download('punkt')
7+
nltk.download('stopwords')
8+
from nltk.tokenize import word_tokenize
9+
from nltk.corpus import stopwords
10+
from nltk.stem.porter import PorterStemmer
11+
12+
path = './txt/'
13+
stemmer = PorterStemmer()
14+
15+
# Pega o tema do artigo através do nome do arquivo
16+
def get_class (filename):
17+
return filename.split('-')[0].split('\\')[1]
18+
19+
def get_data (filename):
20+
with open(filename, 'r', encoding='latin1') as f:
21+
22+
data = f.read()
23+
data = re.sub('[^A-Za-z]', ' ', data) # Retira caracteres não alfanumericos
24+
data = data.lower() # Torna todas as palavras minúsculas
25+
26+
data = word_tokenize(data) # Remoção das stop words
27+
for token in data:
28+
if token in stopwords.words('english'):
29+
data.remove(token)
30+
31+
for i in range(len(data)): # Processo de Stemming
32+
data[i] = stemmer.stem(data[i])
33+
34+
plain_text = ",".join(data) # Transforma o array de tokens em uma string única separada pro vírgulas
35+
return plain_text
36+
37+
names = [f for f in glob.glob(os.path.join(path, '*.txt'))]
38+
39+
dataset = pd.DataFrame({'themes' : [get_class(f) for f in names], 'data' : [get_data(f) for f in names]})
40+
41+
dataset.to_csv('data.csv')

results/result.txt

+88
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,88 @@
1+
(base) D:\Projetos\data-mining-py>python data-mining.py
2+
[nltk_data] Downloading package punkt to
3+
[nltk_data] C:\Users\andre\AppData\Roaming\nltk_data...
4+
[nltk_data] Package punkt is already up-to-date!
5+
[nltk_data] Downloading package stopwords to
6+
[nltk_data] C:\Users\andre\AppData\Roaming\nltk_data...
7+
[nltk_data] Package stopwords is already up-to-date!
8+
precision recall f1-score support
9+
10+
CBR 0.96 0.96 0.96 28
11+
ILP 0.92 0.92 0.92 12
12+
RI 1.00 1.00 1.00 18
13+
14+
avg / total 0.97 0.97 0.97 58
15+
16+
precision recall f1-score support
17+
18+
CBR 0.90 0.96 0.93 28
19+
ILP 0.90 0.75 0.82 12
20+
RI 1.00 1.00 1.00 18
21+
22+
avg / total 0.93 0.93 0.93 58
23+
24+
precision recall f1-score support
25+
26+
CBR 1.00 1.00 1.00 28
27+
ILP 1.00 1.00 1.00 12
28+
RI 1.00 1.00 1.00 18
29+
30+
avg / total 1.00 1.00 1.00 58
31+
32+
precision recall f1-score support
33+
34+
CBR 1.00 1.00 1.00 28
35+
ILP 1.00 1.00 1.00 12
36+
RI 1.00 1.00 1.00 18
37+
38+
avg / total 1.00 1.00 1.00 58
39+
40+
precision recall f1-score support
41+
42+
CBR 1.00 1.00 1.00 28
43+
ILP 1.00 0.92 0.96 12
44+
RI 0.95 1.00 0.97 18
45+
46+
avg / total 0.98 0.98 0.98 58
47+
48+
precision recall f1-score support
49+
50+
CBR 0.90 1.00 0.95 28
51+
ILP 1.00 0.83 0.91 12
52+
RI 1.00 0.94 0.97 18
53+
54+
avg / total 0.95 0.95 0.95 58
55+
56+
precision recall f1-score support
57+
58+
CBR 0.93 0.96 0.95 27
59+
ILP 1.00 0.92 0.96 12
60+
RI 0.94 0.94 0.94 18
61+
62+
avg / total 0.95 0.95 0.95 57
63+
64+
precision recall f1-score support
65+
66+
CBR 1.00 0.96 0.98 27
67+
ILP 0.92 0.92 0.92 12
68+
RI 0.95 1.00 0.97 18
69+
70+
avg / total 0.97 0.96 0.96 57
71+
72+
precision recall f1-score support
73+
74+
CBR 1.00 1.00 1.00 27
75+
ILP 1.00 1.00 1.00 12
76+
RI 1.00 1.00 1.00 18
77+
78+
avg / total 1.00 1.00 1.00 57
79+
80+
precision recall f1-score support
81+
82+
CBR 0.93 1.00 0.96 27
83+
ILP 1.00 0.91 0.95 11
84+
RI 1.00 0.94 0.97 17
85+
86+
avg / total 0.97 0.96 0.96 55
87+
88+
0.9703503272287302

0 commit comments

Comments
 (0)