asalmada
diff --git a/‎data-to-csv.py
Lines changed: 41 additions & 0 deletions b/‎data-to-csv.py
Lines changed: 41 additions & 0 deletions
diff --git a/‎results/result.txt
Lines changed: 88 additions & 0 deletions b/‎results/result.txt
Lines changed: 88 additions & 0 deletions
@@ -0,0 +1,41 @@
+import glob
+import os
+import re
+import pandas as pd
+import nltk
+nltk.download('punkt')
+nltk.download('stopwords')
+from nltk.tokenize import word_tokenize
+from nltk.corpus import stopwords
+from nltk.stem.porter import PorterStemmer
+
+path = './txt/'
+stemmer = PorterStemmer()
+
+# Pega o tema do artigo através do nome do arquivo
+def get_class (filename):
+    return filename.split('-')[0].split('\\')[1]
+
+def get_data (filename):
+    with open(filename, 'r', encoding='latin1') as f:
+
+        data = f.read()
+        data = re.sub('[^A-Za-z]', ' ', data) # Retira caracteres não alfanumericos
+        data = data.lower() # Torna todas as palavras minúsculas
+
+        data = word_tokenize(data) # Remoção das stop words
+        for token in data:
+            if token in stopwords.words('english'):
+                data.remove(token)
+
+        for i in range(len(data)): # Processo de Stemming
+            data[i] = stemmer.stem(data[i])
+
+        plain_text = ",".join(data) # Transforma o array de tokens em uma string única separada pro vírgulas
+        return plain_text
+
+names = [f for f in glob.glob(os.path.join(path, '*.txt'))]
+
+dataset = pd.DataFrame({'themes' : [get_class(f) for f in names], 'data' : [get_data(f) for f in names]})
+
+dataset.to_csv('data.csv')
@@ -0,0 +1,88 @@
+(base) D:\Projetos\data-mining-py>python data-mining.py
+[nltk_data] Downloading package punkt to
+[nltk_data]     C:\Users\andre\AppData\Roaming\nltk_data...
+[nltk_data]   Package punkt is already up-to-date!
+[nltk_data] Downloading package stopwords to
+[nltk_data]     C:\Users\andre\AppData\Roaming\nltk_data...
+[nltk_data]   Package stopwords is already up-to-date!
+             precision    recall  f1-score   support
+
+        CBR       0.96      0.96      0.96        28
+        ILP       0.92      0.92      0.92        12
+         RI       1.00      1.00      1.00        18
+
+avg / total       0.97      0.97      0.97        58
+
+             precision    recall  f1-score   support
+
+        CBR       0.90      0.96      0.93        28
+        ILP       0.90      0.75      0.82        12
+         RI       1.00      1.00      1.00        18
+
+avg / total       0.93      0.93      0.93        58
+
+             precision    recall  f1-score   support
+
+        CBR       1.00      1.00      1.00        28
+        ILP       1.00      1.00      1.00        12
+         RI       1.00      1.00      1.00        18
+
+avg / total       1.00      1.00      1.00        58
+
+             precision    recall  f1-score   support
+
+        CBR       1.00      1.00      1.00        28
+        ILP       1.00      1.00      1.00        12
+         RI       1.00      1.00      1.00        18
+
+avg / total       1.00      1.00      1.00        58
+
+             precision    recall  f1-score   support
+
+        CBR       1.00      1.00      1.00        28
+        ILP       1.00      0.92      0.96        12
+         RI       0.95      1.00      0.97        18
+
+avg / total       0.98      0.98      0.98        58
+
+             precision    recall  f1-score   support
+
+        CBR       0.90      1.00      0.95        28
+        ILP       1.00      0.83      0.91        12
+         RI       1.00      0.94      0.97        18
+
+avg / total       0.95      0.95      0.95        58
+
+             precision    recall  f1-score   support
+
+        CBR       0.93      0.96      0.95        27
+        ILP       1.00      0.92      0.96        12
+         RI       0.94      0.94      0.94        18
+
+avg / total       0.95      0.95      0.95        57
+
+             precision    recall  f1-score   support
+
+        CBR       1.00      0.96      0.98        27
+        ILP       0.92      0.92      0.92        12
+         RI       0.95      1.00      0.97        18
+
+avg / total       0.97      0.96      0.96        57
+
+             precision    recall  f1-score   support
+
+        CBR       1.00      1.00      1.00        27
+        ILP       1.00      1.00      1.00        12
+         RI       1.00      1.00      1.00        18
+
+avg / total       1.00      1.00      1.00        57
+
+             precision    recall  f1-score   support
+
+        CBR       0.93      1.00      0.96        27
+        ILP       1.00      0.91      0.95        11
+         RI       1.00      0.94      0.97        17
+
+avg / total       0.97      0.96      0.96        55
+
+0.9703503272287302