Skip to content

Commit be9118f

Browse files
author
André Almada
committed
Bag of words
1 parent 5d54114 commit be9118f

File tree

1 file changed

+6
-7
lines changed

1 file changed

+6
-7
lines changed

bag-of-words.py

+6-7
Original file line numberDiff line numberDiff line change
@@ -8,9 +8,11 @@
88
from nltk.tokenize import word_tokenize
99
from nltk.corpus import stopwords
1010
from nltk.stem.porter import PorterStemmer
11+
from sklearn.feature_extraction.text import CountVectorizer
1112

1213
path = './txt/'
1314
stemmer = PorterStemmer()
15+
cv = CountVectorizer(max_features=1000, encoding='latin1')
1416

1517
def get_class (filename):
1618
return filename.split('-')[0].split('\\')[1]
@@ -29,14 +31,11 @@ def get_data (filename):
2931
for i in range(len(data)):
3032
data[i] = stemmer.stem(data[i])
3133

32-
return data
34+
plain_text = " ".join(data)
35+
return plain_text
3336

3437
names = [f for f in glob.glob(os.path.join(path, '*.txt'))]
3538

36-
dataset = pd.DataFrame({'journal' : [get_class(f) for f in names], 'data' : [get_data(f) for f in names]})
37-
38-
# Tira caracteres não alfabéticos e deixa o texto inteiro na minúscula
39-
#dataset.data = dataset.data.map(lambda x: re.sub('[^A-Za-z]', ' ', x).lower())
40-
41-
#dataset.data = pre_processing (dataset.data)
39+
dataset = pd.DataFrame({'themes' : [get_class(f) for f in names], 'data' : [get_data(f) for f in names]})
4240

41+
bag_of_words = cv.fit_transform(dataset.data).toarray()

0 commit comments

Comments
 (0)