File tree Expand file tree Collapse file tree 1 file changed +6
-7
lines changed
Expand file tree Collapse file tree 1 file changed +6
-7
lines changed Original file line number Diff line number Diff line change 88from nltk .tokenize import word_tokenize
99from nltk .corpus import stopwords
1010from nltk .stem .porter import PorterStemmer
11+ from sklearn .feature_extraction .text import CountVectorizer
1112
1213path = './txt/'
1314stemmer = PorterStemmer ()
15+ cv = CountVectorizer (max_features = 1000 , encoding = 'latin1' )
1416
1517def get_class (filename ):
1618 return filename .split ('-' )[0 ].split ('\\ ' )[1 ]
@@ -29,14 +31,11 @@ def get_data (filename):
2931 for i in range (len (data )):
3032 data [i ] = stemmer .stem (data [i ])
3133
32- return data
34+ plain_text = " " .join (data )
35+ return plain_text
3336
3437names = [f for f in glob .glob (os .path .join (path , '*.txt' ))]
3538
36- dataset = pd .DataFrame ({'journal' : [get_class (f ) for f in names ], 'data' : [get_data (f ) for f in names ]})
37-
38- # Tira caracteres não alfabéticos e deixa o texto inteiro na minúscula
39- #dataset.data = dataset.data.map(lambda x: re.sub('[^A-Za-z]', ' ', x).lower())
40-
41- #dataset.data = pre_processing (dataset.data)
39+ dataset = pd .DataFrame ({'themes' : [get_class (f ) for f in names ], 'data' : [get_data (f ) for f in names ]})
4240
41+ bag_of_words = cv .fit_transform (dataset .data ).toarray ()
You can’t perform that action at this time.
0 commit comments