File tree 1 file changed +6
-7
lines changed
1 file changed +6
-7
lines changed Original file line number Diff line number Diff line change 8
8
from nltk .tokenize import word_tokenize
9
9
from nltk .corpus import stopwords
10
10
from nltk .stem .porter import PorterStemmer
11
+ from sklearn .feature_extraction .text import CountVectorizer
11
12
12
13
path = './txt/'
13
14
stemmer = PorterStemmer ()
15
+ cv = CountVectorizer (max_features = 1000 , encoding = 'latin1' )
14
16
15
17
def get_class (filename ):
16
18
return filename .split ('-' )[0 ].split ('\\ ' )[1 ]
@@ -29,14 +31,11 @@ def get_data (filename):
29
31
for i in range (len (data )):
30
32
data [i ] = stemmer .stem (data [i ])
31
33
32
- return data
34
+ plain_text = " " .join (data )
35
+ return plain_text
33
36
34
37
names = [f for f in glob .glob (os .path .join (path , '*.txt' ))]
35
38
36
- dataset = pd .DataFrame ({'journal' : [get_class (f ) for f in names ], 'data' : [get_data (f ) for f in names ]})
37
-
38
- # Tira caracteres não alfabéticos e deixa o texto inteiro na minúscula
39
- #dataset.data = dataset.data.map(lambda x: re.sub('[^A-Za-z]', ' ', x).lower())
40
-
41
- #dataset.data = pre_processing (dataset.data)
39
+ dataset = pd .DataFrame ({'themes' : [get_class (f ) for f in names ], 'data' : [get_data (f ) for f in names ]})
42
40
41
+ bag_of_words = cv .fit_transform (dataset .data ).toarray ()
You can’t perform that action at this time.
0 commit comments