-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathprocess_history_push.py
More file actions
30 lines (27 loc) · 962 Bytes
/
process_history_push.py
File metadata and controls
30 lines (27 loc) · 962 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
#coding:utf-8
import jieba
import glob
history = {}
def stopWords_():
stops = []
with open('stops.txt', encoding='utf-8') as fr:
for line in fr:
stops.append(line.strip())
return stops
stopWords = stopWords_()
def getCuts(pathfilename, cutfile):
fw = open(cutfile, 'w')
filenames = glob.glob(pathfilename)
for filename in filenames:
tmp = []
with open(filename, encoding='utf-8') as fr:
for line in fr:
if line.strip():
tmp.append(line.strip())
# print(filename.split('\\')[-1].split('.')[0], ' '.join(tmp))
try:
fw.write(filename.split('\\')[-1].split('.')[0]+'\t'+' '.join([x for x in jieba.cut(' '.join(tmp)) if x not in stopWords]) + '\n')
except:
pass
# history[filename.split('\\')[-1].split('.')[0]] = ' '.join(tmp)
getCuts('../NewsPushDir/*', 'push_cut.txt')