Skip to content

Commit d91940f

Browse files
committed
update
1 parent f563fcf commit d91940f

File tree

1 file changed

+31
-48
lines changed

1 file changed

+31
-48
lines changed

Part2_Text_Classify/src/get_data.py renamed to Part2_Text_Classify/src/get_cls.py

Lines changed: 31 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -17,42 +17,45 @@
1717
reload(sys)
1818
sys.setdefaultencoding("utf-8")
1919

20-
base_path = "C:/Users/kaifun/Desktop/ass_TIP/TextInfoExp/Part2_Text_Classify/data_valid/"
21-
filelist = os.listdir(base_path)
20+
# base_path = "C:/Users/kaifun/Desktop/ass_TIP/TextInfoExp/Part2_Text_Classify/data_valid/"
21+
base_path = '/home/kaifun/PycharmProjects/TextInfoExp/Part2_Text_Classify/'
2222

2323

24-
# 将文本汇总成一个txt
25-
def get_text():
24+
def data_preprocess():
25+
cls = ['Art', 'Computer', 'Sports']
26+
for item in cls:
27+
get_text(item)
28+
29+
30+
# 将各类文本汇总成一个txt
31+
def get_text(item):
32+
filelist = os.listdir(base_path + 'data_train/' + item)
2633
data_dict = {}
2734
for files in filelist:
2835
# print (files)
29-
f = open(base_path + files, 'r')
30-
text = (f.read().decode('GB2312', 'ignore').encode('utf-8')).replace('\n', '')
31-
36+
f = open(base_path + 'data_train/' + item + '/' + files, 'r')
37+
text = f.read().replace('\n', '')
3238
data_temp = text.decode('utf-8') # 转换为unicode编码形式
3339
data = ''.join(re.findall(u'[\u4e00-\u9fff]+', data_temp)) # 必须为unicode类型,取出所有中文字符
34-
data2 = jieba.cut(data) # 分词
40+
data2 = jieba.cut(data.encode('utf-8')) # 分词
3541
data3 = " ".join(data2) # 结果转换为字符串(列表转换为字符串)
36-
data_dict[data3] = "Art"
42+
data_dict[data3] = item
3743

38-
f2 = open('../data/art_res2.txt', 'a+')
44+
f2 = open('%s.txt' % item, 'a+')
3945
for (k, v) in data_dict.items():
4046
f2.write(v + ',' + k + ' ' + '\n')
4147
f2.close()
4248

4349

4450
# 获取数据和标记
4551
def load_data():
46-
data = pd.read_table('C:/Users/kaifun/Desktop/ass_TIP/TextInfoExp/Part2_Text_Classify/data/art_res2.txt',
47-
header=None, sep=',')
48-
data2 = pd.read_table('C:/Users/kaifun/Desktop/ass_TIP/TextInfoExp/Part2_Text_Classify/data/computer_res2.txt',
49-
header=None, sep=',')
50-
data3 = pd.read_table('C:/Users/kaifun/Desktop/ass_TIP/TextInfoExp/Part2_Text_Classify/data/sports_res2.txt',
51-
header=None, sep=',')
52+
data = pd.read_table('Art.txt', header=None, sep=',')
53+
data2 = pd.read_table('Computer.txt', header=None, sep=',')
54+
data3 = pd.read_table('Sports.txt', header=None, sep=',')
5255
# print (data,data2,data3)
5356

5457
posting_list = []
55-
class_list = []
58+
class_list = [] # 方便计算转换为1,2,3
5659

5760
for i in range(len(data)):
5861
posting_list.append((data.iloc[i, 1]))
@@ -71,14 +74,16 @@ def load_data():
7174
def jieba_tokenizer(x): return jieba.cut(x, cut_all=True)
7275

7376

77+
# 将文件名进行脱敏化处理
7478
def trans_text():
7579
# salt = ''.join(random.sample(string.ascii_letters + string.digits, 8))
7680
f3 = open('id2class2.txt', 'a')
81+
filelist = os.listdir(base_path)
7782
for files in filelist:
7883
# print (files)
7984
f = open(base_path + files, 'r')
8085
text = (f.read().decode('GB2312', 'ignore').encode('utf-8'))
81-
salt = ''.join(random.sample(string.ascii_letters + string.digits, 8)) # 产生随机数
86+
salt = ''.join(random.sample(string.ascii_letters + string.digits, 8)) # 产生随机数
8287
f2 = open("C:/Users/kaifun/Desktop/ass_TIP/TextInfoExp/Part2_Text_Classify/test3/" + salt + '.txt', 'w')
8388
f2.write(text)
8489
f3.write(salt + ' ' + 'e' + '\n')
@@ -89,25 +94,26 @@ def trans_text():
8994
def get_classify():
9095
X_train, Y_train = load_data()
9196

97+
# 定义分类器
9298
classifier = Pipeline([
93-
('counter', CountVectorizer(tokenizer=jieba_tokenizer)),
94-
('tfidf', TfidfTransformer()),
95-
('clf', OneVsRestClassifier(LinearSVC())),
99+
('counter', CountVectorizer(tokenizer=jieba_tokenizer)), # 标记和计数,提取特征用 向量化
100+
('tfidf', TfidfTransformer()), # IF-IDF 权重
101+
('clf', OneVsRestClassifier(LinearSVC())), # 1-rest 多分类(多标签)
96102
])
97103
mlb = MultiLabelBinarizer()
98-
Y_train = mlb.fit_transform(Y_train)
104+
Y_train = mlb.fit_transform(Y_train) # 分类号数值化
99105

100106
classifier.fit(X_train, Y_train)
101107

102108
# X_test = ["数据分析"]
103109
# 把所有的测试文本存到一个list中
104110
test_list = []
105111
test_name = []
106-
filelist2 = os.listdir("C:/Users/kaifun/Desktop/ass_TIP/TextInfoExp/Part2_Text_Classify/data_test/")
112+
filelist2 = os.listdir(base_path + "data_test/")
107113
for files in filelist2:
108114
# print (files)
109115
test_name.append(files)
110-
f = open("C:/Users/kaifun/Desktop/ass_TIP/TextInfoExp/Part2_Text_Classify/data_test/" + files, 'r')
116+
f = open(base_path + "data_test/" + files, 'r')
111117
test_list.append(f.read())
112118

113119
prediction = classifier.predict(test_list)
@@ -123,29 +129,6 @@ def get_classify():
123129
print ((num_dict[('1',)] + num_dict[('2',)] + num_dict[('3',)]) / float(len(result))) # 整数除整数为0,应把其中一个改为浮点数。
124130

125131

126-
def get_text2():
127-
for files in filelist:
128-
f = open(base_path + files, 'r')
129-
data = f.read()
130-
f2 = open("C:/Users/kaifun/Desktop/ass_TIP/TextInfoExp/Part2_Text_Classify/test/" + files + '.txt', 'w')
131-
f2.write(data)
132-
133-
134-
def check():
135-
filename = []
136-
print (filelist)
137-
data = pd.read_table('C:/Users/kaifun/Desktop/ass_TIP/TextInfoExp/Part2_Text_Classify/src/result.txt', header=None,
138-
sep='\t')
139-
140-
for i in range(len(data)):
141-
filename.append(str(data.iloc[i,0]).split()[0])
142-
143-
print (filename)
144-
145-
146132
if __name__ == '__main__':
147-
# get_text()
148-
# trans_text()
133+
# data_preprocess()
149134
get_classify()
150-
# get_text2()
151-
# check()

0 commit comments

Comments
 (0)