17
17
reload (sys )
18
18
sys .setdefaultencoding ("utf-8" )
19
19
20
- base_path = "C:/Users/kaifun/Desktop/ass_TIP/TextInfoExp/Part2_Text_Classify/data_valid/"
21
- filelist = os . listdir ( base_path )
20
+ # base_path = "C:/Users/kaifun/Desktop/ass_TIP/TextInfoExp/Part2_Text_Classify/data_valid/"
21
+ base_path = '/home/kaifun/PycharmProjects/TextInfoExp/Part2_Text_Classify/'
22
22
23
23
24
- # 将文本汇总成一个txt
25
- def get_text ():
24
+ def data_preprocess ():
25
+ cls = ['Art' , 'Computer' , 'Sports' ]
26
+ for item in cls :
27
+ get_text (item )
28
+
29
+
30
+ # 将各类文本汇总成一个txt
31
+ def get_text (item ):
32
+ filelist = os .listdir (base_path + 'data_train/' + item )
26
33
data_dict = {}
27
34
for files in filelist :
28
35
# print (files)
29
- f = open (base_path + files , 'r' )
30
- text = (f .read ().decode ('GB2312' , 'ignore' ).encode ('utf-8' )).replace ('\n ' , '' )
31
-
36
+ f = open (base_path + 'data_train/' + item + '/' + files , 'r' )
37
+ text = f .read ().replace ('\n ' , '' )
32
38
data_temp = text .decode ('utf-8' ) # 转换为unicode编码形式
33
39
data = '' .join (re .findall (u'[\u4e00 -\u9fff ]+' , data_temp )) # 必须为unicode类型,取出所有中文字符
34
- data2 = jieba .cut (data ) # 分词
40
+ data2 = jieba .cut (data . encode ( 'utf-8' ) ) # 分词
35
41
data3 = " " .join (data2 ) # 结果转换为字符串(列表转换为字符串)
36
- data_dict [data3 ] = "Art"
42
+ data_dict [data3 ] = item
37
43
38
- f2 = open ('../data/art_res2. txt' , 'a+' )
44
+ f2 = open ('%s. txt' % item , 'a+' )
39
45
for (k , v ) in data_dict .items ():
40
46
f2 .write (v + ',' + k + ' ' + '\n ' )
41
47
f2 .close ()
42
48
43
49
44
50
# 获取数据和标记
45
51
def load_data ():
46
- data = pd .read_table ('C:/Users/kaifun/Desktop/ass_TIP/TextInfoExp/Part2_Text_Classify/data/art_res2.txt' ,
47
- header = None , sep = ',' )
48
- data2 = pd .read_table ('C:/Users/kaifun/Desktop/ass_TIP/TextInfoExp/Part2_Text_Classify/data/computer_res2.txt' ,
49
- header = None , sep = ',' )
50
- data3 = pd .read_table ('C:/Users/kaifun/Desktop/ass_TIP/TextInfoExp/Part2_Text_Classify/data/sports_res2.txt' ,
51
- header = None , sep = ',' )
52
+ data = pd .read_table ('Art.txt' , header = None , sep = ',' )
53
+ data2 = pd .read_table ('Computer.txt' , header = None , sep = ',' )
54
+ data3 = pd .read_table ('Sports.txt' , header = None , sep = ',' )
52
55
# print (data,data2,data3)
53
56
54
57
posting_list = []
55
- class_list = []
58
+ class_list = [] # 方便计算转换为1,2,3
56
59
57
60
for i in range (len (data )):
58
61
posting_list .append ((data .iloc [i , 1 ]))
@@ -71,14 +74,16 @@ def load_data():
71
74
def jieba_tokenizer (x ): return jieba .cut (x , cut_all = True )
72
75
73
76
77
+ # 将文件名进行脱敏化处理
74
78
def trans_text ():
75
79
# salt = ''.join(random.sample(string.ascii_letters + string.digits, 8))
76
80
f3 = open ('id2class2.txt' , 'a' )
81
+ filelist = os .listdir (base_path )
77
82
for files in filelist :
78
83
# print (files)
79
84
f = open (base_path + files , 'r' )
80
85
text = (f .read ().decode ('GB2312' , 'ignore' ).encode ('utf-8' ))
81
- salt = '' .join (random .sample (string .ascii_letters + string .digits , 8 )) # 产生随机数
86
+ salt = '' .join (random .sample (string .ascii_letters + string .digits , 8 )) # 产生随机数
82
87
f2 = open ("C:/Users/kaifun/Desktop/ass_TIP/TextInfoExp/Part2_Text_Classify/test3/" + salt + '.txt' , 'w' )
83
88
f2 .write (text )
84
89
f3 .write (salt + ' ' + 'e' + '\n ' )
@@ -89,25 +94,26 @@ def trans_text():
89
94
def get_classify ():
90
95
X_train , Y_train = load_data ()
91
96
97
+ # 定义分类器
92
98
classifier = Pipeline ([
93
- ('counter' , CountVectorizer (tokenizer = jieba_tokenizer )),
94
- ('tfidf' , TfidfTransformer ()),
95
- ('clf' , OneVsRestClassifier (LinearSVC ())),
99
+ ('counter' , CountVectorizer (tokenizer = jieba_tokenizer )), # 标记和计数,提取特征用 向量化
100
+ ('tfidf' , TfidfTransformer ()), # IF-IDF 权重
101
+ ('clf' , OneVsRestClassifier (LinearSVC ())), # 1-rest 多分类(多标签)
96
102
])
97
103
mlb = MultiLabelBinarizer ()
98
- Y_train = mlb .fit_transform (Y_train )
104
+ Y_train = mlb .fit_transform (Y_train ) # 分类号数值化
99
105
100
106
classifier .fit (X_train , Y_train )
101
107
102
108
# X_test = ["数据分析"]
103
109
# 把所有的测试文本存到一个list中
104
110
test_list = []
105
111
test_name = []
106
- filelist2 = os .listdir ("C:/Users/kaifun/Desktop/ass_TIP/TextInfoExp/Part2_Text_Classify/ data_test/" )
112
+ filelist2 = os .listdir (base_path + " data_test/" )
107
113
for files in filelist2 :
108
114
# print (files)
109
115
test_name .append (files )
110
- f = open ("C:/Users/kaifun/Desktop/ass_TIP/TextInfoExp/Part2_Text_Classify/ data_test/" + files , 'r' )
116
+ f = open (base_path + " data_test/" + files , 'r' )
111
117
test_list .append (f .read ())
112
118
113
119
prediction = classifier .predict (test_list )
@@ -123,29 +129,6 @@ def get_classify():
123
129
print ((num_dict [('1' ,)] + num_dict [('2' ,)] + num_dict [('3' ,)]) / float (len (result ))) # 整数除整数为0,应把其中一个改为浮点数。
124
130
125
131
126
- def get_text2 ():
127
- for files in filelist :
128
- f = open (base_path + files , 'r' )
129
- data = f .read ()
130
- f2 = open ("C:/Users/kaifun/Desktop/ass_TIP/TextInfoExp/Part2_Text_Classify/test/" + files + '.txt' , 'w' )
131
- f2 .write (data )
132
-
133
-
134
- def check ():
135
- filename = []
136
- print (filelist )
137
- data = pd .read_table ('C:/Users/kaifun/Desktop/ass_TIP/TextInfoExp/Part2_Text_Classify/src/result.txt' , header = None ,
138
- sep = '\t ' )
139
-
140
- for i in range (len (data )):
141
- filename .append (str (data .iloc [i ,0 ]).split ()[0 ])
142
-
143
- print (filename )
144
-
145
-
146
132
if __name__ == '__main__' :
147
- # get_text()
148
- # trans_text()
133
+ # data_preprocess()
149
134
get_classify ()
150
- # get_text2()
151
- # check()
0 commit comments