-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathbinary_classifier.py
150 lines (130 loc) · 4.64 KB
/
binary_classifier.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
# coding=utf-8
# @author: cer
from __future__ import print_function
import pandas as pd
import numpy as np
import os
import pickle
import gc
import xgboost as xgb
import re
from sklearn.model_selection import train_test_split
max_num_features = 10
pad_size = 1
boundary_letter = -1
space_letter = 0
# max_data_size = 320000
self_classes = ["PLAIN", "PUNCT"]
param = {'objective': 'multi:softmax',
'eta': '0.3',
'max_depth': 10,
'silent': 1,
'nthread': -1,
# 'num_class':num_class,
'num_class': 2,
'eval_metric': 'merror'}
def context_window_transform(data, pad_size):
"""每个词加上前面一个和后面一个词,中间用-1隔开"""
pre = np.zeros(max_num_features)
pre = [pre for x in np.arange(pad_size)]
data = pre + data + pre
neo_data = []
for i in np.arange(len(data) - pad_size * 2):
row = []
for x in data[i : i + pad_size * 2 + 1]:
row += [boundary_letter]
row += x.tolist()
row += [boundary_letter]
neo_data.append(row)
return neo_data
def train():
print("open data files ...")
train_df = pd.read_csv('input/en_train.csv')
print("data processing...")
x_data = []
# 将类别数字化
# y_data = pd.factorize(train_df['class'])
# labels = y_data[1]
# y_data = y_data[0]
labels = train_df["class"].unique()
class2index = dict(zip(labels, range(len(labels))))
for k in class2index:
if k in self_classes:
class2index[k] = 0
else:
class2index[k] = 1
y_data = map(lambda c: class2index[c], train_df['class'].values)
gc.collect()
# 每个目标词用组成这个词的所有字符的ascii码表示,并padding
for x in train_df['before'].values:
x_row = np.ones(max_num_features, dtype=int) * space_letter
for xi, i in zip(list(str(x)), np.arange(max_num_features)):
x_row[i] = ord(xi)
x_data.append(x_row)
del train_df
gc.collect()
x_data_context = np.array(context_window_transform(x_data, pad_size))
del x_data
gc.collect()
x_data_context_a = np.array(x_data_context)
y_data_a = np.array(y_data)
print('Total number of samples:', len(x_data_context))
print('x_data sample:')
print(x_data_context[0])
print('y_data sample:')
print(y_data[0])
print('labels:')
print(labels)
del x_data_context
del y_data
gc.collect()
x_train, x_valid, y_train, y_valid= train_test_split(x_data_context_a, y_data_a,
test_size=0.1, random_state=2017)
del x_data_context_a
del y_data_a
gc.collect()
print("forming dmatrix...")
num_class = len(labels)
dtrain = xgb.DMatrix(x_train, label=y_train)
dvalid = xgb.DMatrix(x_valid, label=y_valid)
watchlist = [(dvalid, 'valid'), (dtrain, 'train')]
del x_train
del y_train
gc.collect()
print("training start...")
model = xgb.train(param, dtrain, 50, watchlist, early_stopping_rounds=20,
verbose_eval=10)
model.save_model('model_vars/train.model')
model.dump_model('model_vars/dump.train.txt')
def test(model_file='model_vars/train.model'):
test_df = pd.read_csv('input/en_test.csv')
# 每个目标词用组成这个词的所有字符的ascii码表示,并padding
x_data = []
for x in test_df['before'].values:
x_row = np.ones(max_num_features, dtype=int) * space_letter
for xi, i in zip(list(str(x)), np.arange(max_num_features)):
x_row[i] = ord(xi)
x_data.append(x_row)
x_data_context = np.array(context_window_transform(x_data, pad_size))
x_data_context_a = np.array(x_data_context)
dtest = xgb.DMatrix(x_data_context_a)
bst = xgb.Booster(param) # init model
bst.load_model(model_file)
ypred = bst.predict(dtest)
print("ypred:", type(ypred), np.shape(ypred))
print(test_df.shape)
# test_df["id"] = test_df[["sentence_id", "token_id"]].apply(lambda row: axis=1)
print(test_df["sentence_id"].values.shape, test_df["sentence_id"].values.dtype)
ids_a = np.array(map(lambda tup: str(tup[0]) + "_" + str(tup[1]),
zip(test_df["sentence_id"].values,
test_df["token_id"].values)))
print("ids_a: ", ids_a.shape)
test_df["id"] = ids_a
# test_df.drop(["sentence_id", "token_id"])
class_df = test_df[["id", "before"]]
class_df["class_pred"] = ypred
# class_df = class_df[["id", "before", "class_pred"]]
class_df.to_csv("output/class_pred.csv", index=False)
if __name__ == '__main__':
# train()
test()