-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathsim_seq.py
287 lines (249 loc) · 11.5 KB
/
sim_seq.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
import numpy as np
from aip import AipNlp
import tensorflow as tf
from tensorflow.contrib.legacy_seq2seq.python.ops import seq2seq
import word_token
import jieba
import random
import bm25_fitness_data
size = 8 # LSTM神经元size
GO_ID = 1 # 输出序列起始标记
EOS_ID = 2 # 结尾标记
PAD_ID = 0 # 空值填充0
min_freq = 1 # 样本频率超过这个值才会存入词表
epochs = 20000 # 训练次数
batch_num = 1000 # 参与训练的问答对个数
input_seq_len = 25 # 输入序列长度
output_seq_len = 50 # 输出序列长度
init_learning_rate = 0.5 # 初始学习率
wordToken = word_token.WordToken() # 这是个词袋模型
# 放在全局的位置,为了动态算出 num_encoder_symbols 和 num_decoder_symbols
max_token_id = wordToken.load_file_list(['./samples/question', './samples/answer'], min_freq)
num_encoder_symbols = max_token_id + 5 # 算上加上填充、结尾标记、输出标记
num_decoder_symbols = max_token_id + 5
APP_ID = '19349034'
API_KEY = 'Y5qY1MYmD2xV7LMeO5rFveXC'
SECRET_KEY = '2eMGn1PS1XofcsYam3WWkz9fBkyitg6w'
client = AipNlp(APP_ID, API_KEY, SECRET_KEY) # 调用百度词,句相似度API
options = {}
options["model"] = "bert"
q_list=[] # question
a_list=[] # answer
type_list=[] #ill type
with open('C:/Code/KnowledgeGraph-QA-master/data/question.txt', encoding="utf-8") as f:
for line in f.readlines():
q_list.append(line.strip())
with open('C:/Code/KnowledgeGraph-QA-master/data/answer.txt', encoding="utf-8") as f:
for line in f.readlines():
a_list.append(line.strip())
with open('C:/Code/KnowledgeGraph-QA-master/data/ill_type.txt', encoding="utf-8") as f:
for line in f.readlines():
type_list.append(line.strip())
ty_list, s, p = np.unique(type_list, return_index=True, return_inverse=True)
a_type = list(zip(q_list,type_list)) # [[answer],[type]] 如[['老是睡不着'],['失眠症']]
class Word2vecSim:
def sim_main(self, target):
res = []
target_type = []
result2 = client.lexer(target)
result2 = result2.get('items')
if result2 is None:
pass
else:
for i in range(len(result2)):
type = result2[i]
item = type.get('item')
pos = type.get('pos')
if pos == 'nz':
target_type = item # 利用词性分析提取出专有名词,绝大部分情况为疾病名称
for i in range(len(ty_list)):
result3=client.wordSimEmbedding(target_type, ty_list[i][1])
word_sim=result3.get('score')
if word_sim is None:
word_sim=0
if word_sim>=0.3: # 看问句属于哪种type,然后再计算相似度,减少运算消耗
for string in q_list:
result = client.simnet(string, target, options)
score = result.get('score')
if score is None:
score = 0
res.append([string, score])
else:
for string in q_list:
score = 0
res.append([string, score])
res = sorted(res, key=lambda x: x[1], reverse=True)
for i in range(5):
print(i + 1, res[i][0])
print(6, "以上都不是,生成答案")
x = input("请输入你要问的问题序号:")
if int(x) == 6:
answer=seq_predict(target)
return answer
elif 0 < int(x) < 6:
return a_list[q_list.index(res[int(x) - 1][0])]
else:
return "无效输入,请重新提问"
'''def sim_eval(self, target):
res = []
target_type = []
result2 = client.lexer(target)
result2 = result2.get('items')
if result2 is None:
pass
else:
for i in range(len(result2)):
type = result2[i]
item = type.get('item')
pos = type.get('pos')
if pos == 'nz':
target_type = item
for i in range(len(ty_list)):
result3=client.wordSimEmbedding(target_type, ty_list[i][1])
word_sim=result3.get('score')
if word_sim is None:
word_sim=0
if word_sim>=0.3:
for string in q_list:
result = client.simnet(string, target, options)
score = result.get('score')
if score is None:
score = 0
res.append([string, score])
else:
for string in q_list:
score = 0
res.append([string, score])
res = sorted(res, key=lambda x: x[1], reverse=True)
candidate = ''
for i in range(5):
candidate = candidate + str(i + 1) + res[i][0] + '\n'
return candidate'''
#seq模型部分
def get_model(feed_previous=False):
learning_rate = tf.Variable(float(init_learning_rate), trainable=False, dtype=tf.float32)
learning_rate_decay_op = learning_rate.assign(learning_rate * 0.9)
encoder_inputs = []
decoder_inputs = []
target_weights = []
for i in range(input_seq_len):
encoder_inputs.append(tf.compat.v1.placeholder(tf.int32, shape=[None], name="encoder{0}".format(i)))
for i in range(output_seq_len + 1):
decoder_inputs.append(tf.compat.v1.placeholder(tf.int32, shape=[None], name="decoder{0}".format(i)))
for i in range(output_seq_len):
target_weights.append(tf.compat.v1.placeholder(tf.float32, shape=[None], name="weight{0}".format(i)))
# decoder_inputs左移一个时序作为targets
targets = [decoder_inputs[i + 1] for i in range(output_seq_len)]
cell = tf.contrib.rnn.BasicLSTMCell(size)
# 这里输出的状态我们不需要
outputs, _ = seq2seq.embedding_attention_seq2seq(
encoder_inputs,
decoder_inputs[:output_seq_len],
cell,
num_encoder_symbols=num_encoder_symbols,
num_decoder_symbols=num_decoder_symbols,
embedding_size=size,
output_projection=None,
# 是一个(W, B)结构的tuple,W是shape为[output_size x num_decoder_symbols]的weight矩阵,B是shape为[num_decoder_symbols]的偏置向量
feed_previous=feed_previous,
dtype=tf.float32)
# 计算交叉熵损失
loss = seq2seq.sequence_loss(outputs, targets, target_weights)
# 梯度下降优化器
opt = tf.compat.v1.train.GradientDescentOptimizer(learning_rate)
# 优化目标:让loss最小化
update = opt.apply_gradients(opt.compute_gradients(loss))
# 模型持久化,保存所有的变量
saver = tf.compat.v1.train.Saver(tf.compat.v1.global_variables())
return encoder_inputs, decoder_inputs, target_weights, outputs, loss, update, saver, learning_rate_decay_op, learning_rate
def get_id_list_from(sentence):
"""
得到分词后的ID
"""
sentence_id_list = []
seg_list = jieba.cut(sentence)
for str in seg_list:
id = wordToken.word2id(str)
if id:
sentence_id_list.append(wordToken.word2id(str))
return sentence_id_list
def get_samples(train_set, batch_num):
"""
构造样本数据:传入的train_set是处理好的问答集
batch_num:让train_set训练集里多少问答对参与训练
encoder_inputs: []
"""
raw_encoder_input = []
raw_decoder_input = []
if batch_num >= len(train_set):
batch_train_set = train_set
else:
random_start = random.randint(0, len(train_set) - batch_num)
batch_train_set = train_set[random_start:random_start + batch_num]
# 添加起始标记、结束填充
for sample in batch_train_set:
raw_encoder_input.append([PAD_ID] * (input_seq_len - len(sample[0])) + sample[0])
raw_decoder_input.append([GO_ID] + sample[1] + [PAD_ID] * (output_seq_len - len(sample[1]) - 1))
encoder_inputs = []
decoder_inputs = []
target_weights = []
for length_idx in range(input_seq_len):
encoder_inputs.append(np.array([encoder_input[length_idx] for encoder_input in raw_encoder_input],
dtype=np.int32)) # dtype是RNN状态数据的类型,默认是tf.float32
for length_idx in range(output_seq_len):
decoder_inputs.append(
np.array([decoder_input[length_idx] for decoder_input in raw_decoder_input], dtype=np.int32))
target_weights.append(np.array([
0.0 if length_idx == output_seq_len - 1 or decoder_input[length_idx] == PAD_ID else 1.0 for decoder_input in
raw_decoder_input
], dtype=np.float32))
return encoder_inputs, decoder_inputs, target_weights
def seq_to_encoder(input_seq):
"""
从输入空格分隔的数字id串,转成预测用的encoder、decoder、target_weight等
"""
input_seq_array = [int(v) for v in input_seq.split()]
encoder_input = [PAD_ID] * (input_seq_len - len(input_seq_array)) + input_seq_array
decoder_input = [GO_ID] + [PAD_ID] * (output_seq_len - 1)
encoder_inputs = [np.array([v], dtype=np.int32) for v in encoder_input]
decoder_inputs = [np.array([v], dtype=np.int32) for v in decoder_input]
target_weights = [np.array([1.0], dtype=np.float32)] * output_seq_len
return encoder_inputs, decoder_inputs, target_weights
def seq_predict(question):
"""
预测过程
"""
with tf.compat.v1.Session() as sess:
encoder_inputs, decoder_inputs, target_weights, outputs, loss, update, saver, learning_rate_decay_op, learning_rate = get_model(
feed_previous=True)
input_seq = question
max_score, answer = bm25_fitness_data.get_fitness_answer(input_seq)
if max_score > 0.08:
print("爱医生智能助理:" + str(answer))
else:
saver.restore(sess, 'C:/Code/KnowledgeGraph-QA-master/seq2seq/model/')
input_seq = input_seq.strip()
input_id_list = get_id_list_from(input_seq) # 分词,得到id序列
if len(input_id_list):
sample_encoder_inputs, sample_decoder_inputs, sample_target_weights = seq_to_encoder(
' '.join([str(v) for v in input_id_list]))
input_feed = {}
for l in range(input_seq_len):
input_feed[encoder_inputs[l].name] = sample_encoder_inputs[l]
for l in range(output_seq_len):
input_feed[decoder_inputs[l].name] = sample_decoder_inputs[l]
input_feed[target_weights[l].name] = sample_target_weights[l]
# GO_ID需要去了
input_feed[decoder_inputs[output_seq_len].name] = np.zeros([2], dtype=np.int32)
# 预测输出
outputs_seq = sess.run(outputs, input_feed)
# 因为输出数据每一个是num_decoder_symbols维的,因此找到数值最大的那个就是预测的id,就是这里的argmax函数的功能
outputs_seq = [int(np.argmax(logit[0], axis=0)) for logit in outputs_seq]
outputs_seq.remove(PAD_ID)
# 如果是结尾符,那么后面的语句就不输出了
if EOS_ID in outputs_seq:
outputs_seq = outputs_seq[: outputs_seq.index(EOS_ID)]
outputs_seq = [wordToken.id2word(v) for v in outputs_seq]
print("爱医生智能助理", "".join(outputs_seq))
else:
print(":我好像不明白你在说什么")