Skip to content

Commit 82beea4

Browse files
committed
update cnn cls
1 parent 5b4b910 commit 82beea4

20 files changed

+81366
-0
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
# encoding:utf-8
2+
3+
import jieba
4+
import re
5+
6+
data = open('neg.txt', 'r')
7+
# data3 = data.read()
8+
data2 = data.readlines()
9+
print len(data2)
10+
fw = open('neg2.txt', 'a')
11+
for line in data2:
12+
data = ''.join(re.findall(u'[\u4e00-\u9fff]+', line.decode('utf-8', 'ignore')))
13+
str = jieba.cut(data)
14+
data3 = " ".join(str).encode('utf-8')
15+
fw.write(data3 + '\n')

Part2_Text_Classify/cnn-text-classification-tf-chinese/data/chinese/neg.txt

+18,576
Large diffs are not rendered by default.

Part2_Text_Classify/cnn-text-classification-tf-chinese/data/chinese/neg2.txt

+18,576
Large diffs are not rendered by default.

Part2_Text_Classify/cnn-text-classification-tf-chinese/data/chinese/pos.txt

+16,548
Large diffs are not rendered by default.

Part2_Text_Classify/cnn-text-classification-tf-chinese/data/chinese/pos2.txt

+16,548
Large diffs are not rendered by default.

Part2_Text_Classify/cnn-text-classification-tf-chinese/data/rt-polaritydata/rt-polarity.neg

+5,331
Large diffs are not rendered by default.

Part2_Text_Classify/cnn-text-classification-tf-chinese/data/rt-polaritydata/rt-polarity.pos

+5,331
Large diffs are not rendered by default.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,116 @@
1+
import numpy as np
2+
import re
3+
import itertools
4+
import codecs
5+
from collections import Counter
6+
7+
8+
def clean_str(string):
9+
"""
10+
Tokenization/string cleaning for all datasets except for SST.
11+
Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
12+
"""
13+
string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
14+
string = re.sub(r"\'s", " \'s", string)
15+
string = re.sub(r"\'ve", " \'ve", string)
16+
string = re.sub(r"n\'t", " n\'t", string)
17+
string = re.sub(r"\'re", " \'re", string)
18+
string = re.sub(r"\'d", " \'d", string)
19+
string = re.sub(r"\'ll", " \'ll", string)
20+
string = re.sub(r",", " , ", string)
21+
string = re.sub(r"!", " ! ", string)
22+
string = re.sub(r"\(", " \( ", string)
23+
string = re.sub(r"\)", " \) ", string)
24+
string = re.sub(r"\?", " \? ", string)
25+
string = re.sub(r"\s{2,}", " ", string)
26+
return string.strip().lower()
27+
28+
29+
def load_data_and_labels():
30+
"""
31+
Loads MR polarity data from files, splits the data into words and generates labels.
32+
Returns split sentences and labels.
33+
"""
34+
# Load data from files
35+
positive_examples = list(codecs.open("./data/chinese/pos.txt", "r", "utf-8").readlines())
36+
positive_examples = [s.strip() for s in positive_examples]
37+
negative_examples = list(codecs.open("./data/chinese/neg.txt", "r", "utf-8").readlines())
38+
negative_examples = [s.strip() for s in negative_examples]
39+
# Split by words
40+
x_text = positive_examples + negative_examples
41+
# x_text = [clean_str(sent) for sent in x_text]
42+
x_text = [list(s) for s in x_text]
43+
44+
# Generate labels
45+
positive_labels = [[0, 1] for _ in positive_examples]
46+
negative_labels = [[1, 0] for _ in negative_examples]
47+
y = np.concatenate([positive_labels, negative_labels], 0)
48+
return [x_text, y]
49+
50+
51+
def pad_sentences(sentences, padding_word="<PAD/>"):
52+
"""
53+
Pads all sentences to the same length. The length is defined by the longest sentence.
54+
Returns padded sentences.
55+
"""
56+
sequence_length = max(len(x) for x in sentences)
57+
padded_sentences = []
58+
for i in range(len(sentences)):
59+
sentence = sentences[i]
60+
num_padding = sequence_length - len(sentence)
61+
new_sentence = sentence + [padding_word] * num_padding
62+
padded_sentences.append(new_sentence)
63+
return padded_sentences
64+
65+
66+
def build_vocab(sentences):
67+
"""
68+
Builds a vocabulary mapping from word to index based on the sentences.
69+
Returns vocabulary mapping and inverse vocabulary mapping.
70+
"""
71+
# Build vocabulary
72+
word_counts = Counter(itertools.chain(*sentences))
73+
# Mapping from index to word
74+
vocabulary_inv = [x[0] for x in word_counts.most_common()]
75+
# Mapping from word to index
76+
vocabulary = {x: i for i, x in enumerate(vocabulary_inv)}
77+
return [vocabulary, vocabulary_inv]
78+
79+
80+
def build_input_data(sentences, labels, vocabulary):
81+
"""
82+
Maps sentencs and labels to vectors based on a vocabulary.
83+
"""
84+
x = np.array([[vocabulary[word] for word in sentence] for sentence in sentences])
85+
y = np.array(labels)
86+
return [x, y]
87+
88+
89+
def load_data():
90+
"""
91+
Loads and preprocessed data for the MR dataset.
92+
Returns input vectors, labels, vocabulary, and inverse vocabulary.
93+
"""
94+
# Load and preprocess data
95+
sentences, labels = load_data_and_labels()
96+
sentences_padded = pad_sentences(sentences)
97+
vocabulary, vocabulary_inv = build_vocab(sentences_padded)
98+
x, y = build_input_data(sentences_padded, labels, vocabulary)
99+
return [x, y, vocabulary, vocabulary_inv]
100+
101+
102+
def batch_iter(data, batch_size, num_epochs):
103+
"""
104+
Generates a batch iterator for a dataset.
105+
"""
106+
data = np.array(data)
107+
data_size = len(data)
108+
num_batches_per_epoch = int(len(data)/batch_size) + 1
109+
for epoch in range(num_epochs):
110+
# Shuffle the data at each epoch
111+
shuffle_indices = np.random.permutation(np.arange(data_size))
112+
shuffled_data = data[shuffle_indices]
113+
for batch_num in range(num_batches_per_epoch):
114+
start_index = batch_num * batch_size
115+
end_index = min((batch_num + 1) * batch_size, data_size)
116+
yield shuffled_data[start_index:end_index]
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,138 @@
1+
# encoding:utf-8
2+
import tensorflow as tf
3+
4+
5+
def linear(input_, output_size, scope=None):
6+
'''''
7+
Linear map: output[k] = sum_i(Matrix[k, i] * args[i] ) + Bias[k]
8+
Args:
9+
args: a tensor or a list of 2D, batch x n, Tensors.
10+
output_size: int, second dimension of W[i].
11+
scope: VariableScope for the created subgraph; defaults to "Linear".
12+
Returns:
13+
A 2D Tensor with shape [batch x output_size] equal to
14+
sum_i(args[i] * W[i]), where W[i]s are newly created matrices.
15+
Raises:
16+
ValueError: if some of the arguments has unspecified or wrong shape.
17+
'''
18+
19+
shape = input_.get_shape().as_list()
20+
if len(shape) != 2:
21+
raise ValueError("Linear is expecting 2D arguments: %s" % str(shape))
22+
if not shape[1]:
23+
raise ValueError("Linear expects shape[1] of arguments: %s" % str(shape))
24+
input_size = shape[1]
25+
26+
# Now the computation.
27+
with tf.variable_scope(scope or "SimpleLinear"):
28+
matrix = tf.get_variable("Matrix", [output_size, input_size], dtype=input_.dtype)
29+
bias_term = tf.get_variable("Bias", [output_size], dtype=input_.dtype)
30+
31+
return tf.matmul(input_, tf.transpose(matrix)) + bias_term
32+
33+
34+
# highway layer that borrowed from https://github.com/carpedm20/lstm-char-cnn-tensorflow
35+
def highway(input_, size, layer_size=1, bias=-2, f=tf.nn.relu):
36+
"""Highway Network (cf. http://arxiv.org/abs/1505.00387).
37+
38+
t = sigmoid(Wy + b)
39+
z = t * g(Wy + b) + (1 - t) * y
40+
where g is nonlinearity, t is transform gate, and (1 - t) is carry gate.
41+
"""
42+
output = input_
43+
for idx in xrange(layer_size):
44+
output = f(
45+
linear(output, size, scope='output_lin_%d' % idx)) # update
46+
# add below, and remove scope parameter while calling:
47+
# from tensorflow.contrib.rnn.python.ops.core_rnn_cell_impl import _linear as linear
48+
49+
transform_gate = tf.sigmoid(
50+
linear(input_, size, scope='transform_lin_%d' % idx) + bias) # update
51+
carry_gate = 1. - transform_gate
52+
53+
output = transform_gate * output + carry_gate * input_
54+
55+
return output
56+
57+
58+
class TextCNN(object):
59+
"""
60+
A CNN for text classification.
61+
Uses an embedding layer, followed by a convolutional, max-pooling and softmax layer.
62+
"""
63+
64+
def __init__(
65+
self, sequence_length, num_classes, vocab_size,
66+
embedding_size, filter_sizes, num_filters, l2_reg_lambda=0.0):
67+
# Placeholders for input, output and dropout
68+
self.input_x = tf.placeholder(tf.int32, [None, sequence_length], name="input_x")
69+
self.input_y = tf.placeholder(tf.float32, [None, num_classes], name="input_y")
70+
self.dropout_keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prob")
71+
72+
# Keeping track of l2 regularization loss (optional)
73+
l2_loss = tf.constant(0.0)
74+
75+
# Embedding layer
76+
with tf.device('/gpu:0'), tf.name_scope("embedding"):
77+
W = tf.Variable(
78+
tf.random_uniform([vocab_size, embedding_size], -1.0, 1.0),
79+
name="W")
80+
self.embedded_chars = tf.nn.embedding_lookup(W, self.input_x)
81+
self.embedded_chars_expanded = tf.expand_dims(self.embedded_chars, -1)
82+
83+
# Create a convolution + maxpool layer for each filter size
84+
pooled_outputs = []
85+
for filter_size, num_filter in zip(filter_sizes, num_filters):
86+
with tf.name_scope("conv-maxpool-%s" % filter_size):
87+
# Convolution Layer
88+
filter_shape = [filter_size, embedding_size, 1, num_filter]
89+
W = tf.Variable(tf.truncated_normal(filter_shape, stddev=0.1), name="W")
90+
b = tf.Variable(tf.constant(0.1, shape=[num_filter]), name="b")
91+
conv = tf.nn.conv2d(
92+
self.embedded_chars_expanded,
93+
W,
94+
strides=[1, 1, 1, 1],
95+
padding="VALID",
96+
name="conv")
97+
# Apply nonlinearity
98+
h = tf.nn.relu(tf.nn.bias_add(conv, b), name="relu")
99+
# Maxpooling over the outputs
100+
pooled = tf.nn.max_pool(
101+
h,
102+
ksize=[1, sequence_length - filter_size + 1, 1, 1],
103+
strides=[1, 1, 1, 1],
104+
padding='VALID',
105+
name="pool")
106+
pooled_outputs.append(pooled)
107+
108+
# Combine all the pooled features
109+
num_filters_total = sum(num_filters)
110+
self.h_pool = tf.concat(pooled_outputs, 3) # 转换参数的顺序
111+
self.h_pool_flat = tf.reshape(self.h_pool, [-1, num_filters_total])
112+
113+
# Add highway
114+
with tf.name_scope("highway"):
115+
self.h_highway = highway(self.h_pool_flat, self.h_pool_flat.get_shape()[1], 1, 0)
116+
117+
# Add dropout
118+
with tf.name_scope("dropout"):
119+
self.h_drop = tf.nn.dropout(self.h_highway, self.dropout_keep_prob)
120+
121+
# Final (unnormalized) scores and predictions
122+
with tf.name_scope("output"):
123+
W = tf.Variable(tf.truncated_normal([num_filters_total, num_classes], stddev=0.1), name="W")
124+
b = tf.Variable(tf.constant(0.1, shape=[num_classes]), name="b")
125+
l2_loss += tf.nn.l2_loss(W)
126+
l2_loss += tf.nn.l2_loss(b)
127+
self.scores = tf.nn.xw_plus_b(self.h_drop, W, b, name="scores")
128+
self.predictions = tf.argmax(self.scores, 1, name="predictions")
129+
130+
# CalculateMean cross-entropy loss
131+
with tf.name_scope("loss"):
132+
losses = tf.nn.softmax_cross_entropy_with_logits(logits=self.scores, labels=self.input_y) # update
133+
self.loss = tf.reduce_mean(losses) + l2_reg_lambda * l2_loss
134+
135+
# Accuracy
136+
with tf.name_scope("accuracy"):
137+
correct_predictions = tf.equal(self.predictions, tf.argmax(self.input_y, 1))
138+
self.accuracy = tf.reduce_mean(tf.cast(correct_predictions, "float"), name="accuracy")

0 commit comments

Comments
 (0)