Skip to content

Commit 82beea4

Browse files
committed
update cnn cls
1 parent 5b4b910 commit 82beea4

20 files changed

+81366
-0
lines changed
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
# encoding:utf-8
2+
3+
import jieba
4+
import re
5+
6+
data = open('neg.txt', 'r')
7+
# data3 = data.read()
8+
data2 = data.readlines()
9+
print len(data2)
10+
fw = open('neg2.txt', 'a')
11+
for line in data2:
12+
data = ''.join(re.findall(u'[\u4e00-\u9fff]+', line.decode('utf-8', 'ignore')))
13+
str = jieba.cut(data)
14+
data3 = " ".join(str).encode('utf-8')
15+
fw.write(data3 + '\n')

Part2_Text_Classify/cnn-text-classification-tf-chinese/data/chinese/neg.txt

Lines changed: 18576 additions & 0 deletions
Large diffs are not rendered by default.

Part2_Text_Classify/cnn-text-classification-tf-chinese/data/chinese/neg2.txt

Lines changed: 18576 additions & 0 deletions
Large diffs are not rendered by default.

Part2_Text_Classify/cnn-text-classification-tf-chinese/data/chinese/pos.txt

Lines changed: 16548 additions & 0 deletions
Large diffs are not rendered by default.

Part2_Text_Classify/cnn-text-classification-tf-chinese/data/chinese/pos2.txt

Lines changed: 16548 additions & 0 deletions
Large diffs are not rendered by default.

Part2_Text_Classify/cnn-text-classification-tf-chinese/data/rt-polaritydata/rt-polarity.neg

Lines changed: 5331 additions & 0 deletions
Large diffs are not rendered by default.

Part2_Text_Classify/cnn-text-classification-tf-chinese/data/rt-polaritydata/rt-polarity.pos

Lines changed: 5331 additions & 0 deletions
Large diffs are not rendered by default.
Lines changed: 116 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,116 @@
1+
import numpy as np
2+
import re
3+
import itertools
4+
import codecs
5+
from collections import Counter
6+
7+
8+
def clean_str(string):
9+
"""
10+
Tokenization/string cleaning for all datasets except for SST.
11+
Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
12+
"""
13+
string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
14+
string = re.sub(r"\'s", " \'s", string)
15+
string = re.sub(r"\'ve", " \'ve", string)
16+
string = re.sub(r"n\'t", " n\'t", string)
17+
string = re.sub(r"\'re", " \'re", string)
18+
string = re.sub(r"\'d", " \'d", string)
19+
string = re.sub(r"\'ll", " \'ll", string)
20+
string = re.sub(r",", " , ", string)
21+
string = re.sub(r"!", " ! ", string)
22+
string = re.sub(r"\(", " \( ", string)
23+
string = re.sub(r"\)", " \) ", string)
24+
string = re.sub(r"\?", " \? ", string)
25+
string = re.sub(r"\s{2,}", " ", string)
26+
return string.strip().lower()
27+
28+
29+
def load_data_and_labels():
30+
"""
31+
Loads MR polarity data from files, splits the data into words and generates labels.
32+
Returns split sentences and labels.
33+
"""
34+
# Load data from files
35+
positive_examples = list(codecs.open("./data/chinese/pos.txt", "r", "utf-8").readlines())
36+
positive_examples = [s.strip() for s in positive_examples]
37+
negative_examples = list(codecs.open("./data/chinese/neg.txt", "r", "utf-8").readlines())
38+
negative_examples = [s.strip() for s in negative_examples]
39+
# Split by words
40+
x_text = positive_examples + negative_examples
41+
# x_text = [clean_str(sent) for sent in x_text]
42+
x_text = [list(s) for s in x_text]
43+
44+
# Generate labels
45+
positive_labels = [[0, 1] for _ in positive_examples]
46+
negative_labels = [[1, 0] for _ in negative_examples]
47+
y = np.concatenate([positive_labels, negative_labels], 0)
48+
return [x_text, y]
49+
50+
51+
def pad_sentences(sentences, padding_word="<PAD/>"):
52+
"""
53+
Pads all sentences to the same length. The length is defined by the longest sentence.
54+
Returns padded sentences.
55+
"""
56+
sequence_length = max(len(x) for x in sentences)
57+
padded_sentences = []
58+
for i in range(len(sentences)):
59+
sentence = sentences[i]
60+
num_padding = sequence_length - len(sentence)
61+
new_sentence = sentence + [padding_word] * num_padding
62+
padded_sentences.append(new_sentence)
63+
return padded_sentences
64+
65+
66+
def build_vocab(sentences):
67+
"""
68+
Builds a vocabulary mapping from word to index based on the sentences.
69+
Returns vocabulary mapping and inverse vocabulary mapping.
70+
"""
71+
# Build vocabulary
72+
word_counts = Counter(itertools.chain(*sentences))
73+
# Mapping from index to word
74+
vocabulary_inv = [x[0] for x in word_counts.most_common()]
75+
# Mapping from word to index
76+
vocabulary = {x: i for i, x in enumerate(vocabulary_inv)}
77+
return [vocabulary, vocabulary_inv]
78+
79+
80+
def build_input_data(sentences, labels, vocabulary):
81+
"""
82+
Maps sentencs and labels to vectors based on a vocabulary.
83+
"""
84+
x = np.array([[vocabulary[word] for word in sentence] for sentence in sentences])
85+
y = np.array(labels)
86+
return [x, y]
87+
88+
89+
def load_data():
90+
"""
91+
Loads and preprocessed data for the MR dataset.
92+
Returns input vectors, labels, vocabulary, and inverse vocabulary.
93+
"""
94+
# Load and preprocess data
95+
sentences, labels = load_data_and_labels()
96+
sentences_padded = pad_sentences(sentences)
97+
vocabulary, vocabulary_inv = build_vocab(sentences_padded)
98+
x, y = build_input_data(sentences_padded, labels, vocabulary)
99+
return [x, y, vocabulary, vocabulary_inv]
100+
101+
102+
def batch_iter(data, batch_size, num_epochs):
103+
"""
104+
Generates a batch iterator for a dataset.
105+
"""
106+
data = np.array(data)
107+
data_size = len(data)
108+
num_batches_per_epoch = int(len(data)/batch_size) + 1
109+
for epoch in range(num_epochs):
110+
# Shuffle the data at each epoch
111+
shuffle_indices = np.random.permutation(np.arange(data_size))
112+
shuffled_data = data[shuffle_indices]
113+
for batch_num in range(num_batches_per_epoch):
114+
start_index = batch_num * batch_size
115+
end_index = min((batch_num + 1) * batch_size, data_size)
116+
yield shuffled_data[start_index:end_index]
306 KB
Binary file not shown.
7.24 MB
Binary file not shown.

0 commit comments

Comments
 (0)