|
| 1 | +import numpy as np |
| 2 | +import re |
| 3 | +import itertools |
| 4 | +import codecs |
| 5 | +from collections import Counter |
| 6 | + |
| 7 | + |
| 8 | +def clean_str(string): |
| 9 | + """ |
| 10 | + Tokenization/string cleaning for all datasets except for SST. |
| 11 | + Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py |
| 12 | + """ |
| 13 | + string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string) |
| 14 | + string = re.sub(r"\'s", " \'s", string) |
| 15 | + string = re.sub(r"\'ve", " \'ve", string) |
| 16 | + string = re.sub(r"n\'t", " n\'t", string) |
| 17 | + string = re.sub(r"\'re", " \'re", string) |
| 18 | + string = re.sub(r"\'d", " \'d", string) |
| 19 | + string = re.sub(r"\'ll", " \'ll", string) |
| 20 | + string = re.sub(r",", " , ", string) |
| 21 | + string = re.sub(r"!", " ! ", string) |
| 22 | + string = re.sub(r"\(", " \( ", string) |
| 23 | + string = re.sub(r"\)", " \) ", string) |
| 24 | + string = re.sub(r"\?", " \? ", string) |
| 25 | + string = re.sub(r"\s{2,}", " ", string) |
| 26 | + return string.strip().lower() |
| 27 | + |
| 28 | + |
| 29 | +def load_data_and_labels(): |
| 30 | + """ |
| 31 | + Loads MR polarity data from files, splits the data into words and generates labels. |
| 32 | + Returns split sentences and labels. |
| 33 | + """ |
| 34 | + # Load data from files |
| 35 | + positive_examples = list(codecs.open("./data/chinese/pos.txt", "r", "utf-8").readlines()) |
| 36 | + positive_examples = [s.strip() for s in positive_examples] |
| 37 | + negative_examples = list(codecs.open("./data/chinese/neg.txt", "r", "utf-8").readlines()) |
| 38 | + negative_examples = [s.strip() for s in negative_examples] |
| 39 | + # Split by words |
| 40 | + x_text = positive_examples + negative_examples |
| 41 | + # x_text = [clean_str(sent) for sent in x_text] |
| 42 | + x_text = [list(s) for s in x_text] |
| 43 | + |
| 44 | + # Generate labels |
| 45 | + positive_labels = [[0, 1] for _ in positive_examples] |
| 46 | + negative_labels = [[1, 0] for _ in negative_examples] |
| 47 | + y = np.concatenate([positive_labels, negative_labels], 0) |
| 48 | + return [x_text, y] |
| 49 | + |
| 50 | + |
| 51 | +def pad_sentences(sentences, padding_word="<PAD/>"): |
| 52 | + """ |
| 53 | + Pads all sentences to the same length. The length is defined by the longest sentence. |
| 54 | + Returns padded sentences. |
| 55 | + """ |
| 56 | + sequence_length = max(len(x) for x in sentences) |
| 57 | + padded_sentences = [] |
| 58 | + for i in range(len(sentences)): |
| 59 | + sentence = sentences[i] |
| 60 | + num_padding = sequence_length - len(sentence) |
| 61 | + new_sentence = sentence + [padding_word] * num_padding |
| 62 | + padded_sentences.append(new_sentence) |
| 63 | + return padded_sentences |
| 64 | + |
| 65 | + |
| 66 | +def build_vocab(sentences): |
| 67 | + """ |
| 68 | + Builds a vocabulary mapping from word to index based on the sentences. |
| 69 | + Returns vocabulary mapping and inverse vocabulary mapping. |
| 70 | + """ |
| 71 | + # Build vocabulary |
| 72 | + word_counts = Counter(itertools.chain(*sentences)) |
| 73 | + # Mapping from index to word |
| 74 | + vocabulary_inv = [x[0] for x in word_counts.most_common()] |
| 75 | + # Mapping from word to index |
| 76 | + vocabulary = {x: i for i, x in enumerate(vocabulary_inv)} |
| 77 | + return [vocabulary, vocabulary_inv] |
| 78 | + |
| 79 | + |
| 80 | +def build_input_data(sentences, labels, vocabulary): |
| 81 | + """ |
| 82 | + Maps sentencs and labels to vectors based on a vocabulary. |
| 83 | + """ |
| 84 | + x = np.array([[vocabulary[word] for word in sentence] for sentence in sentences]) |
| 85 | + y = np.array(labels) |
| 86 | + return [x, y] |
| 87 | + |
| 88 | + |
| 89 | +def load_data(): |
| 90 | + """ |
| 91 | + Loads and preprocessed data for the MR dataset. |
| 92 | + Returns input vectors, labels, vocabulary, and inverse vocabulary. |
| 93 | + """ |
| 94 | + # Load and preprocess data |
| 95 | + sentences, labels = load_data_and_labels() |
| 96 | + sentences_padded = pad_sentences(sentences) |
| 97 | + vocabulary, vocabulary_inv = build_vocab(sentences_padded) |
| 98 | + x, y = build_input_data(sentences_padded, labels, vocabulary) |
| 99 | + return [x, y, vocabulary, vocabulary_inv] |
| 100 | + |
| 101 | + |
| 102 | +def batch_iter(data, batch_size, num_epochs): |
| 103 | + """ |
| 104 | + Generates a batch iterator for a dataset. |
| 105 | + """ |
| 106 | + data = np.array(data) |
| 107 | + data_size = len(data) |
| 108 | + num_batches_per_epoch = int(len(data)/batch_size) + 1 |
| 109 | + for epoch in range(num_epochs): |
| 110 | + # Shuffle the data at each epoch |
| 111 | + shuffle_indices = np.random.permutation(np.arange(data_size)) |
| 112 | + shuffled_data = data[shuffle_indices] |
| 113 | + for batch_num in range(num_batches_per_epoch): |
| 114 | + start_index = batch_num * batch_size |
| 115 | + end_index = min((batch_num + 1) * batch_size, data_size) |
| 116 | + yield shuffled_data[start_index:end_index] |
0 commit comments