|
| 1 | +# -*- coding: utf-8 -*- |
| 2 | +"""model.ipynb |
| 3 | +
|
| 4 | +Automatically generated by Colaboratory. |
| 5 | +
|
| 6 | +Original file is located at |
| 7 | + https://colab.research.google.com/drive/1rlEs6_Z_XKP2BLdrHAS27MY986dzzEPS |
| 8 | +""" |
| 9 | + |
| 10 | +from __future__ import division |
| 11 | +from collections import Counter, defaultdict |
| 12 | +import os |
| 13 | +from random import shuffle |
| 14 | +import tensorflow.compat.v1 as tf |
| 15 | +from utils import _context_windows, _window, _device_for_node, _batchify, _plot_with_labels |
| 16 | + |
| 17 | +class GloVeModel(): |
| 18 | + def __init__(self, embedding_size, context_size, max_vocab_size=100000, min_occurrences=1, |
| 19 | + scaling_factor=3/4, cooccurrence_cap=100, batch_size=512, learning_rate=0.05): |
| 20 | + self.embedding_size = embedding_size |
| 21 | + if isinstance(context_size, tuple): |
| 22 | + self.left_context, self.right_context = context_size |
| 23 | + elif isinstance(context_size, int): |
| 24 | + self.left_context = self.right_context = context_size |
| 25 | + else: |
| 26 | + raise ValueError("`context_size` should be an int or a tuple of two ints") |
| 27 | + self.max_vocab_size = max_vocab_size |
| 28 | + self.min_occurrences = min_occurrences |
| 29 | + self.scaling_factor = scaling_factor |
| 30 | + self.cooccurrence_cap = cooccurrence_cap |
| 31 | + self.batch_size = batch_size |
| 32 | + self.learning_rate = learning_rate |
| 33 | + self.__words = None |
| 34 | + self.__word_to_id = None |
| 35 | + self.__cooccurrence_matrix = None |
| 36 | + self.__embeddings = None |
| 37 | + |
| 38 | + def fit_to_corpus(self, corpus): |
| 39 | + self.__fit_to_corpus(corpus, self.max_vocab_size, self.min_occurrences, |
| 40 | + self.left_context, self.right_context) |
| 41 | + self.__build_graph() |
| 42 | + |
| 43 | + def __fit_to_corpus(self, corpus, vocab_size, min_occurrences, left_size, right_size): |
| 44 | + word_counts = Counter() |
| 45 | + cooccurrence_counts = defaultdict(float) |
| 46 | + for region in corpus: |
| 47 | + word_counts.update(region) |
| 48 | + for l_context, word, r_context in _context_windows(region, left_size, right_size): |
| 49 | + for i, context_word in enumerate(l_context[::-1]): |
| 50 | + # add (1 / distance from focal word) for this pair |
| 51 | + cooccurrence_counts[(word, context_word)] += 1 / (i + 1) |
| 52 | + for i, context_word in enumerate(r_context): |
| 53 | + cooccurrence_counts[(word, context_word)] += 1 / (i + 1) |
| 54 | + |
| 55 | + self.__words = [word for word, count in word_counts.most_common(vocab_size) |
| 56 | + if count >= min_occurrences] |
| 57 | + self.__word_to_id = {word: i for i, word in enumerate(self.__words)} |
| 58 | + self.__cooccurrence_matrix = { |
| 59 | + (self.__word_to_id[words[0]], self.__word_to_id[words[1]]): count |
| 60 | + for words, count in cooccurrence_counts.items() |
| 61 | + if words[0] in self.__word_to_id and words[1] in self.__word_to_id} |
| 62 | + |
| 63 | + def __build_graph(self): |
| 64 | + self.__graph = tf.Graph() |
| 65 | + with self.__graph.as_default(), self.__graph.device(_device_for_node): |
| 66 | + count_max = tf.constant([self.cooccurrence_cap], dtype=tf.float32, |
| 67 | + name='max_cooccurrence_count') |
| 68 | + scaling_factor = tf.constant([self.scaling_factor], dtype=tf.float32, |
| 69 | + name="scaling_factor") |
| 70 | + |
| 71 | + self.__focal_input = tf.placeholder(tf.int32, shape=[self.batch_size], |
| 72 | + name="focal_words") |
| 73 | + self.__context_input = tf.placeholder(tf.int32, shape=[self.batch_size], |
| 74 | + name="context_words") |
| 75 | + self.__cooccurrence_count = tf.placeholder(tf.float32, shape=[self.batch_size], |
| 76 | + name="cooccurrence_count") |
| 77 | + |
| 78 | + focal_embeddings = tf.Variable( |
| 79 | + tf.random_uniform([self.vocab_size, self.embedding_size], 1.0, -1.0), |
| 80 | + name="focal_embeddings") |
| 81 | + context_embeddings = tf.Variable( |
| 82 | + tf.random_uniform([self.vocab_size, self.embedding_size], 1.0, -1.0), |
| 83 | + name="context_embeddings") |
| 84 | + |
| 85 | + focal_biases = tf.Variable(tf.random_uniform([self.vocab_size], 1.0, -1.0), |
| 86 | + name='focal_biases') |
| 87 | + context_biases = tf.Variable(tf.random_uniform([self.vocab_size], 1.0, -1.0), |
| 88 | + name="context_biases") |
| 89 | + |
| 90 | + focal_embedding = tf.nn.embedding_lookup([focal_embeddings], self.__focal_input) |
| 91 | + context_embedding = tf.nn.embedding_lookup([context_embeddings], self.__context_input) |
| 92 | + focal_bias = tf.nn.embedding_lookup([focal_biases], self.__focal_input) |
| 93 | + context_bias = tf.nn.embedding_lookup([context_biases], self.__context_input) |
| 94 | + |
| 95 | + weighting_factor = tf.minimum( |
| 96 | + 1.0, |
| 97 | + tf.pow( |
| 98 | + tf.div(self.__cooccurrence_count, count_max), |
| 99 | + scaling_factor)) |
| 100 | + |
| 101 | + embedding_product = tf.reduce_sum(tf.multiply(focal_embedding, context_embedding), 1) |
| 102 | + |
| 103 | + log_cooccurrences = tf.log(tf.to_float(self.__cooccurrence_count)) |
| 104 | + |
| 105 | + distance_expr = tf.square(tf.add_n([ |
| 106 | + embedding_product, |
| 107 | + focal_bias, |
| 108 | + context_bias, |
| 109 | + tf.negative(log_cooccurrences)])) |
| 110 | + |
| 111 | + single_losses = tf.multiply(weighting_factor, distance_expr) |
| 112 | + self.__total_loss = tf.reduce_sum(single_losses) |
| 113 | + tf.summary.scalar("GloVe_loss", self.__total_loss) |
| 114 | + self.__optimizer = tf.train.AdagradOptimizer(self.learning_rate).minimize( |
| 115 | + self.__total_loss) |
| 116 | + self.__summary = tf.summary.merge_all() |
| 117 | + |
| 118 | + self.__combined_embeddings = tf.add(focal_embeddings, context_embeddings, |
| 119 | + name="combined_embeddings") |
| 120 | + |
| 121 | + def train(self, num_epochs, log_dir=None, summary_batch_interval=1000, |
| 122 | + tsne_epoch_interval=None): |
| 123 | + should_write_summaries = log_dir is not None and summary_batch_interval |
| 124 | + should_generate_tsne = log_dir is not None and tsne_epoch_interval |
| 125 | + batches = self.__prepare_batches() |
| 126 | + total_steps = 0 |
| 127 | + with tf.Session(graph=self.__graph) as session: |
| 128 | + if should_write_summaries: |
| 129 | + print("Writing TensorBoard summaries to {}".format(log_dir)) |
| 130 | + summary_writer = tf.summary.FileWriter(log_dir, graph=session.graph) |
| 131 | + tf.global_variables_initializer().run() |
| 132 | + for epoch in range(num_epochs): |
| 133 | + shuffle(batches) |
| 134 | + for batch_index, batch in enumerate(batches): |
| 135 | + i_s, j_s, counts = batch |
| 136 | + if len(counts) != self.batch_size: |
| 137 | + continue |
| 138 | + feed_dict = { |
| 139 | + self.__focal_input: i_s, |
| 140 | + self.__context_input: j_s, |
| 141 | + self.__cooccurrence_count: counts} |
| 142 | + session.run([self.__optimizer], feed_dict=feed_dict) |
| 143 | + if should_write_summaries and (total_steps + 1) % summary_batch_interval == 0: |
| 144 | + summary_str = session.run(self.__summary, feed_dict=feed_dict) |
| 145 | + summary_writer.add_summary(summary_str, total_steps) |
| 146 | + total_steps += 1 |
| 147 | + if should_generate_tsne and (epoch + 1) % tsne_epoch_interval == 0: |
| 148 | + current_embeddings = self.__combined_embeddings.eval() |
| 149 | + output_path = os.path.join(log_dir, "epoch{:03d}.png".format(epoch + 1)) |
| 150 | + self.generate_tsne(output_path, embeddings=current_embeddings) |
| 151 | + self.__embeddings = self.__combined_embeddings.eval() |
| 152 | + if should_write_summaries: |
| 153 | + summary_writer.close() |
| 154 | + |
| 155 | + def embedding_for(self, word_str_or_id): |
| 156 | + if isinstance(word_str_or_id, str): |
| 157 | + return self.embeddings[self.__word_to_id[word_str_or_id]] |
| 158 | + elif isinstance(word_str_or_id, int): |
| 159 | + return self.embeddings[word_str_or_id] |
| 160 | + |
| 161 | + def __prepare_batches(self): |
| 162 | + if self.__cooccurrence_matrix is None: |
| 163 | + raise NotFitToCorpusError( |
| 164 | + "Need to fit model to corpus before preparing training batches.") |
| 165 | + cooccurrences = [(word_ids[0], word_ids[1], count) |
| 166 | + for word_ids, count in self.__cooccurrence_matrix.items()] |
| 167 | + i_indices, j_indices, counts = zip(*cooccurrences) |
| 168 | + return list(_batchify(self.batch_size, i_indices, j_indices, counts)) |
| 169 | + |
| 170 | + @property |
| 171 | + def vocab_size(self): |
| 172 | + return len(self.__words) |
| 173 | + |
| 174 | + @property |
| 175 | + def words(self): |
| 176 | + if self.__words is None: |
| 177 | + raise NotFitToCorpusError("Need to fit model to corpus before accessing words.") |
| 178 | + return self.__words |
| 179 | + |
| 180 | + @property |
| 181 | + def embeddings(self): |
| 182 | + if self.__embeddings is None: |
| 183 | + raise NotTrainedError("Need to train model before accessing embeddings") |
| 184 | + return self.__embeddings |
| 185 | + |
| 186 | + def id_for_word(self, word): |
| 187 | + if self.__word_to_id is None: |
| 188 | + raise NotFitToCorpusError("Need to fit model to corpus before looking up word ids.") |
| 189 | + return self.__word_to_id[word] |
| 190 | + |
| 191 | + def generate_tsne(self, path=None, size=(100, 100), word_count=1000, embeddings=None): |
| 192 | + if embeddings is None: |
| 193 | + embeddings = self.embeddings |
| 194 | + from sklearn.manifold import TSNE |
| 195 | + tsne = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000) |
| 196 | + low_dim_embs = tsne.fit_transform(embeddings[:word_count, :]) |
| 197 | + labels = self.words[:word_count] |
| 198 | + return _plot_with_labels(low_dim_embs, labels, path, size) |
0 commit comments