Added GloVe_tf (pclubiitk#32)

bansal-vansh · vanshbansal1505 · web-flow · commit 882a1c7f9150 · 2020-06-14T16:15:22.000+05:30
* Added GloVe_tf

* GloVe commit rectified

* slight error in usage rectified

* updated dataloader.py

removed repetitive part.

Co-authored-by: vanshbansal1505 &lt;vanshb@LAPTOP-K9EC5RBV.localdomain&gt;
diff --git a/NLP/GloVe_Tensorflow/dataloader.py b/NLP/GloVe_Tensorflow/dataloader.py
@@ -0,0 +1,43 @@
+# -*- coding: utf-8 -*-
+"""dataloader.ipynb
+
+Automatically generated by Colaboratory.
+
+Original file is located at
+    https://colab.research.google.com/drive/10JjHEWNTDClqlbw5d1vzctN1Zsewo-dX
+"""
+
+import pandas as pd
+from nltk.tokenize import RegexpTokenizer
+from nltk.corpus import stopwords
+import nltk
+
+nltk.download('stopwords')
+
+# Downloading the dataset
+# !kaggle datasets download -d harmanpreet93/hotelreviews
+# unzip the dataset and keep it in a folder named hotelreviews
+
+def tokenizeData(indv_lines):
+  review_data_list = list()
+  for line in indv_lines:
+    tokenizer = RegexpTokenizer('\w+')
+    tokens = tokenizer.tokenize(line)
+
+    words = [word.lower() for word in tokens]
+
+    #stop_word_list = set(stopwords.words('english'))
+    #words = [w for w in words if not w in stop_word_list]
+
+    review_data_list.append(words)
+
+  return review_data_list
+
+def tokenized_dataLoader():
+    hotel_data = pd.read_csv('~/hotelreviews/hotel-reviews.csv')
+    hotel_data = hotel_data['Description'].tolist()
+    hotel_data = hotel_data[0:100]#you can increase the upper limit depending on your ram size
+
+    indv_lines = hotel_data
+
+    return tokenizeData(indv_lines)
diff --git a/NLP/GloVe_Tensorflow/evaluate.py b/NLP/GloVe_Tensorflow/evaluate.py
@@ -0,0 +1,40 @@
+# -*- coding: utf-8 -*-
+"""eval.ipynb
+
+Automatically generated by Colaboratory.
+
+Original file is located at
+    https://colab.research.google.com/drive/1pFfVgzJrKZNxFqCTFYbz796Xb77Skzsg
+"""
+
+import argparse
+import numpy as np
+from sklearn.manifold import TSNE
+import matplotlib.pyplot as plt
+
+def are_Similar(word1, word2, model):
+
+    word1_embed = model.embedding_for(word1)
+    word2_embed = model.embedding_for(word2)
+
+    similar = np.dot(word1_embed,word2_embed.T)/np.sqrt((np.abs(np.dot(word1_embed,word1_embed.T))*np.abs(np.dot(word2_embed,word2_embed.T))))
+    return similar
+
+def get_ClosestWords(target, model, num_required_words=10):
+    top_words = list()
+    for word in model.words:
+        top_words.append([word, are_Similar(target, word, model)])
+    top_words.sort(key = lambda x: x[1],reverse=True)
+    return top_words[:num_required_words]   
+  
+def analogy(word1, word2, word3, model, num_required_words=10):
+    vocab = model.words
+    word4_embed = model.embedding_for(word1) - model.embedding_for(word2) + model.embedding_for(word3)
+
+    top_words = list()
+    for word in vocab:
+        word_embed = model.embedding_for(word)
+        similar = np.dot(word_embed,word4_embed.T)/np.sqrt((np.abs(np.dot(word_embed,word_embed.T))*np.abs(np.dot(word4_embed,word4_embed.T))))
+        top_words.append([word, similar])
+    top_words.sort(key = lambda x: x[1],reverse=True)
+    return top_words[:num_required_words]
diff --git a/NLP/GloVe_Tensorflow/main.py b/NLP/GloVe_Tensorflow/main.py
@@ -0,0 +1,91 @@
+# -*- coding: utf-8 -*-
+"""main.ipynb
+
+Automatically generated by Colaboratory.
+
+Original file is located at
+    https://colab.research.google.com/drive/1eP_TTxxIGyv30xDs6dl8i5k1QIXyGvUa
+"""
+
+from model import GloVeModel
+from dataloader import tokenized_dataLoader
+import argparse
+from numpy import save, load
+from evaluate import are_Similar, get_ClosestWords, analogy
+from collections import OrderedDict
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument('--epochs', type=int, default=100)
+    parser.add_argument('--embedding_size', type=int, default=100)
+    parser.add_argument('--context_size', type=int, default=10)
+    parser.add_argument('--mode', type=str, default="train")
+    
+    parser.add_argument('--word1', type=str, default="man")
+    parser.add_argument('--word2', type=str, default="boy")
+    parser.add_argument('--word', type=str, default="woman")
+    
+
+    args = parser.parse_args()
+
+    
+    evaluation_words = OrderedDict([
+        ('word1', args.word1),
+        ('word2', args.word2),
+        ('word', args.word),
+    ])
+
+    
+    arguments = OrderedDict([
+        ('evaluation_words', evaluation_words),
+        ('context_size', args.context_size),
+        ('embedding_size', args.embedding_size),
+        ('mode', args.mode),
+    ])
+
+    return arguments
+
+arguments = parse_args()
+
+
+context_size = arguments['context_size']
+embedding_size = arguments['embedding_size']
+evaluation_words = arguments['evaluation_words']
+mode = arguments['mode']
+
+corpus = tokenized_dataLoader()
+
+model = GloVeModel(embedding_size = embedding_size, context_size = context_size)
+model.fit_to_corpus(corpus)
+model.train(num_epochs=100)
+
+if(mode == "plotEmbeddings"):
+    model.generate_tsne()
+
+if(mode == "help"):
+    print("$ python3 main.py --embedding_size 100 --context_size 10\n")
+    print("$ python3 main.py --mode \"are_Similar\" --word1 \"man\" --word2 \"boy\"\n")
+    print("$ python3 main.py --mode \"get_ClosestWords\" --word \"man\"\n")
+    print("$ python3 main.py --mode \"analogy\" --word1 \"man\" --word2 \"boy\" --word \"woman\"\n")
+    print("$ python3 main.py --mode \"help\"")
+    print("$ python3 main.py --mode \"wordIsInVocab\" --word_ \"man\"")
+
+else:
+    if(mode == "are_Similar"):
+        word1 = evaluation_words['word1']
+        word2 = evaluation_words['word2']
+
+        print(are_Similar(word1, word2, model))
+
+    if(mode == "get_ClosestWords"):
+        word = evaluation_words['word']
+
+        print(get_ClosestWords(word, model))
+
+    if(mode == "analogy"):
+        word1 = evaluation_words['word1']
+        word2 = evaluation_words['word2']
+        word3 = evaluation_words['word']
+
+        print(analogy(word1, word2, word3, model))
diff --git a/NLP/GloVe_Tensorflow/model.py b/NLP/GloVe_Tensorflow/model.py
@@ -0,0 +1,198 @@
+# -*- coding: utf-8 -*-
+"""model.ipynb
+
+Automatically generated by Colaboratory.
+
+Original file is located at
+    https://colab.research.google.com/drive/1rlEs6_Z_XKP2BLdrHAS27MY986dzzEPS
+"""
+
+from __future__ import division
+from collections import Counter, defaultdict
+import os
+from random import shuffle
+import tensorflow.compat.v1 as tf
+from utils import _context_windows, _window, _device_for_node, _batchify, _plot_with_labels
+
+class GloVeModel():
+    def __init__(self, embedding_size, context_size, max_vocab_size=100000, min_occurrences=1,
+                 scaling_factor=3/4, cooccurrence_cap=100, batch_size=512, learning_rate=0.05):
+        self.embedding_size = embedding_size
+        if isinstance(context_size, tuple):
+            self.left_context, self.right_context = context_size
+        elif isinstance(context_size, int):
+            self.left_context = self.right_context = context_size
+        else:
+            raise ValueError("`context_size` should be an int or a tuple of two ints")
+        self.max_vocab_size = max_vocab_size
+        self.min_occurrences = min_occurrences
+        self.scaling_factor = scaling_factor
+        self.cooccurrence_cap = cooccurrence_cap
+        self.batch_size = batch_size
+        self.learning_rate = learning_rate
+        self.__words = None
+        self.__word_to_id = None
+        self.__cooccurrence_matrix = None
+        self.__embeddings = None
+
+    def fit_to_corpus(self, corpus):
+        self.__fit_to_corpus(corpus, self.max_vocab_size, self.min_occurrences,
+                             self.left_context, self.right_context)
+        self.__build_graph()
+
+    def __fit_to_corpus(self, corpus, vocab_size, min_occurrences, left_size, right_size):
+        word_counts = Counter()
+        cooccurrence_counts = defaultdict(float)
+        for region in corpus:
+            word_counts.update(region)
+            for l_context, word, r_context in _context_windows(region, left_size, right_size):
+                for i, context_word in enumerate(l_context[::-1]):
+                    # add (1 / distance from focal word) for this pair
+                    cooccurrence_counts[(word, context_word)] += 1 / (i + 1)
+                for i, context_word in enumerate(r_context):
+                    cooccurrence_counts[(word, context_word)] += 1 / (i + 1)
+        
+        self.__words = [word for word, count in word_counts.most_common(vocab_size)
+                        if count >= min_occurrences]
+        self.__word_to_id = {word: i for i, word in enumerate(self.__words)}
+        self.__cooccurrence_matrix = {
+            (self.__word_to_id[words[0]], self.__word_to_id[words[1]]): count
+            for words, count in cooccurrence_counts.items()
+            if words[0] in self.__word_to_id and words[1] in self.__word_to_id}
+
+    def __build_graph(self):
+        self.__graph = tf.Graph()
+        with self.__graph.as_default(), self.__graph.device(_device_for_node):
+            count_max = tf.constant([self.cooccurrence_cap], dtype=tf.float32,
+                                    name='max_cooccurrence_count')
+            scaling_factor = tf.constant([self.scaling_factor], dtype=tf.float32,
+                                         name="scaling_factor")
+
+            self.__focal_input = tf.placeholder(tf.int32, shape=[self.batch_size],
+                                                name="focal_words")
+            self.__context_input = tf.placeholder(tf.int32, shape=[self.batch_size],
+                                                  name="context_words")
+            self.__cooccurrence_count = tf.placeholder(tf.float32, shape=[self.batch_size],
+                                                       name="cooccurrence_count")
+
+            focal_embeddings = tf.Variable(
+                tf.random_uniform([self.vocab_size, self.embedding_size], 1.0, -1.0),
+                name="focal_embeddings")
+            context_embeddings = tf.Variable(
+                tf.random_uniform([self.vocab_size, self.embedding_size], 1.0, -1.0),
+                name="context_embeddings")
+
+            focal_biases = tf.Variable(tf.random_uniform([self.vocab_size], 1.0, -1.0),
+                                       name='focal_biases')
+            context_biases = tf.Variable(tf.random_uniform([self.vocab_size], 1.0, -1.0),
+                                         name="context_biases")
+
+            focal_embedding = tf.nn.embedding_lookup([focal_embeddings], self.__focal_input)
+            context_embedding = tf.nn.embedding_lookup([context_embeddings], self.__context_input)
+            focal_bias = tf.nn.embedding_lookup([focal_biases], self.__focal_input)
+            context_bias = tf.nn.embedding_lookup([context_biases], self.__context_input)
+
+            weighting_factor = tf.minimum(
+                1.0,
+                tf.pow(
+                    tf.div(self.__cooccurrence_count, count_max),
+                    scaling_factor))
+
+            embedding_product = tf.reduce_sum(tf.multiply(focal_embedding, context_embedding), 1)
+
+            log_cooccurrences = tf.log(tf.to_float(self.__cooccurrence_count))
+
+            distance_expr = tf.square(tf.add_n([
+                embedding_product,
+                focal_bias,
+                context_bias,
+                tf.negative(log_cooccurrences)]))
+
+            single_losses = tf.multiply(weighting_factor, distance_expr)
+            self.__total_loss = tf.reduce_sum(single_losses)
+            tf.summary.scalar("GloVe_loss", self.__total_loss)
+            self.__optimizer = tf.train.AdagradOptimizer(self.learning_rate).minimize(
+                self.__total_loss)
+            self.__summary = tf.summary.merge_all()
+
+            self.__combined_embeddings = tf.add(focal_embeddings, context_embeddings,
+                                                name="combined_embeddings")
+
+    def train(self, num_epochs, log_dir=None, summary_batch_interval=1000,
+              tsne_epoch_interval=None):
+        should_write_summaries = log_dir is not None and summary_batch_interval
+        should_generate_tsne = log_dir is not None and tsne_epoch_interval
+        batches = self.__prepare_batches()
+        total_steps = 0
+        with tf.Session(graph=self.__graph) as session:
+            if should_write_summaries:
+                print("Writing TensorBoard summaries to {}".format(log_dir))
+                summary_writer = tf.summary.FileWriter(log_dir, graph=session.graph)
+            tf.global_variables_initializer().run()
+            for epoch in range(num_epochs):
+                shuffle(batches)
+                for batch_index, batch in enumerate(batches):
+                    i_s, j_s, counts = batch
+                    if len(counts) != self.batch_size:
+                        continue
+                    feed_dict = {
+                        self.__focal_input: i_s,
+                        self.__context_input: j_s,
+                        self.__cooccurrence_count: counts}
+                    session.run([self.__optimizer], feed_dict=feed_dict)
+                    if should_write_summaries and (total_steps + 1) % summary_batch_interval == 0:
+                        summary_str = session.run(self.__summary, feed_dict=feed_dict)
+                        summary_writer.add_summary(summary_str, total_steps)
+                    total_steps += 1
+                if should_generate_tsne and (epoch + 1) % tsne_epoch_interval == 0:
+                    current_embeddings = self.__combined_embeddings.eval()
+                    output_path = os.path.join(log_dir, "epoch{:03d}.png".format(epoch + 1))
+                    self.generate_tsne(output_path, embeddings=current_embeddings)
+            self.__embeddings = self.__combined_embeddings.eval()
+            if should_write_summaries:
+                summary_writer.close()
+
+    def embedding_for(self, word_str_or_id):
+        if isinstance(word_str_or_id, str):
+            return self.embeddings[self.__word_to_id[word_str_or_id]]
+        elif isinstance(word_str_or_id, int):
+            return self.embeddings[word_str_or_id]
+
+    def __prepare_batches(self):
+        if self.__cooccurrence_matrix is None:
+            raise NotFitToCorpusError(
+                "Need to fit model to corpus before preparing training batches.")
+        cooccurrences = [(word_ids[0], word_ids[1], count)
+                         for word_ids, count in self.__cooccurrence_matrix.items()]
+        i_indices, j_indices, counts = zip(*cooccurrences)
+        return list(_batchify(self.batch_size, i_indices, j_indices, counts))
+
+    @property
+    def vocab_size(self):
+        return len(self.__words)
+
+    @property
+    def words(self):
+        if self.__words is None:
+            raise NotFitToCorpusError("Need to fit model to corpus before accessing words.")
+        return self.__words
+
+    @property
+    def embeddings(self):
+        if self.__embeddings is None:
+            raise NotTrainedError("Need to train model before accessing embeddings")
+        return self.__embeddings
+
+    def id_for_word(self, word):
+        if self.__word_to_id is None:
+            raise NotFitToCorpusError("Need to fit model to corpus before looking up word ids.")
+        return self.__word_to_id[word]
+
+    def generate_tsne(self, path=None, size=(100, 100), word_count=1000, embeddings=None):
+        if embeddings is None:
+            embeddings = self.embeddings
+        from sklearn.manifold import TSNE
+        tsne = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000)
+        low_dim_embs = tsne.fit_transform(embeddings[:word_count, :])
+        labels = self.words[:word_count]
+        return _plot_with_labels(low_dim_embs, labels, path, size)
diff --git a/NLP/GloVe_Tensorflow/readme.md b/NLP/GloVe_Tensorflow/readme.md
diff --git a/NLP/GloVe_Tensorflow/utils.py b/NLP/GloVe_Tensorflow/utils.py