Added Word2Vec in Tensorflow (pclubiitk#11)

aashishpiitk · web-flow · commit 08a0b56a7481 · 2020-05-23T19:23:06.000+05:30
diff --git a/NLP/word2vec/.gitignore b/NLP/word2vec/.gitignore
@@ -0,0 +1 @@
+__pycache__
diff --git a/NLP/word2vec/README.md b/NLP/word2vec/README.md
@@ -0,0 +1,110 @@
+# Tensorflow Implementation of Word2Vec (Dataset from kaggle file)
+
+## Usage
+### To train
+```bash
+$ python3 main.py --epochs 100 --optimizer "adam" --batch_size 2000 --dim_embedding 100
+```
+### To getSimilarity between two words - word1 and word2
+```bash 
+$ python3 main.py --mode "getSimilarity" --word1 "window" --word2 "house"
+```
+### To getTenClosestWords to a given word
+```bash
+$ python3 main.py --mode "getTenClosestWords" --word "window"
+```
+### To use analogy and get word in ;- word1_ : word2_ :: word3_ : word4
+```bash
+$ python3 main.py --mode "analogy" --word1_ "window" --word2_ "house" --word3_ "door"
+```
+### To plot the embeddings in 2D
+```bash
+$ python3 main.py --mode "plot"
+```
+## References
+* [Stanford CS224n](http://web.stanford.edu/class/cs224n/)
+* [Stanford Word2Vec Notes](http://web.stanford.edu/class/cs224n/readings/cs224n-2019-notes01-wordvecs1.pdf)
+
+* [Original Word2Vec paper](http://arxiv.org/pdf/1301.3781.pdf)
+
+* [Google Word2vec paper which suggested improvements in training using negative sampling and sub-sampling](http://papers.nips.cc/paper/5021-distributed-representations-of-words-and-phrases-and-their-compositionality.pdf)
+
+## Contributed by:
+* [Aashish Patel](https://github.com/aashishpiitk/)
+
+# Summary
+
+Word2Vec is a model in which a network is trained to represent each word in the text corpus in form of an embedding, which is a vector containing numbers.
+These embeddings can be used to perform a variety of tasks such a :-
+```
+    • finding similarity between two words
+    • searching for top ten most similar words to a given words
+    • finding analogies
+```
+
+There are two main approaches while training this model and creating word embeddings:-
+```
+    • Skip-gram 
+    • Continuous Bag of Words 
+```
+
+### Skip-gram
+```
+Input – a single word
+Output – probability of each word in corpus being in context of the given input word
+```
+### Continuous Bag Of Words(CBOW)
+```
+Inputs – context of a word in sentence/phrase
+Output – a single word(in one-hot encoded form, each value being a number from probability distribution)
+Loss – categorical cross entropy loss
+```
+In my current implementation I have used CBOW approach to train
+
+### Preparing data for CBOW
+```
+    1. converting each vector in vocabulary to one-hot encoded representation
+    2. forming a list of (context word, target word) , choosing a suitable window size
+```
+## Architecture of CBOW
+```
+1. This is a three layer neural network with the last one being the output layer
+2. The weights of the first layer are the actual embeddings which will be used in further tasks
+3. The output is of size of vocab with the entries being the softmax output
+4. Cross Entropy loss is used 
+```
+## Instructions for using a custom dataset to train the model
+```
+1. In the  `dataloader.py` file on the `30th` line change the name of the file and path(file must be csv)
+2. On the `31st` line change the name of the column to the name of the `column` in your `.csv` file
+3. On the next line choose the number of example sentences to choose from the `.csv` file.This option is useful when there is not enough RAM on your machine to load all the lines
+```
+## Instructions for using kaggle dataset
+```
+1. !kaggle datasets download -d harmanpreet93/hotelreviews
+2. unzip the dataset and keep it in a folder named hotelreviews 
+3. if you want to change the folder name then follow the guidelines above this block
+```
+## Examples
+```
+getSimilarity("window","door")
+result 0.067170754
+getSimilarity("window","house")
+result 0.029237064
+getSimilarity("vegas","girls")
+result 0.10303633
+getSimilarity("vegas","money")
+result 0.22041301
+getSimilarity("vegas","gold")
+result 0.072522774
+print(getSimilarity("good","bad"))
+0.23856965
+
+print(getTenClosestWords("water"))
+result [['guess', 0.30290997], ['disappointed', 0.29180372], ['understand', 0.29148042], ['also', 0.27842966], ['earth', 0.2709463], ['water', 0.26725885], ['one', 0.2629741], ['power', 0.25627777], ['unbelievably', 0.25437462], ['spouse', 0.2514236]]
+
+print(getTenClosestWords("money"))
+result [['need', 0.3438718], ['chose', 0.34114993], ['money', 0.332256], ['nearby', 0.3176784], ['think', 0.31027701], ['heading', 0.3087694], ['although', 0.30681318], ['understands', 0.3010662], ['lodging', 0.29836887], ['must', 0.29601774]]
+
+
+```
diff --git a/NLP/word2vec/dataloader.py b/NLP/word2vec/dataloader.py
@@ -0,0 +1,36 @@
+import pandas as pd
+from nltk.tokenize import RegexpTokenizer
+from nltk.corpus import stopwords
+import nltk
+nltk.download('stopwords')
+
+# Downloading the dataset
+# !kaggle datasets download -d harmanpreet93/hotelreviews
+# unzip the dataset and keep it in a folder named hotelreviews
+
+
+
+
+def tokenizeData(indv_lines):
+  review_data_list = list()
+  for line in indv_lines:
+    tokenizer = RegexpTokenizer('\w+')
+    tokens = tokenizer.tokenize(line)
+
+    words = [word.lower() for word in tokens]
+
+    stop_word_list = set(stopwords.words('english'))
+    words = [w for w in words if not w in stop_word_list]
+
+    review_data_list.append(words)
+
+    return review_data_list
+
+def performTokenization():
+    hotel_data = pd.read_csv('./hotelreviews/hotel-reviews.csv')
+    hotel_data = hotel_data['Description'].tolist()
+    hotel_data = hotel_data[0:100]#you can increase the upper limit depending on your ram size
+
+    indv_lines = hotel_data
+
+    return tokenizeData(indv_lines)
diff --git a/NLP/word2vec/embedding.npy b/NLP/word2vec/embedding.npy
diff --git a/NLP/word2vec/evaluation.py b/NLP/word2vec/evaluation.py
@@ -0,0 +1,49 @@
+import numpy as np
+from sklearn.manifold import TSNE
+import matplotlib.pyplot as plt
+from sklearn import preprocessing
+import argparse
+
+def getSimilarity(word1, word2, data, emb):
+    word_to_id = data["word_to_id"]
+    word1_emb = emb[word_to_id[word1],:]
+    word2_emb = emb[word_to_id[word2],:]
+
+    similarity = np.dot(word1_emb,word2_emb.T)/(np.abs(np.dot(word1_emb,word1_emb.T))*np.abs(np.dot(word2_emb,word2_emb.T)))
+    return similarity
+
+def getSimilarityByEmbedding(emb1, emb2):
+    similarity = np.dot(emb1,emb2.T)/(np.abs(np.dot(emb1,emb1.T))*np.abs(np.dot(emb2,emb2.T)))
+    return similarity
+
+def getTenClosestWords(search, vocab, data, emb):
+    topTen = list()
+    for word in vocab:
+        topTen.append([word, getSimilarity(search, word, data, emb)])
+    topTen.sort(key = lambda x: x[1],reverse=True)
+    return topTen[:10]   
+  
+def analogy(word1, word2, word3, data, vocab, emb):
+    word_to_id = data["word_to_id"]
+    word4_emb = emb[word_to_id[word1],:] - emb[word_to_id[word2],:] + emb[word_to_id[word3],:]
+
+    topTen = list()
+    for word in vocab:
+        topTen.append([word, getSimilarityByEmbedding(word4_emb,emb[word_to_id[word]])])
+    topTen.sort(key = lambda x: x[1],reverse=True)
+    return topTen[:10] 
+
+def plotEmbeddingsIn2D(emb, data):
+    plt.figure(figsize=(10,20))
+    word_to_id = data["word_to_id"]
+    vocab = list(data["vocab"])[:100]
+    model = TSNE(n_components=2, random_state=0)
+    np.set_printoptions(suppress=True)
+    vectors = model.fit_transform(emb)
+    normalizer = preprocessing.Normalizer()
+    vectors =  normalizer.fit_transform(vectors, 'l2')
+    fig, ax = plt.subplots()
+    for word in vocab:
+        print(word, vectors[word_to_id[word]][1])
+        ax.annotate(word, (vectors[word_to_id[word]][0],vectors[word_to_id[word]][1] ))
+    plt.show()
diff --git a/NLP/word2vec/main.py b/NLP/word2vec/main.py
@@ -0,0 +1,142 @@
+from model import Word2Vec, ScoringLayer, EmbeddingLayer
+from utils import constructBagOfWordsInWindowSize, contextPairToOneHot, OneHotOfAllInVocab
+from keras.callbacks import TensorBoard 
+from dataloader import tokenizeData, performTokenization
+import argparse
+import datetime
+from numpy import save, load
+from evaluation import getSimilarity, getSimilarityByEmbedding, getTenClosestWords, analogy, plotEmbeddingsIn2D
+from collections import OrderedDict
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+
+    #optim config
+    parser.add_argument('--epochs', type=int, default=100)
+    parser.add_argument('--batch_size', type=int, default=2000)
+    parser.add_argument('--optimizer', type=str, default="adam")
+    #Model config
+    parser.add_argument('--dim_embedding', type=int, default=100)
+    #evaluation
+    parser.add_argument('--mode', default="train", type=str)
+    #getSimilarity 
+    parser.add_argument('--word1', type=str, default="window")
+    parser.add_argument('--word2', type=str, default="hoouse")
+    #getTenClosestWords
+    parser.add_argument('--word', type=str, default="window")
+    #analogy
+    parser.add_argument('--word1_', type=str, default="window")
+    parser.add_argument('--word2_', type=str, default="hoouse")
+    parser.add_argument('--word3_', type=str, default="door")
+    #wordIsInVocab
+    parser.add_argument('--word_', type=str)
+
+    args = parser.parse_args()
+
+    optim_config = OrderedDict([
+        ('epochs', args.epochs),
+        ('batch_size', args.batch_size),
+        ('optimizer', args.optimizer)
+    ])
+
+    model_config = OrderedDict([
+        ('dim_embedding', args.dim_embedding)
+    ])
+    
+    evaluation_config = OrderedDict([
+        ('word1', args.word1),
+        ('word2', args.word2),
+        ('word', args.word),
+        ('word1_', args.word1_),
+        ('word2_', args.word2_),
+        ('word3_', args.word3_),
+        ('word_', args.word_),
+    ])
+
+    
+    config = OrderedDict([
+        ('optim_config', optim_config),
+        ('evaluation_config', evaluation_config),
+        ('model_config', model_config),
+        ('mode', args.mode),
+    ])
+
+    return config
+
+config = parse_args()
+
+model_config = config['model_config']
+optim_config = config['optim_config']
+evaluation_config = config['evaluation_config']
+mode = config['mode']
+# log_dir = "logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
+# tensorboard_callback = TensorBoard(log_dir=log_dir, histogram_freq=1)
+
+
+tokenized_data = performTokenization()
+context_tuple_list = constructBagOfWordsInWindowSize(tokenized_data)
+oneHotNumpy, data = contextPairToOneHot(context_tuple_list, tokenized_data)
+print("The total number of words in corpus size are: ",data["vocabSize"])
+
+
+if(mode == "train"):
+    def train():
+
+        dimensionality_of_embeddings = model_config['dim_embedding']
+        optimizer = optim_config['optimizer']
+        epochs = optim_config['epochs']
+        batch_size = optim_config['batch_size']
+
+        model = Word2Vec(input_dim=data['vocabSize'], units = int(dimensionality_of_embeddings))
+        model.compile(loss='categorical_crossentropy',
+                    optimizer= optimizer,
+                    metrics= ['accuracy'])
+        model.fit(oneHotNumpy[:,0,:],oneHotNumpy[:,1,:],
+                epochs = epochs,
+                batch_size = batch_size)
+
+
+        emb = model.get_weights()[0]
+        save("word2vec_embeddings.npy",emb)
+
+elif(mode == "help"):
+    print("$ python3 main.py --epochs 100 --optimizer \"adam\" --batch_size 2000 --dim_embedding 100\n")
+    print("$ python3 main.py --mode \"getSimilarity\" --word1 \"window\" --word2 \"house\"\n")
+    print("$ python3 main.py --mode \"getTenClosestWords\" --word \"window\"\n")
+    print("$ python3 main.py --mode \"analogy\" --word1_ \"window\" --word2_ \"house\" --word3_ \"door\"\n")
+    print("$ python3 main.py --mode \"plot\"")
+    print("$ python3 main.py --mode \"help\"")
+    print("$ python3 main.py --mode \"wordIsInVocab\" --word_ \"window\"")
+
+else:
+    emb = load("embeddings.npy")
+
+    if(mode == "getSimilarity"):
+        word1 = evaluation_config['word1']
+        word2 = evaluation_config['word2']
+
+        print(getSimilarity(word1, word2, data, emb))
+
+    if(mode == "getTenClosestWords"):
+        word = evaluation_config['word']
+
+        print(getTenClosestWords(word, data['vocab'], data, emb))
+
+    if(mode == "analogy"):
+        word1_ = evaluation_config['word1_']
+        word2_ = evaluation_config['word2_']
+        word3_ = evaluation_config['word3_']
+
+        print(analogy(word1_, word2_, word3_, data, data['vocab'], emb))
+    
+    if(mode == "wordIsInVocab"):
+        word_ = evaluation_config['word_']
+        vocabList = data['vocab'].tolist()
+
+        if word_ in vocabList:
+            print("YES")
+        else:
+            print("NO")
+
+    if(mode == "plot"):
+        plotEmbeddingsIn2D(emb, data)
diff --git a/NLP/word2vec/model.py b/NLP/word2vec/model.py
@@ -0,0 +1,49 @@
+import tensorflow as tf
+from keras import layers
+import keras 
+
+
+class EmbeddingLayer(layers.Layer):
+    
+  def __init__(self, units, input_dim):
+    super(EmbeddingLayer, self).__init__()
+    
+    self.input_dim = input_dim
+
+    w_init = tf.random_normal_initializer()
+    self.w = tf.Variable(initial_value=w_init(shape=(input_dim, units), 
+                        dtype= 'float32'),
+                        trainable=True, 
+                        name="emb")
+
+  def call(self, inputs):
+    embedding = tf.matmul(inputs, self.w)
+    return embedding
+
+class ScoringLayer(layers.Layer):
+    
+  def __init__(self, units, input_dim):
+    super(ScoringLayer, self).__init__()
+
+    w_init = tf.random_normal_initializer()
+    self.w = tf.Variable(initial_value=w_init(shape=(input_dim, units),
+                                              dtype='float32'),
+                          trainable=True) 
+    
+  def call(self, inputs):
+    output = tf.matmul(inputs, self.w)
+    softmax = tf.nn.softmax(output, axis=-1)
+    return softmax
+
+class Word2Vec(keras.Model):
+      
+  def __init__(self, units, input_dim):
+    super(Word2Vec, self).__init__()
+
+    self.embedding = EmbeddingLayer(units, input_dim)
+    self.scoring = ScoringLayer(input_dim,units)
+
+  def call(self, inputs):
+    embedding = self.embedding(inputs)
+    output = self.scoring(embedding)
+    return outputs
diff --git a/NLP/word2vec/utils.py b/NLP/word2vec/utils.py