Skip to content

Commit 882a1c7

Browse files
bansal-vanshvanshbansal1505
andauthored
Added GloVe_tf (pclubiitk#32)
* Added GloVe_tf * GloVe commit rectified * slight error in usage rectified * updated dataloader.py removed repetitive part. Co-authored-by: vanshbansal1505 <[email protected]>
1 parent b20cce9 commit 882a1c7

File tree

6 files changed

+510
-0
lines changed

6 files changed

+510
-0
lines changed

NLP/GloVe_Tensorflow/dataloader.py

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
# -*- coding: utf-8 -*-
2+
"""dataloader.ipynb
3+
4+
Automatically generated by Colaboratory.
5+
6+
Original file is located at
7+
https://colab.research.google.com/drive/10JjHEWNTDClqlbw5d1vzctN1Zsewo-dX
8+
"""
9+
10+
import pandas as pd
11+
from nltk.tokenize import RegexpTokenizer
12+
from nltk.corpus import stopwords
13+
import nltk
14+
15+
nltk.download('stopwords')
16+
17+
# Downloading the dataset
18+
# !kaggle datasets download -d harmanpreet93/hotelreviews
19+
# unzip the dataset and keep it in a folder named hotelreviews
20+
21+
def tokenizeData(indv_lines):
22+
review_data_list = list()
23+
for line in indv_lines:
24+
tokenizer = RegexpTokenizer('\w+')
25+
tokens = tokenizer.tokenize(line)
26+
27+
words = [word.lower() for word in tokens]
28+
29+
#stop_word_list = set(stopwords.words('english'))
30+
#words = [w for w in words if not w in stop_word_list]
31+
32+
review_data_list.append(words)
33+
34+
return review_data_list
35+
36+
def tokenized_dataLoader():
37+
hotel_data = pd.read_csv('~/hotelreviews/hotel-reviews.csv')
38+
hotel_data = hotel_data['Description'].tolist()
39+
hotel_data = hotel_data[0:100]#you can increase the upper limit depending on your ram size
40+
41+
indv_lines = hotel_data
42+
43+
return tokenizeData(indv_lines)

NLP/GloVe_Tensorflow/evaluate.py

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
# -*- coding: utf-8 -*-
2+
"""eval.ipynb
3+
4+
Automatically generated by Colaboratory.
5+
6+
Original file is located at
7+
https://colab.research.google.com/drive/1pFfVgzJrKZNxFqCTFYbz796Xb77Skzsg
8+
"""
9+
10+
import argparse
11+
import numpy as np
12+
from sklearn.manifold import TSNE
13+
import matplotlib.pyplot as plt
14+
15+
def are_Similar(word1, word2, model):
16+
17+
word1_embed = model.embedding_for(word1)
18+
word2_embed = model.embedding_for(word2)
19+
20+
similar = np.dot(word1_embed,word2_embed.T)/np.sqrt((np.abs(np.dot(word1_embed,word1_embed.T))*np.abs(np.dot(word2_embed,word2_embed.T))))
21+
return similar
22+
23+
def get_ClosestWords(target, model, num_required_words=10):
24+
top_words = list()
25+
for word in model.words:
26+
top_words.append([word, are_Similar(target, word, model)])
27+
top_words.sort(key = lambda x: x[1],reverse=True)
28+
return top_words[:num_required_words]
29+
30+
def analogy(word1, word2, word3, model, num_required_words=10):
31+
vocab = model.words
32+
word4_embed = model.embedding_for(word1) - model.embedding_for(word2) + model.embedding_for(word3)
33+
34+
top_words = list()
35+
for word in vocab:
36+
word_embed = model.embedding_for(word)
37+
similar = np.dot(word_embed,word4_embed.T)/np.sqrt((np.abs(np.dot(word_embed,word_embed.T))*np.abs(np.dot(word4_embed,word4_embed.T))))
38+
top_words.append([word, similar])
39+
top_words.sort(key = lambda x: x[1],reverse=True)
40+
return top_words[:num_required_words]

NLP/GloVe_Tensorflow/main.py

Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,91 @@
1+
# -*- coding: utf-8 -*-
2+
"""main.ipynb
3+
4+
Automatically generated by Colaboratory.
5+
6+
Original file is located at
7+
https://colab.research.google.com/drive/1eP_TTxxIGyv30xDs6dl8i5k1QIXyGvUa
8+
"""
9+
10+
from model import GloVeModel
11+
from dataloader import tokenized_dataLoader
12+
import argparse
13+
from numpy import save, load
14+
from evaluate import are_Similar, get_ClosestWords, analogy
15+
from collections import OrderedDict
16+
17+
def parse_args():
18+
parser = argparse.ArgumentParser()
19+
20+
parser.add_argument('--epochs', type=int, default=100)
21+
parser.add_argument('--embedding_size', type=int, default=100)
22+
parser.add_argument('--context_size', type=int, default=10)
23+
parser.add_argument('--mode', type=str, default="train")
24+
25+
parser.add_argument('--word1', type=str, default="man")
26+
parser.add_argument('--word2', type=str, default="boy")
27+
parser.add_argument('--word', type=str, default="woman")
28+
29+
30+
args = parser.parse_args()
31+
32+
33+
evaluation_words = OrderedDict([
34+
('word1', args.word1),
35+
('word2', args.word2),
36+
('word', args.word),
37+
])
38+
39+
40+
arguments = OrderedDict([
41+
('evaluation_words', evaluation_words),
42+
('context_size', args.context_size),
43+
('embedding_size', args.embedding_size),
44+
('mode', args.mode),
45+
])
46+
47+
return arguments
48+
49+
arguments = parse_args()
50+
51+
52+
context_size = arguments['context_size']
53+
embedding_size = arguments['embedding_size']
54+
evaluation_words = arguments['evaluation_words']
55+
mode = arguments['mode']
56+
57+
corpus = tokenized_dataLoader()
58+
59+
model = GloVeModel(embedding_size = embedding_size, context_size = context_size)
60+
model.fit_to_corpus(corpus)
61+
model.train(num_epochs=100)
62+
63+
if(mode == "plotEmbeddings"):
64+
model.generate_tsne()
65+
66+
if(mode == "help"):
67+
print("$ python3 main.py --embedding_size 100 --context_size 10\n")
68+
print("$ python3 main.py --mode \"are_Similar\" --word1 \"man\" --word2 \"boy\"\n")
69+
print("$ python3 main.py --mode \"get_ClosestWords\" --word \"man\"\n")
70+
print("$ python3 main.py --mode \"analogy\" --word1 \"man\" --word2 \"boy\" --word \"woman\"\n")
71+
print("$ python3 main.py --mode \"help\"")
72+
print("$ python3 main.py --mode \"wordIsInVocab\" --word_ \"man\"")
73+
74+
else:
75+
if(mode == "are_Similar"):
76+
word1 = evaluation_words['word1']
77+
word2 = evaluation_words['word2']
78+
79+
print(are_Similar(word1, word2, model))
80+
81+
if(mode == "get_ClosestWords"):
82+
word = evaluation_words['word']
83+
84+
print(get_ClosestWords(word, model))
85+
86+
if(mode == "analogy"):
87+
word1 = evaluation_words['word1']
88+
word2 = evaluation_words['word2']
89+
word3 = evaluation_words['word']
90+
91+
print(analogy(word1, word2, word3, model))

NLP/GloVe_Tensorflow/model.py

Lines changed: 198 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,198 @@
1+
# -*- coding: utf-8 -*-
2+
"""model.ipynb
3+
4+
Automatically generated by Colaboratory.
5+
6+
Original file is located at
7+
https://colab.research.google.com/drive/1rlEs6_Z_XKP2BLdrHAS27MY986dzzEPS
8+
"""
9+
10+
from __future__ import division
11+
from collections import Counter, defaultdict
12+
import os
13+
from random import shuffle
14+
import tensorflow.compat.v1 as tf
15+
from utils import _context_windows, _window, _device_for_node, _batchify, _plot_with_labels
16+
17+
class GloVeModel():
18+
def __init__(self, embedding_size, context_size, max_vocab_size=100000, min_occurrences=1,
19+
scaling_factor=3/4, cooccurrence_cap=100, batch_size=512, learning_rate=0.05):
20+
self.embedding_size = embedding_size
21+
if isinstance(context_size, tuple):
22+
self.left_context, self.right_context = context_size
23+
elif isinstance(context_size, int):
24+
self.left_context = self.right_context = context_size
25+
else:
26+
raise ValueError("`context_size` should be an int or a tuple of two ints")
27+
self.max_vocab_size = max_vocab_size
28+
self.min_occurrences = min_occurrences
29+
self.scaling_factor = scaling_factor
30+
self.cooccurrence_cap = cooccurrence_cap
31+
self.batch_size = batch_size
32+
self.learning_rate = learning_rate
33+
self.__words = None
34+
self.__word_to_id = None
35+
self.__cooccurrence_matrix = None
36+
self.__embeddings = None
37+
38+
def fit_to_corpus(self, corpus):
39+
self.__fit_to_corpus(corpus, self.max_vocab_size, self.min_occurrences,
40+
self.left_context, self.right_context)
41+
self.__build_graph()
42+
43+
def __fit_to_corpus(self, corpus, vocab_size, min_occurrences, left_size, right_size):
44+
word_counts = Counter()
45+
cooccurrence_counts = defaultdict(float)
46+
for region in corpus:
47+
word_counts.update(region)
48+
for l_context, word, r_context in _context_windows(region, left_size, right_size):
49+
for i, context_word in enumerate(l_context[::-1]):
50+
# add (1 / distance from focal word) for this pair
51+
cooccurrence_counts[(word, context_word)] += 1 / (i + 1)
52+
for i, context_word in enumerate(r_context):
53+
cooccurrence_counts[(word, context_word)] += 1 / (i + 1)
54+
55+
self.__words = [word for word, count in word_counts.most_common(vocab_size)
56+
if count >= min_occurrences]
57+
self.__word_to_id = {word: i for i, word in enumerate(self.__words)}
58+
self.__cooccurrence_matrix = {
59+
(self.__word_to_id[words[0]], self.__word_to_id[words[1]]): count
60+
for words, count in cooccurrence_counts.items()
61+
if words[0] in self.__word_to_id and words[1] in self.__word_to_id}
62+
63+
def __build_graph(self):
64+
self.__graph = tf.Graph()
65+
with self.__graph.as_default(), self.__graph.device(_device_for_node):
66+
count_max = tf.constant([self.cooccurrence_cap], dtype=tf.float32,
67+
name='max_cooccurrence_count')
68+
scaling_factor = tf.constant([self.scaling_factor], dtype=tf.float32,
69+
name="scaling_factor")
70+
71+
self.__focal_input = tf.placeholder(tf.int32, shape=[self.batch_size],
72+
name="focal_words")
73+
self.__context_input = tf.placeholder(tf.int32, shape=[self.batch_size],
74+
name="context_words")
75+
self.__cooccurrence_count = tf.placeholder(tf.float32, shape=[self.batch_size],
76+
name="cooccurrence_count")
77+
78+
focal_embeddings = tf.Variable(
79+
tf.random_uniform([self.vocab_size, self.embedding_size], 1.0, -1.0),
80+
name="focal_embeddings")
81+
context_embeddings = tf.Variable(
82+
tf.random_uniform([self.vocab_size, self.embedding_size], 1.0, -1.0),
83+
name="context_embeddings")
84+
85+
focal_biases = tf.Variable(tf.random_uniform([self.vocab_size], 1.0, -1.0),
86+
name='focal_biases')
87+
context_biases = tf.Variable(tf.random_uniform([self.vocab_size], 1.0, -1.0),
88+
name="context_biases")
89+
90+
focal_embedding = tf.nn.embedding_lookup([focal_embeddings], self.__focal_input)
91+
context_embedding = tf.nn.embedding_lookup([context_embeddings], self.__context_input)
92+
focal_bias = tf.nn.embedding_lookup([focal_biases], self.__focal_input)
93+
context_bias = tf.nn.embedding_lookup([context_biases], self.__context_input)
94+
95+
weighting_factor = tf.minimum(
96+
1.0,
97+
tf.pow(
98+
tf.div(self.__cooccurrence_count, count_max),
99+
scaling_factor))
100+
101+
embedding_product = tf.reduce_sum(tf.multiply(focal_embedding, context_embedding), 1)
102+
103+
log_cooccurrences = tf.log(tf.to_float(self.__cooccurrence_count))
104+
105+
distance_expr = tf.square(tf.add_n([
106+
embedding_product,
107+
focal_bias,
108+
context_bias,
109+
tf.negative(log_cooccurrences)]))
110+
111+
single_losses = tf.multiply(weighting_factor, distance_expr)
112+
self.__total_loss = tf.reduce_sum(single_losses)
113+
tf.summary.scalar("GloVe_loss", self.__total_loss)
114+
self.__optimizer = tf.train.AdagradOptimizer(self.learning_rate).minimize(
115+
self.__total_loss)
116+
self.__summary = tf.summary.merge_all()
117+
118+
self.__combined_embeddings = tf.add(focal_embeddings, context_embeddings,
119+
name="combined_embeddings")
120+
121+
def train(self, num_epochs, log_dir=None, summary_batch_interval=1000,
122+
tsne_epoch_interval=None):
123+
should_write_summaries = log_dir is not None and summary_batch_interval
124+
should_generate_tsne = log_dir is not None and tsne_epoch_interval
125+
batches = self.__prepare_batches()
126+
total_steps = 0
127+
with tf.Session(graph=self.__graph) as session:
128+
if should_write_summaries:
129+
print("Writing TensorBoard summaries to {}".format(log_dir))
130+
summary_writer = tf.summary.FileWriter(log_dir, graph=session.graph)
131+
tf.global_variables_initializer().run()
132+
for epoch in range(num_epochs):
133+
shuffle(batches)
134+
for batch_index, batch in enumerate(batches):
135+
i_s, j_s, counts = batch
136+
if len(counts) != self.batch_size:
137+
continue
138+
feed_dict = {
139+
self.__focal_input: i_s,
140+
self.__context_input: j_s,
141+
self.__cooccurrence_count: counts}
142+
session.run([self.__optimizer], feed_dict=feed_dict)
143+
if should_write_summaries and (total_steps + 1) % summary_batch_interval == 0:
144+
summary_str = session.run(self.__summary, feed_dict=feed_dict)
145+
summary_writer.add_summary(summary_str, total_steps)
146+
total_steps += 1
147+
if should_generate_tsne and (epoch + 1) % tsne_epoch_interval == 0:
148+
current_embeddings = self.__combined_embeddings.eval()
149+
output_path = os.path.join(log_dir, "epoch{:03d}.png".format(epoch + 1))
150+
self.generate_tsne(output_path, embeddings=current_embeddings)
151+
self.__embeddings = self.__combined_embeddings.eval()
152+
if should_write_summaries:
153+
summary_writer.close()
154+
155+
def embedding_for(self, word_str_or_id):
156+
if isinstance(word_str_or_id, str):
157+
return self.embeddings[self.__word_to_id[word_str_or_id]]
158+
elif isinstance(word_str_or_id, int):
159+
return self.embeddings[word_str_or_id]
160+
161+
def __prepare_batches(self):
162+
if self.__cooccurrence_matrix is None:
163+
raise NotFitToCorpusError(
164+
"Need to fit model to corpus before preparing training batches.")
165+
cooccurrences = [(word_ids[0], word_ids[1], count)
166+
for word_ids, count in self.__cooccurrence_matrix.items()]
167+
i_indices, j_indices, counts = zip(*cooccurrences)
168+
return list(_batchify(self.batch_size, i_indices, j_indices, counts))
169+
170+
@property
171+
def vocab_size(self):
172+
return len(self.__words)
173+
174+
@property
175+
def words(self):
176+
if self.__words is None:
177+
raise NotFitToCorpusError("Need to fit model to corpus before accessing words.")
178+
return self.__words
179+
180+
@property
181+
def embeddings(self):
182+
if self.__embeddings is None:
183+
raise NotTrainedError("Need to train model before accessing embeddings")
184+
return self.__embeddings
185+
186+
def id_for_word(self, word):
187+
if self.__word_to_id is None:
188+
raise NotFitToCorpusError("Need to fit model to corpus before looking up word ids.")
189+
return self.__word_to_id[word]
190+
191+
def generate_tsne(self, path=None, size=(100, 100), word_count=1000, embeddings=None):
192+
if embeddings is None:
193+
embeddings = self.embeddings
194+
from sklearn.manifold import TSNE
195+
tsne = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000)
196+
low_dim_embs = tsne.fit_transform(embeddings[:word_count, :])
197+
labels = self.words[:word_count]
198+
return _plot_with_labels(low_dim_embs, labels, path, size)

0 commit comments

Comments
 (0)