-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathWordEmbeddings.py
98 lines (75 loc) · 2.53 KB
/
WordEmbeddings.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
import math
import random
import sys
import gensim
import warnings
import re
from itertools import chain
warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim')
from gensim.models.keyedvectors import KeyedVectors
from nltk.corpus import wordnet
from nltk.corpus import stopwords
def main():
dataset=load_file()
dataset=get_vectors(dataset)
k=dataset[0]
print(k.label,k.words,len(k.vectors),len(k.vectors[0]))
def get_vectors(dataset):
'''the sentences obtained from dataset are sent into word2vec
to obtain word wise vectors for each sentence'''
total_sentences=len(dataset)
sentences=[temp.words for temp in dataset]
model = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True, limit=600000)
c=0
k=0
flag = 1
for sentence in range(total_sentences):
for word in dataset[sentence].words:
k+=1
synonyms = wordnet.synsets(word)
wordnet_synonyms=list(set(chain.from_iterable([word.lemma_names() for word in synonyms])))
if word in model.vocab:
'''if word exists in word2vec use vector directly'''
dataset[sentence].vectors.append(model[word])
elif len(wordnet_synonyms)!=0:
'''if word does not exist in word2vec, find synonym using wordnet'''
for i in wordnet_synonyms:
if i in model.vocab:
#use vector of similar word from wordnet
dataset[sentence].vectors.append(model[i])
break
else:
'''need to use ConceptNet'''
c+=1
# print(word)
dataset[sentence].vectors.append(model[random.choice(model.index2entity)])
# dataset[sentence].vectors.append(model[word])
#print('\nun-identified words:',c,'\ntotal words:',k)
return dataset
def load_file():
'''loads the datafile and stores label,text attributes to an SMS_datat object'''
file_names=['datasets/smsspamcollection/SMSSpamCollection']
file=0
dataset=[]
with open(file_names[file]) as f:
for line in f:
words=line.split()
temp=SMS_data()
temp.label=words[0] #accessing the label of text (spam or ham)
temp.words=text_processing(' '.join(words[1:])) #obtaining the text as list of words
dataset.append(temp)
print(len(dataset))
return dataset[:2274]
def text_processing(text):
words=re.sub('[^A-Za-z0-9]+', ' ', text) #removes special characters
#words=re.split('(\d+)',words)
stop = set(stopwords.words('english'))
words=[i for i in words.lower().split() if i not in stop] #stop word removal
return words
class SMS_data:
def __init__(self):
self.label=None
self.words=None
self.vectors=[]
if __name__ == '__main__':
main()