forked from Kshitij-Sharma49/InfoTech
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsimilarity.py
196 lines (159 loc) · 5.89 KB
/
similarity.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
nltk.download('averaged_perceptron_tagger')
from nltk.corpus import wordnet
nltk.download('punkt')
import requests
import io
from pdfminer.high_level import extract_text
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from gensim.corpora import Dictionary
from gensim.models import LdaModel
from gensim.parsing.preprocessing import remove_stopwords
import csv
import math
n = 0
# IPFS link to the PDF file
# ipfs_link1 = "https://gateway.lighthouse.storage/ipfs/QmXVPN7KubJF7YNLB3SDZG3NhpWbFstegmtAzZztqWbJwC"
# ipfs_link2 = "https://gateway.lighthouse.storage/ipfs/QmNxQDPZmthPKxbKwGcx4xXt2b2jMG1JzmDMFd8FzdA6yB"
# ipfs_link3 = "https://gateway.lighthouse.storage/ipfs/QmP3SS5JwVLL36uSZSCVWm8N81RGPFVAG88ZVoKcPKDig4"
# Open the CSV file for reading
with open('ipfs_links.csv', 'r') as file:
# Create a CSV reader object
reader = csv.reader(file)
# Initialize an empty list to store the IPFS links
links = []
# Loop through each row in the CSV file
for row in reader:
# Extract the IPFS link from the row and append it to the list
links.append(row[2])
n += 1
text = []
# Fetch the PDF file from IPFS and read it into memory
# Loop through the array and print each link
for link in links:
print(link)
response = requests.get(link)
memory_file = io.BytesIO(response.content)
# Extract text from the PDF
text.append(''.join(extract_text(memory_file)))
for i in range(len(text)):
words = text[i].split()
if(i==0):
print(words)
words = [remove_stopwords(word) for word in words]
text[i] = ' '.join(words)
lemmatizer = WordNetLemmatizer()
# Define a function to get the WordNet POS tag from the NLTK POS tag
def get_wordnet_pos(nltk_tag):
if nltk_tag.startswith('J'):
return wordnet.ADJ
elif nltk_tag.startswith('V'):
return wordnet.VERB
elif nltk_tag.startswith('N'):
return wordnet.NOUN
elif nltk_tag.startswith('R'):
return wordnet.ADV
else:
return None
lemmatized = []
# Iterate over each sentence in the text array
for sentence in text:
# Tokenize the sentence into words
words = nltk.word_tokenize(sentence)
# Get the NLTK POS tags for the words
pos_tags = nltk.pos_tag(words)
# Map the NLTK POS tags to WordNet POS tags
wordnet_tags = [(word, get_wordnet_pos(tag)) for word, tag in pos_tags]
# Lemmatize each word using its WordNet POS tag (if available)
lemmatized_words = [lemmatizer.lemmatize(word, tag) if tag else word for word, tag in wordnet_tags]
# Join the lemmatized words back into a sentence
lemmatized_sentence = ' '.join(lemmatized_words)
# Add the lemmatized sentence to the lemmatized array
lemmatized.append(lemmatized_sentence)
# define the texts to analyze
texts=[]
for el in lemmatized:
texts.append(el)
# define the number of topics to extract
num_topics = 70
# define the number of words in each topic
num_words = 70
lda_models = []
for text in texts:
# tokenize the text and remove stopwords
tokens = [token for token in simple_preprocess(text) if token not in STOPWORDS]
# create a dictionary from the tokens
dictionary = Dictionary([tokens])
# convert the tokens to a bag-of-words representation
bow_corpus = [dictionary.doc2bow(tokens)]
# define the LDA model
lda_model = LdaModel(bow_corpus, num_topics=num_topics, id2word=dictionary)
lda_models.append(lda_model)
topics_dict = {}
for i in range(n):
topics_dict[i+1] = {}
for idx, topic in lda_models[i].print_topics(-1, num_words=num_words):
# extract the words and their weights
words_weights = topic.split("+")
# loop over each word and its weight
for word_weight in words_weights:
# extract the word and its weight
word = word_weight.split("*")[1].replace('"', '').strip()
weight = float(word_weight.split("*")[0])
# store the word and its weight in the dictionary
if word in topics_dict[i+1]:
topics_dict[i+1][word] += weight
else:
topics_dict[i+1][word] = weight
def cosine_similarity(dict1, dict2):
# get the set of keys that appear in both dictionaries
keys = set(dict1.keys()) & set(dict2.keys())
# compute the dot product of the vectors
dot_product = sum(dict1[key] * dict2[key] for key in keys)
# compute the magnitude of the vectors
magnitude1 = math.sqrt(sum(dict1[key]**2 for key in dict1.keys()))
magnitude2 = math.sqrt(sum(dict2[key]**2 for key in dict2.keys()))
# compute the cosine similarity
if magnitude1 == 0 or magnitude2 == 0:
return 0
else:
return dot_product / (magnitude1 * magnitude2)
cos_sim = {}
for i in range(1, n+1):
cos_sim[i] = {}
for j in range(1, n+1):
if i == j:
continue
wt = cosine_similarity(topics_dict[i], topics_dict[j])
cos_sim[i][j] = wt
# print(cos_sim[i])
temp_list = sorted(cos_sim[i].items(), key=lambda x:x[1], reverse=True)
cos_sim[i] = dict(temp_list)
print(cos_sim)
id_mapping = {}
# Open CSV file
with open('ipfs_links.csv', 'r') as file:
reader = csv.DictReader(file)
for row in reader:
# Map the serial number to the document ID
id_mapping[int(row[0])] = int(row[1])
# Open a CSV file for writing
with open('similarities.csv', 'w', newline='') as file:
writer = csv.writer(file)
# Write the header row
writer.writerow(['Doc ID', 'D1', 'Sim1', 'D2', 'Sim2', 'D3', 'Sim3', 'D4', 'Sim4'])
# Write each inner dictionary as a new row
for s_no, doc_dict in cos_sim.items():
row = [id_mapping[s_no]]
j = 0
for key, value in doc_dict.items():
if j>=4:
break
row += [key, value]
j+=1
writer.writerow(row)