-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtfidf_impl.py
134 lines (107 loc) · 3.74 KB
/
tfidf_impl.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
"""Tfidf Implementation."""
import numpy as np
from sklearn.preprocessing import normalize
from scipy.sparse import csr_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter
class CustomTfidfVectorizer:
"""Custom Tfidf Vectorizer"""
def __init__(self):
# Store idf of all different words in corpus
self.idf_dict = {}
# Store idx of all different sorted words in corpus
self.word_idx = {}
def _term_freq(self, document: str):
"""Compute term frequency of a given document."""
# Split at spaces
doc = list(document.split(" "))
doc_len = len(doc)
# Stores term frequency
tf = dict(Counter(doc))
for word in tf.keys():
tf[word] = tf[word]/(doc_len*1.0)
return tf
def _inv_doc_freq(self, corpus: list):
"""Compute inverse document frequency of all words in corpus."""
idf = {}
corpus_len = len(corpus)
# Find the number of documents in which a word occurs
for document in corpus:
doc = set(document.split(" "))
for word in doc:
if word not in idf:
idf[word] = 1
else:
idf[word] += 1
# Considering smoothed idf
# idf[word] = log(N/n) where N=total number of documents
# n=number of documents in which word occurs
# smoothed_idf[word] = 1+log((1+N)/(1+n))
# To avoid zero-division, log(0) issues
for word in idf.keys():
idf[word] = 1 + np.log((1+corpus_len)/(1+idf[word]))
self.idf_dict = {word: idf[word] for word in sorted(idf)}
self.word_idx = {word: idx for idx, word in enumerate(sorted(idf))}
def fit(self, corpus: list):
"""Fit the corpus."""
self._inv_doc_freq(corpus=corpus)
def transform(self, corpus: list):
"""Transform the corpus."""
corpus_len = len(corpus)
# Each row corresponds to a document in corpus
rows = []
# Store the idx of words whose tfidf is not zero
columns = []
# value of tfidf
tfidf_values = []
for row_idx, doc in enumerate(corpus):
# Compute term frequency
tf = self._term_freq(doc)
for word, freq in tf.items():
if word in self.idf_dict:
# Compute tfidf
tfidf = freq*self.idf_dict[word]
if 0 != tfidf:
col_idx = self.word_idx[word]
rows.append(row_idx)
columns.append(col_idx)
tfidf_values.append(tfidf)
else:
print(f"{word} not present in Vocabulary")
# Create sparse matrix
sparse_mat = csr_matrix((tfidf_values, (rows, columns)),
shape=(corpus_len, len(self.idf_dict.keys())))
# Normalize sparse matrix to create unit vectors for all docs
norm_sparse_mat = normalize(sparse_mat, norm='l2', axis=1, copy=True)
return norm_sparse_mat
def fit_transform(self, corpus: list):
"""Fit and transform the corpus."""
self.fit(corpus=corpus)
return self.transform(corpus=corpus)
def get_feature_names_out(self):
"""Get all the unique words in corpus."""
return np.array(list(self.idf_dict.keys()))
corpus = [
'this is the first document here',
'this document is the second document',
'and this is the third one',
'is this the first document',
]
# Custom Tfidf
cf = CustomTfidfVectorizer()
cf_mat = cf.fit_transform(corpus=corpus)
cf_feat = cf.get_feature_names_out()
# Sklearn tfidf
sf = TfidfVectorizer()
sf_mat = sf.fit_transform(corpus)
sf_feat = sf.get_feature_names_out()
print(cf_feat)
print(sf_feat)
try:
np.testing.assert_array_equal(cf_feat, sf_feat)
print("Features matched.")
except AssertionError:
print("Features did not match.")
print(cf_mat)
print('--------------------')
print(sf_mat)