1
+ import sqlite3
2
+ import json
3
+ import pandas as pd
4
+ import sklearn
5
+ from sklearn .feature_extraction .text import TfidfVectorizer
6
+
7
+ class QuestionAnswerVirtualAssistant :
8
+ """
9
+ Used for automatic question-answering
10
+
11
+ It works by building a reverse index store that maps
12
+ words to an id. To find the indexed questions that contain
13
+ a certain the words in the user question, we then take an
14
+ intersection of the ids, ranks the questions to pick the best fit,
15
+ then select the answer that maps to that question
16
+ """
17
+
18
+ def __init__ (self ):
19
+ """
20
+ Returns - None
21
+ Input - None
22
+ ----------
23
+ - Initialize database. we use sqlite3
24
+ - Check if the tables exist, if not create them
25
+ - maintain a class level access to the database
26
+ connection object
27
+ """
28
+ self .conn = sqlite3 .connect ("virtualassistant.sqlite3" , autocommit = True )
29
+ cur = self .conn .cursor ()
30
+ res = cur .execute ("SELECT name FROM sqlite_master WHERE name='IdToQuesAns'" )
31
+ tables_exist = res .fetchone ()
32
+
33
+ if not tables_exist :
34
+ self .conn .execute ("CREATE TABLE IdToQuesAns(id INTEGER PRIMARY KEY, question TEXT, answer TEXT)" )
35
+ self .conn .execute ('CREATE TABLE WordToId (name TEXT, value TEXT)' )
36
+ cur .execute ("INSERT INTO WordToId VALUES (?, ?)" , ("index" , "{}" ,))
37
+
38
+ def index_question_answer (self , question , answer ):
39
+ """
40
+ Returns - string
41
+ Input - str: a string of words called question
42
+ ----------
43
+ Indexes the question and answer. It does this by performing two
44
+ operations - add the question and answer to the IdToQuesAns, then
45
+ adds the words in the question to WordToId
46
+ - takes in the question and answer (str)
47
+ - passes the question and answer to a method to add them
48
+ to IdToQuesAns
49
+ - retrieves the id of the inserted ques-answer
50
+ - uses the id to call the method that adds the words of
51
+ the question to the reverse index WordToId if the word has not
52
+ already been indexed
53
+ """
54
+ row_id = self ._add_to_IdToQuesAns (question .lower (), answer .lower ())
55
+ cur = self .conn .cursor ()
56
+ reverse_idx = cur .execute ("SELECT value FROM WordToId WHERE name='index'" ).fetchone ()[0 ]
57
+ reverse_idx = json .loads (reverse_idx )
58
+ question = question .split ()
59
+ for word in question :
60
+ if word not in reverse_idx :
61
+ reverse_idx [word ] = [row_id ]
62
+ else :
63
+ if row_id not in reverse_idx [word ]:
64
+ reverse_idx [word ].append (row_id )
65
+ reverse_idx = json .dumps (reverse_idx )
66
+ cur = self .conn .cursor ()
67
+ result = cur .execute ("UPDATE WordToId SET value = (?) WHERE name='index'" , (reverse_idx ,))
68
+ return ("index successful" )
69
+
70
+ def _add_to_IdToQuesAns (self , question , answer ):
71
+ """
72
+ Returns - int: the id of the inserted document
73
+ Input - str: a string of words called `document`
74
+ ---------
75
+ - use the class-level connection object to insert the document
76
+ into the db
77
+ - retrieve and return the row id of the inserted document
78
+ """
79
+ cur = self .conn .cursor ()
80
+ res = cur .execute ("INSERT INTO IdToQuesAns (question, answer) VALUES (?, ?)" , (question , answer ,))
81
+ return res .lastrowid
82
+
83
+ def find_questions (self , user_input ):
84
+ """
85
+ Returns - <class method>: the return value of the _find_questions_with_idx method
86
+ Input - str: a string of words called `user_input`, expected to be a question
87
+ ---------
88
+ - retrieve the reverse index
89
+ - use the words contained in the user input to find all the idxs
90
+ that contain the word
91
+ - use idxs to call the _find_questions_with_idx method
92
+ - return the result of the called method
93
+ """
94
+ cur = self .conn .cursor ()
95
+ reverse_idx = cur .execute ("SELECT value FROM WordToId WHERE name='index'" ).fetchone ()[0 ]
96
+ reverse_idx = json .loads (reverse_idx )
97
+ user_input = user_input .split (" " )
98
+ all_docs_with_user_input = []
99
+ for term in user_input :
100
+ if term in reverse_idx :
101
+ all_docs_with_user_input .append (reverse_idx [term ])
102
+
103
+ if not all_docs_with_user_input : # the user_input does not exist
104
+ return []
105
+
106
+ common_idx_of_docs = set (all_docs_with_user_input [0 ])
107
+ for idx in all_docs_with_user_input [1 :]:
108
+ common_idx_of_docs .intersection_update (idx )
109
+
110
+ if not common_idx_of_docs : # the user_input does not exist
111
+ return []
112
+
113
+ return self ._find_questions_with_idx (common_idx_of_docs )
114
+
115
+ def _find_questions_with_idx (self , idxs ):
116
+ """
117
+ Returns - list[str]: the list of questions with the idxs
118
+ Input - list of idxs
119
+ ---------
120
+ - use the class-level connection object to retrieve the questions that
121
+ have the idx in the input list of idxs.
122
+ - retrieve and return these questions as a list
123
+ """
124
+ idxs = list (idxs )
125
+ cur = self .conn .cursor ()
126
+ sql = "SELECT id, question, answer FROM IdToQuesAns WHERE id in ({seq})" .format (
127
+ seq = ',' .join (['?' ]* len (idxs ))
128
+ )
129
+ result = cur .execute (sql , idxs ).fetchall ()
130
+ return (result )
131
+
132
+ def find_most_matched_question (self , user_input , corpus ):
133
+ """
134
+ Returns - list[str]: the list of [(score, most_matching_question)]
135
+ Input - user_input, and list of matching questions called corpus
136
+ ---------
137
+ - use the tfidf score to rank the questions and pick the most matching
138
+ question
139
+ """
140
+ vectorizer = TfidfVectorizer ()
141
+ tfidf_scores = vectorizer .fit_transform (corpus )
142
+ tfidf_array = pd .DataFrame (tfidf_scores .toarray (),columns = vectorizer .get_feature_names_out ())
143
+ tfidf_dict = tfidf_array .to_dict ()
144
+
145
+ user_input = user_input .split (" " )
146
+ result = []
147
+ for idx in range (len (corpus )):
148
+ result .append ([0 , corpus [idx ]])
149
+
150
+ for term in user_input :
151
+ if term in tfidf_dict :
152
+ for idx in range (len (result )):
153
+ result [idx ][0 ] += tfidf_dict [term ][idx ]
154
+ return result [0 ]
155
+
156
+ def provide_answer (self , user_input ):
157
+ """
158
+ Returns - str: the answer to the user_input
159
+ Input - str: user_input
160
+ ---------
161
+ - use the user_input to get the list of matching questions
162
+ - create a corpus which is a list of all matching questions
163
+ - create a question_map that maps questions to their respective answers
164
+ - use the user_input and corpus to find the most matching question
165
+ - return the answer that matches that question from the question_map
166
+ """
167
+ matching_questions = self .find_questions (user_input )
168
+ corpus = [item [1 ] for item in matching_questions ]
169
+ question_map = {question :answer for (id , question , answer ) in matching_questions }
170
+ score , most_matching_question = self .find_most_matched_question (user_input , corpus )
171
+ return question_map [most_matching_question ]
172
+
173
+
174
+ if __name__ == "__main__" :
175
+ va = QuestionAnswerVirtualAssistant ()
176
+ va .index_question_answer (
177
+ "What are the different types of competitions available on Kaggle" ,
178
+ "Types of Competitions Kaggle Competitions are designed to provide challenges for competitors"
179
+ )
180
+ print (
181
+ va .index_question_answer (
182
+ "How to form, manage, and disband teams in a competition" ,
183
+ "Everyone that competes in a Competition does so as a team. A team is a group of one or more users"
184
+ )
185
+ )
186
+ va .index_question_answer (
187
+ "What is Data Leakage" ,
188
+ "Data Leakage is the presence of unexpected additional information in the training data"
189
+ )
190
+ va .index_question_answer (
191
+ "How does Kaggle handle cheating" ,
192
+ "Cheating is not taken lightly on Kaggle. We monitor our compliance account"
193
+ )
194
+ print (va .provide_answer ("state Kaggle cheating policy" ))
195
+ print (va .provide_answer ("Tell me what is data leakage" ))
0 commit comments