Skip to content

Commit 61e4a20

Browse files
Merge pull request #2381 from Xceptions/ques-ans-virtual-assistant
Ques ans virtual assistant
2 parents eed473b + 83b0a7f commit 61e4a20

File tree

3 files changed

+238
-0
lines changed

3 files changed

+238
-0
lines changed
Lines changed: 195 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,195 @@
1+
import sqlite3
2+
import json
3+
import pandas as pd
4+
import sklearn
5+
from sklearn.feature_extraction.text import TfidfVectorizer
6+
7+
class QuestionAnswerVirtualAssistant:
8+
"""
9+
Used for automatic question-answering
10+
11+
It works by building a reverse index store that maps
12+
words to an id. To find the indexed questions that contain
13+
a certain the words in the user question, we then take an
14+
intersection of the ids, ranks the questions to pick the best fit,
15+
then select the answer that maps to that question
16+
"""
17+
18+
def __init__(self):
19+
"""
20+
Returns - None
21+
Input - None
22+
----------
23+
- Initialize database. we use sqlite3
24+
- Check if the tables exist, if not create them
25+
- maintain a class level access to the database
26+
connection object
27+
"""
28+
self.conn = sqlite3.connect("virtualassistant.sqlite3", autocommit=True)
29+
cur = self.conn.cursor()
30+
res = cur.execute("SELECT name FROM sqlite_master WHERE name='IdToQuesAns'")
31+
tables_exist = res.fetchone()
32+
33+
if not tables_exist:
34+
self.conn.execute("CREATE TABLE IdToQuesAns(id INTEGER PRIMARY KEY, question TEXT, answer TEXT)")
35+
self.conn.execute('CREATE TABLE WordToId (name TEXT, value TEXT)')
36+
cur.execute("INSERT INTO WordToId VALUES (?, ?)", ("index", "{}",))
37+
38+
def index_question_answer(self, question, answer):
39+
"""
40+
Returns - string
41+
Input - str: a string of words called question
42+
----------
43+
Indexes the question and answer. It does this by performing two
44+
operations - add the question and answer to the IdToQuesAns, then
45+
adds the words in the question to WordToId
46+
- takes in the question and answer (str)
47+
- passes the question and answer to a method to add them
48+
to IdToQuesAns
49+
- retrieves the id of the inserted ques-answer
50+
- uses the id to call the method that adds the words of
51+
the question to the reverse index WordToId if the word has not
52+
already been indexed
53+
"""
54+
row_id = self._add_to_IdToQuesAns(question.lower(), answer.lower())
55+
cur = self.conn.cursor()
56+
reverse_idx = cur.execute("SELECT value FROM WordToId WHERE name='index'").fetchone()[0]
57+
reverse_idx = json.loads(reverse_idx)
58+
question = question.split()
59+
for word in question:
60+
if word not in reverse_idx:
61+
reverse_idx[word] = [row_id]
62+
else:
63+
if row_id not in reverse_idx[word]:
64+
reverse_idx[word].append(row_id)
65+
reverse_idx = json.dumps(reverse_idx)
66+
cur = self.conn.cursor()
67+
result = cur.execute("UPDATE WordToId SET value = (?) WHERE name='index'", (reverse_idx,))
68+
return("index successful")
69+
70+
def _add_to_IdToQuesAns(self, question, answer):
71+
"""
72+
Returns - int: the id of the inserted document
73+
Input - str: a string of words called `document`
74+
---------
75+
- use the class-level connection object to insert the document
76+
into the db
77+
- retrieve and return the row id of the inserted document
78+
"""
79+
cur = self.conn.cursor()
80+
res = cur.execute("INSERT INTO IdToQuesAns (question, answer) VALUES (?, ?)", (question, answer,))
81+
return res.lastrowid
82+
83+
def find_questions(self, user_input):
84+
"""
85+
Returns - <class method>: the return value of the _find_questions_with_idx method
86+
Input - str: a string of words called `user_input`, expected to be a question
87+
---------
88+
- retrieve the reverse index
89+
- use the words contained in the user input to find all the idxs
90+
that contain the word
91+
- use idxs to call the _find_questions_with_idx method
92+
- return the result of the called method
93+
"""
94+
cur = self.conn.cursor()
95+
reverse_idx = cur.execute("SELECT value FROM WordToId WHERE name='index'").fetchone()[0]
96+
reverse_idx = json.loads(reverse_idx)
97+
user_input = user_input.split(" ")
98+
all_docs_with_user_input = []
99+
for term in user_input:
100+
if term in reverse_idx:
101+
all_docs_with_user_input.append(reverse_idx[term])
102+
103+
if not all_docs_with_user_input: # the user_input does not exist
104+
return []
105+
106+
common_idx_of_docs = set(all_docs_with_user_input[0])
107+
for idx in all_docs_with_user_input[1:]:
108+
common_idx_of_docs.intersection_update(idx)
109+
110+
if not common_idx_of_docs: # the user_input does not exist
111+
return []
112+
113+
return self._find_questions_with_idx(common_idx_of_docs)
114+
115+
def _find_questions_with_idx(self, idxs):
116+
"""
117+
Returns - list[str]: the list of questions with the idxs
118+
Input - list of idxs
119+
---------
120+
- use the class-level connection object to retrieve the questions that
121+
have the idx in the input list of idxs.
122+
- retrieve and return these questions as a list
123+
"""
124+
idxs = list(idxs)
125+
cur = self.conn.cursor()
126+
sql="SELECT id, question, answer FROM IdToQuesAns WHERE id in ({seq})".format(
127+
seq=','.join(['?']*len(idxs))
128+
)
129+
result = cur.execute(sql, idxs).fetchall()
130+
return(result)
131+
132+
def find_most_matched_question(self, user_input, corpus):
133+
"""
134+
Returns - list[str]: the list of [(score, most_matching_question)]
135+
Input - user_input, and list of matching questions called corpus
136+
---------
137+
- use the tfidf score to rank the questions and pick the most matching
138+
question
139+
"""
140+
vectorizer = TfidfVectorizer()
141+
tfidf_scores = vectorizer.fit_transform(corpus)
142+
tfidf_array = pd.DataFrame(tfidf_scores.toarray(),columns=vectorizer.get_feature_names_out())
143+
tfidf_dict = tfidf_array.to_dict()
144+
145+
user_input = user_input.split(" ")
146+
result = []
147+
for idx in range(len(corpus)):
148+
result.append([0, corpus[idx]])
149+
150+
for term in user_input:
151+
if term in tfidf_dict:
152+
for idx in range(len(result)):
153+
result[idx][0] += tfidf_dict[term][idx]
154+
return result[0]
155+
156+
def provide_answer(self, user_input):
157+
"""
158+
Returns - str: the answer to the user_input
159+
Input - str: user_input
160+
---------
161+
- use the user_input to get the list of matching questions
162+
- create a corpus which is a list of all matching questions
163+
- create a question_map that maps questions to their respective answers
164+
- use the user_input and corpus to find the most matching question
165+
- return the answer that matches that question from the question_map
166+
"""
167+
matching_questions = self.find_questions(user_input)
168+
corpus = [item[1] for item in matching_questions]
169+
question_map = {question:answer for (id, question, answer) in matching_questions}
170+
score, most_matching_question = self.find_most_matched_question(user_input, corpus)
171+
return question_map[most_matching_question]
172+
173+
174+
if __name__ == "__main__":
175+
va = QuestionAnswerVirtualAssistant()
176+
va.index_question_answer(
177+
"What are the different types of competitions available on Kaggle",
178+
"Types of Competitions Kaggle Competitions are designed to provide challenges for competitors"
179+
)
180+
print(
181+
va.index_question_answer(
182+
"How to form, manage, and disband teams in a competition",
183+
"Everyone that competes in a Competition does so as a team. A team is a group of one or more users"
184+
)
185+
)
186+
va.index_question_answer(
187+
"What is Data Leakage",
188+
"Data Leakage is the presence of unexpected additional information in the training data"
189+
)
190+
va.index_question_answer(
191+
"How does Kaggle handle cheating",
192+
"Cheating is not taken lightly on Kaggle. We monitor our compliance account"
193+
)
194+
print(va.provide_answer("state Kaggle cheating policy"))
195+
print(va.provide_answer("Tell me what is data leakage"))
Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
from tkinter import *
2+
from tkinter import messagebox
3+
import backend
4+
5+
6+
def index_question_answer():
7+
# for this, we are separating question and answer by "_"
8+
question_answer = index_question_answer_entry.get()
9+
question, answer = question_answer.split("_")
10+
print(question)
11+
print(answer)
12+
va = backend.QuestionAnswerVirtualAssistant()
13+
print(va.index_question_answer(question, answer))
14+
15+
def provide_answer():
16+
term = provide_answer_entry.get()
17+
va = backend.QuestionAnswerVirtualAssistant()
18+
print(va.provide_answer(term))
19+
20+
if __name__ == "__main__":
21+
root = Tk()
22+
root.title("Knowledge base")
23+
root.geometry('300x300')
24+
25+
index_question_answer_label = Label(root, text="Add question:")
26+
index_question_answer_label.pack()
27+
index_question_answer_entry = Entry(root)
28+
index_question_answer_entry.pack()
29+
30+
index_question_answer_button = Button(root, text="add", command=index_question_answer)
31+
index_question_answer_button.pack()
32+
33+
provide_answer_label = Label(root, text="User Input:")
34+
provide_answer_label.pack()
35+
provide_answer_entry = Entry(root)
36+
provide_answer_entry.pack()
37+
38+
search_term_button = Button(root, text="ask", command=provide_answer)
39+
search_term_button.pack()
40+
41+
root.mainloop()
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
pandas
2+
scikit-learn

0 commit comments

Comments
 (0)