-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathauxfunctions.py
102 lines (73 loc) · 2.86 KB
/
auxfunctions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
import pathlib
from typing import List, Union, AnyStr
import pandas as pd
from typing.io import IO
def parse_input(s) -> List[tuple]:
"""
Parses the search input query and creates a data structure:
( column_to_search_in, string_to_search, should we filter filter for quoted string)
:param s: input string to parse
:return: the list of the search terms to perform and where
"""
# implementation for 'AND'
combined_queries = s.split(' AND ')
queries_to_perform = []
# find content operators (e.g. "title:")
import re
regex = r"([a-z]+):([a-zA-Z0-9 _]+( |$))"
for query in combined_queries:
matches = re.finditer(regex, query, re.MULTILINE)
for match in matches:
query = list(match.groups())
# match 0 is the column
# match 1 is the string to query
queries_to_perform.append((query[0], query[1], False))
# assumption: quoted queries are not combined with search operators
if not queries_to_perform:
if s.startswith('"') and s.endswith('"'):
s.replace('"', '') # remove quotes
queries_to_perform.append(('content', s, True))
else:
queries_to_perform.append(('content', s, False))
return queries_to_perform
def ranker(df: pd.DataFrame, models: dict, search_query: List[tuple], nr_results=20) -> pd.DataFrame:
sim_results = []
cols_to_return = ['rank', 'id', 'author', 'publication', 'title', 'content']
for col, term, _ in search_query:
print(f"Searching for '{term}' in '{col}'...")
# try catch here
sims = models[col].query_similarity(term)
sim_results.append(sims)
# simplest combination of results is the sum of queries
sim_results = sum(sim_results)
df['rank'] = sim_results
# filter for results with similarities
df = df[df['rank'] > 0]
# assumption: quoted queries are not combined with search operators
# is it a quoted query?
_, term, quoted = search_query.pop()
if quoted:
# then we filter for contents that have exactly that string
term = term.replace('"', '')
mask = df['content'].apply(lambda x: term in x)
df = df[mask]
print(f'Found {len(df)} documents') # as requested
return df[cols_to_return].sort_values(by=['rank'], ascending=False).head(nr_results)
def all_news_filereader(filename: Union[str, pathlib.Path, IO[AnyStr]]) -> pd.DataFrame:
"""
Reads a csv file into a pandas dataframe
:param filename: path to the filename
:return: a pandas dataframe
"""
import pandas as pd
print(f'Reading file {filename}...')
df = pd.read_csv(filename)
df.reset_index(drop=True, inplace=True)
return df
def read_input() -> str:
"""
Reads input from CLI
:return: str of the read input
"""
print('search: ', end="")
return input()