-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpreprocessing.py
92 lines (78 loc) · 3.18 KB
/
preprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
from nltk.stem.snowball import EnglishStemmer # Assuming we're working with English
import regex as re
def read_stop_words():
stop_words = []
with open("EnglishStopWords.txt", 'r', encoding='utf-8', errors='ignore') as f:
read_file = f.readlines()
for word in read_file:
stop_words.append(word.strip())
return {key: None for key in stop_words}
#initialise some variables
stop_words = read_stop_words()
stemmer = EnglishStemmer()
def preprocess_string(text, stopping = True, stemming = True, lowercasing = True):
global stop_words
global stemmer
if not text:
text = ""
# print(f"Input to preprocesser: {text}")
text = text.encode("utf-8",errors="ingore").decode("utf-8", errors="ingore")
# text = unicode(text, errors='ignore')
html_tag_regex = re.compile('<.*?>')
non_word_regex = re.compile('[^\w\']')
non_alpha_numeric_chars = re.compile('[^a-z-A-Z-0-9\' ]')
# HTML stripped
html_stripped_string = re.sub(html_tag_regex, ' ', text)
# Tokenize and remove odd/nonalphanumeric characters. (except for unknown characters by the regex)
newline_removed_string = " ".join(re.split(non_word_regex,html_stripped_string))
#clean text from leftover unkown characters
stripped_string = re.sub(non_alpha_numeric_chars, "", newline_removed_string)
temp_string = stripped_string
if stopping:
# stopword removal
stripped_no_stopword_list = [word for word in stripped_string.split() if not word in stop_words]
#remove extra empty strings left after last removal
stripped_string = " ".join(stripped_no_stopword_list)
temp_string = stripped_string
if lowercasing:
# Lowercasing.
temp_string = temp_string.lower()
if stemming:
# Stemming
temp_string = " ".join([stemmer.stem(word) for word in temp_string.split()])
return temp_string
def preprocess_QA_text(text):
if not text:
text = ""
text = text.encode("utf-8",errors="ingore").decode("utf-8", errors="ingore")
# html removal
clean_text = re.sub('<.*?>', ' ', text)
# remove other special characters except thos helping the meaning of a sentence
clean_text = re.sub('[^a-zA-Z0-9,\'.?!:\-()\[\] ]', '', clean_text)
return clean_text
def ensure_good_string(doc, string):
# ensures that filed exists in doc and that it is not null, else returns ""
if string in doc and doc[string]:
return doc[string]
else:
return ""
def ensure_good_str_list(doc, string):
if string in doc and doc[string]:
return [s if s is not None else "" for s in doc[string] ]
else:
return []
return
def ensure_good_content(content_list):
'''
function to remove potential problems from the context, and preprocess it to look like normal text
'''
# remove None-s from the list
string_list = map(str,content_list)
# preprocess and join together the content list
string_list = preprocess_string(" ".join(string_list), stopping = False, stemming = False, lowercasing = False)
return [string_list]
def ensure_good_list(doc, string):
if string in doc:
return doc[string]
else:
return []