-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathtextTools.py
38 lines (28 loc) · 999 Bytes
/
textTools.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
polish_letters = 'ąćęłńóśźżĄĆĘŁŃÓŚŹŻ'
ascii_letters = 'acelnoszz'
punctuation = ",.-?!'"
lowercase = 'abcdefghijklmnopqrstuvwxyz'
uppercase = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
polish_map = str.maketrans(polish_letters, ascii_letters * 2)
punc_map = str.maketrans('', '', punctuation)
case_map = str.maketrans(uppercase, lowercase)
all_map = polish_map | punc_map | case_map
# remove punctuation, trailing whitespace,
# change to lowercase, replace polish letters with ascii
def clean(text):
text = text.strip().translate(all_map)
return text
# LEGACY
# replace polish letters with ascii
def polish(text):
polish_letters = 'ąćęłńóśźż'
ascii_letters = 'acelnoszz'
for index, letter in enumerate(polish_letters):
text = text.replace(letter, ascii_letters[index])
return text
def clean_legacy(text):
for char in punctuation:
text = text.replace(char, '')
text = text.strip().lower()
text = polish(text)
return text