-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathprocess_raw_data.py
63 lines (47 loc) · 2.22 KB
/
process_raw_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
import json
import re
from cleantext import clean
def preprocess_text(text, **kwargs):
cleaned_text = clean(
text,
no_emoji=True, # Remove emojis
no_urls=True, # Remove URLs
to_ascii=False,
# no_numbers=False, # Remove numbers
# no_currency_symbols=True, # Remove currency symbols
# replace_with_punct="", # Replace punctuation with nothing
# replace_with_url="<URL>", # Replace URLs with <URL>
# replace_with_number="<NUM>", # Replace numbers with <NUM>
# lower=True,
**kwargs
)
return cleaned_text.replace('', '')
def remove_lines(text, lines_triggers):
for l in lines_triggers:
text = re.sub(l, '', text, flags=re.MULTILINE | re.UNICODE)
return text
with open('data/parsed.txt', encoding='utf-8') as f:
a = f.read().replace('ru:', '')
b = a.split('>>>>>>>')
def remove_non_alphabetic_lines(text):
lines = text.splitlines()
cleaned_lines = [
line for line in lines
if re.search(r'[a-zA-Zа-яА-Я]', line)
]
return "\n".join(cleaned_lines)
io = dict()
for i in b:
if not 'input:' in i:
continue
texts = i.split('input:')[1].split('output')[0], i.split('output')[1]
texts = [remove_lines(t, [r'^#event.*$', r'^choyxona.*$', r'^Посмотреть ещё.*$']) for t in texts]
inp, out = texts
out = remove_lines(out, [r'Посмотреть ещё мероприятия', 'Мероприятия для стартапов и инвесторов в Центральной Азии',
'startupchoyxona', 'К мероприятию', 'Построить маршрут', 'Перейти к регистрации'])
out = remove_non_alphabetic_lines(preprocess_text(out, no_punct=False,
lower=False,
replace_with_url=''))
io[preprocess_text(inp, no_punct=False, replace_with_punct='', lower=False)] = out
with open('data/dataset.json', 'w+', encoding='utf-8') as f:
json.dump([{'input': k, 'output': v} for k, v in io.items()], f, ensure_ascii=False)