forked from hasadna/laws-fixes-extractor-and-exporter
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcleaner.py
49 lines (40 loc) · 1.99 KB
/
cleaner.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
import logging
logger = logging.getLogger(__name__)
def clean(data):
data = data.replace('<br/>', ' ').strip()
logger.debug(data)
return data
def clean_data(results, booklet_type):
split_texts = ["תיקונים עקיפים:", "תיקון עקיף:"] # Define the texts to split the description
for result_dict in reversed(results['Results']):
data = result_dict['Data']
if len(data['Document']) > 1:
logger.error(f'data has more than 1 document {data["Document"][0]["DisplayName"]}')
exit()
description_html = data['DocSummary']['DescriptionHtmlString']
split_text = next((text for text in split_texts if text in description_html), None)
if split_text:
summary, description = description_html.split(split_text)
doc_summary = [summary.replace('<br/>', '')]
display_name = data['Document'][0]['DisplayName'].replace('<br/>', '\n')
description = description.strip('<br/>').replace('<br/>', '\n')
description = f"{display_name}\n\n{description}"
else:
description = data['Document'][0]['DisplayName'].replace('<br/>', '\n')
doc_summary = data['DocSummary']['DescriptionHtmlString'].split('<br/>')
doc_summary = [summary.strip() for summary in doc_summary if summary.strip()]
for summary in doc_summary:
datum = {
'creation_date': data['CreationDate'],
'modify_date': data['ModifyDate'],
'number_of_pages': data['Pages'],
'published_date': data['PublishDate'],
'file_name': data['Document'][0]['FileName'],
'display_name': summary,
'extension': data['Document'][0]['Extension'],
'description': description,
'booklet_number': data['BookletNum'],
'foreign_year': data['ForeignYear'],
'booklet_type': booklet_type
}
yield datum