forked from AbeHandler/DocumentParserator
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathutils.py
98 lines (85 loc) · 3 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
"""
A few utility functions to support the web app
"""
import json
import logging
import importlib
from documentcloud import DocumentCloud
MODULE = importlib.import_module('documentparserator.parserator.contract_parser')
logging.basicConfig(level=logging.DEBUG, filename="parserator.log")
CLIENT = DocumentCloud()
def get_colors(tag):
"""
Get the colors for a certain tag
"""
with open('/home/abe/research/documentparserator/webapp/static/json/tags.json') as data_file:
data = json.load(data_file)
return [d for d in data if d['name'] == tag].pop()
def sort_keys(keys):
"""
Keys come in the form page#-tokennumber (on a page)
Sort them in order of pages, then in order of token #
"""
keys.sort(key=lambda x: int(x.split("-")[1]))
keys.sort(key=lambda x: int(x.split("-")[0]))
return keys
def get_document_page(doc_cloud_id, page):
"""
Get a page in a document cloud document
"""
doc = CLIENT.documents.get(doc_cloud_id)
page_text = doc.get_page_text(page)
page_text = page_text.decode("ascii", "ignore").encode("ascii", "ignore")
return page_text
# TO DO JINJA TEMPLATE
def span_wrap(text, span_id, tag):
"""
Wrap a token in a span tag
"""
if tag == "skip":
return "<span id=\"" + span_id +\
"\" class=\"token\" data-tag=\"" + tag + "\">" + text + "</span>"
else:
colors = get_colors(tag)
style = 'style="border: 2px solid rgb(' + str(colors['red']) + ',' +\
' ' + str(colors['green']) + ', ' + str(colors['blue']) + ');"'
return "<span id=\"" + span_id +\
"\" class=\"token\" data-tag=\"" +\
tag + "\"" + style +\
">" + text + "</span>"
def spanify(page_text, page_no, labels=None):
"""
Take a page of text and wrap it in span tags
If the labels from parserator are provided.
Otherwise it defaults to skip
"""
tokens = MODULE.tokenize(page_text, True)
last_index_mem = 0
in_between = ""
new_tokens = []
in_betweens = []
token_no = 1
for token in tokens:
start = token[0]
end = token[1]
token_no = token_no + 1
if last_index_mem > 0:
in_between = page_text[last_index_mem: start]
last_index_mem = end
spanid = str(page_no) + "-" + str(token_no)
if labels:
try:
correct_label = [l for l\
in labels if spanid == l['id']].pop()['label']
except IndexError:
logging.debug("Skipping. Could not find label for " + spanid)
correct_label = "skip"
new_token = span_wrap(str(page_text[start: end]), spanid, correct_label)
else:
new_token = span_wrap(str(page_text[start: end]), spanid, "skip")
new_tokens.append(new_token)
in_betweens.append(in_between)
output = ""
for i in range(0, len(new_tokens)):
output = output + new_tokens[i] + in_betweens[i]
return output.replace("\n", "<br />")