forked from mderdun/indicator-leighhunt
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathVizprep.py
145 lines (133 loc) · 6.13 KB
/
Vizprep.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
import xml.etree.ElementTree as tree
import json
from os import listdir # install the "listdir" package (pip install dirlist)
from os.path import isfile, join
files =[]
dir = "OutputNER"
for file in listdir(dir):
if isfile(join(dir, file)):
files.append(dir + "/" + file)
for f in files:
if ".xml" not in f:
files.remove(f)
files.sort()
#print(files)
result = {}
for file in files:
result[file] = {}
root = tree.parse(file)
string = ""
result[file]["title"] = []
result[file]["bibl"] = {}
result[file]["bibl"]["title"] = []
result[file]["persons"] = {}
result[file]["persons"]["real"] = {}
result[file]["persons"]["fictional"] = {}
result[file]["places"] = {}
result[file]["org"] = {}
result[file]["dates"] = {}
no = ""
bibtitle = ""
for el in root.findall(".//{http://www.tei-c.org/ns/1.0}div2//{http://www.tei-c.org/ns/1.0}title"):
obj={}
altstring = ""
title_type = ""
key = el.get("key")
level= el.get("level")
expan = el.find("{http://www.tei-c.org/ns/1.0}choice/{http://www.tei-c.org/ns/1.0}expan")
if expan is not None:
altstring = expan.text
if key is not None:
string = key
elif key is None:
string = el.text
if level is not None:
title_type = level
if altstring is not None and altstring != "":
obj = {"title": altstring, "level": title_type}
if string is not None and string != "":
obj = {"title": string, "level": title_type}
if obj not in result[file]["title"]:
result[file]["title"].append(obj)
for el in root.findall(".//{http://www.tei-c.org/ns/1.0}div2//{http://www.tei-c.org/ns/1.0}persName[@type = 'real']"):
key = el.get('key')
string = el.get('ref')
if key is not None and string is not None:
result[file]["persons"]["real"][key] = string
elif key is not None and string is None:
result[file]["persons"]["real"][key] = key
for el in root.findall(".//{http://www.tei-c.org/ns/1.0}div2//{http://www.tei-c.org/ns/1.0}persName[@type = 'fictional']"):
key = el.get('key')
string = el.get('ref')
if key is not None and string is not None:
result[file]["persons"]["fictional"][key] = string
elif key is not None and string is None:
result[file]["persons"]["fictional"][key] = key
for el in root.findall(".//{http://www.tei-c.org/ns/1.0}div2//{http://www.tei-c.org/ns/1.0}bibl"):
bibl = {}
altstring = ""
head = el.find("{http://www.tei-c.org/ns/1.0}title")
t_type = ""
if head is not None:
expan = head.find("{http://www.tei-c.org/ns/1.0}choice/{http://www.tei-c.org/ns/1.0}expan")
bibtitle = head.text
if expan is not None:
altstring = expan.text
t_type = head.get("level")
if bibtitle is not None and bibtitle != "" and string not in result[file]["bibl"]["title"]:
bibl["title"] = bibtitle
#result[file]["bibl"]["title"][bibtitle] = {}
if altstring is not None and altstring != "" and string not in result[file]["bibl"]["title"]:
bibtitle = altstring
bibl["title"] = bibtitle
#result[file]["bibl"]["title"][bibtitle] = {}
author = el.find("{http://www.tei-c.org/ns/1.0}author")
if author is not None:
string = author.text
if string is not None:
bibl["author"] = string
#result[file]["bibl"]["title"][bibtitle]["author"] = string
editor = el.find("{http://www.tei-c.org/ns/1.0}editor")
if editor is not None:
string = editor.text
if string is not None:
bibl["editor"] = string
#result[file]["bibl"]["title"][bibtitle]["editor"] = string
ref = el.find("{http://www.tei-c.org/ns/1.0}ref")
if ref is not None:
target = ref.get('target')
string = target
if string is not None:
if bibtitle != "":
bibl["ref"] = string
#result[file]["bibl"]["title"][bibtitle]["ref"] = string
if t_type is not None and t_type != "":
bibl["level"] = t_type
#result[file]["bibl"]["title"][bibtitle]["level"] = t_type
if bibl is not result[file]["bibl"]["title"] and len(bibl.keys()) > 1:
result[file]["bibl"]["title"].append(bibl)
for el in root.findall(".//{http://www.tei-c.org/ns/1.0}div2//{http://www.tei-c.org/ns/1.0}placeName[@key]"):
string = el.get('key')
ref = el.get('ref')
type = el.get('type')
country = el.get('corresp')
result[file]["places"][string] = {}
result[file]["places"][string]["type"] = type
result[file]["places"][string]["country"] = country
for el in root.findall(".//{http://www.tei-c.org/ns/1.0}div2//{http://www.tei-c.org/ns/1.0}orgName"):
key = el.get('key')
string = el.get('ref')
if key is not None and string is not None:
result[file]["org"][key] = string
else:
string = el.text
result[file]["org"][string] = string
for el in root.findall(".//{http://www.tei-c.org/ns/1.0}div2//{http://www.tei-c.org/ns/1.0}date"):
when = el.get('when')
string = el.text
if when is not None:
result[file]["dates"][when] = string
json_obj = json.dumps(result, indent=7, ensure_ascii = False)
with open("Hunt_dataNew.json", "w") as outfile:
outfile.write(json_obj)
print("Done!")