-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathparser.py
72 lines (62 loc) · 2.9 KB
/
parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
import ijson
import time
import codecs
from datetime import datetime
from time import strftime, localtime
f = open("/media/Windows7_OS/dpla/new/dpla")
out = codecs.open('/media/Windows7_OS/dpla/new/dpla.csv', 'w', encoding='utf-8')
melt = codecs.open('/media/Windows7_OS/dpla/new/dpla.melt.csv', 'w', encoding='utf-8')
now = ""
start = time.strftime("%a, %d %b %Y %H:%M:%S +0000", time.localtime())
print "Start: " + start
counter = 0
melt.write("identifier|provider|subprov|field|binary|count\n")
header = "identifier|collection|contributor|creator|date|description|extent|format|identifier|"
header += "isPartOf|language|publisher|relation|rights|spatial|specType|stateLocatedIn|"
header += "subject|temporal|title|type|provider|subprov|thumbnail"
out.write(header + "\n")
for item in ijson.items(f, "item"):
counter = counter + 1
if counter % 10000 == 0:
now = time.strftime("%a, %d %b %Y %H:%M:%S +0000", time.localtime())
print str(counter) + ": " + now
ident = item['id']
if "provider" in item: prov = item['provider']['name']
else: prov = "Null"
if "dataProvider" in item:
if isinstance(item['dataProvider'], basestring): subprov = item['dataProvider']
else: subprov = ", ".join(item['dataProvider'])
else: subprov = "Null"
string = ident
# collection, contributor, creator, date, description, extent, format, @id, identifier,
# isPartOf, language, publisher, relation, rights, spatial, specType, stateLocatedIn,
# subject, temporal, title, type
for field in ["collection", "contributor", "creator", "date", "description",
"extent", "format", "identifier", "isPartOf", "language", "publisher",
"relation", "rights", "spatial", "specType", "stateLocatedIn",
"subject", "temporal", "title", "type"]:
if field in item['sourceResource'] and item['sourceResource'][field] is not None:
melt.write(ident + "|" + prov + "|" + subprov + "|" + field + "|1")
if isinstance(item['sourceResource'][field], list):
string += "|" + str(len(item['sourceResource'][field]))
melt.write("|" + str(len(item['sourceResource'][field])) + "\n")
else:
string += "|" + "1"
melt.write("|1\n")
else:
string += "|" + "0"
melt.write(ident + "|" + prov + "|" + subprov +"|" + field + "|0|0\n")
string += "|" + prov + "|" + subprov
if 'object' in item:
string += "|" + "1"
melt.write(ident + "|" + prov + "|" + subprov + "|thumb|1|1\n")
else:
string += "|" + "0"
melt.write(ident + "|" + prov + "|" + subprov + "|thumb|0|0\n")
out.write(string + "\n")
#objects = ijson.items(f, 'sourceResource')
#for o in objects:
# print 'title'
end = time.strftime("%a, %d %b %Y %H:%M:%S +0000", time.localtime())
print "Start: " + start
print "End: " + end