-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathcsv2omeka.py
180 lines (139 loc) · 7.15 KB
/
csv2omeka.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
from omekaclient import OmekaClient
from omekautils import get_omeka_config
from omekautils import create_stream_logger
from sys import stdout, stdin
import argparse
from csv2repo import CSVData
import json
import httplib2
import urlparse
import os
""" Uploads an a csv file to an Omeka server """
def download_and_upload_files(item):
"""Handle any dowloads, cache as files locally, then upload all files"""
http = httplib2.Http()
download_this = True
files = []
for url_field in item.URLs:
url = url_field.value
filename = urlparse.urlsplit(url).path.split("/")[-1]
new_path = os.path.join(data_dir, str(item.id))
if not os.path.exists(new_path):
os.makedirs(new_path)
file_path = os.path.join(new_path, filename)
logger.info("Local filename: %s", file_path)
#Check if we have one the same size already
if os.path.exists(file_path):
response, content = http.request(url, "HEAD")
download_size = int(response['content-length']) if 'content-length' in response else -1
file_size = os.path.getsize(file_path)
if download_size == file_size:
logger.info("Already have a download of the same size: %d", file_size)
download_this = False
if download_this:
try:
response, content = http.request(url, "GET")
open(file_path,'wb').write(content)
logger.info(response)
except:
logger.warning("Some kind of download error happened fetching %s - pressing on" % url)
files.append(file_path)
for f in item.files:
files.append(os.path.join(data_dir, f.value))
for fyle in files:
logger.info("Uploading %s", fyle)
try:
omeka_client.post_file_from_filename(fyle, item.omeka_id )
logger.info("Uploaded %s", fyle)
except:
logger.warning("Some kind of error happened uploading %s - pressing on" % fyle)
logger = create_stream_logger('csv2omeka', stdout)
def omekaize(item): #TODO make this a kind of repository item
dc_set_id = omeka_client.getSetId("Dublin Core",
create=args['create_item_types'] )
item_type_id = omeka_client.getItemTypeId(item.type, create=args['create_item_types'])
item.omeka_data = {"public": args["public"],
"item_type" : {"id": item_type_id }}
if item.in_collection != None:
collection_id = omeka_client.get_collection_id_by_dc_identifier(item.in_collection,
name=title,
create=args['create_collections'],
public=args["public"])
if collection_id != None:
item.omeka_data["collection"] = {"id": collection_id}
#Lets deal with DC fields to start with worry about other namespaces later...
element_texts = []
for f in item.text_fields:
if f.namespace.prefix == "dcterms":
element_id = omeka_client.getElementId(dc_set_id ,f.field_name, create=args['create_item_types'] )
element_text = {"html": False, "text": unicode(f.value)}
element_text["element"] = {"id": element_id }
element_texts.append(element_text)
item.omeka_data["element_texts"] = element_texts
# Define and parse command-line arguments
parser = argparse.ArgumentParser()
parser.add_argument('inputfile', type=argparse.FileType('rb'), default=stdin, help='Name of input Excel file')
parser.add_argument('-k', '--key', default=None, help='Omeka API Key')
parser.add_argument('-u', '--api_url',default=None, help='Omeka API Endpoint URL (hint, ends in /api)')
parser.add_argument('-i', '--identifier', default="Identifier", help='Name of an Identifier column in the input spreadsheet. ')
parser.add_argument('-d', '--download_cache', default="./data", help='Path to a directory in which to chache dowloads (defaults to ./data)')
parser.add_argument('-p', '--public', action='store_true', help='Make items public')
parser.add_argument('-c', '--create_collections', action='store_true', help='Auto-create missing collections')
parser.add_argument('-e', '--create_elements', action='store_true', help='Auto-create missing element types')
parser.add_argument('-y', '--create_item_types', action='store_true', help='Auto-create missing Item Types')
parser.add_argument('-q', '--quietly', action='store_true', help='Only log errors and warnings not the constant stream of info')
args = vars(parser.parse_args())
config = get_omeka_config()
endpoint = args['api_url'] if args['api_url'] <> None else config['api_url']
apikey = args['key'] if args['api_url'] <> None else config['key']
omeka_client = OmekaClient(endpoint.encode("utf-8"), logger, apikey)
inputfile = args['inputfile']
identifier_column = args['identifier']
data_dir = args['download_cache']
if args["quietly"]:
logger.setLevel(30)
csv_data = CSVData(inputfile)
csv_data.get_items()
for collection in csv_data.collections:
id = collection.id
title = collection.title
print id
if id != None:
collection_id = omeka_client.get_collection_id_by_dc_identifier(id, name=title, create=args['create_collections'], public=args["public"])
print "Collection ID", collection_id
uploaded_item_ids = []
for item in csv_data.items:
print item.type
id = item.id
if id != None:
title = item.title
type = item.type
previous_id = omeka_client.get_item_id_by_dc_identifier(id)
omekaize(item)
jsonstr = json.dumps(item.omeka_data)
# Upload it
if previous_id != None:
logger.info("Re-uploading %s", previous_id)
response, content = omeka_client.put("items" , previous_id, jsonstr)
new_item = json.loads(content)
if 'id' not in new_item:
print new_item
else:
logger.info("Uploading new version")
response, content = omeka_client.post("items", jsonstr)
#Looks like the ID wasn't actually there, so get it to mint a new one
if response['status'] == '404':
logger.info("retrying")
response, content = omeka_client.post("items", jsonstr)
# Have new (or old) item now
new_item = json.loads(content)
item.omeka_id = new_item['id']
uploaded_item_ids.append(item.omeka_id)
# Relate to other items
for r in item.relations:
property_id = omeka_client.getRelationPropertyIdByLocalPart(r.namespace.prefix, r.field_name)
object_id = previous_id = omeka_client.get_item_id_by_dc_identifier(r.value)
if object_id != None:
logger.info("Relating this item %s to another. Property %s, target %s", item.omeka_id, property_id, object_id)
omeka_client.addItemRelation(item.omeka_id, property_id, object_id)
download_and_upload_files(item)