forked from lcnetdev/lds-processing
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbf2m-bfdb-idlist.py
executable file
·120 lines (93 loc) · 3.07 KB
/
bf2m-bfdb-idlist.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
#!/usr/bin/env python
#works on atom feed url of instances loaded (instance Ids in bfdb) works on a curl of an instance id: c0213880820001
# https://preprod-8230.id.loc.gov/resources/instances/c0213880820001.marc-pkg.xml
#
import glob
import sys
from lxml import etree as ET
from lxml.builder import ElementMaker
import os
import shutil
import multiprocessing
import subprocess
import urllib
import argparse
import feedparser
import yaml
from datetime import date, timedelta
from modules.helpers import get_config
from modules.config_parser import args
# for each entry, curl the marc pck to the our dir, get next feed url (entry rel=next/@href)
class FileResolver(ET.Resolver):
def resolve(self, url, pubid, context):
return self.resolve_filename(url, context)
####################
# Main Program
print("*** BF to MARC List of Instance IDs ***")
print ()
####################
config=get_config(args )
config = yaml.safe_load(open(args.config))
job=args.job
jobconfig = config[job]
indir=jobconfig["source_directory"]
outdir=jobconfig["target_directory"]
filename=jobconfig["infile"]
infile=outdir+filename
idtype="bib"
curl=jobconfig["curl"]
parser = ET.XMLParser()
parser.resolvers.add(FileResolver())
efilename= outdir + '/error.txt'
# clean out the input dir: (rdf only)
files = glob.glob(indir+'*.rdf')
for f in files:
os.remove(f)
outfile = outdir + filename.replace('txt','xml')
print()
print ("-----------------------------")
print("Job config:")
print(jobconfig)
print ("In dir is " , indir)
print ("Out dir is " , outdir)
print('results in :',outfile)
print ("-----------------------------")
bfstylesheet=jobconfig["bfstylesheet"]
bf2marc=ET.parse(bfstylesheet,parser)
bf2marcxsl=ET.XSLT(bf2marc)
biblist=open(infile ,'r')
bibids = biblist.read().splitlines()
curl = "curl -L 'https://preprod-8230.id.loc.gov/resources/instances/%BIBID%.marc-pkg.xml' > in/%OUTFILE%.rdf"
for bibid in bibids:
# print ("curling from metaproxy dev: "+ bibid)
curlcmd = curl.replace('%BIBID%', bibid)
curlcmd = curlcmd.replace('%OUTFILE%', bibid)
returned_value = subprocess.Popen(curlcmd, shell=True).wait()
#============================
bibfiles=list(glob.glob(indir+'*.rdf'))
counter = 0
# create output marcxml:collection:
M= ElementMaker(namespace="http://www.loc.gov/MARC21/slim" ,
nsmap={"marc":"http://www.loc.gov/MARC21/slim"})
coll=M.collection()
with open(outfile,'wb') as out:
for file in bibfiles:
counter+=1
if counter % 100 == 0:
print(counter,'/',len(bibfiles))
print ("converting to marc: "+file)
bftree = ET.parse(file,parser)
bfroot = bftree.getroot()
# result has marc
try:
result=bf2marcxsl(bfroot)
except:
print("Unexpected error:", sys.exc_info()[0], sys.exc_info()[1] )
for info in sys.exc_info():
print(info)
record= ET.XML(bytes(result))
coll.insert(counter,record)
out.write(ET.tostring(coll))
out.close
print ("Done with ",job, " job : check: ", outfile)
#print(glob.glob("out/*xml"))