12
12
import xml .etree .ElementTree as ET
13
13
14
14
SCRAPE_API = 'https://archive.org/services/search/v1/scrape'
15
- ADVANCED_SEARCH = 'https://archive.org/advancedsearch.php? '
15
+ ADVANCED_SEARCH = 'https://archive.org/advancedsearch.php'
16
16
IMG_CTX = 'http://iiif.io/api/image/2/context.json'
17
17
PRZ_CTX = 'http://iiif.io/api/presentation/2/context.json'
18
18
ARCHIVE = 'https://archive.org'
22
22
bookreader = "http://%s/BookReader/BookReaderImages.php"
23
23
URI_PRIFIX = "https://iiif.archive.org/iiif"
24
24
25
+ MAX_SCRAPE_LIMIT = 10_000
26
+ MAX_API_LIMIT = 1_000
27
+
25
28
class MaxLimitException (Exception ):
26
29
pass
27
30
@@ -35,7 +38,7 @@ def purify_domain(domain):
35
38
domain = re .sub ('^http:\/\/' , "https://" , domain )
36
39
return domain if domain .endswith ('/iiif/' ) else domain + 'iiif/'
37
40
38
- def getids (q , limit = 1000 , cursor = '' , sorts = '' , fields = '' ):
41
+ def getids (q , cursor = '' , sorts = '' , fields = '' , limit = MAX_API_LIMIT ):
39
42
query = "(mediatype:(texts) OR mediatype:(image))" + \
40
43
((" AND %s" % q ) if q else "" )
41
44
# 'all:1' also works
@@ -55,10 +58,9 @@ def scrape(query, fields="", sorts="", count=100, cursor="", security=True):
55
58
if not query :
56
59
raise ValueError ("GET 'query' parameters required" )
57
60
58
- if int (count ) > 1000 and security :
61
+ if int (count ) > MAX_API_LIMIT and security :
59
62
raise MaxLimitException ("Limit may not exceed 1000." )
60
63
61
- #sorts = sorts or 'date+asc,createdate'
62
64
fields = fields or 'identifier,title'
63
65
64
66
params = {
@@ -83,14 +85,13 @@ def search(query, page=1, limit=100, security=True, sort=None, fields=None):
83
85
if int (limit ) > 1000 and security :
84
86
raise MaxLimitException ("Limit may not exceed 1000." )
85
87
86
- sort = sort or 'sort%5B%5D=date+asc&sort%5B%5D=createdate'
87
- fields = fields or 'identifier,title'
88
88
return requests .get (
89
- ADVANCED_SEARCH + sort ,
89
+ ADVANCED_SEARCH ,
90
90
params = {'q' : query ,
91
+ 'sort[]' : sort or ['date asc' , 'createdate' ],
91
92
'rows' : limit ,
92
93
'page' : page ,
93
- 'fl[]' : fields ,
94
+ 'fl[]' : fields or 'identifier,title' ,
94
95
'output' : 'json' ,
95
96
}).json ()
96
97
@@ -172,12 +173,12 @@ def create_collection3(identifier, domain, page=1, rows=1000):
172
173
173
174
addMetadata (collection , identifier , metadata ['metadata' ], collection = True )
174
175
175
- asURL = f'https://archive.org/advancedsearch.php ?q=collection%3A{ identifier } &fl[]=identifier&fl[]=mediatype&fl[]=title&fl[]=description&sort[]=&sort[]=&sort[]=&rows={ rows } &page={ page } &output=json&save=yes'
176
+ asURL = f'{ ADVANCED_SEARCH } ?q=collection%3A{ identifier } &fl[]=identifier&fl[]=mediatype&fl[]=title&fl[]=description&sort[]=&sort[]=&sort[]=&rows={ rows } &page={ page } &output=json&save=yes'
176
177
itemsSearch = requests .get (asURL ).json ()
177
178
total = itemsSearch ['response' ]['numFound' ]
178
179
# There is a max of 10,000 items that can be retrieved from the advanced search
179
- if total > 10000 :
180
- total = 10000
180
+ if total > MAX_SCRAPE_LIMIT :
181
+ total = MAX_SCRAPE_LIMIT
181
182
182
183
if len (itemsSearch ['response' ]['docs' ]) == 0 :
183
184
return None
0 commit comments