Skip to content

Commit

Permalink
use constants, tidy
Browse files Browse the repository at this point in the history
  • Loading branch information
mekarpeles authored Sep 27, 2024
1 parent 7685d02 commit a3c7e1f
Show file tree
Hide file tree
Showing 2 changed files with 13 additions and 12 deletions.
2 changes: 1 addition & 1 deletion iiify/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,9 +57,9 @@ def mainentry():
def index():
"""Lists all available book and image items on Archive.org"""
q = request.args.get('q', '')
cursor = request.args.get('cursor', '')
fields = request.args.get('fields', '')
sorts = request.args.get('sorts', '')
cursor = request.args.get('cursor', '')
return jsonify(getids(q, cursor=cursor, fields=fields, sorts=sorts))


Expand Down
23 changes: 12 additions & 11 deletions iiify/resolver.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
import xml.etree.ElementTree as ET

SCRAPE_API = 'https://archive.org/services/search/v1/scrape'
ADVANCED_SEARCH = 'https://archive.org/advancedsearch.php?'
ADVANCED_SEARCH = 'https://archive.org/advancedsearch.php'
IMG_CTX = 'http://iiif.io/api/image/2/context.json'
PRZ_CTX = 'http://iiif.io/api/presentation/2/context.json'
ARCHIVE = 'https://archive.org'
Expand All @@ -22,6 +22,9 @@
bookreader = "http://%s/BookReader/BookReaderImages.php"
URI_PRIFIX = "https://iiif.archive.org/iiif"

MAX_SCRAPE_LIMIT = 10_000
MAX_API_LIMIT = 1_000

class MaxLimitException(Exception):
pass

Expand All @@ -35,7 +38,7 @@ def purify_domain(domain):
domain = re.sub('^http:\/\/', "https://", domain)
return domain if domain.endswith('/iiif/') else domain + 'iiif/'

def getids(q, limit=1000, cursor='', sorts='', fields=''):
def getids(q, cursor='', sorts='', fields='', limit=MAX_API_LIMIT):
query = "(mediatype:(texts) OR mediatype:(image))" + \
((" AND %s" % q) if q else "")
# 'all:1' also works
Expand All @@ -55,10 +58,9 @@ def scrape(query, fields="", sorts="", count=100, cursor="", security=True):
if not query:
raise ValueError("GET 'query' parameters required")

if int(count) > 1000 and security:
if int(count) > MAX_API_LIMIT and security:
raise MaxLimitException("Limit may not exceed 1000.")

#sorts = sorts or 'date+asc,createdate'
fields = fields or 'identifier,title'

params = {
Expand All @@ -83,14 +85,13 @@ def search(query, page=1, limit=100, security=True, sort=None, fields=None):
if int(limit) > 1000 and security:
raise MaxLimitException("Limit may not exceed 1000.")

sort = sort or 'sort%5B%5D=date+asc&sort%5B%5D=createdate'
fields = fields or 'identifier,title'
return requests.get(
ADVANCED_SEARCH + sort,
ADVANCED_SEARCH,
params={'q': query,
'sort[]': sort or ['date asc', 'createdate'],
'rows': limit,
'page': page,
'fl[]': fields,
'fl[]': fields or 'identifier,title',
'output': 'json',
}).json()

Expand Down Expand Up @@ -172,12 +173,12 @@ def create_collection3(identifier, domain, page=1, rows=1000):

addMetadata(collection, identifier, metadata['metadata'], collection=True)

asURL = f'https://archive.org/advancedsearch.php?q=collection%3A{identifier}&fl[]=identifier&fl[]=mediatype&fl[]=title&fl[]=description&sort[]=&sort[]=&sort[]=&rows={rows}&page={page}&output=json&save=yes'
asURL = f'{ADVANCED_SEARCH}?q=collection%3A{identifier}&fl[]=identifier&fl[]=mediatype&fl[]=title&fl[]=description&sort[]=&sort[]=&sort[]=&rows={rows}&page={page}&output=json&save=yes'
itemsSearch = requests.get(asURL).json()
total = itemsSearch['response']['numFound']
# There is a max of 10,000 items that can be retrieved from the advanced search
if total > 10000:
total = 10000
if total > MAX_SCRAPE_LIMIT:
total = MAX_SCRAPE_LIMIT

if len(itemsSearch['response']['docs']) == 0:
return None
Expand Down

0 comments on commit a3c7e1f

Please sign in to comment.