This repository has been archived by the owner on Jul 1, 2022. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathia-wishlist-offerings.py
145 lines (131 loc) · 4.76 KB
/
ia-wishlist-offerings.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
#!/usr/bin/python
# -*- coding: utf-8 -*-
""" Bot to scrape a list of book offerings from ISBN """
#
# (C) Federico Leva, 2018
#
# Distributed under the terms of the MIT license.
#
__version__ = '0.1.2'
import requests
try:
from isbnlib import is_isbn10, is_isbn13, isbn_from_words
except ImportError:
print("WARNING: No isbnlib, cannot query ISBNS")
from lxml import html
import re
from time import sleep
try:
import unicodecsv as csv
except ImportError:
import csv
s = requests.Session()
t = requests.Session()
#headers = {'user-agent': '', 'cookie': ''}
def getAbebooks(isbn, fulltitle=None, keywords=None):
notes = ''
year = ''
if isbn:
url = 'https://www.abebooks.it/servlet/SearchResults?isbn={}&sortby=2'.format(isbn)
elif fulltitle:
url = 'https://www.abebooks.it/servlet/SearchResults?an={}&pn={}&tn={}&yrl={}&sortby=2'.format(
fulltitle['authors'], fulltitle['publisher'], fulltitle['title'], fulltitle['year'])
year = fulltitle['year']
elif keywords:
url = 'https://www.abebooks.it/servlet/SearchResults?kn={}&sortby=2'.format(keywords)
else:
return
r = t.get(url, timeout=10, headers=headers).text
listing = html.fromstring( r )
price = listing.xpath('//div[@id="srp-item-price-1"]/text()')[0]
seller = listing.xpath('//a[text()="Informazioni sul venditore"]/@href')[0] or ''
title = listing.xpath('//li[@id="book-1"]//h2[@itemprop="offers"]/a[@itemprop="url"]/span/text()')[0] or ''
description = listing.xpath('//li[@id="book-1"]//p[contains(@class, "p-md-t")]/span[contains(.," 19") or contains(.," 20")]/text()') or ''
if not isbn:
isbn = listing.xpath('//li[@id="book-1"]//p[contains(@class, "isbn")]//a/@title')[-1]
if description:
year = re.findall(r'\b(?:19|20)[0-9]{2}\b', description[0])[0]
if 'Anybook' in seller:
notes = listing.xpath('//li[@id="book-1"]//p[contains(@class, "p-md-t")]/text()')[0].strip().replace('Codice articolo ', '')
return [isbn, seller, price, title, year, notes]
def getAlibris(isbn):
sleep(4)
alibris = t.get('https://www.alibris.com/search/books/isbn/%s' % isbn , timeout=10).text
if 'Enter Your Search Terms' in alibris:
return
listing = html.fromstring(alibris)
price = listing.xpath('//td[@class="price"]/p/text()')[0]
#seller = listing.xpath('//td[@class="seller"]//a[@class="seller-link"]/text()')[0]
#title = listing.xpath('//h1/[@itemprop="name"]/text()')
#description = listing.xpath('//p[@id="synopsis-limited"]/text()')
#year = listing.xpath('//span[@itemprop="datePublished"]/text()')
return [isbn, '', price.strip(), '', '', '']
def getLibraccio(isbn):
listing = html.fromstring( s.get('http://libraccio.it/libro/%s/' % isbn).text )
price = listing.xpath('//span[@class="currentprice" and @id="C_C_ProductDetail_lSellPriceU"]/text()')[0]
return [isbn, u'Libraccio', price, u'']
def parseItalianCitation(wikitext):
# Simplistic parser to get just some data
# TODO: https://github.com/dissemin/wikiciteparser/issues/2
try:
citation = {
'title': ' '.join(re.findall('titolo *= *([^|}]+)', wikitext, flags=re.I)),
'authors': ' '.join(re.findall('(?:nome|cognome|autore|curatore) *= *([^|}]+)', wikitext, flags=re.I)),
'publisher': ' '.join(re.findall('editore *= *([^|}]+)', wikitext, flags=re.I)),
'year': ''.join(re.findall('anno *= *([^|}]+)', wikitext, flags=re.I)).strip(),
}
if len(''.join(citation.values())) > 10:
return citation
else:
return False
except IndexError:
return None
def main():
wishlist = open('wishlist.txt', 'r')
offerings = open('wishlist-offerings.csv', 'a')
writer = csv.writer(offerings,
delimiter='\t',
lineterminator='\n',
quoting=csv.QUOTE_MINIMAL,
)
writer.writerow([u'ISBN', u'Bookshop', u'Price', u'Title', u'Year', 'Notes'])
for code in wishlist.readlines():
code = code.strip()
keywords = None
if is_isbn10(code) or is_isbn13(code):
isbn = code
fulltitle = None
else:
isbn = None
fulltitle = parseItalianCitation(re.sub('[[\]]', '', code))
print(fulltitle)
keywords = ' '.join(re.findall('\| *(?:\w+ *= *)?([^|}]+)', re.sub('[\[\]]', '', code)))
if not fulltitle:
print(keywords)
try:
offer = getAbebooks(isbn, fulltitle, keywords)
if keywords and not offer:
offer = getAbebooks(None, None, keywords)
if offer:
writer.writerow(offer)
print("INFO: Found {}".format(offer[0]))
else:
print("NOTICE: Not found: {}".format(isbn))
sleep(0.1)
except IndexError as e:
print("NOTICE: Not found: {}".format(isbn))
except requests.exceptions.ConnectionError:
print("WARNING: Connection error. Sleeping.")
sleep(5)
except requests.exceptions.ReadTimeout:
print("WARNING: Connection timeout. Sleeping.")
sleep(15)
except Exception as e:
print("ERROR: Unexpected exception")
print(e)
sleep(30)
pass
wishlist.close()
offerings.close()
if __name__ == "__main__":
main()