Skip to content

Commit b284a39

Browse files
committed
Add images/scrap to recursively download images off a website
1 parent 8fc91c7 commit b284a39

File tree

2 files changed

+152
-0
lines changed

2 files changed

+152
-0
lines changed

images/scrap

+151
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,151 @@
1+
#!/usr/bin/env python3
2+
"""
3+
Scraper script intended to batch download images from a website.
4+
Requires 'requests' and 'bs4' packages.
5+
"""
6+
import os
7+
import re
8+
import sys
9+
import argparse
10+
from collections import deque
11+
from urllib.parse import urlparse
12+
13+
import requests
14+
from bs4 import BeautifulSoup as Soup
15+
16+
17+
# follow links into these extensions only
18+
PAGE_FORMATS = {'', '.html', '.htm', '.php'}
19+
20+
# forbidden characters in a filename
21+
FORBIDDEN = '<>:"/\|?*'
22+
FORBIDDEN_FIX = '_'
23+
24+
25+
def download(url, minsize=1, overwrite=False):
26+
"""
27+
Downloads the file located at URL if its Content-length is at least
28+
minsize (in bytes). If overwrite is False, a unique file name will
29+
be found while the file already exists, otherwise it'll overwrite it.
30+
31+
Returns True if the file was downloaded, and False otherwise.
32+
"""
33+
if '%0A' in url: # normal filenames won't have a new line
34+
return False
35+
36+
r = requests.get(url, stream=True)
37+
if int(r.headers['Content-length']) < minsize:
38+
return False
39+
40+
name = os.path.basename(url)
41+
for c in FORBIDDEN:
42+
name = name.replace(c, FORBIDDEN_FIX)
43+
44+
if not overwrite:
45+
oriname, ext = os.path.splitext(name)
46+
name = oriname
47+
n = 1
48+
while os.path.isfile(name + ext):
49+
name = '{} ({})'.format(oriname, n)
50+
n += 1
51+
name = name + ext
52+
53+
with open(name, 'wb') as f:
54+
for chunk in r.iter_content(chunk_size=4096):
55+
f.write(chunk)
56+
57+
return True
58+
59+
60+
def scrap(url, netloc, minsize, formats, overwrite, pathre):
61+
"""
62+
Scraps the given URL, downloading all the files with a file extension
63+
that exists in formats, if the found items with an extension belonging
64+
to formats are larger than minsize.
65+
66+
If the netloc string is present, the scrapper will recurse into all
67+
domains that match it, otherwise, only URL will be scrapped.
68+
69+
The overwrite parameter describes whether items with the same name
70+
should be overwritten, or alternative names should be found.
71+
72+
Finally, pathre works in conjunction with netloc, and if it must
73+
be a regex that matches the path (including the slash) of the URLs
74+
that the scrapper should recurse into.
75+
"""
76+
visitted = set(urlparse(url).path) # visitted paths
77+
checked = set() # files that were attempted to download
78+
79+
# keep track on how many files were downloaded from each tag
80+
imgtag = 0
81+
atag = 0
82+
83+
urls = deque()
84+
urls.append(url)
85+
while urls:
86+
r = requests.get(urls.popleft())
87+
soup = Soup(r.text, 'html.parser')
88+
89+
# if we have downloaded 0 images from this tag while we have images on
90+
# the other tag, we probably won't find any using this tag so don't.
91+
if imgtag != 0 or atag == 0:
92+
for img in soup.find_all('img'):
93+
if img['src'] not in checked:
94+
# TODO Could this be smart enough to detect how
95+
# files that have been downloaded look like and just
96+
# ignore next ones (like thumbnails)?
97+
imgtag += int(download(img['src'], minsize=minsize))
98+
checked.add(img['src'])
99+
100+
for a in soup.find_all('a'):
101+
url = urlparse(a.get('href'))
102+
if not url.path:
103+
continue
104+
105+
_, ext = os.path.splitext(os.path.basename(url.path))
106+
ext = ext.lower()
107+
if ext in formats:
108+
if a['href'] not in checked:
109+
atag += int(download(a['href'], minsize=minsize))
110+
checked.add(a['href'])
111+
112+
elif (url.netloc == netloc and
113+
ext in PAGE_FORMATS and
114+
pathre.match(url.path) and
115+
url.path not in visitted):
116+
visitted.add(url.path)
117+
urls.append(a.get('href'))
118+
119+
120+
if __name__ == '__main__':
121+
parser = argparse.ArgumentParser(description='website scrapper to download images.')
122+
parser.add_argument('url', help='the url from which to recursively download images.')
123+
parser.add_argument('-m', '--minsize', type=int, default=30 * 1024, metavar='N',
124+
help='minimum image size before downloading, default 30kb.')
125+
126+
parser.add_argument('-f', '--formats', default='jpg,png,gif,jpeg', metavar='F',
127+
help='''comma separated list of extensions considered to be images.
128+
adding or not the dot is optional.''')
129+
130+
parser.add_argument('-o', '--overwrite', action='store_true',
131+
help='''overwrite existing files instead figuring out unique names.
132+
the names will have (n) if this option is not specified
133+
and a file already exists.''')
134+
135+
parser.add_argument('-r', '--recursive', nargs='?', default=False, metavar='REGEX',
136+
help='''recursively scraps the website, without leaving the domain.
137+
if the regex is provided, it must be a python-compliant regex
138+
that will match the *path* (including the slash) for those
139+
links that will be attempted to be recursed into.''')
140+
141+
args = parser.parse_args()
142+
try:
143+
scrap(args.url,
144+
netloc=urlparse(args.url).netloc if args.recursive != False else None,
145+
minsize=max(args.minsize, 1),
146+
formats={'.'+f.strip('.').lower() for f in args.formats.split(',')},
147+
overwrite=args.overwrite,
148+
pathre=re.compile(args.recursive or '')
149+
)
150+
except KeyboardInterrupt:
151+
sys.stderr.write('operation interrupted by user.\n')

set-links.py

+1
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55

66

77
utilities = (
8+
'images/scrap',
89
'mineutils/mc',
910
'misc/gitmail',
1011
'misc/pipu',

0 commit comments

Comments
 (0)