|
| 1 | +#!/usr/bin/env python3 |
| 2 | +""" |
| 3 | +Scraper script intended to batch download images from a website. |
| 4 | +Requires 'requests' and 'bs4' packages. |
| 5 | +""" |
| 6 | +import os |
| 7 | +import re |
| 8 | +import sys |
| 9 | +import argparse |
| 10 | +from collections import deque |
| 11 | +from urllib.parse import urlparse |
| 12 | + |
| 13 | +import requests |
| 14 | +from bs4 import BeautifulSoup as Soup |
| 15 | + |
| 16 | + |
| 17 | +# follow links into these extensions only |
| 18 | +PAGE_FORMATS = {'', '.html', '.htm', '.php'} |
| 19 | + |
| 20 | +# forbidden characters in a filename |
| 21 | +FORBIDDEN = '<>:"/\|?*' |
| 22 | +FORBIDDEN_FIX = '_' |
| 23 | + |
| 24 | + |
| 25 | +def download(url, minsize=1, overwrite=False): |
| 26 | + """ |
| 27 | + Downloads the file located at URL if its Content-length is at least |
| 28 | + minsize (in bytes). If overwrite is False, a unique file name will |
| 29 | + be found while the file already exists, otherwise it'll overwrite it. |
| 30 | +
|
| 31 | + Returns True if the file was downloaded, and False otherwise. |
| 32 | + """ |
| 33 | + if '%0A' in url: # normal filenames won't have a new line |
| 34 | + return False |
| 35 | + |
| 36 | + r = requests.get(url, stream=True) |
| 37 | + if int(r.headers['Content-length']) < minsize: |
| 38 | + return False |
| 39 | + |
| 40 | + name = os.path.basename(url) |
| 41 | + for c in FORBIDDEN: |
| 42 | + name = name.replace(c, FORBIDDEN_FIX) |
| 43 | + |
| 44 | + if not overwrite: |
| 45 | + oriname, ext = os.path.splitext(name) |
| 46 | + name = oriname |
| 47 | + n = 1 |
| 48 | + while os.path.isfile(name + ext): |
| 49 | + name = '{} ({})'.format(oriname, n) |
| 50 | + n += 1 |
| 51 | + name = name + ext |
| 52 | + |
| 53 | + with open(name, 'wb') as f: |
| 54 | + for chunk in r.iter_content(chunk_size=4096): |
| 55 | + f.write(chunk) |
| 56 | + |
| 57 | + return True |
| 58 | + |
| 59 | + |
| 60 | +def scrap(url, netloc, minsize, formats, overwrite, pathre): |
| 61 | + """ |
| 62 | + Scraps the given URL, downloading all the files with a file extension |
| 63 | + that exists in formats, if the found items with an extension belonging |
| 64 | + to formats are larger than minsize. |
| 65 | +
|
| 66 | + If the netloc string is present, the scrapper will recurse into all |
| 67 | + domains that match it, otherwise, only URL will be scrapped. |
| 68 | +
|
| 69 | + The overwrite parameter describes whether items with the same name |
| 70 | + should be overwritten, or alternative names should be found. |
| 71 | +
|
| 72 | + Finally, pathre works in conjunction with netloc, and if it must |
| 73 | + be a regex that matches the path (including the slash) of the URLs |
| 74 | + that the scrapper should recurse into. |
| 75 | + """ |
| 76 | + visitted = set(urlparse(url).path) # visitted paths |
| 77 | + checked = set() # files that were attempted to download |
| 78 | + |
| 79 | + # keep track on how many files were downloaded from each tag |
| 80 | + imgtag = 0 |
| 81 | + atag = 0 |
| 82 | + |
| 83 | + urls = deque() |
| 84 | + urls.append(url) |
| 85 | + while urls: |
| 86 | + r = requests.get(urls.popleft()) |
| 87 | + soup = Soup(r.text, 'html.parser') |
| 88 | + |
| 89 | + # if we have downloaded 0 images from this tag while we have images on |
| 90 | + # the other tag, we probably won't find any using this tag so don't. |
| 91 | + if imgtag != 0 or atag == 0: |
| 92 | + for img in soup.find_all('img'): |
| 93 | + if img['src'] not in checked: |
| 94 | + # TODO Could this be smart enough to detect how |
| 95 | + # files that have been downloaded look like and just |
| 96 | + # ignore next ones (like thumbnails)? |
| 97 | + imgtag += int(download(img['src'], minsize=minsize)) |
| 98 | + checked.add(img['src']) |
| 99 | + |
| 100 | + for a in soup.find_all('a'): |
| 101 | + url = urlparse(a.get('href')) |
| 102 | + if not url.path: |
| 103 | + continue |
| 104 | + |
| 105 | + _, ext = os.path.splitext(os.path.basename(url.path)) |
| 106 | + ext = ext.lower() |
| 107 | + if ext in formats: |
| 108 | + if a['href'] not in checked: |
| 109 | + atag += int(download(a['href'], minsize=minsize)) |
| 110 | + checked.add(a['href']) |
| 111 | + |
| 112 | + elif (url.netloc == netloc and |
| 113 | + ext in PAGE_FORMATS and |
| 114 | + pathre.match(url.path) and |
| 115 | + url.path not in visitted): |
| 116 | + visitted.add(url.path) |
| 117 | + urls.append(a.get('href')) |
| 118 | + |
| 119 | + |
| 120 | +if __name__ == '__main__': |
| 121 | + parser = argparse.ArgumentParser(description='website scrapper to download images.') |
| 122 | + parser.add_argument('url', help='the url from which to recursively download images.') |
| 123 | + parser.add_argument('-m', '--minsize', type=int, default=30 * 1024, metavar='N', |
| 124 | + help='minimum image size before downloading, default 30kb.') |
| 125 | + |
| 126 | + parser.add_argument('-f', '--formats', default='jpg,png,gif,jpeg', metavar='F', |
| 127 | + help='''comma separated list of extensions considered to be images. |
| 128 | + adding or not the dot is optional.''') |
| 129 | + |
| 130 | + parser.add_argument('-o', '--overwrite', action='store_true', |
| 131 | + help='''overwrite existing files instead figuring out unique names. |
| 132 | + the names will have (n) if this option is not specified |
| 133 | + and a file already exists.''') |
| 134 | + |
| 135 | + parser.add_argument('-r', '--recursive', nargs='?', default=False, metavar='REGEX', |
| 136 | + help='''recursively scraps the website, without leaving the domain. |
| 137 | + if the regex is provided, it must be a python-compliant regex |
| 138 | + that will match the *path* (including the slash) for those |
| 139 | + links that will be attempted to be recursed into.''') |
| 140 | + |
| 141 | + args = parser.parse_args() |
| 142 | + try: |
| 143 | + scrap(args.url, |
| 144 | + netloc=urlparse(args.url).netloc if args.recursive != False else None, |
| 145 | + minsize=max(args.minsize, 1), |
| 146 | + formats={'.'+f.strip('.').lower() for f in args.formats.split(',')}, |
| 147 | + overwrite=args.overwrite, |
| 148 | + pathre=re.compile(args.recursive or '') |
| 149 | + ) |
| 150 | + except KeyboardInterrupt: |
| 151 | + sys.stderr.write('operation interrupted by user.\n') |
0 commit comments