Skip to content

Commit ec0f854

Browse files
committed
added download images from javascript-driven websites script
1 parent 2aabf0f commit ec0f854

File tree

3 files changed

+101
-0
lines changed

3 files changed

+101
-0
lines changed

Diff for: web-scraping/download-images/README.md

+1
Original file line numberDiff line numberDiff line change
@@ -24,3 +24,4 @@ To run this:
2424
python download_images https://www.thepythoncode.com/topic/web-scraping
2525
```
2626
A new folder `www.thepythoncode.com` will be created automatically that contains all the images of that web page.
27+
- If you want to download images from javascript-driven websites, consider using `download_images_js.py` script instead (it accepts the same parameters)

Diff for: web-scraping/download-images/download_images_js.py

+99
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,99 @@
1+
from requests_html import HTMLSession
2+
import requests
3+
from tqdm import tqdm
4+
from bs4 import BeautifulSoup as bs
5+
from urllib.parse import urljoin, urlparse
6+
7+
import os
8+
9+
10+
def is_valid(url):
11+
"""
12+
Checks whether `url` is a valid URL.
13+
"""
14+
parsed = urlparse(url)
15+
return bool(parsed.netloc) and bool(parsed.scheme)
16+
17+
18+
def get_all_images(url):
19+
"""
20+
Returns all image URLs on a single `url`
21+
"""
22+
# initialize the session
23+
session = HTMLSession()
24+
# make the HTTP request and retrieve response
25+
response = session.get(url)
26+
# execute Javascript
27+
response.html.render()
28+
# construct the soup parser
29+
soup = bs(response.html.html, "html.parser")
30+
urls = []
31+
for img in tqdm(soup.find_all("img"), "Extracting images"):
32+
img_url = img.attrs.get("src") or img.attrs.get("data-src")
33+
if not img_url:
34+
# if img does not contain src attribute, just skip
35+
continue
36+
# make the URL absolute by joining domain with the URL that is just extracted
37+
img_url = urljoin(url, img_url)
38+
# remove URLs like '/hsts-pixel.gif?c=3.2.5'
39+
try:
40+
pos = img_url.index("?")
41+
img_url = img_url[:pos]
42+
except ValueError:
43+
pass
44+
# finally, if the url is valid
45+
if is_valid(img_url):
46+
urls.append(img_url)
47+
return urls
48+
49+
50+
def download(url, pathname):
51+
"""
52+
Downloads a file given an URL and puts it in the folder `pathname`
53+
"""
54+
# if path doesn't exist, make that path dir
55+
if not os.path.isdir(pathname):
56+
os.makedirs(pathname)
57+
# download the body of response by chunk, not immediately
58+
response = requests.get(url, stream=True)
59+
60+
# get the total file size
61+
file_size = int(response.headers.get("Content-Length", 0))
62+
63+
# get the file name
64+
filename = os.path.join(pathname, url.split("/")[-1])
65+
66+
# progress bar, changing the unit to bytes instead of iteration (default by tqdm)
67+
progress = tqdm(response.iter_content(1024), f"Downloading {filename}", total=file_size, unit="B", unit_scale=True, unit_divisor=1024)
68+
with open(filename, "wb") as f:
69+
for data in progress:
70+
# write data read to the file
71+
f.write(data)
72+
# update the progress bar manually
73+
progress.update(len(data))
74+
75+
76+
def main(url, path):
77+
# get all images
78+
imgs = get_all_images(url)
79+
for img in imgs:
80+
# for each img, download it
81+
download(img, path)
82+
83+
84+
85+
if __name__ == "__main__":
86+
import argparse
87+
parser = argparse.ArgumentParser(description="This script downloads all images from a web page")
88+
parser.add_argument("url", help="The URL of the web page you want to download images")
89+
parser.add_argument("-p", "--path", help="The Directory you want to store your images, default is the domain of URL passed")
90+
91+
args = parser.parse_args()
92+
url = args.url
93+
path = args.path
94+
95+
if not path:
96+
# if path isn't specified, use the domain name of that url as the folder name
97+
path = urlparse(url).netloc
98+
99+
main(url, path)

Diff for: web-scraping/download-images/requirements.txt

+1
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
11
requests
2+
requests_html
23
bs4
34
tqdm

0 commit comments

Comments
 (0)