1
+ from requests_html import HTMLSession
2
+ import requests
3
+ from tqdm import tqdm
4
+ from bs4 import BeautifulSoup as bs
5
+ from urllib .parse import urljoin , urlparse
6
+
7
+ import os
8
+
9
+
10
+ def is_valid (url ):
11
+ """
12
+ Checks whether `url` is a valid URL.
13
+ """
14
+ parsed = urlparse (url )
15
+ return bool (parsed .netloc ) and bool (parsed .scheme )
16
+
17
+
18
+ def get_all_images (url ):
19
+ """
20
+ Returns all image URLs on a single `url`
21
+ """
22
+ # initialize the session
23
+ session = HTMLSession ()
24
+ # make the HTTP request and retrieve response
25
+ response = session .get (url )
26
+ # execute Javascript
27
+ response .html .render ()
28
+ # construct the soup parser
29
+ soup = bs (response .html .html , "html.parser" )
30
+ urls = []
31
+ for img in tqdm (soup .find_all ("img" ), "Extracting images" ):
32
+ img_url = img .attrs .get ("src" ) or img .attrs .get ("data-src" )
33
+ if not img_url :
34
+ # if img does not contain src attribute, just skip
35
+ continue
36
+ # make the URL absolute by joining domain with the URL that is just extracted
37
+ img_url = urljoin (url , img_url )
38
+ # remove URLs like '/hsts-pixel.gif?c=3.2.5'
39
+ try :
40
+ pos = img_url .index ("?" )
41
+ img_url = img_url [:pos ]
42
+ except ValueError :
43
+ pass
44
+ # finally, if the url is valid
45
+ if is_valid (img_url ):
46
+ urls .append (img_url )
47
+ return urls
48
+
49
+
50
+ def download (url , pathname ):
51
+ """
52
+ Downloads a file given an URL and puts it in the folder `pathname`
53
+ """
54
+ # if path doesn't exist, make that path dir
55
+ if not os .path .isdir (pathname ):
56
+ os .makedirs (pathname )
57
+ # download the body of response by chunk, not immediately
58
+ response = requests .get (url , stream = True )
59
+
60
+ # get the total file size
61
+ file_size = int (response .headers .get ("Content-Length" , 0 ))
62
+
63
+ # get the file name
64
+ filename = os .path .join (pathname , url .split ("/" )[- 1 ])
65
+
66
+ # progress bar, changing the unit to bytes instead of iteration (default by tqdm)
67
+ progress = tqdm (response .iter_content (1024 ), f"Downloading { filename } " , total = file_size , unit = "B" , unit_scale = True , unit_divisor = 1024 )
68
+ with open (filename , "wb" ) as f :
69
+ for data in progress :
70
+ # write data read to the file
71
+ f .write (data )
72
+ # update the progress bar manually
73
+ progress .update (len (data ))
74
+
75
+
76
+ def main (url , path ):
77
+ # get all images
78
+ imgs = get_all_images (url )
79
+ for img in imgs :
80
+ # for each img, download it
81
+ download (img , path )
82
+
83
+
84
+
85
+ if __name__ == "__main__" :
86
+ import argparse
87
+ parser = argparse .ArgumentParser (description = "This script downloads all images from a web page" )
88
+ parser .add_argument ("url" , help = "The URL of the web page you want to download images" )
89
+ parser .add_argument ("-p" , "--path" , help = "The Directory you want to store your images, default is the domain of URL passed" )
90
+
91
+ args = parser .parse_args ()
92
+ url = args .url
93
+ path = args .path
94
+
95
+ if not path :
96
+ # if path isn't specified, use the domain name of that url as the folder name
97
+ path = urlparse (url ).netloc
98
+
99
+ main (url , path )
0 commit comments