-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathimagic.py
258 lines (204 loc) · 8.17 KB
/
imagic.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
import pathlib
import csv
from bs4 import *
import requests
import os
import subprocess
from pathlib import Path
from urllib.parse import urlparse
from datetime import datetime
import time
import requests
import hashlib
# Create folder to store images
def folder_create(url, images):
try:
# get domain name from entered url
domain_name = urlparse(url).netloc
# get current date and time
now = str(datetime.now())
# format date to suit folder name parameters
date_time = now.replace('-', '').replace(' ', '_').replace(':', '').split('.')
# generate save location folder name
folder_name = f"{domain_name}_{date_time[0]}"
# create the save location folder
os.mkdir(folder_name)
# if folder exists with that name, ask another name
except:
print("Folder with that name already exists! Enter a different name.")
folder_create()
# image download start
download_images(images, folder_name)
# read exif for all downloaded images
time.sleep(1)
getExifData(folder_name)
#Function do get exif for all images downloaded.
def download_images(images, folder_name):
# initial count is zero
count = 0
# print total images found in URL
print(f"\nAttempting to download and save {len(images)} image(s) to '{folder_name}'\n")
# checking if images is not zero
if len(images) != 0:
for i, image in enumerate(images):
# From image tag ,Fetch image Source URL
# 1. data-srcset
# 2. data-src
# 3. data-fallback-src
# 4. src
# Here we will use exception handling
# first, search for "data-srcset" in img tag
try:
# search for "data-srcset" in img tag
image_link = image["data-srcset"]
# then we will search for "data-src" in img tag and so on.
except:
try:
# search for "data-src" in img tag
image_link = image["data-src"]
except:
try:
# search for "data-fallback-src" in img tag
image_link = image["data-fallback-src"]
except:
try:
# search for "src" in img tag
image_link = image["src"]
# if no source URL found
except:
pass
# After getting Image Source URL
# We will try to get the content of image
try:
# Handle images with relative source paths
if 'http' not in image_link.lower():
image_link = url.rstrip('/') + image_link
# Parse image link to allow removal of query strings
scheme = urlparse(image_link).scheme
host = urlparse(image_link).netloc
path = urlparse(image_link).path
formatted_link = f"{scheme}://{host}{path}"
r = requests.get(formatted_link).content
# Default for images with full paths
else:
# Parse image link to allow removal of query strings
scheme = urlparse(image_link).scheme
host = urlparse(image_link).netloc
path = urlparse(image_link).path
formatted_link = f"{scheme}://{host}{path}"
r = requests.get(formatted_link).content
# Get image name and extension from the link
basename = os.path.basename(formatted_link)
# extract image name and extension from basename
img_name, img_ext = os.path.splitext(basename)
try:
# possibility of decode
r = str(r, 'utf-8')
except UnicodeDecodeError:
# Calculate md5 hash of downloaded image
img_hash = hashlib.md5(r).hexdigest()
# After checking above condition, download image with original filename
# Append hash to filename for unique names to prevent overwriting
image_path = f"{folder_name}/image{count + 1}{img_ext}"
with open(f"{image_path}", "wb+") as f:
f.write(r)
# Rudimentary image download progress
count += 1
print(f"{count} of {len(images)} image(s) downloaded")
except:
pass
# There might be a possibility not all images will download
# if all images were downloaded
if count == len(images):
print(f"\nAll images downloaded!\n")
# if not all images were downloaded
else:
not_downloaded = len(images) - count
print(f"\n{count} image(s) downloaded. Failed to download {not_downloaded} image(s)\n")
def getExifData(folder_name):
# Reuse the same folder created to store images
files= Path(folder_name)
# Count number of files whose exif was a success, failed
fileCount=0
failedscannedCount=0
fileswithnoexif=0
fileswithexif=0
# Give filename same name as folder name
formatted_name=folder_name.strip()
csv_file = open(f"{formatted_name}.csv", 'w')
writer = csv.writer(csv_file, delimiter=",")
# CSV Columns
header =['Image Name', "Exif Data", "Size", "MD5 Sum Hash", "SHA 1 Hash"]
writer.writerow(header)
# Loop through all files to get Exif and hashes.
for file in files.iterdir():
if file.is_file():
# Get Directory path
imgFilepath=Path(file).resolve()
# Start Count
fileCount +=1
# Get File name
imgFilename=Path(file).name
##print(f"File name and path is {imgFilepath}")
try:
##print(f"exiv2 for {imgFilepath} done...can we print output? ")
# run Exiv2 command and spefic Exif options
exfildata_command=subprocess.run(["exiv2", "-p", "e", imgFilepath],capture_output=True,text=True)
exfildata=str.strip(exfildata_command.stdout)
#fileCount =1
# Check if Image has Exif Data and print size. Images without data will have same size
if(len(exfildata) == 0):
exfildata="No Exif Data Detected."
##print(f"Size is {len(exfildata)}")
writer.writerow([imgFilename,exfildata,len(exfildata),imgMD5(imgFilepath),imgSHA1(imgFilepath)])
fileswithnoexif +=1
# Image has Exif,
else:
print(f"Size is {len(exfildata)}")
writer.writerow([imgFilename,exfildata,len(exfildata),imgMD5(imgFilepath),imgSHA1(imgFilepath)])
fileswithexif +=1
except:
print(f"Error encountred with {imgFilepath}, proceeding on")
failedscannedCount +=1
pass
# House cleaning
else:
##print(f"{pathlib.Path(file).resolve} is not a file")
fileCount -=1
print(f"Total Files scanned are {fileCount}, Files with ExifData are {fileswithexif}, Files without ExifData are {fileswithnoexif},Files whose ExifData Scan failed are {failedscannedCount}")
csv_file.close()
# Get MD5 sum
def imgMD5(imgPath):
md5_hash = hashlib.md5()
with open(imgPath,"rb") as f:
for byte_block in iter(lambda: f.read(4096),b""):
md5_hash.update(byte_block)
return md5_hash.hexdigest()
# Get SHA 1
def imgSHA1(imgPath):
# make a hash object
h = hashlib.sha1()
# open file for reading in binary mode
with open(imgPath,'rb') as file:
# loop till the end of the file
chunk = 0
while chunk != b'':
# read only 1024 bytes at a time
chunk = file.read(1024)
h.update(chunk)
# return the hex representation of digest
return h.hexdigest()
# Main Function
def main(url):
# content of URL
r = requests.get(url)
# Parse HTML Code
soup = BeautifulSoup(r.text, 'html.parser')
# Find all images in URL
images = soup.findAll('img')
# Call folder create function
folder_create(url, images)
# take url
url = input("Enter URL:- ")
# CALL MAIN FUNCTION
main(url)