Skip to content

Commit 027a540

Browse files
committed
initializing repo with working copy
0 parents  commit 027a540

11 files changed

+179
-0
lines changed

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
/storage
2+
/.idea

Download.py

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
import urllib.parse
2+
import urllib.request
3+
import os.path
4+
import errno
5+
import SaveFile
6+
7+
8+
class Download():
9+
def __init__(self, file_name, path=''):
10+
self.links = set()
11+
self.completed = set()
12+
self.file_name = file_name
13+
self.file_to_set()
14+
self.path = path
15+
16+
"""
17+
Load links from file and set to Set()
18+
"""
19+
20+
def file_to_set(self):
21+
if not os.path.exists(self.file_name):
22+
raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), self.file_name)
23+
with open(self.file_name, 'rt') as f:
24+
for line in f:
25+
self.links.add(line.replace('\n', ''))
26+
return sorted(self.links)
27+
28+
"""
29+
Start of the downloading
30+
"""
31+
32+
def start(self):
33+
for file in self.links:
34+
try:
35+
img = SaveFile.SaveFile(file, self.path)
36+
img.save()
37+
except:
38+
continue
39+
self.completed.add(file)
40+
self.set_to_file()
41+
42+
"""
43+
Update links txt file
44+
"""
45+
46+
def set_to_file(self):
47+
remaining = self.links.difference(self.completed)
48+
with open(self.file_name, 'w') as f:
49+
if len(remaining) > 0:
50+
for line in self.links:
51+
f.write(line + "\n")
52+
f.write("")

ImgFinder.py

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
from html.parser import HTMLParser
2+
import urllib.parse
3+
import functions
4+
5+
6+
class ImgFinder(HTMLParser):
7+
def __init__(self, page_url):
8+
urlres = urllib.parse.urlparse(page_url)
9+
self.page_url = page_url
10+
self.base_url = urlres.netloc
11+
self.folder = functions.get_folder_name(urlres.netloc)
12+
self.path = urlres.path
13+
self.src = set()
14+
HTMLParser.__init__(self)
15+
"""
16+
This function called by HTMLParser internally. We modify it to make our work
17+
"""
18+
def handle_starttag(self, tag, attrs):
19+
if tag == 'img':
20+
for (attr, value) in attrs:
21+
if attr == 'src':
22+
fullUrl = urllib.parse.urljoin(self.base_url, value)
23+
self.src.add(fullUrl)
24+
else:
25+
continue
26+
27+
def getSrc(self):
28+
return self.src
29+
30+
def get_base_url(self):
31+
return self.base_url
32+
33+
def save_to_file(self):
34+
file_name = self.folder_path()+ self.path + '.txt'
35+
with open(file_name, 'w') as f:
36+
for line in sorted(self.src):
37+
f.write(line + '\n')
38+
39+
return file_name
40+
41+
def folder_path(self):
42+
return "storage/" + self.folder

SaveFile.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
import urllib.request
2+
import os.path
3+
4+
5+
class SaveFile:
6+
def __init__(self, file_name, path=''):
7+
self.file_name = file_name
8+
9+
self.base_name = os.path.basename(urllib.request.urlparse(file_name).path)
10+
self.path = path
11+
12+
def save(self):
13+
try:
14+
if len(self.path) > 0:
15+
full_file_path = self.path + '/' + self.base_name
16+
else:
17+
full_file_path = self.base_name
18+
urllib.request.urlretrieve(self.file_name, full_file_path)
19+
except Exception as e:
20+
print(str(e))

__pycache__/Download.cpython-36.pyc

1.47 KB
Binary file not shown.

__pycache__/ImgFinder.cpython-36.pyc

1.58 KB
Binary file not shown.

__pycache__/SaveFile.cpython-36.pyc

881 Bytes
Binary file not shown.

__pycache__/functions.cpython-36.pyc

1.25 KB
Binary file not shown.

functions.py

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
import urllib.request
2+
import ImgFinder
3+
import urllib.parse
4+
import os.path
5+
6+
7+
def gather_img_src(page_url):
8+
try:
9+
html = html_string(page_url)
10+
finder = ImgFinder.ImgFinder(page_url)
11+
finder.feed(html)
12+
except Exception as e:
13+
print(str(e))
14+
return set()
15+
16+
return finder.getSrc()
17+
18+
19+
def create_project_folder(page_url):
20+
base_url = get_folder_name(urllib.parse.urlparse(page_url).netloc)
21+
if not os.path.exists("storage/" + base_url):
22+
os.makedirs("storage/" + base_url)
23+
24+
25+
def html_string(page_url):
26+
html_string = ''
27+
try:
28+
response = urllib.request.urlopen(page_url)
29+
if 'text/html' in response.getheader('Content-Type'):
30+
html_bytes = response.read()
31+
html_string = html_bytes.decode("utf-8")
32+
33+
except Exception as e:
34+
print(str(e))
35+
return html_string
36+
37+
38+
def get_folder_name(base_url):
39+
parts = base_url.split(".")
40+
if len(parts) == 3:
41+
return parts[1]
42+
else:
43+
return parts.join("-")

main.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
import urllib.request
2+
import urllib.parse
3+
from functions import *
4+
import ImgFinder
5+
import Download
6+
7+
if __name__ == '__main__':
8+
PAGE_URL = 'http://www.fdfashionbd.com/gallarey'
9+
create_project_folder(PAGE_URL)
10+
finder = ImgFinder.ImgFinder(PAGE_URL)
11+
finder.feed(html_string(PAGE_URL))
12+
file_name = finder.save_to_file()
13+
# start downloading images
14+
down = Download.Download(file_name, finder.folder_path())
15+
down.start()

test.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
import SaveFile
2+
3+
if __name__=='__main__':
4+
file=SaveFile.SaveFile('https://www.google.com/images/branding/googlelogo/2x/googlelogo_color_272x92dp.png',"storage")
5+
file.save()

0 commit comments

Comments
 (0)