Skip to content

Commit

Permalink
fixing erros and improving
Browse files Browse the repository at this point in the history
  • Loading branch information
digitaldreams committed Nov 22, 2017
1 parent 91552a9 commit cadb842
Show file tree
Hide file tree
Showing 5 changed files with 13 additions and 7 deletions.
7 changes: 4 additions & 3 deletions ImgFinder.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,8 @@ def __init__(self, page_url):
self.page_url = page_url
self.base_url = urlres.netloc
self.folder = functions.get_folder_name(urlres.netloc)
self.path = urlres.path
self.path = urlres.path.replace("/", "_")
self.scheme = urlres.scheme
self.src = set()
HTMLParser.__init__(self)

Expand All @@ -22,7 +23,7 @@ def handle_starttag(self, tag, attrs):
if tag == 'img':
for (attr, value) in attrs:
if attr == 'src':
fullUrl = urllib.parse.urljoin(self.base_url, value)
fullUrl = urllib.parse.urljoin(self.scheme + "://" + self.base_url, value)
self.src.add(fullUrl)
else:
continue
Expand All @@ -46,7 +47,7 @@ def save_to_file(self) -> object:
Save waiting downloadable image to queue. So next time when program run
:rtype: object
"""
file_name = self.folder_path() + self.path + '.txt'
file_name = self.folder_path() + "/" + self.path + '.txt'
with open(file_name, 'w') as f:
for line in sorted(self.src):
f.write(line + '\n')
Expand Down
Binary file modified __pycache__/ImgFinder.cpython-36.pyc
Binary file not shown.
Binary file modified __pycache__/functions.cpython-36.pyc
Binary file not shown.
9 changes: 7 additions & 2 deletions functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ def create_project_folder(page_url: object) -> object:
os.makedirs("storage/" + base_url)


# Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36
def html_string(page_url: object) -> object:
"""
Fetch html from url and return as Html String
Expand All @@ -40,7 +41,9 @@ def html_string(page_url: object) -> object:
"""
html_string = ''
try:
response = urllib.request.urlopen(page_url)
request = urllib.request.Request(page_url, headers={
"User-Agent": "Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.27 Safari/537.17"})
response = urllib.request.urlopen(request)
if 'text/html' in response.getheader('Content-Type'):
html_bytes = response.read()
html_string = html_bytes.decode("utf-8")
Expand All @@ -59,5 +62,7 @@ def get_folder_name(base_url) -> object:
parts = base_url.split(".")
if len(parts) == 3:
return parts[1]
elif len(parts) == 2:
return parts[0]
else:
return parts.join("-")
return "-".join(parts)
4 changes: 2 additions & 2 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,13 @@
import Download

if __name__ == '__main__':
PAGE_URL = 'http://www.fdfashionbd.com/gallarey'
PAGE_URL = 'https://gopostie.com/how-it-works'
# Create the project folder into storage folder
create_project_folder(PAGE_URL)
# Find images source and save it to project folder
finder = ImgFinder.ImgFinder(PAGE_URL)
finder.feed(html_string(PAGE_URL))
file_name = finder.save_to_file()
# start downloading images
#start downloading images
down = Download.Download(file_name, finder.folder_path())
down.start()

0 comments on commit cadb842

Please sign in to comment.