diff --git a/ImgFinder.py b/ImgFinder.py index e7a3f2d..148d7fb 100644 --- a/ImgFinder.py +++ b/ImgFinder.py @@ -9,7 +9,8 @@ def __init__(self, page_url): self.page_url = page_url self.base_url = urlres.netloc self.folder = functions.get_folder_name(urlres.netloc) - self.path = urlres.path + self.path = urlres.path.replace("/", "_") + self.scheme = urlres.scheme self.src = set() HTMLParser.__init__(self) @@ -22,7 +23,7 @@ def handle_starttag(self, tag, attrs): if tag == 'img': for (attr, value) in attrs: if attr == 'src': - fullUrl = urllib.parse.urljoin(self.base_url, value) + fullUrl = urllib.parse.urljoin(self.scheme + "://" + self.base_url, value) self.src.add(fullUrl) else: continue @@ -46,7 +47,7 @@ def save_to_file(self) -> object: Save waiting downloadable image to queue. So next time when program run :rtype: object """ - file_name = self.folder_path() + self.path + '.txt' + file_name = self.folder_path() + "/" + self.path + '.txt' with open(file_name, 'w') as f: for line in sorted(self.src): f.write(line + '\n') diff --git a/__pycache__/ImgFinder.cpython-36.pyc b/__pycache__/ImgFinder.cpython-36.pyc index 58f53fa..5be95e2 100644 Binary files a/__pycache__/ImgFinder.cpython-36.pyc and b/__pycache__/ImgFinder.cpython-36.pyc differ diff --git a/__pycache__/functions.cpython-36.pyc b/__pycache__/functions.cpython-36.pyc index 3c963b4..2759be3 100644 Binary files a/__pycache__/functions.cpython-36.pyc and b/__pycache__/functions.cpython-36.pyc differ diff --git a/functions.py b/functions.py index d54e194..bef45f3 100644 --- a/functions.py +++ b/functions.py @@ -32,6 +32,7 @@ def create_project_folder(page_url: object) -> object: os.makedirs("storage/" + base_url) +# Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36 def html_string(page_url: object) -> object: """ Fetch html from url and return as Html String @@ -40,7 +41,9 @@ def html_string(page_url: object) -> object: """ html_string = '' try: - response = urllib.request.urlopen(page_url) + request = urllib.request.Request(page_url, headers={ + "User-Agent": "Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.27 Safari/537.17"}) + response = urllib.request.urlopen(request) if 'text/html' in response.getheader('Content-Type'): html_bytes = response.read() html_string = html_bytes.decode("utf-8") @@ -59,5 +62,7 @@ def get_folder_name(base_url) -> object: parts = base_url.split(".") if len(parts) == 3: return parts[1] + elif len(parts) == 2: + return parts[0] else: - return parts.join("-") + return "-".join(parts) diff --git a/main.py b/main.py index 59881ed..ebfe7a8 100644 --- a/main.py +++ b/main.py @@ -5,13 +5,13 @@ import Download if __name__ == '__main__': - PAGE_URL = 'http://www.fdfashionbd.com/gallarey' + PAGE_URL = 'https://gopostie.com/how-it-works' # Create the project folder into storage folder create_project_folder(PAGE_URL) # Find images source and save it to project folder finder = ImgFinder.ImgFinder(PAGE_URL) finder.feed(html_string(PAGE_URL)) file_name = finder.save_to_file() - # start downloading images + #start downloading images down = Download.Download(file_name, finder.folder_path()) down.start()