From cadb8420d7b3175f8496e4235b676dde6fca2a1e Mon Sep 17 00:00:00 2001 From: Tuhin Bepari Date: Wed, 22 Nov 2017 19:56:18 +0600 Subject: [PATCH] fixing erros and improving --- ImgFinder.py | 7 ++++--- __pycache__/ImgFinder.cpython-36.pyc | Bin 2242 -> 2312 bytes __pycache__/functions.cpython-36.pyc | Bin 1735 -> 1934 bytes functions.py | 9 +++++++-- main.py | 4 ++-- 5 files changed, 13 insertions(+), 7 deletions(-) diff --git a/ImgFinder.py b/ImgFinder.py index e7a3f2d..148d7fb 100644 --- a/ImgFinder.py +++ b/ImgFinder.py @@ -9,7 +9,8 @@ def __init__(self, page_url): self.page_url = page_url self.base_url = urlres.netloc self.folder = functions.get_folder_name(urlres.netloc) - self.path = urlres.path + self.path = urlres.path.replace("/", "_") + self.scheme = urlres.scheme self.src = set() HTMLParser.__init__(self) @@ -22,7 +23,7 @@ def handle_starttag(self, tag, attrs): if tag == 'img': for (attr, value) in attrs: if attr == 'src': - fullUrl = urllib.parse.urljoin(self.base_url, value) + fullUrl = urllib.parse.urljoin(self.scheme + "://" + self.base_url, value) self.src.add(fullUrl) else: continue @@ -46,7 +47,7 @@ def save_to_file(self) -> object: Save waiting downloadable image to queue. So next time when program run :rtype: object """ - file_name = self.folder_path() + self.path + '.txt' + file_name = self.folder_path() + "/" + self.path + '.txt' with open(file_name, 'w') as f: for line in sorted(self.src): f.write(line + '\n') diff --git a/__pycache__/ImgFinder.cpython-36.pyc b/__pycache__/ImgFinder.cpython-36.pyc index 58f53fa73b96cc01827214eaa6a1609fab7b5fd7..5be95e22beddd88bdd22352f769f725261580972 100644 GIT binary patch delta 816 zcmZ`%O=}ZD7@paW$tKx+RGXwI*7`wVEjEfswTcJrMJ%DVV5A`HHZy6mZnAN9YD?ML zLoR}+Iw$cL6tA8=*nctLFYw^qgZR!CRHU-(^X|U$ykE~dUnbt1%HJrJ3Ut4=et3}o zRxlAl58zFn9QN1P>xh}Y#;JU2B7&YZ4T1wh!v1UtBlt8&_C zPOg9}#T36V`0~#E)L25#7G+>eU7=mYi#V<@NI`AwRJx)%lc^}YC&JNd~dv7 zI1PDr&BbQ3<>femAFfNP9P|>#{UM*wuA&CbRAxY_M|e#xnX}jH+MPi|pH+InkVec; zM8JgzHg#y?0wALMcJLYb!F+c~=kYMFfjJKTlYwk9vBo>+<%;#$(?$QobAjh?1PSw@ z%~`o%-gN* z7G&AA=EaUscw6B@%Ok4TjU&pqKMF*TSAd|bc*KUVznBX~k9D!Pd0#Nk%EnUpZXa|Q z1_P#WS6NYi$o?JGO@dd<7qPFYS0QUsKsFe=E5LZ7=C9mnI>yC-Cfaw?%EYmi66A$WzQ-VQ5Ljdr6@7mnYJ6(#7@Q!Fe}Jj zym+W{_6PVUJb3QSi{vjTc=sT_lY#|52=lz*{hH@_=1u2Q+rQjwHny`P>vwnkuMG+S zSMd}pyFZr}ONf2lm;uH^I!K!(%m@Jis}PZ46q*rvXg-sZtvQ@n=TFr{$>Ia3j$!ot z(H16cp$HLx1cVchgyuS3y0wCdk_hsMsw*!BCA|g@mhaQ4Gi9YCC}u+qDM@JwsZ+8U zox1qPQnz&N{X3!k)<8JOgPA@0>f$^u97bF*jLJo*=8#RAk2B@ll&x1zA_Vrd&1>h( zrH)P4aM;%7x06gvWL)S(X`O6Y@Upz_*86(5fM!SRV5W7(zSff+200PgHXo?n5v*-4XI}5e(JLlE_WwYCG z2kxPPDBm@VEAx`~MonB&vHXe5DqiGDOR=xoeXhn@Myw`X`I}=@nI(^9MK&uo8E5iO zAwiKmld;ZY+j;@F77YY&JLU)f-nX9oTbb%(X<&Ts#!c&*xA=DoMPSU;;}QP=+(G~c}LFCk5k9smFU diff --git a/__pycache__/functions.cpython-36.pyc b/__pycache__/functions.cpython-36.pyc index 3c963b4afc7535adb14cf597208dc2724cd457d9..2759be308112aa16fc1bdc91c3b5c7814a01f8ee 100644 GIT binary patch delta 446 zcmXX?(P|Sx6uozLW_G(I8iESG$Q1gJfNr{*wh0JgsaT=yi%}ANi_X++HYRm;D;j65 zvf!JgxL?w*=vzM^f6#tKC&hSZ{vv}hj)?(Dq!Xj0e@RYmJ;#s&b4N1cjH%=WCS1R?niY^!L&sdY_Ktm+1{5+k z@%2}PoFUBXnU}2?R$ze$eRv1blU^-nKiL4LWB>+x&|LXP<%$0`Rr<;HCpFFHhX=EH zJedqzo58wh48riK*pH{j-$cCC-D-;MqoaxHs}Fl|*4k`$g0LeRd#~Oc>^~NhI8kC( zeN1Leu`|-M&#Dz|1nWWA4x=FIh~DsUsN?@%OR@X8#wyvZJ(UC#P9zqW! ox5HZ3D%Upl$`Sj+ugEg7yk^g8-DP0qHqgfkdgTT`b1JoaH#YZX%m4rY delta 253 zcmW-bu}T9$5Qb-VcQ%*R)8jy}^srG8kw`$&<`GgPrisWF;Q}YD*BE!TIIt99pCtGY zzJvH4DbhG=@eecn{QrC(hu^um$#b)8j^>r#m>2m*08p^TtfwUpivgdnTd1)X9y$!d z3NLDDWrr*62E3~dg7mT`tJudWG*=2*y)y0rAt;1(+4kDZv)duU6z;&1rJ!Eb3>q;5 zj}ti8>xDUq$Lc object: os.makedirs("storage/" + base_url) +# Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36 def html_string(page_url: object) -> object: """ Fetch html from url and return as Html String @@ -40,7 +41,9 @@ def html_string(page_url: object) -> object: """ html_string = '' try: - response = urllib.request.urlopen(page_url) + request = urllib.request.Request(page_url, headers={ + "User-Agent": "Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.27 Safari/537.17"}) + response = urllib.request.urlopen(request) if 'text/html' in response.getheader('Content-Type'): html_bytes = response.read() html_string = html_bytes.decode("utf-8") @@ -59,5 +62,7 @@ def get_folder_name(base_url) -> object: parts = base_url.split(".") if len(parts) == 3: return parts[1] + elif len(parts) == 2: + return parts[0] else: - return parts.join("-") + return "-".join(parts) diff --git a/main.py b/main.py index 59881ed..ebfe7a8 100644 --- a/main.py +++ b/main.py @@ -5,13 +5,13 @@ import Download if __name__ == '__main__': - PAGE_URL = 'http://www.fdfashionbd.com/gallarey' + PAGE_URL = 'https://gopostie.com/how-it-works' # Create the project folder into storage folder create_project_folder(PAGE_URL) # Find images source and save it to project folder finder = ImgFinder.ImgFinder(PAGE_URL) finder.feed(html_string(PAGE_URL)) file_name = finder.save_to_file() - # start downloading images + #start downloading images down = Download.Download(file_name, finder.folder_path()) down.start()