diff --git a/proxypool/crawlers/base.py b/proxypool/crawlers/base.py index aa35430e..f0570b93 100644 --- a/proxypool/crawlers/base.py +++ b/proxypool/crawlers/base.py @@ -1,32 +1,41 @@ +import asyncio +import aiohttp from retrying import retry -import requests from loguru import logger from proxypool.setting import GET_TIMEOUT class BaseCrawler(object): urls = [] - + + def __init__(self): + self.loop = asyncio.get_event_loop() + @retry(stop_max_attempt_number=3, retry_on_result=lambda x: x is None, wait_fixed=2000) - def fetch(self, url, **kwargs): + async def fetch(self, session, url, **kwargs): try: kwargs.setdefault('timeout', GET_TIMEOUT) - kwargs.setdefault('verify', False) - response = requests.get(url, **kwargs) - if response.status_code == 200: - response.encoding = 'utf-8' - return response.text - except requests.ConnectionError: + async with session.get(url, **kwargs) as response: + if response.status == 200: + response.encoding = 'utf-8' + return await response.text() + except aiohttp.ClientConnectionError: return - + @logger.catch - def crawl(self): + async def crawl(self): """ crawl main method """ - for url in self.urls: - logger.info(f'fetching {url}') - html = self.fetch(url) - for proxy in self.parse(html): - logger.info(f'fetched proxy {proxy.string()} from {url}') - yield proxy + proxies = [] + async with aiohttp.ClientSession(connector=aiohttp.TCPConnector(ssl=False)) as session: + tasks = [self.fetch(session, url) for url in self.urls] + results = await asyncio.gather(*tasks) + for result in results: + if result: + for proxy in self.parse(result): + proxies.append(proxy) + return proxies + + def run(self): + return self.loop.run_until_complete(self.crawl()) diff --git a/proxypool/crawlers/public/daili66.py b/proxypool/crawlers/public/daili66.py index 09a3ee45..b5671bc8 100644 --- a/proxypool/crawlers/public/daili66.py +++ b/proxypool/crawlers/public/daili66.py @@ -28,5 +28,5 @@ def parse(self, html): if __name__ == '__main__': crawler = Daili66Crawler() - for proxy in crawler.crawl(): + for proxy in crawler.run(): print(proxy) diff --git a/proxypool/crawlers/public/data5u.py b/proxypool/crawlers/public/data5u.py index e36bf664..59728046 100644 --- a/proxypool/crawlers/public/data5u.py +++ b/proxypool/crawlers/public/data5u.py @@ -1,3 +1,5 @@ +import asyncio +import aiohttp from pyquery import PyQuery as pq from proxypool.schemas.proxy import Proxy from proxypool.crawlers.base import BaseCrawler @@ -11,23 +13,23 @@ class Data5UCrawler(BaseCrawler): data5u crawler, http://www.data5u.com """ urls = [BASE_URL] - + headers = { 'User-Agent': 'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36' } @logger.catch - def crawl(self): - """ - crawl main method - """ - for url in self.urls: - logger.info(f'fetching {url}') - html = self.fetch(url, headers=self.headers) - for proxy in self.parse(html): - logger.info(f'fetched proxy {proxy.string()} from {url}') - yield proxy - + async def crawl(self): + proxies = [] + async with aiohttp.ClientSession(connector=aiohttp.TCPConnector(ssl=False)) as session: + tasks = [self.fetch(session, url, headers=self.headers) for url in self.urls] + results = await asyncio.gather(*tasks) + for result in results: + if result: + for proxy in self.parse(result): + proxies.append(proxy) + return proxies + def parse(self, html): """ parse html file to get proxies @@ -43,5 +45,5 @@ def parse(self, html): if __name__ == '__main__': crawler = Data5UCrawler() - for proxy in crawler.crawl(): + for proxy in crawler.run(): print(proxy) diff --git a/proxypool/crawlers/public/ip3366.py b/proxypool/crawlers/public/ip3366.py index 78d29447..fcf3f710 100644 --- a/proxypool/crawlers/public/ip3366.py +++ b/proxypool/crawlers/public/ip3366.py @@ -28,5 +28,5 @@ def parse(self, html): if __name__ == '__main__': crawler = IP3366Crawler() - for proxy in crawler.crawl(): + for proxy in crawler.run(): print(proxy) diff --git a/proxypool/crawlers/public/iphai.py b/proxypool/crawlers/public/iphai.py index baa79834..d1786c8c 100644 --- a/proxypool/crawlers/public/iphai.py +++ b/proxypool/crawlers/public/iphai.py @@ -5,6 +5,7 @@ BASE_URL = 'http://www.iphai.com/' + class IPHaiCrawler(BaseCrawler): """ iphai crawler, http://www.iphai.com/ @@ -28,8 +29,8 @@ def parse(self, html): proxy = Proxy(host=address.strip(), port=int(port.strip())) yield proxy + if __name__ == '__main__': crawler = IPHaiCrawler() - for proxy in crawler.crawl(): + for proxy in crawler.run(): print(proxy) - diff --git a/proxypool/crawlers/public/kuaidaili.py b/proxypool/crawlers/public/kuaidaili.py index f3fa6437..7557ac66 100644 --- a/proxypool/crawlers/public/kuaidaili.py +++ b/proxypool/crawlers/public/kuaidaili.py @@ -29,5 +29,5 @@ def parse(self, html): if __name__ == '__main__': crawler = KuaidailiCrawler() - for proxy in crawler.crawl(): + for proxy in crawler.run(): print(proxy) diff --git a/proxypool/crawlers/public/xicidaili.py b/proxypool/crawlers/public/xicidaili.py index fdd2a317..537422a7 100644 --- a/proxypool/crawlers/public/xicidaili.py +++ b/proxypool/crawlers/public/xicidaili.py @@ -1,8 +1,11 @@ +import asyncio +import aiohttp from pyquery import PyQuery as pq from proxypool.schemas.proxy import Proxy from proxypool.crawlers.base import BaseCrawler from loguru import logger + BASE_URL = 'https://www.xicidaili.com/' @@ -18,17 +21,18 @@ class XicidailiCrawler(BaseCrawler): } @logger.catch - def crawl(self): - """ - crawl main method - """ - for url in self.urls: - logger.info(f'fetching {url}') - html = self.fetch(url, headers=self.headers) - for proxy in self.parse(html): - logger.info(f'fetched proxy {proxy.string()} from {url}') - yield proxy - + async def crawl(self): + proxies = [] + async with aiohttp.ClientSession( + connector=aiohttp.TCPConnector(ssl=False)) as session: + tasks = [self.fetch(session, url, headers=self.headers) for url in self.urls] + results = await asyncio.gather(*tasks) + for result in results: + if result: + for proxy in self.parse(result): + proxies.append(proxy) + return proxies + def parse(self, html): """ parse html file to get proxies @@ -47,6 +51,5 @@ def parse(self, html): if __name__ == '__main__': crawler = XicidailiCrawler() - for proxy in crawler.crawl(): + for proxy in crawler.run(): print(proxy) - diff --git a/proxypool/crawlers/public/xiladaili.py b/proxypool/crawlers/public/xiladaili.py index 70a75ff1..15b3accd 100644 --- a/proxypool/crawlers/public/xiladaili.py +++ b/proxypool/crawlers/public/xiladaili.py @@ -28,5 +28,5 @@ def parse(self, html): if __name__ == '__main__': crawler = XiladailiCrawler() - for proxy in crawler.crawl(): + for proxy in crawler.run(): print(proxy) diff --git a/proxypool/crawlers/public/zhandaye.py b/proxypool/crawlers/public/zhandaye.py index b6278a28..448c72ee 100755 --- a/proxypool/crawlers/public/zhandaye.py +++ b/proxypool/crawlers/public/zhandaye.py @@ -1,13 +1,15 @@ +import asyncio +import aiohttp from pyquery import PyQuery as pq from proxypool.schemas.proxy import Proxy from proxypool.crawlers.base import BaseCrawler -from loguru import logger import re BASE_URL = 'https://www.zdaye.com/dayProxy/{page}.html' MAX_PAGE = 5 + class ZhandayeCrawler(BaseCrawler): """ zhandaye crawler, https://www.zdaye.com/dayProxy/ @@ -19,25 +21,27 @@ class ZhandayeCrawler(BaseCrawler): urls = [] ignore = True - def crawl(self): - self.crawl_catalog() - yield from super().crawl() + async def crawl(self): + await self.crawl_catalog() + await super().crawl() - def crawl_catalog(self): - for url in self.urls_catalog: - logger.info(f'fetching {url}') - html = self.fetch(url, headers=self.headers) - self.parse_catalog(html) + async def crawl_catalog(self): + async with aiohttp.ClientSession( + connector=aiohttp.TCPConnector(ssl=False)) as session: + tasks = [self.fetch(session, url, headers=self.headers) for url in self.urls_catalog] + results = await asyncio.gather(*tasks) + for result in results: + if result: + self.parse_catalog(result) def parse_catalog(self, html): """ - parse html file to get proxies + parse catalog_html file to get urls :return: """ doc = pq(html) for item in doc('#J_posts_list .thread_item div div p a').items(): url = 'https://www.zdaye.com' + item.attr('href') - logger.info(f'get detail url: {url}') self.urls.append(url) def parse(self, html): @@ -54,6 +58,5 @@ def parse(self, html): if __name__ == '__main__': crawler = ZhandayeCrawler() - for proxy in crawler.crawl(): + for proxy in crawler.run(): print(proxy) - diff --git a/proxypool/processors/getter.py b/proxypool/processors/getter.py index 1a1d5261..1a1497ea 100644 --- a/proxypool/processors/getter.py +++ b/proxypool/processors/getter.py @@ -8,7 +8,7 @@ class Getter(object): """ getter of proxypool """ - + def __init__(self): """ init db and crawlers @@ -16,14 +16,14 @@ def __init__(self): self.redis = RedisClient() self.crawlers_cls = crawlers_cls self.crawlers = [crawler_cls() for crawler_cls in self.crawlers_cls] - + def is_full(self): """ if proxypool if full return: bool """ return self.redis.count() >= PROXY_NUMBER_MAX - + @logger.catch def run(self): """ @@ -34,8 +34,13 @@ def run(self): return for crawler in self.crawlers: logger.info(f'crawler {crawler} to get proxy') - for proxy in crawler.crawl(): - self.redis.add(proxy) + proxies = crawler.run() + if proxies: + for proxy in proxies: + self.redis.add(proxy) + logger.info(f'crawled {len(proxies)} proxies from {crawler}') + else: + logger.debug(f'cannot crawl proxies from {crawler}') if __name__ == '__main__': diff --git a/proxypool/processors/tester.py b/proxypool/processors/tester.py index e0812110..5387c5fa 100644 --- a/proxypool/processors/tester.py +++ b/proxypool/processors/tester.py @@ -23,49 +23,48 @@ class Tester(object): """ tester for testing proxies in queue """ - + def __init__(self): """ init redis """ self.redis = RedisClient() self.loop = asyncio.get_event_loop() - - async def test(self, proxy: Proxy): + + async def test(self, session, proxy: Proxy): """ test single proxy :param proxy: Proxy object :return: """ - async with aiohttp.ClientSession(connector=aiohttp.TCPConnector(ssl=False)) as session: - try: - logger.debug(f'testing {proxy.string()}') - # if TEST_ANONYMOUS is True, make sure that - # the proxy has the effect of hiding the real IP - if TEST_ANONYMOUS: - url = 'https://httpbin.org/ip' - async with session.get(url, timeout=TEST_TIMEOUT) as response: - resp_json = await response.json() - origin_ip = resp_json['origin'] - async with session.get(url, proxy=f'http://{proxy.string()}', timeout=TEST_TIMEOUT) as response: - resp_json = await response.json() - anonymous_ip = resp_json['origin'] - assert origin_ip != anonymous_ip - assert proxy.host == anonymous_ip - async with session.get(TEST_URL, proxy=f'http://{proxy.string()}', timeout=TEST_TIMEOUT, - allow_redirects=False) as response: - if response.status in TEST_VALID_STATUS: - self.redis.max(proxy) - logger.debug(f'proxy {proxy.string()} is valid, set max score') - else: - self.redis.decrease(proxy) - logger.debug(f'proxy {proxy.string()} is invalid, decrease score') - except EXCEPTIONS: - self.redis.decrease(proxy) - logger.debug(f'proxy {proxy.string()} is invalid, decrease score') - + try: + logger.debug(f'testing {proxy.string()}') + # if TEST_ANONYMOUS is True, make sure that + # the proxy has the effect of hiding the real IP + if TEST_ANONYMOUS: + url = 'https://httpbin.org/ip' + async with session.get(url, timeout=TEST_TIMEOUT) as response: + resp_json = await response.json() + origin_ip = resp_json['origin'] + async with session.get(url, proxy=f'http://{proxy.string()}', timeout=TEST_TIMEOUT) as response: + resp_json = await response.json() + anonymous_ip = resp_json['origin'] + assert origin_ip != anonymous_ip + assert proxy.host == anonymous_ip + async with session.get(TEST_URL, proxy=f'http://{proxy.string()}', timeout=TEST_TIMEOUT, + allow_redirects=False) as response: + if response.status in TEST_VALID_STATUS: + self.redis.max(proxy) + logger.debug(f'proxy {proxy.string()} is valid, set max score') + else: + self.redis.decrease(proxy) + logger.debug(f'proxy {proxy.string()} is invalid, decrease score') + except EXCEPTIONS: + self.redis.decrease(proxy) + logger.debug(f'proxy {proxy.string()} is invalid, decrease score') + @logger.catch - def run(self): + async def main(self): """ test main method :return: @@ -75,14 +74,18 @@ def run(self): count = self.redis.count() logger.debug(f'{count} proxies to test') cursor = 0 - while True: - logger.debug(f'testing proxies use cursor {cursor}, count {TEST_BATCH}') - cursor, proxies = self.redis.batch(cursor, count=TEST_BATCH) - if proxies: - tasks = [self.test(proxy) for proxy in proxies] - self.loop.run_until_complete(asyncio.wait(tasks)) - if not cursor: - break + async with aiohttp.ClientSession(connector=aiohttp.TCPConnector(ssl=False)) as session: + while True: + logger.debug(f'testing proxies use cursor {cursor}, count {TEST_BATCH}') + cursor, proxies = self.redis.batch(cursor, count=TEST_BATCH) + if proxies: + tasks = [self.test(session, proxy) for proxy in proxies] + await asyncio.gather(*tasks) + if not cursor: + break + + def run(self): + self.loop.run_until_complete(self.main()) if __name__ == '__main__':