diff --git a/examples/usage2.py b/examples/usage2.py index 918c5eb2..95be78f1 100644 --- a/examples/usage2.py +++ b/examples/usage2.py @@ -39,6 +39,17 @@ def run(self): # 消除关闭证书验证的警告 urllib3.disable_warnings() headers = Headers(headers=True).generate() +<<<<<<< HEAD + # headers['Referer'] = 'http://bb.cf08tp.cn/Home/index.php?m=Index&a=index&id=2676' + headers['Pragma'] = 'no-cache' + # headers['Host'] = 'bb.cf08tp.cn' + # headers['x-forward-for'] = pure_ip_address + headers['Cookie'] = 'PHPSESSID={}'.format( + ''.join(str(uuid.uuid1()).split('-'))) + # print(headers) + html = requests.get(headers=headers, url=targetUrl, proxies={ + "http": 'http://' + self.proxyip}, verify=False, timeout=12).content.decode() +======= headers['Referer'] = 'http://bb.cf08tp.cn/Home/index.php?m=Index&a=index&id=2676' headers['Pragma'] = 'no-cache' headers['Host'] = 'bb.cf08tp.cn' @@ -48,6 +59,7 @@ def run(self): print(headers) html = requests.get(headers=headers, url=targetUrl, proxies={ "http": 'http://' + self.proxyip, "https": 'https://' + self.proxyip}, verify=False, timeout=2).content.decode() +>>>>>>> cb4cbc440e00a091773d985705d3f27b93a9213e # 结束计时 end = time.time() # 输出内容 diff --git a/examples/usage3.py b/examples/usage3.py new file mode 100644 index 00000000..67b636f7 --- /dev/null +++ b/examples/usage3.py @@ -0,0 +1,95 @@ +# -*- coding: UTF-8 -*- + +''' +''' +import requests +import time +import threading +import urllib3 +from fake_headers import Headers +import uuid +from geolite2 import geolite2 +ips = [] + +# 爬数据的线程类 + +def getChinaIP(ip='127.0.0.1'): + reader = geolite2.reader() + ip_info = reader.get(ip) + geolite2.close() + # print(ip_info) + return True if ip_info['country']['iso_code'] == 'CN' else False + + + +class CrawlThread(threading.Thread): + def __init__(self, proxyip): + super(CrawlThread, self).__init__() + self.proxyip = proxyip + + def run(self): + # 开始计时 + pure_ip_address = self.proxyip.split(':')[0] + # 验证IP归属 + if not getChinaIP(pure_ip_address): + pass + # raise ValueError('不是有效IP') + # + start = time.time() + # 消除关闭证书验证的警告 + urllib3.disable_warnings() + headers = Headers(headers=True).generate() + headers['Referer'] = 'http://ga.314300.cn/toupiao/user40.html' + headers['Pragma'] = 'no-cache' + # headers['Host'] = 'ga.314300.cn' + # headers['x-forward-for'] = pure_ip_address + headers['Cookie'] = 'ASPSESSIONIDSAACBBBS=HOPLOAJDCHIIHBFNLIODPLJL' + # print(headers) + headers['User-Agent'] = 'Mozilla/5.0 (Linux; U; Android 2.3.6; zh-cn; GT-S5660 Build/GINGERBREAD) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1 MicroMessenger/5.3' + html = requests.get(headers=headers, url=targetUrl, proxies={ + "http": 'http://' + self.proxyip}, verify=False, timeout=12).content.decode() + # 结束计时 + end = time.time() + # 输出内容 + print(threading.current_thread().getName() + "使用代理IP, 耗时 " + str(end - start) + + "毫秒 " + self.proxyip + " 获取到如下HTML内容:\n" + html + "\n*************") + +# 获取代理IP的线程类 + + +class GetIpThread(threading.Thread): + def __init__(self, fetchSecond): + super(GetIpThread, self).__init__() + self.fetchSecond = fetchSecond + + def run(self): + global ips + while True: + # 获取IP列表 + res = requests.get(apiUrl).content.decode() + # 按照\n分割获取到的IP + ips = res.split('\n') + # 利用每一个IP + for proxyip in ips: + if proxyip.strip(): + # 开启一个线程 + # CrawlThread(proxyip).start() + try: + CrawlThread(proxyip).run() + time.sleep(1.5) + except Exception as e: + print(e) + # 休眠 + time.sleep(len(ips) /self.fetchSecond ) + + +if __name__ == '__main__': + # 获取IP的API接口 + # apiUrl = "http://127.0.0.1:5555/all" + apiUrl = "http://127.0.0.1:5555/random" + # 要抓取的目标网站地址 + # targetUrl = "http://bb.cf08tp.cn/Home/index.php?m=Index&a=vote&vid=335688&id=2676&tp=" + targetUrl = 'http://ga.314300.cn/toupiao/json/?id=40&s=tp' + fetchSecond = 5 + # 开始自动获取IP + GetIpThread(fetchSecond).start() diff --git a/proxypool/crawlers/public/fanqieip.py b/proxypool/crawlers/public/fanqieip.py new file mode 100644 index 00000000..01e7b7a9 --- /dev/null +++ b/proxypool/crawlers/public/fanqieip.py @@ -0,0 +1,31 @@ +from proxypool.schemas.proxy import Proxy +from proxypool.crawlers.base import BaseCrawler +from pyquery import PyQuery as pq + +BaseUrl = 'https://www.fanqieip.com/free/{num}' +MAX_PAGE = 5 * 100 + + +class FanqieIPCrawler(BaseCrawler): + """ + FanqieIP crawler, https://www.fanqieip.com + """ + urls = [BaseUrl.format(num=i) for i in range(1, MAX_PAGE)] + + def parse(self, html): + """ + parse html file to get proxies + :return: + """ + doc = pq(html) + trs = doc('.layui-table tbody tr ').items() + for tr in trs: + host = tr.find('td div')[0].text + port = tr.find('td div')[1].text + yield Proxy(host=host, port=port) + + +if __name__ == '__main__': + crawler = FanqieIPCrawler() + for proxy in crawler.crawl(): + print(proxy) diff --git a/proxypool/crawlers/public/taiyangdaili.py b/proxypool/crawlers/public/taiyangdaili.py index 7a48cb43..bff823d0 100644 --- a/proxypool/crawlers/public/taiyangdaili.py +++ b/proxypool/crawlers/public/taiyangdaili.py @@ -3,7 +3,7 @@ from pyquery import PyQuery as pq BaseUrl = 'http://www.taiyanghttp.com/free/page{num}' -MAX_PAGE = 5 +MAX_PAGE = 5 * 2 class TaiyangdailiCrawler(BaseCrawler): diff --git a/proxypool/processors/server.py b/proxypool/processors/server.py index d3edd70d..785bbf4f 100644 --- a/proxypool/processors/server.py +++ b/proxypool/processors/server.py @@ -1,6 +1,6 @@ -from flask import Flask, g +from flask import Flask, g , request , jsonify from proxypool.storages.redis import RedisClient -from proxypool.setting import API_HOST, API_PORT, API_THREADED +from proxypool.setting import API_HOST, API_PORT, API_THREADED,PROXY_SCORE_MIN, PROXY_SCORE_MAX __all__ = ['app'] @@ -40,11 +40,12 @@ def get_proxy(): @app.route('/all') def get_proxy_all(): """ - get a random proxy - :return: get a random proxy + get proxy by min_score to max_score + :return: proxies list """ + args = request.args conn = get_conn() - proxies = conn.all() + proxies = conn.all(args.get('min_score',PROXY_SCORE_MIN),args.get('max_score',PROXY_SCORE_MAX)) proxies_string = '' for proxy in proxies: proxies_string += str(proxy) + '\n' diff --git a/proxypool/processors/tester.py b/proxypool/processors/tester.py index f002056a..168e2f27 100644 --- a/proxypool/processors/tester.py +++ b/proxypool/processors/tester.py @@ -6,7 +6,7 @@ from proxypool.setting import TEST_TIMEOUT, TEST_BATCH, TEST_URL, TEST_VALID_STATUS, TEST_ANONYMOUS from aiohttp import ClientProxyConnectionError, ServerDisconnectedError, ClientOSError, ClientHttpProxyError from asyncio import TimeoutError - +import requests EXCEPTIONS = ( ClientProxyConnectionError, @@ -43,7 +43,7 @@ async def test(self, proxy: Proxy): # if TEST_ANONYMOUS is True, make sure that # the proxy has the effect of hiding the real IP if TEST_ANONYMOUS: - url = 'https://httpbin.org/ip' + url = 'http://www.nghttp2.org/httpbin/ip' async with session.get(url, timeout=TEST_TIMEOUT) as response: resp_json = await response.json() origin_ip = resp_json['origin'] diff --git a/proxypool/schemas/proxy.py b/proxypool/schemas/proxy.py index 8be3fb34..84323e81 100644 --- a/proxypool/schemas/proxy.py +++ b/proxypool/schemas/proxy.py @@ -8,6 +8,12 @@ class Proxy(object): """ host = attr(type=str, default=None) port = attr(type=int, default=None) + location = attr(type=str, default=None) + isp = attr(type=str, default=None) + country = attr(type=str, default=None) + anonymous = attr(type=bool, default=None) + protocol = attr(type=str, default=None) + alive_time = attr(type=int, default=None) def __str__(self): """ diff --git a/proxypool/storages/redis.py b/proxypool/storages/redis.py index 0ebbccc2..3570cc1e 100644 --- a/proxypool/storages/redis.py +++ b/proxypool/storages/redis.py @@ -103,12 +103,12 @@ def count(self) -> int: """ return self.db.zcard(REDIS_KEY) - def all(self) -> List[Proxy]: + def all(self,min_score=PROXY_SCORE_MIN,max_score=PROXY_SCORE_MAX) -> List[Proxy]: """ get all proxies :return: list of proxies """ - return convert_proxy_or_proxies(self.db.zrangebyscore(REDIS_KEY, PROXY_SCORE_MIN, PROXY_SCORE_MAX)) + return convert_proxy_or_proxies(self.db.zrangebyscore(REDIS_KEY, min_score,max_score)) def batch(self, cursor, count) -> List[Proxy]: """