Skip to content

Commit 556b320

Browse files
author
jy
committed
1. tester优化查询 2. 返回all接口优化 3. 增加taiyangdaili 3. usage优化
1 parent 1bf6371 commit 556b320

File tree

10 files changed

+158
-25
lines changed

10 files changed

+158
-25
lines changed

Dockerfile

+2-2
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
FROM python:3.6
22
WORKDIR /app
33
COPY . .
4-
# RUN pip install -r requirements.txt -i https://pypi.douban.com/simple
5-
RUN pip install -r requirements.txt -i
4+
RUN pip install -r requirements.txt -i https://pypi.douban.com/simple
5+
# RUN pip install -r requirements.txt -i
66
VOLUME ["/app/proxypool/crawlers/private"]
77
CMD ["supervisord", "-c", "supervisord.conf"]

docker-compose.yml

+1-1
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ services:
55
container_name: redis4proxypool
66
command: redis-server
77
ports:
8-
- "6379:6379"
8+
- "6378:6379"
99
# restart: always
1010
proxypool:
1111
build: .

examples/usage2.py

+10-10
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ def getChinaIP(ip='127.0.0.1'):
1717
reader = geolite2.reader()
1818
ip_info = reader.get(ip)
1919
geolite2.close()
20-
print(ip_info)
20+
# print(ip_info)
2121
return True if ip_info['country']['iso_code'] == 'CN' else False
2222

2323

@@ -32,22 +32,22 @@ def run(self):
3232
pure_ip_address = self.proxyip.split(':')[0]
3333
# 验证IP归属
3434
if not getChinaIP(pure_ip_address):
35-
# pass
36-
raise ValueError('不是有效IP')
35+
pass
36+
# raise ValueError('不是有效IP')
3737
#
3838
start = time.time()
3939
# 消除关闭证书验证的警告
4040
urllib3.disable_warnings()
4141
headers = Headers(headers=True).generate()
42-
headers['Referer'] = 'http://bb.cf08tp.cn/Home/index.php?m=Index&a=index&id=2676'
42+
# headers['Referer'] = 'http://bb.cf08tp.cn/Home/index.php?m=Index&a=index&id=2676'
4343
headers['Pragma'] = 'no-cache'
44-
headers['Host'] = 'bb.cf08tp.cn'
45-
headers['x-forward-for'] = pure_ip_address
44+
# headers['Host'] = 'bb.cf08tp.cn'
45+
# headers['x-forward-for'] = pure_ip_address
4646
headers['Cookie'] = 'PHPSESSID={}'.format(
4747
''.join(str(uuid.uuid1()).split('-')))
48-
print(headers)
48+
# print(headers)
4949
html = requests.get(headers=headers, url=targetUrl, proxies={
50-
"http": 'http://' + self.proxyip, "https": 'https://' + self.proxyip}, verify=False, timeout=2).content.decode()
50+
"http": 'http://' + self.proxyip}, verify=False, timeout=12).content.decode()
5151
# 结束计时
5252
end = time.time()
5353
# 输出内容
@@ -88,8 +88,8 @@ def run(self):
8888
# apiUrl = "http://127.0.0.1:5555/all"
8989
apiUrl = "http://127.0.0.1:5555/random"
9090
# 要抓取的目标网站地址
91-
targetUrl = "http://bb.cf08tp.cn/Home/index.php?m=Index&a=vote&vid=335688&id=2676&tp="
92-
# targetUrl = 'http://bb.cf08tp.cn/Home/index.php?m=Index&a=vote&vid=335608&id=2676&tp='
91+
# targetUrl = "http://bb.cf08tp.cn/Home/index.php?m=Index&a=vote&vid=335688&id=2676&tp="
92+
targetUrl = 'http://www.so.com'
9393
fetchSecond = 5
9494
# 开始自动获取IP
9595
GetIpThread(fetchSecond).start()

examples/usage3.py

+95
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,95 @@
1+
# -*- coding: UTF-8 -*-
2+
3+
'''
4+
'''
5+
import requests
6+
import time
7+
import threading
8+
import urllib3
9+
from fake_headers import Headers
10+
import uuid
11+
from geolite2 import geolite2
12+
ips = []
13+
14+
# 爬数据的线程类
15+
16+
def getChinaIP(ip='127.0.0.1'):
17+
reader = geolite2.reader()
18+
ip_info = reader.get(ip)
19+
geolite2.close()
20+
# print(ip_info)
21+
return True if ip_info['country']['iso_code'] == 'CN' else False
22+
23+
24+
25+
class CrawlThread(threading.Thread):
26+
def __init__(self, proxyip):
27+
super(CrawlThread, self).__init__()
28+
self.proxyip = proxyip
29+
30+
def run(self):
31+
# 开始计时
32+
pure_ip_address = self.proxyip.split(':')[0]
33+
# 验证IP归属
34+
if not getChinaIP(pure_ip_address):
35+
pass
36+
# raise ValueError('不是有效IP')
37+
#
38+
start = time.time()
39+
# 消除关闭证书验证的警告
40+
urllib3.disable_warnings()
41+
headers = Headers(headers=True).generate()
42+
headers['Referer'] = 'http://ga.314300.cn/toupiao/user40.html'
43+
headers['Pragma'] = 'no-cache'
44+
# headers['Host'] = 'ga.314300.cn'
45+
# headers['x-forward-for'] = pure_ip_address
46+
headers['Cookie'] = 'ASPSESSIONIDSAACBBBS=HOPLOAJDCHIIHBFNLIODPLJL'
47+
# print(headers)
48+
headers['User-Agent'] = 'Mozilla/5.0 (Linux; U; Android 2.3.6; zh-cn; GT-S5660 Build/GINGERBREAD) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1 MicroMessenger/5.3'
49+
html = requests.get(headers=headers, url=targetUrl, proxies={
50+
"http": 'http://' + self.proxyip}, verify=False, timeout=12).content.decode()
51+
# 结束计时
52+
end = time.time()
53+
# 输出内容
54+
print(threading.current_thread().getName() + "使用代理IP, 耗时 " + str(end - start) +
55+
"毫秒 " + self.proxyip + " 获取到如下HTML内容:\n" + html + "\n*************")
56+
57+
# 获取代理IP的线程类
58+
59+
60+
class GetIpThread(threading.Thread):
61+
def __init__(self, fetchSecond):
62+
super(GetIpThread, self).__init__()
63+
self.fetchSecond = fetchSecond
64+
65+
def run(self):
66+
global ips
67+
while True:
68+
# 获取IP列表
69+
res = requests.get(apiUrl).content.decode()
70+
# 按照\n分割获取到的IP
71+
ips = res.split('\n')
72+
# 利用每一个IP
73+
for proxyip in ips:
74+
if proxyip.strip():
75+
# 开启一个线程
76+
# CrawlThread(proxyip).start()
77+
try:
78+
CrawlThread(proxyip).run()
79+
time.sleep(1.5)
80+
except Exception as e:
81+
print(e)
82+
# 休眠
83+
time.sleep(len(ips) /self.fetchSecond )
84+
85+
86+
if __name__ == '__main__':
87+
# 获取IP的API接口
88+
# apiUrl = "http://127.0.0.1:5555/all"
89+
apiUrl = "http://127.0.0.1:5555/random"
90+
# 要抓取的目标网站地址
91+
# targetUrl = "http://bb.cf08tp.cn/Home/index.php?m=Index&a=vote&vid=335688&id=2676&tp="
92+
targetUrl = 'http://ga.314300.cn/toupiao/json/?id=40&s=tp'
93+
fetchSecond = 5
94+
# 开始自动获取IP
95+
GetIpThread(fetchSecond).start()

proxypool/crawlers/public/fanqieip.py

+31
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
from proxypool.schemas.proxy import Proxy
2+
from proxypool.crawlers.base import BaseCrawler
3+
from pyquery import PyQuery as pq
4+
5+
BaseUrl = 'https://www.fanqieip.com/free/{num}'
6+
MAX_PAGE = 5 * 100
7+
8+
9+
class FanqieIPCrawler(BaseCrawler):
10+
"""
11+
FanqieIP crawler, https://www.fanqieip.com
12+
"""
13+
urls = [BaseUrl.format(num=i) for i in range(1, MAX_PAGE)]
14+
15+
def parse(self, html):
16+
"""
17+
parse html file to get proxies
18+
:return:
19+
"""
20+
doc = pq(html)
21+
trs = doc('.layui-table tbody tr ').items()
22+
for tr in trs:
23+
host = tr.find('td div')[0].text
24+
port = tr.find('td div')[1].text
25+
yield Proxy(host=host, port=port)
26+
27+
28+
if __name__ == '__main__':
29+
crawler = FanqieIPCrawler()
30+
for proxy in crawler.crawl():
31+
print(proxy)

proxypool/crawlers/public/taiyangdaili.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
from pyquery import PyQuery as pq
44

55
BaseUrl = 'http://www.taiyanghttp.com/free/page{num}'
6-
MAX_PAGE = 5
6+
MAX_PAGE = 5 * 2
77

88

99
class TaiyangdailiCrawler(BaseCrawler):

proxypool/processors/server.py

+6-5
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
1-
from flask import Flask, g
1+
from flask import Flask, g , request , jsonify
22
from proxypool.storages.redis import RedisClient
3-
from proxypool.setting import API_HOST, API_PORT, API_THREADED
3+
from proxypool.setting import API_HOST, API_PORT, API_THREADED,PROXY_SCORE_MIN, PROXY_SCORE_MAX
44

55

66
__all__ = ['app']
@@ -40,11 +40,12 @@ def get_proxy():
4040
@app.route('/all')
4141
def get_proxy_all():
4242
"""
43-
get a random proxy
44-
:return: get a random proxy
43+
get proxy by min_score to max_score
44+
:return: proxies list
4545
"""
46+
args = request.args
4647
conn = get_conn()
47-
proxies = conn.all()
48+
proxies = conn.all(args.get('min_score',PROXY_SCORE_MIN),args.get('max_score',PROXY_SCORE_MAX))
4849
proxies_string = ''
4950
for proxy in proxies:
5051
proxies_string += str(proxy) + '\n'

proxypool/processors/tester.py

+4-4
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
from proxypool.setting import TEST_TIMEOUT, TEST_BATCH, TEST_URL, TEST_VALID_STATUS, TEST_ANONYMOUS
77
from aiohttp import ClientProxyConnectionError, ServerDisconnectedError, ClientOSError, ClientHttpProxyError
88
from asyncio import TimeoutError
9-
9+
import requests
1010

1111
EXCEPTIONS = (
1212
ClientProxyConnectionError,
@@ -43,7 +43,7 @@ async def test(self, proxy: Proxy):
4343
# if TEST_ANONYMOUS is True, make sure that
4444
# the proxy has the effect of hiding the real IP
4545
if TEST_ANONYMOUS:
46-
url = 'https://httpbin.org/ip'
46+
url = 'http://www.nghttp2.org/httpbin/ip'
4747
async with session.get(url, timeout=TEST_TIMEOUT) as response:
4848
resp_json = await response.json()
4949
origin_ip = resp_json['origin']
@@ -85,8 +85,8 @@ def run(self):
8585
break
8686

8787
def run_tester():
88-
host = '96.113.165.182'
89-
port = '3128'
88+
host = '111.246.42.52'
89+
port = '8888'
9090
tasks = [tester.test(Proxy(host=host, port=port))]
9191
tester.loop.run_until_complete(asyncio.wait(tasks))
9292

proxypool/schemas/proxy.py

+6
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,12 @@ class Proxy(object):
88
"""
99
host = attr(type=str, default=None)
1010
port = attr(type=int, default=None)
11+
location = attr(type=str, default=None)
12+
isp = attr(type=str, default=None)
13+
country = attr(type=str, default=None)
14+
anonymous = attr(type=bool, default=None)
15+
protocol = attr(type=str, default=None)
16+
alive_time = attr(type=int, default=None)
1117

1218
def __str__(self):
1319
"""

proxypool/storages/redis.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -103,12 +103,12 @@ def count(self) -> int:
103103
"""
104104
return self.db.zcard(REDIS_KEY)
105105

106-
def all(self) -> List[Proxy]:
106+
def all(self,min_score=PROXY_SCORE_MIN,max_score=PROXY_SCORE_MAX) -> List[Proxy]:
107107
"""
108108
get all proxies
109109
:return: list of proxies
110110
"""
111-
return convert_proxy_or_proxies(self.db.zrangebyscore(REDIS_KEY, PROXY_SCORE_MIN, PROXY_SCORE_MAX))
111+
return convert_proxy_or_proxies(self.db.zrangebyscore(REDIS_KEY, min_score,max_score))
112112

113113
def batch(self, cursor, count) -> List[Proxy]:
114114
"""

0 commit comments

Comments
 (0)