Skip to content

Commit e3bbd55

Browse files
添加IP代理爬取 (#106)
* Create ip89.py www.89ip.cn 免费代理 * Update ip89.py update Class name * Create fatezero_proxylist.py 增加 http://proxylist.fatezero.org/ 代理 * Create ihuan.py i幻 代理
1 parent 9912b98 commit e3bbd55

File tree

3 files changed

+99
-0
lines changed

3 files changed

+99
-0
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
from proxypool.schemas.proxy import Proxy
2+
from proxypool.crawlers.base import BaseCrawler
3+
import re
4+
import json
5+
BASE_URL = 'http://proxylist.fatezero.org/proxy.list'
6+
7+
8+
class FatezeroCrawler(BaseCrawler):
9+
"""
10+
Fatezero crawler,http://proxylist.fatezero.org
11+
"""
12+
urls = [BASE_URL]
13+
14+
def parse(self, html):
15+
"""
16+
parse html file to get proxies
17+
:return:
18+
"""
19+
20+
hosts_ports = html.split('\n')
21+
for addr in hosts_ports:
22+
ip_address = json.loads(addr)
23+
if(True):
24+
host = ip_address['host']
25+
port = ip_address['port']
26+
yield Proxy(host=host, port=port)
27+
28+
29+
if __name__ == '__main__':
30+
crawler = FatezeroCrawler()
31+
for proxy in crawler.crawl():
32+
print(proxy)

proxypool/crawlers/public/ihuan.py

+34
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
from proxypool.schemas.proxy import Proxy
2+
from proxypool.crawlers.base import BaseCrawler
3+
import re
4+
from pyquery import PyQuery as pq
5+
import time
6+
BASE_URL = 'https://ip.ihuan.me/today/{path}.html'
7+
8+
9+
class IhuanCrawler(BaseCrawler):
10+
"""
11+
ip ihuan crawler, https://ip.ihuan.me
12+
"""
13+
urls = [BASE_URL.format(path=time.strftime("%Y/%m/%d/%H", time.localtime()))]
14+
15+
def parse(self, html):
16+
"""
17+
parse html file to get proxies
18+
:return:
19+
"""
20+
# doc = pq(html)('.text-left')
21+
ip_address = re.compile('([\d:\.]*).*?<br>')
22+
hosts_ports = ip_address.findall(html)
23+
for addr in hosts_ports:
24+
addr_split = addr.split(':')
25+
if(len(addr_split) == 2):
26+
host = addr_split[0]
27+
port = addr_split[1]
28+
yield Proxy(host=host, port=port)
29+
30+
31+
if __name__ == '__main__':
32+
crawler = IhuanCrawler()
33+
for proxy in crawler.crawl():
34+
print(proxy)

proxypool/crawlers/public/ip89.py

+33
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
from proxypool.schemas.proxy import Proxy
2+
from proxypool.crawlers.base import BaseCrawler
3+
import re
4+
5+
MAX_NUM = 9999
6+
BASE_URL = 'http://api.89ip.cn/tqdl.html?api=1&num={MAX_NUM}&port=&address=&isp='.format(MAX_NUM=MAX_NUM)
7+
8+
9+
class Ip89Crawler(BaseCrawler):
10+
"""
11+
89ip crawler, http://api.89ip.cn
12+
"""
13+
urls = [BASE_URL]
14+
15+
def parse(self, html):
16+
"""
17+
parse html file to get proxies
18+
:return:
19+
"""
20+
ip_address = re.compile('([\d:\.]*)<br>')
21+
hosts_ports = ip_address.findall(html)
22+
for addr in hosts_ports:
23+
addr_split = addr.split(':')
24+
if(len(addr_split) == 2):
25+
host = addr_split[0]
26+
port = addr_split[1]
27+
yield Proxy(host=host, port=port)
28+
29+
30+
if __name__ == '__main__':
31+
crawler = Ip89Crawler()
32+
for proxy in crawler.crawl():
33+
print(proxy)

0 commit comments

Comments
 (0)