Skip to content

Commit 9912b98

Browse files
添加新的代理源爬取——太阳代理 (#107)
* Add files via upload 新的代理爬虫 * Delete taiyangdaili.py * add taiyang 爬取太阳代理免费ip
1 parent 2ee17b6 commit 9912b98

File tree

1 file changed

+31
-0
lines changed

1 file changed

+31
-0
lines changed
+31
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
from proxypool.schemas.proxy import Proxy
2+
from proxypool.crawlers.base import BaseCrawler
3+
from pyquery import PyQuery as pq
4+
5+
BaseUrl = 'http://www.taiyanghttp.com/free/page{num}'
6+
MAX_PAGE = 5
7+
8+
9+
class TaiyangdailiCrawler(BaseCrawler):
10+
"""
11+
taiyangdaili crawler, http://www.taiyanghttp.com/free/
12+
"""
13+
urls = [BaseUrl.format(num=i) for i in range(1, 6)]
14+
15+
def parse(self, html):
16+
"""
17+
parse html file to get proxies
18+
:return:
19+
"""
20+
doc = pq(html)
21+
trs = doc('#ip_list .tr.ip_tr').items()
22+
for tr in trs:
23+
host = tr.find('div:nth-child(1)').text()
24+
port = tr.find('div:nth-child(2)').text()
25+
yield Proxy(host=host, port=port)
26+
27+
28+
if __name__ == '__main__':
29+
crawler = TaiyangdailiCrawler()
30+
for proxy in crawler.crawl():
31+
print(proxy)

0 commit comments

Comments
 (0)