File tree Expand file tree Collapse file tree 1 file changed +31
-0
lines changed
proxypool/crawlers/public Expand file tree Collapse file tree 1 file changed +31
-0
lines changed Original file line number Diff line number Diff line change 1+ from proxypool .schemas .proxy import Proxy
2+ from proxypool .crawlers .base import BaseCrawler
3+ from pyquery import PyQuery as pq
4+
5+ BaseUrl = 'http://www.taiyanghttp.com/free/page{num}'
6+ MAX_PAGE = 5
7+
8+
9+ class TaiyangdailiCrawler (BaseCrawler ):
10+ """
11+ taiyangdaili crawler, http://www.taiyanghttp.com/free/
12+ """
13+ urls = [BaseUrl .format (num = i ) for i in range (1 , 6 )]
14+
15+ def parse (self , html ):
16+ """
17+ parse html file to get proxies
18+ :return:
19+ """
20+ doc = pq (html )
21+ trs = doc ('#ip_list .tr.ip_tr' ).items ()
22+ for tr in trs :
23+ host = tr .find ('div:nth-child(1)' ).text ()
24+ port = tr .find ('div:nth-child(2)' ).text ()
25+ yield Proxy (host = host , port = port )
26+
27+
28+ if __name__ == '__main__' :
29+ crawler = TaiyangdailiCrawler ()
30+ for proxy in crawler .crawl ():
31+ print (proxy )
You can’t perform that action at this time.
0 commit comments