File tree 1 file changed +31
-0
lines changed
proxypool/crawlers/public
1 file changed +31
-0
lines changed Original file line number Diff line number Diff line change
1
+ from proxypool .schemas .proxy import Proxy
2
+ from proxypool .crawlers .base import BaseCrawler
3
+ from pyquery import PyQuery as pq
4
+
5
+ BaseUrl = 'http://www.taiyanghttp.com/free/page{num}'
6
+ MAX_PAGE = 5
7
+
8
+
9
+ class TaiyangdailiCrawler (BaseCrawler ):
10
+ """
11
+ taiyangdaili crawler, http://www.taiyanghttp.com/free/
12
+ """
13
+ urls = [BaseUrl .format (num = i ) for i in range (1 , 6 )]
14
+
15
+ def parse (self , html ):
16
+ """
17
+ parse html file to get proxies
18
+ :return:
19
+ """
20
+ doc = pq (html )
21
+ trs = doc ('#ip_list .tr.ip_tr' ).items ()
22
+ for tr in trs :
23
+ host = tr .find ('div:nth-child(1)' ).text ()
24
+ port = tr .find ('div:nth-child(2)' ).text ()
25
+ yield Proxy (host = host , port = port )
26
+
27
+
28
+ if __name__ == '__main__' :
29
+ crawler = TaiyangdailiCrawler ()
30
+ for proxy in crawler .crawl ():
31
+ print (proxy )
You can’t perform that action at this time.
0 commit comments