File tree 3 files changed +99
-0
lines changed
proxypool/crawlers/public
3 files changed +99
-0
lines changed Original file line number Diff line number Diff line change
1
+ from proxypool .schemas .proxy import Proxy
2
+ from proxypool .crawlers .base import BaseCrawler
3
+ import re
4
+ import json
5
+ BASE_URL = 'http://proxylist.fatezero.org/proxy.list'
6
+
7
+
8
+ class FatezeroCrawler (BaseCrawler ):
9
+ """
10
+ Fatezero crawler,http://proxylist.fatezero.org
11
+ """
12
+ urls = [BASE_URL ]
13
+
14
+ def parse (self , html ):
15
+ """
16
+ parse html file to get proxies
17
+ :return:
18
+ """
19
+
20
+ hosts_ports = html .split ('\n ' )
21
+ for addr in hosts_ports :
22
+ ip_address = json .loads (addr )
23
+ if (True ):
24
+ host = ip_address ['host' ]
25
+ port = ip_address ['port' ]
26
+ yield Proxy (host = host , port = port )
27
+
28
+
29
+ if __name__ == '__main__' :
30
+ crawler = FatezeroCrawler ()
31
+ for proxy in crawler .crawl ():
32
+ print (proxy )
Original file line number Diff line number Diff line change
1
+ from proxypool .schemas .proxy import Proxy
2
+ from proxypool .crawlers .base import BaseCrawler
3
+ import re
4
+ from pyquery import PyQuery as pq
5
+ import time
6
+ BASE_URL = 'https://ip.ihuan.me/today/{path}.html'
7
+
8
+
9
+ class IhuanCrawler (BaseCrawler ):
10
+ """
11
+ ip ihuan crawler, https://ip.ihuan.me
12
+ """
13
+ urls = [BASE_URL .format (path = time .strftime ("%Y/%m/%d/%H" , time .localtime ()))]
14
+
15
+ def parse (self , html ):
16
+ """
17
+ parse html file to get proxies
18
+ :return:
19
+ """
20
+ # doc = pq(html)('.text-left')
21
+ ip_address = re .compile ('([\d:\.]*).*?<br>' )
22
+ hosts_ports = ip_address .findall (html )
23
+ for addr in hosts_ports :
24
+ addr_split = addr .split (':' )
25
+ if (len (addr_split ) == 2 ):
26
+ host = addr_split [0 ]
27
+ port = addr_split [1 ]
28
+ yield Proxy (host = host , port = port )
29
+
30
+
31
+ if __name__ == '__main__' :
32
+ crawler = IhuanCrawler ()
33
+ for proxy in crawler .crawl ():
34
+ print (proxy )
Original file line number Diff line number Diff line change
1
+ from proxypool .schemas .proxy import Proxy
2
+ from proxypool .crawlers .base import BaseCrawler
3
+ import re
4
+
5
+ MAX_NUM = 9999
6
+ BASE_URL = 'http://api.89ip.cn/tqdl.html?api=1&num={MAX_NUM}&port=&address=&isp=' .format (MAX_NUM = MAX_NUM )
7
+
8
+
9
+ class Ip89Crawler (BaseCrawler ):
10
+ """
11
+ 89ip crawler, http://api.89ip.cn
12
+ """
13
+ urls = [BASE_URL ]
14
+
15
+ def parse (self , html ):
16
+ """
17
+ parse html file to get proxies
18
+ :return:
19
+ """
20
+ ip_address = re .compile ('([\d:\.]*)<br>' )
21
+ hosts_ports = ip_address .findall (html )
22
+ for addr in hosts_ports :
23
+ addr_split = addr .split (':' )
24
+ if (len (addr_split ) == 2 ):
25
+ host = addr_split [0 ]
26
+ port = addr_split [1 ]
27
+ yield Proxy (host = host , port = port )
28
+
29
+
30
+ if __name__ == '__main__' :
31
+ crawler = Ip89Crawler ()
32
+ for proxy in crawler .crawl ():
33
+ print (proxy )
You can’t perform that action at this time.
0 commit comments