forked from LUCY78765580/Python-web-scraping
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathBoLiBei.py
71 lines (65 loc) · 2.58 KB
/
BoLiBei.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
#!usr/bin/env python
# -*-coding:utf-8 -*-
__author__='WYY'
__date__='2017.03.24'
#实战小项目:爬取SCU-info玻璃杯事件,提取热门100条神回复
import requests
import json
import re
import time
class Spider():
#初始化,记录采集时间
def __init__(self):
self.time=time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
print u'\n',u'开始采集数据',u'\n本地时间:',self.time
#获取data
def getData(self,url):
html=requests.get(url).text
requests.adapters.DEFAULT_RETRIES=5
result=json.loads(html)
data=result['data']
return data
#获取最新的评论id
def getNew(self):
data=self.getData(url='http://www.scuinfo.com/api/posts?pageSize=15')
New=data[0]['id']
return New
#提取data中有效数据,写入一个dict,多项写入一个list
def getDetail(self):
New=self.getNew()
container=[]
i=1
for id in range(131599,New+1):
content={}
self.url='http://www.scuinfo.com/api/post?id='+str(id)
data=self.getData(url=self.url)
if not isinstance(data,list):
body=data.values()[7]
likeCount=data.values()[6]
comment=data.values()[0]
#关键词分别为“玻璃”、“杯”、“摔”、“观光”
pattern=re.compile(u'\u73bb\u7483|\u676f|\u6454|\u89c2\u5149',re.S)
items=re.search(pattern,body)
if items:
content['body']=body
content['like']=likeCount
content['comment']=comment
print u'\n', i, u'\n', u'发言:', body, u'\n', u'点赞:', likeCount, u'', u'评论:', comment
time.sleep(0.01)
i += 1
container.append(content)
else:
print 'None'
print u'\n\n', u'至', self.time, u'为止,info上关于玻璃杯事件,共有评论',i-1, u'条'
return container
#获取评论总数
#依据点赞数由大到小将评论排列,获取前100条热门评论
def getSort(self):
container=self.getDetail()
print u'\n',u'将人气最高的前100条打印如下:'
container.sort(key=lambda k:k.get('comment',0))
container.sort(key=lambda k:k.get('like',0),reverse=True)
for index,r in enumerate(container):
print u'\n\n序号:',index+1, u'\n发言:',r['body'],u'\n点赞:' ,r['like'],u'评论',r['comment']
spider=Spider()
spider.getSort()