-
Notifications
You must be signed in to change notification settings - Fork 86
/
Copy pathFilmScoreSpider.py
61 lines (51 loc) · 2.26 KB
/
FilmScoreSpider.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
#!/usr/bin/python
# coding: utf-8
import urllib2 # library which helps in opening URL
import bs4 # extract data from HTML or XML files
import chardet # detect character encoding
class SpiderDouban:
# self表示构造函数创建的对象
def __init__(self):
# __headers为类变量,对象持有
self.__headers = ('User-Agent', 'Mozilla/5.0 (Windows NT 6.3; WOW64)')
# self指向调用该函数的对象
def openUrl(self, url):
"""
打开一个URL,返回其HTML文档内容
:param url: 要打开的网址
:return: 字符串形式的HTML文档内容
"""
opener = urllib2.build_opener() # create an OpenerDirector object
opener.addheaders = [self.__headers] # set HTTP headers
fileLikeObj = opener.open(url) # open the url and return a file like object
contentString = fileLikeObj.read() # get HTML content of the opened file like object
fileLikeObj.close()
# detect character encoding of content
encodingDictionary = chardet.detect(contentString)
encoding = encodingDictionary['encoding']
# decode content with indicated character encoding
contentString = contentString.decode(encoding=encoding, errors='ignore')
return contentString
def getFilmsDictionary(self, content):
"""
获取所有电影和其相应分数
:param content: HTML网页文档
:return: 一个存有电影和其分数的字典
"""
filmsDictionary = {}
soup = bs4.BeautifulSoup(content, "html.parser")
div = soup.find('div', class_='screening-bd') # get the first "div" tag
filmsList = div.findAll('li', class_="ui-slide-item") # get the list of "li" tag under "div"
for film in filmsList:
if ('data-rate' in film.attrs):
if film['data-rate']:
filmsDictionary[film['data-title']] = float(film['data-rate'])
return filmsDictionary
def sortFilms(self, filmsDictionary):
"""
将字典按value排序
:param filmsDictionary: 字典
:return: 一个字典的键,值元组数组
"""
sortedFilms = sorted(filmsDictionary.items(), key=lambda x: x[1], reverse=True)
return sortedFilms