-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathCommentByID.py
86 lines (71 loc) · 2.93 KB
/
CommentByID.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
# coding:utf-8
__author__ = 'hang'
import warnings
warnings.filterwarnings("ignore")
import jieba # 分词包
import numpy # numpy计算包
import codecs # codecs提供的open方法来指定打开的文件的语言编码,它会在读取的时候自动转换为内部unicode
import re
import pandas as pd
import matplotlib.pyplot as plt
import sys
from urllib import request
from bs4 import BeautifulSoup as bs
import matplotlib
matplotlib.rcParams['figure.figsize'] = (10.0, 5.0)
from wordcloud import WordCloud # 词云包
# 爬取评论函数
def getCommentsById(movieId, pageNum):
eachCommentList = [];
if pageNum > 0:
start = (pageNum - 1) * 20
else:
return False
requrl = 'https://movie.douban.com/subject/' + movieId + '/comments' + '?' + 'start=' + str(start) + '&limit=20'
resp = request.urlopen(requrl)
html_data = resp.read().decode('utf-8')
soup = bs(html_data, 'html.parser')
comment_div_lits = soup.find_all('div', class_='comment')
for item in comment_div_lits:
if item.find_all('p')[0].string is not None:
eachCommentList.append(item.find_all('p')[0].string)
return eachCommentList
def main():
# 循环获取第一个电影的前10页评论
commentList = []
movie_id=sys.argv[1]
for i in range(10):
num = i + 1
commentList_temp = getCommentsById(movie_id, num) #电影的id
commentList.append(commentList_temp)
# 将列表中的数据转换为字符串
comments = ''
for k in range(len(commentList)):
comments = comments + (str(commentList[k])).strip()
# 使用正则表达式去除标点符号
pattern = re.compile(r'[\u4e00-\u9fa5]+')
filterdata = re.findall(pattern, comments)
cleaned_comments = ''.join(filterdata)
# 使用结巴分词进行中文分词
segment = jieba.lcut(cleaned_comments)
words_df = pd.DataFrame({'segment': segment})
# 去掉停用词
stopwords = pd.read_csv("stopwords.txt", index_col=False, quoting=3, sep="\t", names=['stopword'], encoding='utf-8') # quoting=3全不引用
words_df = words_df[~words_df.segment.isin(stopwords.stopword)]
# 统计词频
words_stat = words_df.groupby(by=['segment'])['segment'].agg({"count": numpy.size})
words_stat = words_stat.reset_index().sort_values(by=["count"], ascending=False)
# 用词云进行显示
wordcloud = WordCloud(font_path="simhei.ttf", width=800,height=400,background_color="white", max_font_size=120)
word_frequence = {x[0]: x[1] for x in words_stat.head(1000).values}
word_frequence_list = []
for key in word_frequence:
temp = (key, word_frequence[key])
word_frequence_list.append(temp)
wordcloud.fit_words(word_frequence)
plt.imshow(wordcloud, interpolation="bilinear")
name=movie_id
wordcloud.to_file(name+".jpg")
print(name+" OK ")
# 主函数
main()