Skip to content

Commit 6baa997

Browse files
committed
puppeteer爬取简书文章保存到本地
1 parent 51b3a9e commit 6baa997

File tree

3 files changed

+142
-0
lines changed

3 files changed

+142
-0
lines changed

.DS_Store

0 Bytes
Binary file not shown.

README.md

+5
Original file line numberDiff line numberDiff line change
@@ -36,3 +36,8 @@
3636
* [爬取简书网所有文章数据](./scrapy/jianshu_spider/)
3737
* [爬取房天下所有房的数据,包含新房、二手房](./scrapy/sfw_spider)
3838

39+
40+
41+
## Node.js 爬虫
42+
43+
* [使用 puppeteer 爬取简书文章并保存到本地](./js/jian_shu.js)

js/jian_shu.js

+137
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,137 @@
1+
//简书上的文章保存为pdf保存到本地
2+
const puppeteer = require('puppeteer');
3+
4+
const mkdirp = require('mkdirp');
5+
6+
BASE_URL = 'https://www.jianshu.com';
7+
8+
HOME_URL = `${BASE_URL}/u/f46becd1ed83`;
9+
10+
//文章目录
11+
const ARTICLE_PATH = './monkey';
12+
13+
const download_article = async () => {
14+
15+
const viewport_size = {
16+
width: 0,
17+
height: 0,
18+
};
19+
20+
const browser = await puppeteer.launch({
21+
headless: true,
22+
});
23+
24+
const page = await browser.newPage();
25+
26+
page.setViewport(viewport_size);
27+
28+
//打开文章主页
29+
await page.goto(HOME_URL);
30+
31+
console.log('显示文章列表,马上开始滑动')
32+
33+
//滑动文章列表,使所有文章被加载出来
34+
//参考:https://github.com/GoogleChrome/puppeteer/issues/844
35+
await autoScroll(page);
36+
37+
console.log('所有文章加载完成');
38+
39+
const articles = await page.$eval('.note-list', articles_element => {
40+
const article_elements = articles_element.querySelectorAll('li');
41+
const articleElementArray = Array.prototype.slice.call(article_elements);
42+
43+
return articleElementArray.map(item => {
44+
const a_element = item.querySelector('.title');
45+
return {
46+
href: a_element.getAttribute('href'),
47+
title: a_element.innerHTML.trim(),
48+
};
49+
});
50+
});
51+
52+
console.log(`大佬一共发布了${articles.length}篇文章`);
53+
54+
55+
//新建目录
56+
mkdirp.sync(ARTICLE_PATH);
57+
58+
for (let article of articles) {
59+
const articlePage = await browser.newPage();
60+
articlePage.setViewport(viewport_size);
61+
articlePage.goto(`${BASE_URL}${article.href}`, {
62+
waitUntil: 'networkidle2'
63+
});
64+
65+
articlePage.waitForSelector('.post');
66+
console.log('文章详情页面加载完成');
67+
68+
//注意:这里必须等待几秒,不然下面的滑动会报错:
69+
// UnhandledPromiseRejectionWarning: Error: Execution context was destroyed, most likely because of a navigation.
70+
await articlePage.waitFor(2000);
71+
72+
//滑动到最底部,加载出所有的图片
73+
await autoScroll(articlePage);
74+
75+
76+
//为了保证页面的整洁干净,屏蔽多余的元素
77+
await articlePage.$eval('body', body => {
78+
body.querySelector('.navbar').style.display = 'none';
79+
body.querySelector('#note-fixed-ad-container').style.display = 'none';
80+
body.querySelector('.note-bottom').style.display = 'none';
81+
body.querySelector('.side-tool').style.display = 'none';
82+
// body.querySelector('.author').style.display = 'none';
83+
body.querySelector('.meta-bottom').style.display = 'none';
84+
body.querySelector('#web-note-ad-1').style.display = 'none';
85+
body.querySelector('#comment-list').style.display = 'none';
86+
body.querySelector('.follow-detail').style.display = 'none';
87+
body.querySelector('.show-foot').style.display = 'none';
88+
89+
Promise.resolve();
90+
});
91+
92+
//文章名称
93+
const fileName = `${article.title.replace("/\\//g", "、")}.pdf`;
94+
const fileFullPath = `${ARTICLE_PATH}/${fileName}`;
95+
console.log(`文章保存的完整路径是:${fileFullPath}`);
96+
97+
await page.emulateMedia('screen');
98+
await articlePage.pdf({
99+
path: fileFullPath,
100+
format: 'A4'
101+
});
102+
console.log(`保存成功: ${fileFullPath}`);
103+
articlePage.close();
104+
}
105+
106+
console.log('下载完成!Enjoy~');
107+
};
108+
109+
function autoScroll(page) {
110+
return page.evaluate(() => {
111+
return new Promise((resolve, reject) => {
112+
var totalHeight = 0;
113+
var distance = 100;
114+
var timer = setInterval(() => {
115+
console.log('执行间断函数');
116+
var scrollHeight = document.body.scrollHeight;
117+
window.scrollBy(0, distance);
118+
totalHeight += distance;
119+
120+
if (totalHeight >= scrollHeight) {
121+
console.log('滑动到底');
122+
clearInterval(timer);
123+
resolve();
124+
}
125+
}, 100);
126+
})
127+
});
128+
}
129+
130+
131+
module.exports = download_article;
132+
133+
if (require.main === module) {
134+
download_article()
135+
}
136+
137+

0 commit comments

Comments
 (0)