Skip to content

Commit 96309dc

Browse files
committed
fix: 小红书创作者功能数据获取优化
1 parent 78a9bf9 commit 96309dc

File tree

5 files changed

+134
-140
lines changed

5 files changed

+134
-140
lines changed

README.md

+25-24
Original file line numberDiff line numberDiff line change
@@ -18,13 +18,13 @@
1818
成为赞助者,展示你的产品在这里,联系作者:[email protected]
1919

2020
## 功能列表
21-
| 平台 | Cookie 登录 | 二维码登录 | 手机号登录 | 关键词搜索 | 指定视频/帖子 ID 爬取 | 登录状态缓存 | 数据保存 | IP 代理池 | 滑块验证码 |
22-
|:---:|:---------:|:-----:|:-----:|:-----:|:-------------:|:------:|:----:|:------:|:-----:|
23-
| 小红书 ||| |||||||
24-
| 抖音 ||| |||||||
25-
| 快手 ||| |||||||
26-
| B 站 ||| |||||||
27-
| 微博 ||| |||||||
21+
| 平台 | Cookie 登录 | 二维码登录 | 指定创作者主页 | 关键词搜索 | 指定视频/帖子 ID 爬取 | 登录状态缓存 | 数据保存 | IP 代理池 | 滑块验证码 |
22+
|:---:|:---------:|:-----:|:-------:|:-----:|:-------------:|:------:|:----:|:------:|:-----:|
23+
| 小红书 ||| |||||||
24+
| 抖音 ||| |||||||
25+
| 快手 ||| |||||||
26+
| B 站 ||| |||||||
27+
| 微博 ||| |||||||
2828

2929

3030
## 使用方法
@@ -93,24 +93,25 @@
9393

9494
PS:如果打赏时请备注捐赠者,如有遗漏请联系我添加(有时候消息多可能会漏掉,十分抱歉)
9595

96-
| 捐赠者 | 捐赠金额 | 捐赠日期 |
97-
|-------------|-------|------------|
96+
| 捐赠者 | 捐赠金额 | 捐赠日期 |
97+
|------------|-------|------------|
98+
| *| 20 元 | 2024-03-17 |
9899
| Strem Gamer | 20 元 | 2024-03-16 |
99-
| * | 20 元 | 2024-03-14 |
100-
| Yuzu | 20 元 | 2024-03-07 |
101-
| ** | 100 元 | 2024-03-03 |
102-
| ** | 20 元 | 2024-03-03 |
103-
| Scarlett | 20 元 | 2024-02-16 |
104-
| Asun | 20 元 | 2024-01-30 |
105-
|* | 100 元 | 2024-01-21 |
106-
| allen | 20 元 | 2024-01-10 |
107-
| llllll | 20 元 | 2024-01-07 |
108-
|* | 20 元 | 2023-12-29 |
109-
| 50chen | 50 元 | 2023-12-22 |
110-
| xiongot | 20 元 | 2023-12-17 |
111-
| atom.hu | 20 元 | 2023-12-16 |
112-
| 一呆 | 20 元 | 2023-12-01 |
113-
| 坠落 | 50 元 | 2023-11-08 |
100+
| *| 20 元 | 2024-03-14 |
101+
| Yuzu | 20 元 | 2024-03-07 |
102+
| **| 100 元 | 2024-03-03 |
103+
| **| 20 元 | 2024-03-03 |
104+
| Scarlett | 20 元 | 2024-02-16 |
105+
| Asun | 20 元 | 2024-01-30 |
106+
|* | 100 元 | 2024-01-21 |
107+
| allen | 20 元 | 2024-01-10 |
108+
| llllll | 20 元 | 2024-01-07 |
109+
|*| 20 元 | 2023-12-29 |
110+
| 50chen | 50 元 | 2023-12-22 |
111+
| xiongot | 20 元 | 2023-12-17 |
112+
| atom.hu | 20 元 | 2023-12-16 |
113+
| 一呆 | 20 元 | 2023-12-01 |
114+
| 坠落 | 50 元 | 2023-11-08 |
114115

115116
## 运行报错常见问题Q&A
116117
> 遇到问题先自行搜索解决下,现在AI很火,用ChatGPT大多情况下能解决你的问题 [免费的ChatGPT](https://sider.ai/invited?c=8e03db1a973401fdf114ed9cf9f8c183)

config/base_config.py

+2-4
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
LOGIN_TYPE = "qrcode" # qrcode or phone or cookie
55
COOKIES = ""
66
SORT_TYPE = "popularity_descending" # 具体值参见media_platform.xxx.field下的枚举值,展示只支持小红书
7-
CRAWLER_TYPE = "search"
7+
CRAWLER_TYPE = "search" # 爬取类型,search(关键词搜索) | detail(帖子相亲)| creator(创作者主页数据)
88

99
# 是否开启 IP 代理
1010
ENABLE_IP_PROXY = False
@@ -70,8 +70,6 @@
7070

7171
# 指定小红书创作者ID列表
7272
XHS_CREATOR_ID_LIST = [
73-
"59d8cb33de5fb4696bf17217",
74-
"61b87386000000001000b18b",
75-
"5e8558100000000001005bc5",
73+
"63e36c9a000000002703502b",
7674
# ........................
7775
]

media_platform/xhs/client.py

+79-55
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
import asyncio
22
import json
33
import re
4-
from typing import Callable, Dict, List, Optional
4+
from typing import Callable, Dict, List, Optional, Union, Any
55
from urllib.parse import urlencode
66

77
import httpx
@@ -28,6 +28,7 @@ def __init__(
2828
self.timeout = timeout
2929
self.headers = headers
3030
self._host = "https://edith.xiaohongshu.com"
31+
self._domain = "https://www.xiaohongshu.com"
3132
self.IP_ERROR_STR = "网络连接异常,请检查网络设置或重启试试"
3233
self.IP_ERROR_CODE = 300012
3334
self.NOTE_ABNORMAL_STR = "笔记状态异常,请稍后查看"
@@ -63,7 +64,7 @@ async def _pre_headers(self, url: str, data=None) -> Dict:
6364
self.headers.update(headers)
6465
return self.headers
6566

66-
async def request(self, method, url, **kwargs) -> Dict:
67+
async def request(self, method, url, **kwargs) -> Union[str, Any]:
6768
"""
6869
封装httpx的公共请求方法,对请求响应做一些处理
6970
Args:
@@ -82,10 +83,10 @@ async def request(self, method, url, **kwargs) -> Dict:
8283
method, url, timeout=self.timeout,
8384
**kwargs
8485
)
85-
86+
8687
if return_response:
8788
return response.text
88-
89+
8990
data: Dict = response.json()
9091
if data["success"]:
9192
return data.get("data", data.get("success", {}))
@@ -186,56 +187,6 @@ async def get_note_by_keyword(
186187
}
187188
return await self.post(uri, data)
188189

189-
async def get_creator_info_and_notes(self, creator: str) -> Dict:
190-
"""
191-
获取博主的信息和第一页的笔记
192-
Args:
193-
creator: 博主ID
194-
Returns:
195-
{"creator":{}, "notes":[]}
196-
"""
197-
path = '/user/profile/'+creator
198-
content = await self.request(method="GET", url=f"https://www.xiaohongshu.com{path}", return_response=True)
199-
match = re.search(r'<script>window.__INITIAL_STATE__=(.+)<\/script>', content, re.M)
200-
201-
if match == None:
202-
return {}
203-
204-
info = json.loads(match.group(1).replace(':undefined', ':null'), strict=False)
205-
if info == None:
206-
return {}
207-
208-
return {
209-
'creator': info.get('user').get('userPageData'),
210-
'notes': info.get('user').get('notes')[0],
211-
'cursor': info.get('user').get('noteQueries')[0].get('cursor'),
212-
'has_more_notes': info.get('user').get('noteQueries')[0].get('hasMore')
213-
}
214-
215-
async def get_notes_by_creator(
216-
self, creator: str,
217-
cursor: str,
218-
page_size: int = 30
219-
) -> Dict:
220-
"""
221-
获取博主的笔记
222-
Args:
223-
creator: 博主ID
224-
cursor: 上一页最后一条笔记的ID
225-
page_size: 分页数据长度
226-
227-
Returns:
228-
229-
"""
230-
uri = "/api/sns/web/v1/user_posted"
231-
data = {
232-
"user_id": creator,
233-
"cursor": cursor,
234-
"num": page_size,
235-
"image_formats": "jpg,webp,avif"
236-
}
237-
return await self.get(uri, data)
238-
239190
async def get_note_by_id(self, note_id: str) -> Dict:
240191
"""
241192
获取笔记详情API
@@ -268,7 +219,7 @@ async def get_note_comments(self, note_id: str, cursor: str = "") -> Dict:
268219
params = {
269220
"note_id": note_id,
270221
"cursor": cursor,
271-
"top_comment_id":"",
222+
"top_comment_id": "",
272223
"image_formats": "jpg,webp,avif"
273224
}
274225
return await self.get(uri, params)
@@ -323,3 +274,76 @@ async def get_note_all_comments(self, note_id: str, crawl_interval: float = 1.0,
323274
await asyncio.sleep(crawl_interval)
324275
result.extend(comments)
325276
return result
277+
278+
async def get_creator_info(self, user_id: str) -> Dict:
279+
"""
280+
通过解析网页版的用户主页HTML,获取用户个人简要信息
281+
PC端用户主页的网页存在window.__INITIAL_STATE__这个变量上的,解析它即可
282+
eg: https://www.xiaohongshu.com/user/profile/59d8cb33de5fb4696bf17217
283+
"""
284+
uri = f"/user/profile/{user_id}"
285+
html_content = await self.request("GET", self._domain + uri, return_response=True, headers=self.headers)
286+
match = re.search(r'<script>window.__INITIAL_STATE__=(.+)<\/script>', html_content, re.M)
287+
288+
if match is None:
289+
return {}
290+
291+
info = json.loads(match.group(1).replace(':undefined', ':null'), strict=False)
292+
if info is None:
293+
return {}
294+
return info.get('user').get('userPageData')
295+
296+
async def get_notes_by_creator(
297+
self, creator: str,
298+
cursor: str,
299+
page_size: int = 30
300+
) -> Dict:
301+
"""
302+
获取博主的笔记
303+
Args:
304+
creator: 博主ID
305+
cursor: 上一页最后一条笔记的ID
306+
page_size: 分页数据长度
307+
308+
Returns:
309+
310+
"""
311+
uri = "/api/sns/web/v1/user_posted"
312+
data = {
313+
"user_id": creator,
314+
"cursor": cursor,
315+
"num": page_size,
316+
"image_formats": "jpg,webp,avif"
317+
}
318+
return await self.get(uri, data)
319+
320+
async def get_all_notes_by_creator(self, user_id: str, crawl_interval: float = 1.0,
321+
callback: Optional[Callable] = None) -> List[Dict]:
322+
"""
323+
获取指定用户下的所有发过的帖子,该方法会一直查找一个用户下的所有帖子信息
324+
Args:
325+
user_id: 用户ID
326+
crawl_interval: 爬取一次的延迟单位(秒)
327+
callback: 一次分页爬取结束后的更新回调函数
328+
329+
Returns:
330+
331+
"""
332+
result = []
333+
notes_has_more = True
334+
notes_cursor = ""
335+
while notes_has_more:
336+
notes_res = await self.get_notes_by_creator(user_id, notes_cursor)
337+
notes_has_more = notes_res.get("has_more", False)
338+
notes_cursor = notes_res.get("cursor", "")
339+
if "notes" not in notes_res:
340+
utils.logger.info(f"[XHSClient.get_all_notes_by_creator] No 'notes' key found in response: {notes_res}")
341+
break
342+
343+
notes = notes_res["notes"]
344+
utils.logger.info(f"[XHSClient.get_all_notes_by_creator] got user_id:{user_id} notes len : {len(notes)}")
345+
if callback:
346+
await callback(notes)
347+
await asyncio.sleep(crawl_interval)
348+
result.extend(notes)
349+
return result

media_platform/xhs/core.py

+26-56
Original file line numberDiff line numberDiff line change
@@ -126,65 +126,35 @@ async def search(self) -> None:
126126
async def get_creators_and_notes(self) -> None:
127127
"""Get creator's notes and retrieve their comment information."""
128128
utils.logger.info("[XiaoHongShuCrawler.get_creators_and_notes] Begin get xiaohongshu creators")
129-
xhs_limit_count = 30
130-
for creator in config.XHS_CREATOR_ID_LIST:
131-
utils.logger.info(f"[XiaoHongShuCrawler.get_creators_and_notes] Current creator: {creator}")
132-
page = 0
133-
cursor = ''
134-
has_more_notes = False
135-
while page * xhs_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
136-
note_id_list: List[str] = []
137-
138-
if page == 0:
139-
# get creator info and notes
140-
creator_and_notes_info = await self.xhs_client.get_creator_info_and_notes(creator)
141-
142-
if creator_and_notes_info == None or not creator_and_notes_info:
143-
utils.logger.error(f"[XiaoHongShuCrawler.get_creators_and_notes] get creator notes error")
144-
continue
145-
146-
notes_res = creator_and_notes_info.get('notes')
147-
# utils.logger.info(f"[XiaoHongShuCrawler.get_creators_and_notes] get creator and notes:{notes_res}")
148-
149-
cursor = creator_and_notes_info.get('cursor')
150-
has_more_notes = creator_and_notes_info.get('has_more_notes')
151-
152-
# save creator info
153-
await xhs_store.save_creator(creator, creator_and_notes_info.get('creator'))
154-
utils.logger.info(
155-
f"[XiaoHongShuCrawler.get_creators_and_notes] save creator info:{creator_and_notes_info.get('creator')}")
156-
else:
157-
# get notes
158-
notes = await self.xhs_client.get_notes_by_creator(creator, cursor)
159-
# utils.logger.info(f"[XiaoHongShuCrawler.get_creators_and_notes] get notes res:{notes_res}")
160-
161-
if notes == None or not notes:
162-
utils.logger.error(f"[XiaoHongShuCrawler.get_creators_and_notes] get creator's notes error")
163-
continue
129+
for user_id in config.XHS_CREATOR_ID_LIST:
130+
# get creator detail info from web html content
131+
createor_info: Dict = await self.xhs_client.get_creator_info(user_id=user_id)
132+
if createor_info:
133+
await xhs_store.save_creator(user_id, creator=createor_info)
134+
135+
# Get all note information of the creator
136+
all_notes_list = await self.xhs_client.get_all_notes_by_creator(
137+
user_id=user_id,
138+
crawl_interval=random.random(),
139+
callback=self.fetch_creator_notes_detail
140+
)
164141

165-
cursor = notes.get('cursor')
166-
has_more_notes = notes.get('has_more_notes')
167-
notes_res = notes.get('notes')
168-
utils.logger.info(
169-
f"[XiaoHongShuCrawler.get_creators_and_notes] get creator's notes res:{notes_res}")
142+
note_ids = [note_item.get("note_id") for note_item in all_notes_list]
143+
await self.batch_get_note_comments(note_ids)
170144

171-
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
172-
task_list = [
173-
self.get_note_detail(post_item.get('id'), semaphore)
174-
for post_item in notes_res
175-
]
176-
note_details = await asyncio.gather(*task_list)
177-
for note_detail in note_details:
178-
if note_detail is not None:
179-
await xhs_store.update_xhs_note(note_detail)
180-
note_id_list.append(note_detail.get('note_id'))
181-
page += 1
182-
183-
utils.logger.info(f"[XiaoHongShuCrawler.get_creators_and_notes] Note details: {note_details}")
184-
await self.batch_get_note_comments(note_id_list)
145+
async def fetch_creator_notes_detail(self, note_list: List[Dict]):
146+
"""
147+
Concurrently obtain the specified post list and save the data
148+
"""
149+
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
150+
task_list = [
151+
self.get_note_detail(post_item.get("note_id"), semaphore) for post_item in note_list
152+
]
185153

186-
if not has_more_notes:
187-
break
154+
note_details = await asyncio.gather(*task_list)
155+
for note_detail in note_details:
156+
if note_detail is not None:
157+
await xhs_store.update_xhs_note(note_detail)
188158

189159
async def get_specified_notes(self):
190160
"""Get the information and comments of the specified post"""

store/xhs/__init__.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ def create_store() -> AbstractStore:
2626
return store_class()
2727

2828

29+
2930
async def update_xhs_note(note_item: Dict):
3031
note_id = note_item.get("note_id")
3132
user_info = note_item.get("user", {})
@@ -116,7 +117,7 @@ async def save_creator(user_id: str, creator: Dict):
116117
'follows': follows,
117118
'fans': fans,
118119
'interaction': interaction,
119-
'tag_list': json.dumps({tag.get('tagType'): tag.get('name') for tag in creator.get('tags')}),
120+
'tag_list': json.dumps({tag.get('tagType'): tag.get('name') for tag in creator.get('tags')}, ensure_ascii=False),
120121
}
121122
utils.logger.info(f"[store.xhs.save_creator] creator:{local_db_item}")
122123
await XhsStoreFactory.create_store().store_creator(local_db_item)

0 commit comments

Comments
 (0)