Skip to content

Commit 61ba8c5

Browse files
feat: 小红书支持通过博主ID采集笔记和评论,小红书type=search时支持配置按哪种排序方式获取笔记数据,小红书笔记增加视频地址和标签字段
1 parent c09f9fe commit 61ba8c5

File tree

8 files changed

+244
-2
lines changed

8 files changed

+244
-2
lines changed

Diff for: base/base_crawler.py

+6
Original file line numberDiff line numberDiff line change
@@ -49,3 +49,9 @@ async def store_content(self, content_item: Dict):
4949
@abstractmethod
5050
async def store_comment(self, comment_item: Dict):
5151
pass
52+
53+
# TODO support all platform
54+
# only xhs is supported, so @abstractmethod is commented
55+
# @abstractmethod
56+
async def store_creator(self, creator: Dict):
57+
pass

Diff for: config/base_config.py

+9
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
KEYWORDS = "python,golang"
44
LOGIN_TYPE = "qrcode" # qrcode or phone or cookie
55
COOKIES = ""
6+
SORT_TYPE="popularity_descending" # 具体值参见media_platform.xxx.field下的枚举值,展示只支持小红书
67
CRAWLER_TYPE = "search"
78

89
# 是否开启 IP 代理
@@ -70,3 +71,11 @@
7071
"4982041758140155",
7172
# ........................
7273
]
74+
75+
# 指定小红书创作者ID列表
76+
XHS_CREATOR_ID_LIST = [
77+
"59d8cb33de5fb4696bf17217",
78+
"61b87386000000001000b18b",
79+
"5e8558100000000001005bc5",
80+
# ........................
81+
]

Diff for: main.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -36,8 +36,8 @@ async def main():
3636
choices=["xhs", "dy", "ks", "bili", "wb"], default=config.PLATFORM)
3737
parser.add_argument('--lt', type=str, help='Login type (qrcode | phone | cookie)',
3838
choices=["qrcode", "phone", "cookie"], default=config.LOGIN_TYPE)
39-
parser.add_argument('--type', type=str, help='crawler type (search | detail)',
40-
choices=["search", "detail"], default=config.CRAWLER_TYPE)
39+
parser.add_argument('--type', type=str, help='crawler type (search | detail | creator)',
40+
choices=["search", "detail", "creator"], default=config.CRAWLER_TYPE)
4141

4242
# init db
4343
if config.SAVE_DATA_OPTION == "db":

Diff for: media_platform/xhs/client.py

+58
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import asyncio
22
import json
3+
import re
34
from typing import Callable, Dict, List, Optional
45
from urllib.parse import urlencode
56

@@ -73,11 +74,18 @@ async def request(self, method, url, **kwargs) -> Dict:
7374
Returns:
7475
7576
"""
77+
# return response.text
78+
return_response = kwargs.pop('return_response', False)
79+
7680
async with httpx.AsyncClient(proxies=self.proxies) as client:
7781
response = await client.request(
7882
method, url, timeout=self.timeout,
7983
**kwargs
8084
)
85+
86+
if return_response:
87+
return response.text
88+
8189
data: Dict = response.json()
8290
if data["success"]:
8391
return data.get("data", data.get("success", {}))
@@ -178,6 +186,56 @@ async def get_note_by_keyword(
178186
}
179187
return await self.post(uri, data)
180188

189+
async def get_creator_info_and_notes(self, creator: str) -> Dict:
190+
"""
191+
获取博主的信息和第一页的笔记
192+
Args:
193+
creator: 博主ID
194+
Returns:
195+
{"creator":{}, "notes":[]}
196+
"""
197+
path = '/user/profile/'+creator
198+
content = await self.request(method="GET", url=f"https://www.xiaohongshu.com{path}", return_response=True)
199+
match = re.search(r'<script>window.__INITIAL_STATE__=(.+)<\/script>', content, re.M)
200+
201+
if match == None:
202+
return {}
203+
204+
info = json.loads(match.group(1).replace(':undefined', ':null'), strict=False)
205+
if info == None:
206+
return {}
207+
208+
return {
209+
'creator': info.get('user').get('userPageData'),
210+
'notes': info.get('user').get('notes')[0],
211+
'cursor': info.get('user').get('noteQueries')[0].get('cursor'),
212+
'has_more_notes': info.get('user').get('noteQueries')[0].get('hasMore')
213+
}
214+
215+
async def get_notes_by_creator(
216+
self, creator: str,
217+
cursor: str,
218+
page_size: int = 30
219+
) -> Dict:
220+
"""
221+
获取博主的笔记
222+
Args:
223+
creator: 博主ID
224+
cursor: 上一页最后一条笔记的ID
225+
page_size: 分页数据长度
226+
227+
Returns:
228+
229+
"""
230+
uri = "/api/sns/web/v1/user_posted"
231+
data = {
232+
"user_id": creator,
233+
"cursor": cursor,
234+
"num": page_size,
235+
"image_formats": "jpg,webp,avif"
236+
}
237+
return await self.get(uri, data)
238+
181239
async def get_note_by_id(self, note_id: str) -> Dict:
182240
"""
183241
获取笔记详情API

Diff for: media_platform/xhs/core.py

+66
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616

1717
from .client import XHSClient
1818
from .exception import DataFetchError
19+
from .field import SearchSortType
1920
from .login import XHSLogin
2021

2122

@@ -84,6 +85,9 @@ async def start(self) -> None:
8485
elif self.crawler_type == "detail":
8586
# Get the information and comments of the specified post
8687
await self.get_specified_notes()
88+
elif self.crawler_type == "creator":
89+
# Get creator's information and their notes and comments
90+
await self.get_creators_and_notes()
8791
else:
8892
pass
8993

@@ -101,6 +105,7 @@ async def search(self) -> None:
101105
notes_res = await self.xhs_client.get_note_by_keyword(
102106
keyword=keyword,
103107
page=page,
108+
sort=SearchSortType(config.SORT_TYPE) if config.SORT_TYPE!='' else SearchSortType.GENERAL,
104109
)
105110
utils.logger.info(f"[XiaoHongShuCrawler.search] Search notes res:{notes_res}")
106111
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
@@ -117,6 +122,67 @@ async def search(self) -> None:
117122
page += 1
118123
utils.logger.info(f"[XiaoHongShuCrawler.search] Note details: {note_details}")
119124
await self.batch_get_note_comments(note_id_list)
125+
126+
async def get_creators_and_notes(self) -> None:
127+
"""Get creator's notes and retrieve their comment information."""
128+
utils.logger.info("[XiaoHongShuCrawler.get_creators_and_notes] Begin get xiaohongshu creators")
129+
xhs_limit_count = 30
130+
for creator in config.XHS_CREATOR_ID_LIST:
131+
utils.logger.info(f"[XiaoHongShuCrawler.get_creators_and_notes] Current creator: {creator}")
132+
page = 0
133+
cursor = ''
134+
has_more_notes = False
135+
while page * xhs_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
136+
note_id_list: List[str] = []
137+
138+
if page == 0:
139+
# get creator info and notes
140+
creator_and_notes_info = await self.xhs_client.get_creator_info_and_notes(creator)
141+
142+
if creator_and_notes_info == None or not creator_and_notes_info:
143+
utils.logger.error(f"[XiaoHongShuCrawler.get_creators_and_notes] get creator notes error")
144+
continue
145+
146+
notes_res = creator_and_notes_info.get('notes')
147+
# utils.logger.info(f"[XiaoHongShuCrawler.get_creators_and_notes] get creator and notes:{notes_res}")
148+
149+
cursor = creator_and_notes_info.get('cursor')
150+
has_more_notes = creator_and_notes_info.get('has_more_notes')
151+
152+
# save creator info
153+
await xhs_store.save_creator(creator, creator_and_notes_info.get('creator'))
154+
utils.logger.info(f"[XiaoHongShuCrawler.get_creators_and_notes] save creator info:{creator_and_notes_info.get('creator')}")
155+
else:
156+
# get notes
157+
notes = await self.xhs_client.get_notes_by_creator(creator, cursor)
158+
# utils.logger.info(f"[XiaoHongShuCrawler.get_creators_and_notes] get notes res:{notes_res}")
159+
160+
if notes == None or not notes:
161+
utils.logger.error(f"[XiaoHongShuCrawler.get_creators_and_notes] get creator's notes error")
162+
continue
163+
164+
cursor = notes.get('cursor')
165+
has_more_notes = notes.get('has_more_notes')
166+
notes_res = notes.get('notes')
167+
utils.logger.info(f"[XiaoHongShuCrawler.get_creators_and_notes] get creator's notes res:{notes_res}")
168+
169+
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
170+
task_list = [
171+
self.get_note_detail(post_item.get('id'), semaphore)
172+
for post_item in notes_res
173+
]
174+
note_details = await asyncio.gather(*task_list)
175+
for note_detail in note_details:
176+
if note_detail is not None:
177+
await xhs_store.update_xhs_note(note_detail)
178+
note_id_list.append(note_detail.get('note_id'))
179+
page += 1
180+
181+
utils.logger.info(f"[XiaoHongShuCrawler.get_creators_and_notes] Note details: {note_details}")
182+
await self.batch_get_note_comments(note_id_list)
183+
184+
if not has_more_notes:
185+
break
120186

121187
async def get_specified_notes(self):
122188
"""Get the information and comments of the specified post"""

Diff for: store/xhs/__init__.py

+38
Original file line numberDiff line numberDiff line change
@@ -31,12 +31,20 @@ async def update_xhs_note(note_item: Dict):
3131
user_info = note_item.get("user", {})
3232
interact_info = note_item.get("interact_info", {})
3333
image_list: List[Dict] = note_item.get("image_list", [])
34+
tag_list: List[Dict] = note_item.get("tag_list", [])
35+
36+
video_url = ''
37+
if note_item.get('type') == 'video':
38+
videos = note_item.get('video').get('media').get('stream').get('h264')
39+
if type(videos).__name__ == 'list':
40+
video_url = ','.join([ v.get('master_url') for v in videos])
3441

3542
local_db_item = {
3643
"note_id": note_item.get("note_id"),
3744
"type": note_item.get("type"),
3845
"title": note_item.get("title") or note_item.get("desc", "")[:255],
3946
"desc": note_item.get("desc", ""),
47+
"video_url": video_url,
4048
"time": note_item.get("time"),
4149
"last_update_time": note_item.get("last_update_time", 0),
4250
"user_id": user_info.get("user_id"),
@@ -48,6 +56,7 @@ async def update_xhs_note(note_item: Dict):
4856
"share_count": interact_info.get("share_count"),
4957
"ip_location": note_item.get("ip_location", ""),
5058
"image_list": ','.join([img.get('url', '') for img in image_list]),
59+
"tag_list": ','.join([tag.get('name', '') for tag in tag_list if tag.get('type')=='topic']),
5160
"last_modify_ts": utils.get_current_timestamp(),
5261
"note_url": f"https://www.xiaohongshu.com/explore/{note_id}"
5362
}
@@ -77,3 +86,32 @@ async def update_xhs_note_comment(note_id: str, comment_item: Dict):
7786
}
7887
utils.logger.info(f"[store.xhs.update_xhs_note_comment] xhs note comment:{local_db_item}")
7988
await XhsStoreFactory.create_store().store_comment(local_db_item)
89+
90+
async def save_creator(user_id: str, creator: Dict):
91+
user_info = creator.get('basicInfo', {})
92+
93+
follows = 0
94+
fans = 0
95+
interaction = 0
96+
for i in creator.get('interactions'):
97+
if i.get('type') == 'follows':
98+
follows = i.get('count')
99+
elif i.get('type') == 'fans':
100+
fans = i.get('count')
101+
elif i.get('type') == 'interaction':
102+
interaction = i.get('count')
103+
104+
local_db_item = {
105+
'user_id': user_id,
106+
'nickname': user_info.get('nickname'),
107+
'gender': '女' if user_info.get('gender') == 1 else '男' ,
108+
'avatar': user_info.get('images'),
109+
'desc': user_info.get('desc'),
110+
'ip_location': user_info.get('ip_location'),
111+
'follows': follows,
112+
'fans': fans,
113+
'interaction': interaction,
114+
'tag_list': json.dumps({tag.get('tagType'):tag.get('name') for tag in creator.get('tags')}),
115+
}
116+
utils.logger.info(f"[store.xhs.save_creator] creator:{local_db_item}")
117+
await XhsStoreFactory.create_store().store_creator(local_db_item)

Diff for: store/xhs/xhs_store_db_types.py

+18
Original file line numberDiff line numberDiff line change
@@ -25,13 +25,15 @@ class XHSNote(XhsBaseModel):
2525
type = fields.CharField(null=True, max_length=16, description="笔记类型(normal | video)")
2626
title = fields.CharField(null=True, max_length=255, description="笔记标题")
2727
desc = fields.TextField(null=True, description="笔记描述")
28+
video_url = fields.TextField(null=True, description="视频地址")
2829
time = fields.BigIntField(description="笔记发布时间戳", index=True)
2930
last_update_time = fields.BigIntField(description="笔记最后更新时间戳")
3031
liked_count = fields.CharField(null=True, max_length=16, description="笔记点赞数")
3132
collected_count = fields.CharField(null=True, max_length=16, description="笔记收藏数")
3233
comment_count = fields.CharField(null=True, max_length=16, description="笔记评论数")
3334
share_count = fields.CharField(null=True, max_length=16, description="笔记分享数")
3435
image_list = fields.TextField(null=True, description="笔记封面图片列表")
36+
tag_list = fields.TextField(null=True, description="标签列表")
3537
note_url = fields.CharField(null=True, max_length=255, description="笔记详情页的URL")
3638

3739
class Meta:
@@ -55,3 +57,19 @@ class Meta:
5557

5658
def __str__(self):
5759
return f"{self.comment_id} - {self.content}"
60+
61+
62+
class XhsCreator(XhsBaseModel):
63+
desc = fields.TextField(null=True, description="用户描述")
64+
gender = fields.CharField(null=True, max_length=1, description="性别")
65+
follows = fields.CharField(null=True, max_length=16, description="关注数")
66+
fans = fields.CharField(null=True, max_length=16, description="粉丝数")
67+
interaction = fields.CharField(null=True, max_length=16, description="获赞和收藏数")
68+
# follows = fields.IntField(description="关注数")
69+
# fans = fields.IntField(description="粉丝数")
70+
# interaction = fields.IntField(description="获赞和收藏数")
71+
tag_list = fields.TextField(null=True, description="标签列表") # json字符串
72+
73+
class Meta:
74+
table = "xhs_creator"
75+
table_description = "小红书博主"

Diff for: store/xhs/xhs_store_impl.py

+47
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,17 @@ async def store_comment(self, comment_item: Dict):
7272
"""
7373
await self.save_data_to_csv(save_item=comment_item, store_type="comments")
7474

75+
async def store_creator(self, creator: Dict):
76+
"""
77+
Xiaohongshu content CSV storage implementation
78+
Args:
79+
creator: creator dict
80+
81+
Returns:
82+
83+
"""
84+
await self.save_data_to_csv(save_item=creator, store_type="creator")
85+
7586

7687
class XhsDbStoreImplement(AbstractStore):
7788
async def store_content(self, content_item: Dict):
@@ -121,6 +132,31 @@ async def store_comment(self, comment_item: Dict):
121132
comment_pydantic.model_validate(comment_data)
122133
await XHSNoteComment.filter(comment_id=comment_id).update(**comment_data.model_dump())
123134

135+
async def store_creator(self, creator: Dict):
136+
"""
137+
Xiaohongshu content DB storage implementation
138+
Args:
139+
creator: creator dict
140+
141+
Returns:
142+
143+
"""
144+
from .xhs_store_db_types import XhsCreator
145+
user_id = creator.get("user_id")
146+
if not await XhsCreator.filter(user_id=user_id).first():
147+
creator["add_ts"] = utils.get_current_timestamp()
148+
creator["last_modify_ts"] = creator["add_ts"]
149+
creator_pydantic = pydantic_model_creator(XhsCreator, name="CreatorPydanticCreate", exclude=('id',))
150+
creator_data = creator_pydantic(**creator)
151+
creator_pydantic.model_validate(creator_data)
152+
await XhsCreator.create(**creator_data.model_dump())
153+
else:
154+
creator["last_modify_ts"] = utils.get_current_timestamp()
155+
creator_pydantic = pydantic_model_creator(XhsCreator, name="CreatorPydanticUpdate", exclude=('id', 'add_ts',))
156+
creator_data = creator_pydantic(**creator)
157+
creator_pydantic.model_validate(creator_data)
158+
await XhsCreator.filter(user_id=user_id).update(**creator_data.model_dump())
159+
124160

125161
class XhsJsonStoreImplement(AbstractStore):
126162
json_store_path: str = "data/xhs"
@@ -181,3 +217,14 @@ async def store_comment(self, comment_item: Dict):
181217
182218
"""
183219
await self.save_data_to_json(comment_item, "comments")
220+
221+
async def store_creator(self, creator: Dict):
222+
"""
223+
Xiaohongshu content JSON storage implementation
224+
Args:
225+
creator: creator dict
226+
227+
Returns:
228+
229+
"""
230+
await self.save_data_to_json(creator, "creator")

0 commit comments

Comments
 (0)