1
1
import asyncio
2
2
import json
3
3
import re
4
- from typing import Callable , Dict , List , Optional
4
+ from typing import Callable , Dict , List , Optional , Union , Any
5
5
from urllib .parse import urlencode
6
6
7
7
import httpx
@@ -28,6 +28,7 @@ def __init__(
28
28
self .timeout = timeout
29
29
self .headers = headers
30
30
self ._host = "https://edith.xiaohongshu.com"
31
+ self ._domain = "https://www.xiaohongshu.com"
31
32
self .IP_ERROR_STR = "网络连接异常,请检查网络设置或重启试试"
32
33
self .IP_ERROR_CODE = 300012
33
34
self .NOTE_ABNORMAL_STR = "笔记状态异常,请稍后查看"
@@ -63,7 +64,7 @@ async def _pre_headers(self, url: str, data=None) -> Dict:
63
64
self .headers .update (headers )
64
65
return self .headers
65
66
66
- async def request (self , method , url , ** kwargs ) -> Dict :
67
+ async def request (self , method , url , ** kwargs ) -> Union [ str , Any ] :
67
68
"""
68
69
封装httpx的公共请求方法,对请求响应做一些处理
69
70
Args:
@@ -82,10 +83,10 @@ async def request(self, method, url, **kwargs) -> Dict:
82
83
method , url , timeout = self .timeout ,
83
84
** kwargs
84
85
)
85
-
86
+
86
87
if return_response :
87
88
return response .text
88
-
89
+
89
90
data : Dict = response .json ()
90
91
if data ["success" ]:
91
92
return data .get ("data" , data .get ("success" , {}))
@@ -186,56 +187,6 @@ async def get_note_by_keyword(
186
187
}
187
188
return await self .post (uri , data )
188
189
189
- async def get_creator_info_and_notes (self , creator : str ) -> Dict :
190
- """
191
- 获取博主的信息和第一页的笔记
192
- Args:
193
- creator: 博主ID
194
- Returns:
195
- {"creator":{}, "notes":[]}
196
- """
197
- path = '/user/profile/' + creator
198
- content = await self .request (method = "GET" , url = f"https://www.xiaohongshu.com{ path } " , return_response = True )
199
- match = re .search (r'<script>window.__INITIAL_STATE__=(.+)<\/script>' , content , re .M )
200
-
201
- if match == None :
202
- return {}
203
-
204
- info = json .loads (match .group (1 ).replace (':undefined' , ':null' ), strict = False )
205
- if info == None :
206
- return {}
207
-
208
- return {
209
- 'creator' : info .get ('user' ).get ('userPageData' ),
210
- 'notes' : info .get ('user' ).get ('notes' )[0 ],
211
- 'cursor' : info .get ('user' ).get ('noteQueries' )[0 ].get ('cursor' ),
212
- 'has_more_notes' : info .get ('user' ).get ('noteQueries' )[0 ].get ('hasMore' )
213
- }
214
-
215
- async def get_notes_by_creator (
216
- self , creator : str ,
217
- cursor : str ,
218
- page_size : int = 30
219
- ) -> Dict :
220
- """
221
- 获取博主的笔记
222
- Args:
223
- creator: 博主ID
224
- cursor: 上一页最后一条笔记的ID
225
- page_size: 分页数据长度
226
-
227
- Returns:
228
-
229
- """
230
- uri = "/api/sns/web/v1/user_posted"
231
- data = {
232
- "user_id" : creator ,
233
- "cursor" : cursor ,
234
- "num" : page_size ,
235
- "image_formats" : "jpg,webp,avif"
236
- }
237
- return await self .get (uri , data )
238
-
239
190
async def get_note_by_id (self , note_id : str ) -> Dict :
240
191
"""
241
192
获取笔记详情API
@@ -268,7 +219,7 @@ async def get_note_comments(self, note_id: str, cursor: str = "") -> Dict:
268
219
params = {
269
220
"note_id" : note_id ,
270
221
"cursor" : cursor ,
271
- "top_comment_id" :"" ,
222
+ "top_comment_id" : "" ,
272
223
"image_formats" : "jpg,webp,avif"
273
224
}
274
225
return await self .get (uri , params )
@@ -323,3 +274,76 @@ async def get_note_all_comments(self, note_id: str, crawl_interval: float = 1.0,
323
274
await asyncio .sleep (crawl_interval )
324
275
result .extend (comments )
325
276
return result
277
+
278
+ async def get_creator_info (self , user_id : str ) -> Dict :
279
+ """
280
+ 通过解析网页版的用户主页HTML,获取用户个人简要信息
281
+ PC端用户主页的网页存在window.__INITIAL_STATE__这个变量上的,解析它即可
282
+ eg: https://www.xiaohongshu.com/user/profile/59d8cb33de5fb4696bf17217
283
+ """
284
+ uri = f"/user/profile/{ user_id } "
285
+ html_content = await self .request ("GET" , self ._domain + uri , return_response = True , headers = self .headers )
286
+ match = re .search (r'<script>window.__INITIAL_STATE__=(.+)<\/script>' , html_content , re .M )
287
+
288
+ if match is None :
289
+ return {}
290
+
291
+ info = json .loads (match .group (1 ).replace (':undefined' , ':null' ), strict = False )
292
+ if info is None :
293
+ return {}
294
+ return info .get ('user' ).get ('userPageData' )
295
+
296
+ async def get_notes_by_creator (
297
+ self , creator : str ,
298
+ cursor : str ,
299
+ page_size : int = 30
300
+ ) -> Dict :
301
+ """
302
+ 获取博主的笔记
303
+ Args:
304
+ creator: 博主ID
305
+ cursor: 上一页最后一条笔记的ID
306
+ page_size: 分页数据长度
307
+
308
+ Returns:
309
+
310
+ """
311
+ uri = "/api/sns/web/v1/user_posted"
312
+ data = {
313
+ "user_id" : creator ,
314
+ "cursor" : cursor ,
315
+ "num" : page_size ,
316
+ "image_formats" : "jpg,webp,avif"
317
+ }
318
+ return await self .get (uri , data )
319
+
320
+ async def get_all_notes_by_creator (self , user_id : str , crawl_interval : float = 1.0 ,
321
+ callback : Optional [Callable ] = None ) -> List [Dict ]:
322
+ """
323
+ 获取指定用户下的所有发过的帖子,该方法会一直查找一个用户下的所有帖子信息
324
+ Args:
325
+ user_id: 用户ID
326
+ crawl_interval: 爬取一次的延迟单位(秒)
327
+ callback: 一次分页爬取结束后的更新回调函数
328
+
329
+ Returns:
330
+
331
+ """
332
+ result = []
333
+ notes_has_more = True
334
+ notes_cursor = ""
335
+ while notes_has_more :
336
+ notes_res = await self .get_notes_by_creator (user_id , notes_cursor )
337
+ notes_has_more = notes_res .get ("has_more" , False )
338
+ notes_cursor = notes_res .get ("cursor" , "" )
339
+ if "notes" not in notes_res :
340
+ utils .logger .info (f"[XHSClient.get_all_notes_by_creator] No 'notes' key found in response: { notes_res } " )
341
+ break
342
+
343
+ notes = notes_res ["notes" ]
344
+ utils .logger .info (f"[XHSClient.get_all_notes_by_creator] got user_id:{ user_id } notes len : { len (notes )} " )
345
+ if callback :
346
+ await callback (notes )
347
+ await asyncio .sleep (crawl_interval )
348
+ result .extend (notes )
349
+ return result
0 commit comments