11import  asyncio 
22import  json 
33import  re 
4- from  typing  import  Callable , Dict , List , Optional 
4+ from  typing  import  Callable , Dict , List , Optional ,  Union ,  Any 
55from  urllib .parse  import  urlencode 
66
77import  httpx 
@@ -28,6 +28,7 @@ def __init__(
2828        self .timeout  =  timeout 
2929        self .headers  =  headers 
3030        self ._host  =  "https://edith.xiaohongshu.com" 
31+         self ._domain  =  "https://www.xiaohongshu.com" 
3132        self .IP_ERROR_STR  =  "网络连接异常,请检查网络设置或重启试试" 
3233        self .IP_ERROR_CODE  =  300012 
3334        self .NOTE_ABNORMAL_STR  =  "笔记状态异常,请稍后查看" 
@@ -63,7 +64,7 @@ async def _pre_headers(self, url: str, data=None) -> Dict:
6364        self .headers .update (headers )
6465        return  self .headers 
6566
66-     async  def  request (self , method , url , ** kwargs ) ->  Dict :
67+     async  def  request (self , method , url , ** kwargs ) ->  Union [ str ,  Any ] :
6768        """ 
6869        封装httpx的公共请求方法,对请求响应做一些处理 
6970        Args: 
@@ -82,10 +83,10 @@ async def request(self, method, url, **kwargs) -> Dict:
8283                method , url , timeout = self .timeout ,
8384                ** kwargs 
8485            )
85-          
86+ 
8687        if  return_response :
8788            return  response .text 
88-          
89+ 
8990        data : Dict  =  response .json ()
9091        if  data ["success" ]:
9192            return  data .get ("data" , data .get ("success" , {}))
@@ -186,56 +187,6 @@ async def get_note_by_keyword(
186187        }
187188        return  await  self .post (uri , data )
188189
189-     async  def  get_creator_info_and_notes (self , creator : str ) ->  Dict :
190-         """ 
191-         获取博主的信息和第一页的笔记 
192-         Args: 
193-             creator: 博主ID 
194-         Returns: 
195-             {"creator":{}, "notes":[]} 
196-         """ 
197-         path  =  '/user/profile/' + creator 
198-         content  =  await  self .request (method = "GET" , url = f"https://www.xiaohongshu.com{ path }  , return_response = True )
199-         match  =  re .search (r'<script>window.__INITIAL_STATE__=(.+)<\/script>' , content , re .M )
200- 
201-         if  match  ==  None :
202-             return  {}
203-         
204-         info  =  json .loads (match .group (1 ).replace (':undefined' , ':null' ), strict = False )
205-         if  info  ==  None :
206-             return  {}
207-         
208-         return  {
209-             'creator' : info .get ('user' ).get ('userPageData' ),
210-             'notes' : info .get ('user' ).get ('notes' )[0 ],
211-             'cursor' : info .get ('user' ).get ('noteQueries' )[0 ].get ('cursor' ),
212-             'has_more_notes' : info .get ('user' ).get ('noteQueries' )[0 ].get ('hasMore' )
213-         }
214- 
215-     async  def  get_notes_by_creator (
216-             self , creator : str ,
217-             cursor : str , 
218-             page_size : int  =  30 
219-     ) ->  Dict :
220-         """ 
221-         获取博主的笔记 
222-         Args: 
223-             creator: 博主ID 
224-             cursor: 上一页最后一条笔记的ID 
225-             page_size: 分页数据长度 
226- 
227-         Returns: 
228- 
229-         """ 
230-         uri  =  "/api/sns/web/v1/user_posted" 
231-         data  =  {
232-             "user_id" : creator ,
233-             "cursor" : cursor ,
234-             "num" : page_size ,
235-             "image_formats" : "jpg,webp,avif" 
236-         }
237-         return  await  self .get (uri , data )
238- 
239190    async  def  get_note_by_id (self , note_id : str ) ->  Dict :
240191        """ 
241192        获取笔记详情API 
@@ -268,7 +219,7 @@ async def get_note_comments(self, note_id: str, cursor: str = "") -> Dict:
268219        params  =  {
269220            "note_id" : note_id ,
270221            "cursor" : cursor ,
271-             "top_comment_id" :"" ,
222+             "top_comment_id" :  "" ,
272223            "image_formats" : "jpg,webp,avif" 
273224        }
274225        return  await  self .get (uri , params )
@@ -323,3 +274,76 @@ async def get_note_all_comments(self, note_id: str, crawl_interval: float = 1.0,
323274            await  asyncio .sleep (crawl_interval )
324275            result .extend (comments )
325276        return  result 
277+ 
278+     async  def  get_creator_info (self , user_id : str ) ->  Dict :
279+         """ 
280+         通过解析网页版的用户主页HTML,获取用户个人简要信息 
281+         PC端用户主页的网页存在window.__INITIAL_STATE__这个变量上的,解析它即可 
282+         eg: https://www.xiaohongshu.com/user/profile/59d8cb33de5fb4696bf17217 
283+         """ 
284+         uri  =  f"/user/profile/{ user_id }  
285+         html_content  =  await  self .request ("GET" , self ._domain  +  uri , return_response = True , headers = self .headers )
286+         match  =  re .search (r'<script>window.__INITIAL_STATE__=(.+)<\/script>' , html_content , re .M )
287+ 
288+         if  match  is  None :
289+             return  {}
290+ 
291+         info  =  json .loads (match .group (1 ).replace (':undefined' , ':null' ), strict = False )
292+         if  info  is  None :
293+             return  {}
294+         return  info .get ('user' ).get ('userPageData' )
295+ 
296+     async  def  get_notes_by_creator (
297+             self , creator : str ,
298+             cursor : str ,
299+             page_size : int  =  30 
300+     ) ->  Dict :
301+         """ 
302+         获取博主的笔记 
303+         Args: 
304+             creator: 博主ID 
305+             cursor: 上一页最后一条笔记的ID 
306+             page_size: 分页数据长度 
307+ 
308+         Returns: 
309+ 
310+         """ 
311+         uri  =  "/api/sns/web/v1/user_posted" 
312+         data  =  {
313+             "user_id" : creator ,
314+             "cursor" : cursor ,
315+             "num" : page_size ,
316+             "image_formats" : "jpg,webp,avif" 
317+         }
318+         return  await  self .get (uri , data )
319+ 
320+     async  def  get_all_notes_by_creator (self , user_id : str , crawl_interval : float  =  1.0 ,
321+                                        callback : Optional [Callable ] =  None ) ->  List [Dict ]:
322+         """ 
323+         获取指定用户下的所有发过的帖子,该方法会一直查找一个用户下的所有帖子信息 
324+         Args: 
325+             user_id: 用户ID 
326+             crawl_interval: 爬取一次的延迟单位(秒) 
327+             callback: 一次分页爬取结束后的更新回调函数 
328+ 
329+         Returns: 
330+ 
331+         """ 
332+         result  =  []
333+         notes_has_more  =  True 
334+         notes_cursor  =  "" 
335+         while  notes_has_more :
336+             notes_res  =  await  self .get_notes_by_creator (user_id , notes_cursor )
337+             notes_has_more  =  notes_res .get ("has_more" , False )
338+             notes_cursor  =  notes_res .get ("cursor" , "" )
339+             if  "notes"  not  in notes_res :
340+                 utils .logger .info (f"[XHSClient.get_all_notes_by_creator] No 'notes' key found in response: { notes_res }  )
341+                 break 
342+ 
343+             notes  =  notes_res ["notes" ]
344+             utils .logger .info (f"[XHSClient.get_all_notes_by_creator] got user_id:{ user_id } { len (notes )}  )
345+             if  callback :
346+                 await  callback (notes )
347+             await  asyncio .sleep (crawl_interval )
348+             result .extend (notes )
349+         return  result 
0 commit comments