11import asyncio
22import json
33import re
4- from typing import Callable , Dict , List , Optional
4+ from typing import Callable , Dict , List , Optional , Union , Any
55from urllib .parse import urlencode
66
77import httpx
@@ -28,6 +28,7 @@ def __init__(
2828 self .timeout = timeout
2929 self .headers = headers
3030 self ._host = "https://edith.xiaohongshu.com"
31+ self ._domain = "https://www.xiaohongshu.com"
3132 self .IP_ERROR_STR = "网络连接异常,请检查网络设置或重启试试"
3233 self .IP_ERROR_CODE = 300012
3334 self .NOTE_ABNORMAL_STR = "笔记状态异常,请稍后查看"
@@ -63,7 +64,7 @@ async def _pre_headers(self, url: str, data=None) -> Dict:
6364 self .headers .update (headers )
6465 return self .headers
6566
66- async def request (self , method , url , ** kwargs ) -> Dict :
67+ async def request (self , method , url , ** kwargs ) -> Union [ str , Any ] :
6768 """
6869 封装httpx的公共请求方法,对请求响应做一些处理
6970 Args:
@@ -82,10 +83,10 @@ async def request(self, method, url, **kwargs) -> Dict:
8283 method , url , timeout = self .timeout ,
8384 ** kwargs
8485 )
85-
86+
8687 if return_response :
8788 return response .text
88-
89+
8990 data : Dict = response .json ()
9091 if data ["success" ]:
9192 return data .get ("data" , data .get ("success" , {}))
@@ -186,56 +187,6 @@ async def get_note_by_keyword(
186187 }
187188 return await self .post (uri , data )
188189
189- async def get_creator_info_and_notes (self , creator : str ) -> Dict :
190- """
191- 获取博主的信息和第一页的笔记
192- Args:
193- creator: 博主ID
194- Returns:
195- {"creator":{}, "notes":[]}
196- """
197- path = '/user/profile/' + creator
198- content = await self .request (method = "GET" , url = f"https://www.xiaohongshu.com{ path } " , return_response = True )
199- match = re .search (r'<script>window.__INITIAL_STATE__=(.+)<\/script>' , content , re .M )
200-
201- if match == None :
202- return {}
203-
204- info = json .loads (match .group (1 ).replace (':undefined' , ':null' ), strict = False )
205- if info == None :
206- return {}
207-
208- return {
209- 'creator' : info .get ('user' ).get ('userPageData' ),
210- 'notes' : info .get ('user' ).get ('notes' )[0 ],
211- 'cursor' : info .get ('user' ).get ('noteQueries' )[0 ].get ('cursor' ),
212- 'has_more_notes' : info .get ('user' ).get ('noteQueries' )[0 ].get ('hasMore' )
213- }
214-
215- async def get_notes_by_creator (
216- self , creator : str ,
217- cursor : str ,
218- page_size : int = 30
219- ) -> Dict :
220- """
221- 获取博主的笔记
222- Args:
223- creator: 博主ID
224- cursor: 上一页最后一条笔记的ID
225- page_size: 分页数据长度
226-
227- Returns:
228-
229- """
230- uri = "/api/sns/web/v1/user_posted"
231- data = {
232- "user_id" : creator ,
233- "cursor" : cursor ,
234- "num" : page_size ,
235- "image_formats" : "jpg,webp,avif"
236- }
237- return await self .get (uri , data )
238-
239190 async def get_note_by_id (self , note_id : str ) -> Dict :
240191 """
241192 获取笔记详情API
@@ -268,7 +219,7 @@ async def get_note_comments(self, note_id: str, cursor: str = "") -> Dict:
268219 params = {
269220 "note_id" : note_id ,
270221 "cursor" : cursor ,
271- "top_comment_id" :"" ,
222+ "top_comment_id" : "" ,
272223 "image_formats" : "jpg,webp,avif"
273224 }
274225 return await self .get (uri , params )
@@ -323,3 +274,76 @@ async def get_note_all_comments(self, note_id: str, crawl_interval: float = 1.0,
323274 await asyncio .sleep (crawl_interval )
324275 result .extend (comments )
325276 return result
277+
278+ async def get_creator_info (self , user_id : str ) -> Dict :
279+ """
280+ 通过解析网页版的用户主页HTML,获取用户个人简要信息
281+ PC端用户主页的网页存在window.__INITIAL_STATE__这个变量上的,解析它即可
282+ eg: https://www.xiaohongshu.com/user/profile/59d8cb33de5fb4696bf17217
283+ """
284+ uri = f"/user/profile/{ user_id } "
285+ html_content = await self .request ("GET" , self ._domain + uri , return_response = True , headers = self .headers )
286+ match = re .search (r'<script>window.__INITIAL_STATE__=(.+)<\/script>' , html_content , re .M )
287+
288+ if match is None :
289+ return {}
290+
291+ info = json .loads (match .group (1 ).replace (':undefined' , ':null' ), strict = False )
292+ if info is None :
293+ return {}
294+ return info .get ('user' ).get ('userPageData' )
295+
296+ async def get_notes_by_creator (
297+ self , creator : str ,
298+ cursor : str ,
299+ page_size : int = 30
300+ ) -> Dict :
301+ """
302+ 获取博主的笔记
303+ Args:
304+ creator: 博主ID
305+ cursor: 上一页最后一条笔记的ID
306+ page_size: 分页数据长度
307+
308+ Returns:
309+
310+ """
311+ uri = "/api/sns/web/v1/user_posted"
312+ data = {
313+ "user_id" : creator ,
314+ "cursor" : cursor ,
315+ "num" : page_size ,
316+ "image_formats" : "jpg,webp,avif"
317+ }
318+ return await self .get (uri , data )
319+
320+ async def get_all_notes_by_creator (self , user_id : str , crawl_interval : float = 1.0 ,
321+ callback : Optional [Callable ] = None ) -> List [Dict ]:
322+ """
323+ 获取指定用户下的所有发过的帖子,该方法会一直查找一个用户下的所有帖子信息
324+ Args:
325+ user_id: 用户ID
326+ crawl_interval: 爬取一次的延迟单位(秒)
327+ callback: 一次分页爬取结束后的更新回调函数
328+
329+ Returns:
330+
331+ """
332+ result = []
333+ notes_has_more = True
334+ notes_cursor = ""
335+ while notes_has_more :
336+ notes_res = await self .get_notes_by_creator (user_id , notes_cursor )
337+ notes_has_more = notes_res .get ("has_more" , False )
338+ notes_cursor = notes_res .get ("cursor" , "" )
339+ if "notes" not in notes_res :
340+ utils .logger .info (f"[XHSClient.get_all_notes_by_creator] No 'notes' key found in response: { notes_res } " )
341+ break
342+
343+ notes = notes_res ["notes" ]
344+ utils .logger .info (f"[XHSClient.get_all_notes_by_creator] got user_id:{ user_id } notes len : { len (notes )} " )
345+ if callback :
346+ await callback (notes )
347+ await asyncio .sleep (crawl_interval )
348+ result .extend (notes )
349+ return result
0 commit comments