16
16
17
17
from .client import XHSClient
18
18
from .exception import DataFetchError
19
+ from .field import SearchSortType
19
20
from .login import XHSLogin
20
21
21
22
@@ -84,6 +85,9 @@ async def start(self) -> None:
84
85
elif self .crawler_type == "detail" :
85
86
# Get the information and comments of the specified post
86
87
await self .get_specified_notes ()
88
+ elif self .crawler_type == "creator" :
89
+ # Get creator's information and their notes and comments
90
+ await self .get_creators_and_notes ()
87
91
else :
88
92
pass
89
93
@@ -101,6 +105,7 @@ async def search(self) -> None:
101
105
notes_res = await self .xhs_client .get_note_by_keyword (
102
106
keyword = keyword ,
103
107
page = page ,
108
+ sort = SearchSortType (config .SORT_TYPE ) if config .SORT_TYPE != '' else SearchSortType .GENERAL ,
104
109
)
105
110
utils .logger .info (f"[XiaoHongShuCrawler.search] Search notes res:{ notes_res } " )
106
111
semaphore = asyncio .Semaphore (config .MAX_CONCURRENCY_NUM )
@@ -117,6 +122,67 @@ async def search(self) -> None:
117
122
page += 1
118
123
utils .logger .info (f"[XiaoHongShuCrawler.search] Note details: { note_details } " )
119
124
await self .batch_get_note_comments (note_id_list )
125
+
126
+ async def get_creators_and_notes (self ) -> None :
127
+ """Get creator's notes and retrieve their comment information."""
128
+ utils .logger .info ("[XiaoHongShuCrawler.get_creators_and_notes] Begin get xiaohongshu creators" )
129
+ xhs_limit_count = 30
130
+ for creator in config .XHS_CREATOR_ID_LIST :
131
+ utils .logger .info (f"[XiaoHongShuCrawler.get_creators_and_notes] Current creator: { creator } " )
132
+ page = 0
133
+ cursor = ''
134
+ has_more_notes = False
135
+ while page * xhs_limit_count <= config .CRAWLER_MAX_NOTES_COUNT :
136
+ note_id_list : List [str ] = []
137
+
138
+ if page == 0 :
139
+ # get creator info and notes
140
+ creator_and_notes_info = await self .xhs_client .get_creator_info_and_notes (creator )
141
+
142
+ if creator_and_notes_info == None or not creator_and_notes_info :
143
+ utils .logger .error (f"[XiaoHongShuCrawler.get_creators_and_notes] get creator notes error" )
144
+ continue
145
+
146
+ notes_res = creator_and_notes_info .get ('notes' )
147
+ # utils.logger.info(f"[XiaoHongShuCrawler.get_creators_and_notes] get creator and notes:{notes_res}")
148
+
149
+ cursor = creator_and_notes_info .get ('cursor' )
150
+ has_more_notes = creator_and_notes_info .get ('has_more_notes' )
151
+
152
+ # save creator info
153
+ await xhs_store .save_creator (creator , creator_and_notes_info .get ('creator' ))
154
+ utils .logger .info (f"[XiaoHongShuCrawler.get_creators_and_notes] save creator info:{ creator_and_notes_info .get ('creator' )} " )
155
+ else :
156
+ # get notes
157
+ notes = await self .xhs_client .get_notes_by_creator (creator , cursor )
158
+ # utils.logger.info(f"[XiaoHongShuCrawler.get_creators_and_notes] get notes res:{notes_res}")
159
+
160
+ if notes == None or not notes :
161
+ utils .logger .error (f"[XiaoHongShuCrawler.get_creators_and_notes] get creator's notes error" )
162
+ continue
163
+
164
+ cursor = notes .get ('cursor' )
165
+ has_more_notes = notes .get ('has_more_notes' )
166
+ notes_res = notes .get ('notes' )
167
+ utils .logger .info (f"[XiaoHongShuCrawler.get_creators_and_notes] get creator's notes res:{ notes_res } " )
168
+
169
+ semaphore = asyncio .Semaphore (config .MAX_CONCURRENCY_NUM )
170
+ task_list = [
171
+ self .get_note_detail (post_item .get ('id' ), semaphore )
172
+ for post_item in notes_res
173
+ ]
174
+ note_details = await asyncio .gather (* task_list )
175
+ for note_detail in note_details :
176
+ if note_detail is not None :
177
+ await xhs_store .update_xhs_note (note_detail )
178
+ note_id_list .append (note_detail .get ('note_id' ))
179
+ page += 1
180
+
181
+ utils .logger .info (f"[XiaoHongShuCrawler.get_creators_and_notes] Note details: { note_details } " )
182
+ await self .batch_get_note_comments (note_id_list )
183
+
184
+ if not has_more_notes :
185
+ break
120
186
121
187
async def get_specified_notes (self ):
122
188
"""Get the information and comments of the specified post"""
0 commit comments