1+ import arxiv
2+ import os
3+ import json
4+ import time
5+ from tqdm import tqdm
6+ import logging
7+ from datetime import datetime
8+
9+ def process_metadata (result ):
10+ """
11+ 将ArXiv结果对象转换为结构化的字典
12+
13+ Args:
14+ result: arxiv.Result对象
15+
16+ Returns:
17+ dict: 结构化的元数据字典
18+ """
19+ metadata = {
20+ "entry_id" : result .entry_id ,
21+ "updated" : str (result .updated ),
22+ "published" : str (result .published ),
23+ "title" : result .title ,
24+ "authors" : [author .name for author in result .authors ],
25+ "summary" : result .summary ,
26+ "comment" : str (result .comment ),
27+ "journal_ref" : str (result .journal_ref ),
28+ "doi" : str (result .doi ),
29+ "primary_category" : result .primary_category ,
30+ "categories" : result .categories ,
31+ "links" : [{"title" : link .title , "href" : link .href , "rel" : link .rel } for link in result .links ],
32+ "pdf_url" : result .pdf_url ,
33+ "download_time" : datetime .now ().isoformat ()
34+ }
35+
36+ return metadata
37+
38+
39+ def download_arxiv_papers (topic , max_papers = 200 , save_dir = "papers" , sleep_interval = 2 ):
40+ """
41+ 下载指定主题的ArXiv论文并保存结构化元数据
42+
43+ Args:
44+ topic (str): 要搜索的主题/查询
45+ max_papers (int): 要下载的最大论文数量
46+ save_dir (str): 保存论文的基本目录
47+ sleep_interval (float): 下载间隔时间(避免API限制)
48+
49+ Returns:
50+ int: 成功下载的论文数量
51+ """
52+ # 配置日志
53+ topic_safe = topic .replace (' ' , '_' ).replace ('/' , '_' ).replace ('\\ ' , '_' )
54+ logging .basicConfig (
55+ level = logging .INFO ,
56+ format = '%(asctime)s - %(levelname)s - %(message)s' ,
57+ handlers = [
58+ logging .FileHandler (f"{ save_dir } /{ topic_safe } _download.log" ),
59+ logging .StreamHandler ()
60+ ]
61+ )
62+
63+ logger = logging .getLogger (__name__ )
64+ logger .info (f"开始下载主题: { topic } " )
65+
66+ # 创建文件夹结构
67+ topic_dir = os .path .join (save_dir , f"topic_{ topic_safe } " )
68+ pdfs_dir = os .path .join (topic_dir , "pdfs" )
69+ metadata_dir = os .path .join (topic_dir , "metadata" )
70+
71+ os .makedirs (pdfs_dir , exist_ok = True )
72+ os .makedirs (metadata_dir , exist_ok = True )
73+
74+ logger .info (f"创建目录: { pdfs_dir } 和 { metadata_dir } " )
75+
76+ # 创建一个总体元数据文件,包含所有下载的论文信息
77+ all_metadata_file = os .path .join (topic_dir , f"{ topic_safe } _all_metadata.json" )
78+ all_metadata = []
79+
80+ # 配置搜索
81+ search = arxiv .Search (
82+ query = topic ,
83+ max_results = max_papers ,
84+ sort_by = arxiv .SortCriterion .Relevance
85+ )
86+
87+ client = arxiv .Client ()
88+
89+ # 初始化计数器
90+ successful_downloads = 0
91+ failed_downloads = 0
92+
93+ # 下载论文
94+ try :
95+ results = list (client .results (search ))
96+ total_results = len (results )
97+ logger .info (f"找到 { total_results } 篇关于主题 '{ topic } ' 的论文" )
98+
99+ for i , result in enumerate (tqdm (results , desc = f"下载主题 '{ topic } ' 的论文" )):
100+ try :
101+ # 获取论文ID并创建文件名
102+ paper_id = result .get_short_id ()
103+ pdf_filename = f"{ paper_id } .pdf"
104+ metadata_filename = f"{ paper_id } .json"
105+
106+ # 处理元数据
107+ metadata = process_metadata (result )
108+ metadata_path = os .path .join (metadata_dir , metadata_filename )
109+
110+ # 保存单个论文元数据
111+ with open (metadata_path , 'w' , encoding = 'utf-8' ) as f :
112+ json .dump (metadata , f , ensure_ascii = False , indent = 2 )
113+
114+ # 添加到总体元数据
115+ all_metadata .append (metadata )
116+
117+ # 保存总体元数据每10篇论文更新一次
118+ if (i + 1 ) % 10 == 0 or (i + 1 ) == total_results :
119+ with open (all_metadata_file , 'w' , encoding = 'utf-8' ) as f :
120+ json .dump (all_metadata , f , ensure_ascii = False , indent = 2 )
121+
122+ # 下载PDF
123+ pdf_path = os .path .join (pdfs_dir , pdf_filename )
124+ result .download_pdf (dirpath = pdfs_dir , filename = pdf_filename )
125+ successful_downloads += 1
126+
127+ # 休眠以避免速率限制
128+ time .sleep (sleep_interval )
129+
130+ except Exception as e :
131+ logger .error (f"下载论文 { paper_id } 时出错: { str (e )} " )
132+ failed_downloads += 1
133+
134+ # 每10篇论文记录一次进度
135+ if (i + 1 ) % 10 == 0 :
136+ logger .info (f"进度: { i + 1 } /{ total_results } 篇论文已处理" )
137+ time .sleep (0.5 )
138+ except Exception as e :
139+ logger .error (f"搜索或下载过程中出错: { str (e )} " )
140+
141+ # 记录最终统计信息
142+ logger .info (f"主题 '{ topic } ' 的下载已完成" )
143+ logger .info (f"成功下载: { successful_downloads } 篇论文" )
144+ logger .info (f"下载失败: { failed_downloads } 篇论文" )
145+
146+ return successful_downloads
147+
148+
149+ def batch_download_topics (topics_list , max_papers_per_topic = 200 , base_dir = "papers" ):
150+ """
151+ 批量下载多个主题的论文
152+
153+ Args:
154+ topics_list (list): 主题列表
155+ max_papers_per_topic (int): 每个主题要下载的最大论文数量
156+ base_dir (str): 基本保存目录
157+
158+ Returns:
159+ dict: 每个主题的下载统计信息
160+ """
161+ os .makedirs (base_dir , exist_ok = True )
162+
163+ results = {}
164+ total_start_time = time .time ()
165+
166+ for i , topic in enumerate (topics_list ):
167+ print (f"\n [{ i + 1 } /{ len (topics_list )} ] 开始下载主题: { topic } " )
168+
169+ topic_start_time = time .time ()
170+ papers_downloaded = download_arxiv_papers (
171+ topic = topic ,
172+ max_papers = max_papers_per_topic ,
173+ save_dir = base_dir ,
174+ sleep_interval = 3 # 为批量下载增加一点休眠时间
175+ )
176+
177+ topic_elapsed_time = time .time () - topic_start_time
178+
179+ results [topic ] = {
180+ "papers_downloaded" : papers_downloaded ,
181+ "elapsed_time" : f"{ topic_elapsed_time :.2f} 秒"
182+ }
183+
184+ print (f"主题 '{ topic } ' 已完成: 下载 { papers_downloaded } 篇论文,用时 { topic_elapsed_time :.2f} 秒" )
185+
186+ # 在主题之间添加额外休眠以减轻API负担
187+ if i < len (topics_list ) - 1 :
188+ rest_time = 10
189+ print (f"休息 { rest_time } 秒后继续下一个主题..." )
190+ time .sleep (rest_time )
191+
192+ total_elapsed_time = time .time () - total_start_time
193+ print (f"\n 批量下载已完成! 总用时: { total_elapsed_time :.2f} 秒" )
194+
195+ # 保存批量下载的摘要
196+ summary_file = os .path .join (base_dir , "batch_download_summary.json" )
197+ with open (summary_file , 'w' , encoding = 'utf-8' ) as f :
198+ summary = {
199+ "total_topics" : len (topics_list ),
200+ "total_time" : f"{ total_elapsed_time :.2f} 秒" ,
201+ "completed_at" : datetime .now ().isoformat (),
202+ "topics_results" : results
203+ }
204+ json .dump (summary , f , ensure_ascii = False , indent = 2 )
205+
206+ return results
207+
208+
209+ # 使用示例:
210+ if __name__ == "__main__" :
211+ # 单个主题下载
212+ # download_arxiv_papers("Reasoning Large Language Models", max_papers=200)
213+
214+ # 多个主题批量下载
215+ topics = [
216+ "Reasoning Large Language Models" ,
217+ # "LLM Post-Training",
218+ # "Chain of Thought",
219+ ]
220+
221+ batch_download_topics (topics , max_papers_per_topic = 200 )
0 commit comments