-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathdata1.py
More file actions
141 lines (113 loc) · 4.33 KB
/
data1.py
File metadata and controls
141 lines (113 loc) · 4.33 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
import os
import time
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse, unquote
from xhtml2pdf import pisa
# ========== 配置区域 ==========
TARGET_URL = "https://webpath.med.utah.edu/TUTORIAL/TUTORIAL.html"
DELAY = 0.5 # 请求延迟(秒)
OUTPUT_DIR = "output"
IMAGES_DIR = "images1"
os.makedirs(OUTPUT_DIR, exist_ok=True)
os.makedirs(IMAGES_DIR, exist_ok=True)
# ===========================
visited_urls = set()
pdf_counter = 0
# 下载图片
def download_image(img_url):
try:
filename = os.path.basename(unquote(urlparse(img_url).path))
if not filename:
filename = f"img_{hash(img_url)}.jpg"
local_path = os.path.join(IMAGES_DIR, filename)
if os.path.exists(local_path):
return local_path
headers = {"User-Agent": "Mozilla/5.0"}
res = requests.get(img_url, headers=headers, stream=True, timeout=10)
if res.status_code == 200:
with open(local_path, "wb") as f:
f.write(res.content)
print(f"📥 保存图片:{local_path}")
return local_path
except Exception as e:
print(f"图片错误: {e}")
return None
# 提取文字+图片
def extract_page_data(url):
print(f"\n========== 正在爬取:{url} ==========")
data = {"title": "", "text": "", "images1": []}
try:
headers = {"User-Agent": "Mozilla/5.0"}
res = requests.get(url, headers=headers, timeout=10)
res.encoding = "utf-8"
soup = BeautifulSoup(res.text, "html.parser")
# 标题
title_tag = soup.find("title")
data["title"] = title_tag.get_text(strip=True) if title_tag else "未命名页面"
# 文字
contents = soup.find_all(["p", "li", "div", "td"])
text_list = [c.get_text(strip=True) for c in contents if c.get_text(strip=True)]
data["text"] = "\n\n".join(text_list)
# 图片收集
for img in soup.find_all("img"):
src = img.get("src")
if not src:
continue
img_url = urljoin(url, src)
if urlparse(img_url).netloc == urlparse(TARGET_URL).netloc:
local = download_image(img_url)
if local:
data["images1"].append(local)
return data
except Exception as e:
print(f"提取失败: {e}")
return None
# 生成 PDF
def save_as_pdf(data):
global pdf_counter
if not data or not data["text"]:
print("⚠️ 内容为空,跳过 PDF")
return
pdf_counter += 1
pdf_name = f"{pdf_counter}.pdf"
pdf_path = os.path.join(OUTPUT_DIR, pdf_name)
temp_html = "temp.html"
with open(temp_html, "w", encoding="utf-8") as f:
f.write(f"<h1 style='text-align:center;'>{data['title']}</h1>")
for paragraph in data["text"].split("\n\n"):
f.write(f"<p>{paragraph}</p>")
for img in data["images1"]:
f.write(f"<p><img src='{img}' style='max-width:95%'></p>")
with open(temp_html, "r", encoding="utf-8") as html_file:
with open(pdf_path, "wb") as pdf_file:
pisa.CreatePDF(html_file.read(), dest=pdf_file)
os.remove(temp_html)
print(f"📄 PDF({pdf_counter}) 已保存:{pdf_path}")
# 主爬虫递归
def crawl_recursive(url):
if url in visited_urls:
return
visited_urls.add(url)
data = extract_page_data(url)
save_as_pdf(data)
time.sleep(DELAY)
try:
headers = {"User-Agent": "Mozilla/5.0"}
res = requests.get(url, headers=headers, timeout=10)
soup = BeautifulSoup(res.text, "html.parser")
for a in soup.find_all("a", href=True):
link = urljoin(url, a["href"])
if "#" in link:
continue
if urlparse(link).netloc != urlparse(TARGET_URL).netloc:
continue
if link in visited_urls:
continue
crawl_recursive(link)
except Exception as e:
print(f"子链接解析失败: {e}")
if __name__ == "__main__":
print("🚩 开始递归爬取整个站点并输出连续编号 PDF...")
crawl_recursive(TARGET_URL)
print(f"\n🎉 完成!共生成 {pdf_counter} 个 PDF 文件。")