-Development-of-Professional-Pathological-Knowledge-Q-A-Intelligent-Agent/data1.py at main · CtrlCSV/-Development-of-Professional-Pathological-Knowledge-Q-A-Intelligent-Agent · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
import os
import time
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse, unquote
from xhtml2pdf import pisa

# ========== 配置区域 ==========
TARGET_URL = "https://webpath.med.utah.edu/TUTORIAL/TUTORIAL.html"
DELAY = 0.5  # 请求延迟(秒)
OUTPUT_DIR = "output"
IMAGES_DIR = "images1"
os.makedirs(OUTPUT_DIR, exist_ok=True)
os.makedirs(IMAGES_DIR, exist_ok=True)
# ===========================

visited_urls = set()
pdf_counter = 0

# 下载图片
def download_image(img_url):
    try:
        filename = os.path.basename(unquote(urlparse(img_url).path))
        if not filename:
            filename = f"img_{hash(img_url)}.jpg"

        local_path = os.path.join(IMAGES_DIR, filename)

        if os.path.exists(local_path):
            return local_path

        headers = {"User-Agent": "Mozilla/5.0"}
        res = requests.get(img_url, headers=headers, stream=True, timeout=10)
        if res.status_code == 200:
            with open(local_path, "wb") as f:
                f.write(res.content)
            print(f"📥 保存图片：{local_path}")
            return local_path
    except Exception as e:
        print(f"图片错误: {e}")
    return None


# 提取文字+图片
def extract_page_data(url):
    print(f"\n========== 正在爬取：{url} ==========")
    data = {"title": "", "text": "", "images1": []}

    try:
        headers = {"User-Agent": "Mozilla/5.0"}
        res = requests.get(url, headers=headers, timeout=10)
        res.encoding = "utf-8"
        soup = BeautifulSoup(res.text, "html.parser")

        # 标题
        title_tag = soup.find("title")
        data["title"] = title_tag.get_text(strip=True) if title_tag else "未命名页面"

        # 文字
        contents = soup.find_all(["p", "li", "div", "td"])
        text_list = [c.get_text(strip=True) for c in contents if c.get_text(strip=True)]
        data["text"] = "\n\n".join(text_list)

        # 图片收集
        for img in soup.find_all("img"):
            src = img.get("src")
            if not src:
                continue
            img_url = urljoin(url, src)
            if urlparse(img_url).netloc == urlparse(TARGET_URL).netloc:
                local = download_image(img_url)
                if local:
                    data["images1"].append(local)

        return data
    except Exception as e:
        print(f"提取失败: {e}")
        return None


# 生成 PDF
def save_as_pdf(data):
    global pdf_counter
    if not data or not data["text"]:
        print("⚠️ 内容为空，跳过 PDF")
        return

    pdf_counter += 1
    pdf_name = f"{pdf_counter}.pdf"
    pdf_path = os.path.join(OUTPUT_DIR, pdf_name)

    temp_html = "temp.html"
    with open(temp_html, "w", encoding="utf-8") as f:
        f.write(f"<h1 style='text-align:center;'>{data['title']}</h1>")
        for paragraph in data["text"].split("\n\n"):
            f.write(f"<p>{paragraph}</p>")
        for img in data["images1"]:
            f.write(f"<p><img src='{img}' style='max-width:95%'></p>")

    with open(temp_html, "r", encoding="utf-8") as html_file:
        with open(pdf_path, "wb") as pdf_file:
            pisa.CreatePDF(html_file.read(), dest=pdf_file)

    os.remove(temp_html)
    print(f"📄 PDF({pdf_counter}) 已保存：{pdf_path}")


# 主爬虫递归
def crawl_recursive(url):
    if url in visited_urls:
        return
    visited_urls.add(url)

    data = extract_page_data(url)
    save_as_pdf(data)
    time.sleep(DELAY)

    try:
        headers = {"User-Agent": "Mozilla/5.0"}
        res = requests.get(url, headers=headers, timeout=10)
        soup = BeautifulSoup(res.text, "html.parser")

        for a in soup.find_all("a", href=True):
            link = urljoin(url, a["href"])
            if "#" in link:
                continue
            if urlparse(link).netloc != urlparse(TARGET_URL).netloc:
                continue
            if link in visited_urls:
                continue

            crawl_recursive(link)

    except Exception as e:
        print(f"子链接解析失败: {e}")


if __name__ == "__main__":
    print("🚩 开始递归爬取整个站点并输出连续编号 PDF...")
    crawl_recursive(TARGET_URL)
    print(f"\n🎉 完成！共生成 {pdf_counter} 个 PDF 文件。")