|
| 1 | +#!/usr/bin/python3 |
| 2 | + |
| 3 | +""" |
| 4 | +Outputs URLs of ArduPilot documentation pages. |
| 5 | +
|
| 6 | +SPDX-FileCopyrightText: 2024-2025 Amilcar do Carmo Lucas <[email protected]> |
| 7 | +
|
| 8 | +SPDX-License-Identifier: GPL-3.0-or-later |
| 9 | +""" |
| 10 | + |
| 11 | +import logging |
| 12 | +import time |
| 13 | +from os import environ as os_environ |
| 14 | +from typing import Union |
| 15 | +from urllib.parse import urljoin, urlparse |
| 16 | + |
| 17 | +import requests |
| 18 | +from bs4 import BeautifulSoup, Tag |
| 19 | + |
| 20 | +# Define the URL where to start crawling |
| 21 | +URL = "https://ardupilot.org/ardupilot/" |
| 22 | +USERNAME = "your_username" # Replace with actual username if needed |
| 23 | +PASSWORD = "" # Replace with actual password if needed |
| 24 | + |
| 25 | +# Crawls and outputs only URLs that start with: |
| 26 | +ALLOWED_DOMAINS = ( |
| 27 | + "ardupilot.org/ardupilot/", |
| 28 | + "ardupilot.org/copter/", |
| 29 | + "ardupilot.org/plane/", |
| 30 | + "ardupilot.org/rover/", |
| 31 | + "www.ardusub.com/", |
| 32 | + "ardupilot.org/blimp/", |
| 33 | + "ardupilot.org/antennatracker/", |
| 34 | + "ardupilot.org/planner/", |
| 35 | + "ardupilot.org/mavproxy/", |
| 36 | + "ardupilot.org/dev/", |
| 37 | + "ardupilot.github.io/MethodicConfigurator/", |
| 38 | + "mavlink.io/en/", |
| 39 | + "docs.cubepilot.org/", |
| 40 | + "doc.cuav.net/", |
| 41 | + "docs.holybro.com/", |
| 42 | +) |
| 43 | + |
| 44 | +URL_BLACKLIST = [ |
| 45 | + "https://mavlink.io/en/messages/ASLUAV.html", |
| 46 | + "https://mavlink.io/en/messages/AVSSUAS.html", |
| 47 | + "https://mavlink.io/en/messages/all.html", |
| 48 | + "https://mavlink.io/en/messages/csAirLink.html", |
| 49 | + "https://mavlink.io/en/messages/dialects.html", |
| 50 | + "https://mavlink.io/en/messages/icarous.html", |
| 51 | + "https://mavlink.io/en/messages/matrixpilot.html", |
| 52 | + "https://mavlink.io/en/messages/paparazzi.html", |
| 53 | + "https://mavlink.io/en/messages/python_array_test.html", |
| 54 | + "https://mavlink.io/en/messages/test.html", |
| 55 | + "https://mavlink.io/en/messages/uAvionix.html", |
| 56 | + "https://mavlink.io/en/messages/ualberta.html", |
| 57 | +] |
| 58 | + |
| 59 | +URL_BLACKLIST_PREFIXES = ["https://docs.cubepilot.org/user-guides/~/changes/", "zh-hans", "doc.cuav.net/tutorial"] |
| 60 | + |
| 61 | + |
| 62 | +# pylint: disable=duplicate-code |
| 63 | +def get_env_proxies() -> Union[dict[str, str], None]: |
| 64 | + proxies_env = { |
| 65 | + "http": os_environ.get("HTTP_PROXY") or os_environ.get("http_proxy"), |
| 66 | + "https": os_environ.get("HTTPS_PROXY") or os_environ.get("https_proxy"), |
| 67 | + "no_proxy": os_environ.get("NO_PROXY") or os_environ.get("no_proxy"), |
| 68 | + } |
| 69 | + # Remove None values |
| 70 | + proxies_dict: dict[str, str] = {k: v for k, v in proxies_env.items() if v is not None} |
| 71 | + # define as None if no proxies are defined in the OS environment variables |
| 72 | + proxies = proxies_dict if proxies_dict else None |
| 73 | + if proxies: |
| 74 | + logging.info("Proxies: %s", proxies) |
| 75 | + else: |
| 76 | + logging.debug("Proxies: %s", proxies) |
| 77 | + return proxies |
| 78 | + |
| 79 | + |
| 80 | +# pylint: enable=duplicate-code |
| 81 | + |
| 82 | + |
| 83 | +def remove_duplicates(visited_urls: set[str]) -> set[str]: |
| 84 | + # if the URL is https:// and the same URL exists as http://, remove the https:// URL from the list |
| 85 | + urls_to_remove = set() |
| 86 | + for url in visited_urls: |
| 87 | + if url.startswith("https://") and f"http://{url[8:]}" in visited_urls: |
| 88 | + urls_to_remove.add(url) |
| 89 | + visited_urls -= urls_to_remove |
| 90 | + |
| 91 | + # if visited URLs end in "/index.html", and the same URL without "/index.html" or without "index.html" is in the list, |
| 92 | + # remove the one with "/index.html" from the list |
| 93 | + urls_to_remove = set() |
| 94 | + for url in visited_urls: |
| 95 | + if url.endswith("/index.html") and (url[:-11] in visited_urls or url[:-10] in visited_urls): |
| 96 | + urls_to_remove.add(url) |
| 97 | + visited_urls -= urls_to_remove |
| 98 | + |
| 99 | + # if visited URLs end in "/", and the same URL without "/" is in the list, remove the one with "/" from the list |
| 100 | + urls_to_remove = set() |
| 101 | + for url in visited_urls: |
| 102 | + if url.endswith("/") and url[:-1] in visited_urls: |
| 103 | + urls_to_remove.add(url) |
| 104 | + visited_urls -= urls_to_remove |
| 105 | + |
| 106 | + # if visited URLs end in "common-*.html", and the file URL exists in with base URL http://ardupilot.org/copter/docs/, |
| 107 | + # remove it from the list |
| 108 | + urls_to_remove = set() |
| 109 | + for url in visited_urls: |
| 110 | + if "/common-" in url and url.endswith(".html"): |
| 111 | + filename = url.split("/")[-1] |
| 112 | + copter_url = f"http://ardupilot.org/copter/docs/{filename}" |
| 113 | + if copter_url in visited_urls and url != copter_url: |
| 114 | + urls_to_remove.add(url) |
| 115 | + visited_urls -= urls_to_remove |
| 116 | + |
| 117 | + return visited_urls |
| 118 | + |
| 119 | + |
| 120 | +def find_all_links(soup: BeautifulSoup, current_url: str, visited_urls: set[str], urls_to_visit: set[str]) -> set[str]: |
| 121 | + for link in soup.find_all("a", href=True): |
| 122 | + if not isinstance(link, Tag): |
| 123 | + continue |
| 124 | + |
| 125 | + href = link.attrs.get("href") |
| 126 | + |
| 127 | + if not isinstance(href, str): |
| 128 | + continue |
| 129 | + |
| 130 | + full_url = urljoin(current_url, href) |
| 131 | + |
| 132 | + if not isinstance(full_url, str): |
| 133 | + continue |
| 134 | + |
| 135 | + # Remove anchor from URL |
| 136 | + parsed_url = urlparse(full_url) |
| 137 | + |
| 138 | + clean_url = f"{parsed_url.scheme}://{parsed_url.netloc}{parsed_url.path}" |
| 139 | + |
| 140 | + # Skip image files |
| 141 | + if parsed_url.path.lower().endswith((".png", ".jpg", ".jpeg", ".gif", ".svg", ".bmp", "_images")): |
| 142 | + continue |
| 143 | + |
| 144 | + # Check if URL matches allowed domains |
| 145 | + if ( |
| 146 | + any(domain in clean_url for domain in ALLOWED_DOMAINS) |
| 147 | + and clean_url not in visited_urls |
| 148 | + and clean_url not in urls_to_visit |
| 149 | + and all(domain not in clean_url for domain in URL_BLACKLIST_PREFIXES) |
| 150 | + ): |
| 151 | + urls_to_visit.add(clean_url) |
| 152 | + |
| 153 | + return urls_to_visit |
| 154 | + |
| 155 | + |
| 156 | +def main() -> None: |
| 157 | + logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s") |
| 158 | + start_time = time.time() |
| 159 | + |
| 160 | + visited_urls = set() |
| 161 | + broken_urls = set() |
| 162 | + urls_to_visit = {URL} |
| 163 | + proxies = get_env_proxies() |
| 164 | + |
| 165 | + # Set up authentication if credentials provided |
| 166 | + auth = (USERNAME, PASSWORD) if USERNAME and PASSWORD else None |
| 167 | + |
| 168 | + session = requests.Session() |
| 169 | + if auth: |
| 170 | + session.auth = auth |
| 171 | + if proxies: |
| 172 | + session.proxies = proxies |
| 173 | + |
| 174 | + while urls_to_visit: |
| 175 | + current_url = urls_to_visit.pop() |
| 176 | + |
| 177 | + if current_url in visited_urls or current_url in broken_urls: |
| 178 | + continue |
| 179 | + |
| 180 | + try: |
| 181 | + logging.info("Crawling: %s", current_url) |
| 182 | + response = session.get(current_url, timeout=30) |
| 183 | + response.raise_for_status() |
| 184 | + |
| 185 | + visited_urls.add(current_url) |
| 186 | + |
| 187 | + # Parse HTML |
| 188 | + soup = BeautifulSoup(response.text, "html.parser") |
| 189 | + |
| 190 | + urls_to_visit = find_all_links(soup, current_url, visited_urls, urls_to_visit) |
| 191 | + |
| 192 | + except (requests.RequestException, requests.Timeout) as e: |
| 193 | + logging.error("Network error crawling %s: %s", current_url, str(e)) |
| 194 | + broken_urls.add(current_url) |
| 195 | + if current_url in visited_urls: |
| 196 | + visited_urls.remove(current_url) |
| 197 | + except (KeyError, ValueError) as e: |
| 198 | + logging.error("URL processing error for %s: %s", current_url, str(e)) |
| 199 | + broken_urls.add(current_url) |
| 200 | + if current_url in visited_urls: |
| 201 | + visited_urls.remove(current_url) |
| 202 | + output_urls(visited_urls, broken_urls, start_time) |
| 203 | + |
| 204 | + |
| 205 | +def output_urls(visited_urls: set[str], broken_urls: set[str], start_time: float) -> None: |
| 206 | + # Write all html URLs to file |
| 207 | + raw_pages = len(visited_urls) |
| 208 | + with open("gurubase.io_url_list_raw.txt", "w", encoding="utf-8") as f: |
| 209 | + for url in sorted(visited_urls): |
| 210 | + f.write(f"{url}\n") # Output to file |
| 211 | + |
| 212 | + visited_urls -= set(URL_BLACKLIST) |
| 213 | + dedup_urls = remove_duplicates(visited_urls) |
| 214 | + |
| 215 | + # Write de-duplicated URLs to file and terminal |
| 216 | + with open("gurubase.io_url_list.txt", "w", encoding="utf-8") as f: |
| 217 | + for url in sorted(dedup_urls): |
| 218 | + print(url) # Output to terminal # noqa: T201 |
| 219 | + f.write(f"{url}\n") # Output to file |
| 220 | + |
| 221 | + # Write broken URLs to file |
| 222 | + with open("gurubase.io_broken_urls_list.txt", "w", encoding="utf-8") as f: |
| 223 | + for url in sorted(broken_urls): |
| 224 | + f.write(f"{url}\n") |
| 225 | + |
| 226 | + duration_mins = (time.time() - start_time) / 60 |
| 227 | + pages_per_min = raw_pages / duration_mins |
| 228 | + |
| 229 | + logging.info("\nCrawling Statistics:") |
| 230 | + msg = f"{raw_pages} pages crawled in {duration_mins:.2f} minutes ({pages_per_min:.2f} pages/min)" |
| 231 | + logging.info(msg) |
| 232 | + msg = f"De-duplicated pages: {len(dedup_urls)}" |
| 233 | + logging.info(msg) |
| 234 | + msg = f"Broken URLs found: {len(broken_urls)}" |
| 235 | + logging.info(msg) |
| 236 | + |
| 237 | + |
| 238 | +if __name__ == "__main__": |
| 239 | + main() # Call the main function |
0 commit comments