Skip to content

Commit ef56e22

Browse files
committed
feat(crawl): Added scripts that feed the gurubase.io RAG AI tool
1 parent 3153357 commit ef56e22

File tree

4 files changed

+258
-1
lines changed

4 files changed

+258
-1
lines changed

.github/workflows/pylint.yml

+1-1
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ jobs:
4040
- name: Install dependencies
4141
# these extra packages are required by pylint to validate the python imports
4242
run: |
43-
uv pip install .[dev]
43+
uv pip install .[dev,scripts]
4444
4545
- name: Analyzing the code with pylint
4646
run: |

pyproject.toml

+4
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,10 @@ dev = [
7575
"types-requests==2.32.0.20250328",
7676
]
7777

78+
scripts = [
79+
"bs4",
80+
]
81+
7882
[project.scripts]
7983
ardupilot_methodic_configurator = "ardupilot_methodic_configurator.__main__:main"
8084
extract_param_defaults = "ardupilot_methodic_configurator.extract_param_defaults:main"

scripts/crawl_ardupilot_wiki.py

+239
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,239 @@
1+
#!/usr/bin/python3
2+
3+
"""
4+
Outputs URLs of ArduPilot documentation pages.
5+
6+
SPDX-FileCopyrightText: 2024-2025 Amilcar do Carmo Lucas <[email protected]>
7+
8+
SPDX-License-Identifier: GPL-3.0-or-later
9+
"""
10+
11+
import logging
12+
import time
13+
from os import environ as os_environ
14+
from typing import Union
15+
from urllib.parse import urljoin, urlparse
16+
17+
import requests
18+
from bs4 import BeautifulSoup, Tag
19+
20+
# Define the URL where to start crawling
21+
URL = "https://ardupilot.org/ardupilot/"
22+
USERNAME = "your_username" # Replace with actual username if needed
23+
PASSWORD = "" # Replace with actual password if needed
24+
25+
# Crawls and outputs only URLs that start with:
26+
ALLOWED_DOMAINS = (
27+
"ardupilot.org/ardupilot/",
28+
"ardupilot.org/copter/",
29+
"ardupilot.org/plane/",
30+
"ardupilot.org/rover/",
31+
"www.ardusub.com/",
32+
"ardupilot.org/blimp/",
33+
"ardupilot.org/antennatracker/",
34+
"ardupilot.org/planner/",
35+
"ardupilot.org/mavproxy/",
36+
"ardupilot.org/dev/",
37+
"ardupilot.github.io/MethodicConfigurator/",
38+
"mavlink.io/en/",
39+
"docs.cubepilot.org/",
40+
"doc.cuav.net/",
41+
"docs.holybro.com/",
42+
)
43+
44+
URL_BLACKLIST = [
45+
"https://mavlink.io/en/messages/ASLUAV.html",
46+
"https://mavlink.io/en/messages/AVSSUAS.html",
47+
"https://mavlink.io/en/messages/all.html",
48+
"https://mavlink.io/en/messages/csAirLink.html",
49+
"https://mavlink.io/en/messages/dialects.html",
50+
"https://mavlink.io/en/messages/icarous.html",
51+
"https://mavlink.io/en/messages/matrixpilot.html",
52+
"https://mavlink.io/en/messages/paparazzi.html",
53+
"https://mavlink.io/en/messages/python_array_test.html",
54+
"https://mavlink.io/en/messages/test.html",
55+
"https://mavlink.io/en/messages/uAvionix.html",
56+
"https://mavlink.io/en/messages/ualberta.html",
57+
]
58+
59+
URL_BLACKLIST_PREFIXES = ["https://docs.cubepilot.org/user-guides/~/changes/", "zh-hans", "doc.cuav.net/tutorial"]
60+
61+
62+
# pylint: disable=duplicate-code
63+
def get_env_proxies() -> Union[dict[str, str], None]:
64+
proxies_env = {
65+
"http": os_environ.get("HTTP_PROXY") or os_environ.get("http_proxy"),
66+
"https": os_environ.get("HTTPS_PROXY") or os_environ.get("https_proxy"),
67+
"no_proxy": os_environ.get("NO_PROXY") or os_environ.get("no_proxy"),
68+
}
69+
# Remove None values
70+
proxies_dict: dict[str, str] = {k: v for k, v in proxies_env.items() if v is not None}
71+
# define as None if no proxies are defined in the OS environment variables
72+
proxies = proxies_dict if proxies_dict else None
73+
if proxies:
74+
logging.info("Proxies: %s", proxies)
75+
else:
76+
logging.debug("Proxies: %s", proxies)
77+
return proxies
78+
79+
80+
# pylint: enable=duplicate-code
81+
82+
83+
def remove_duplicates(visited_urls: set[str]) -> set[str]:
84+
# if the URL is https:// and the same URL exists as http://, remove the https:// URL from the list
85+
urls_to_remove = set()
86+
for url in visited_urls:
87+
if url.startswith("https://") and f"http://{url[8:]}" in visited_urls:
88+
urls_to_remove.add(url)
89+
visited_urls -= urls_to_remove
90+
91+
# if visited URLs end in "/index.html", and the same URL without "/index.html" or without "index.html" is in the list,
92+
# remove the one with "/index.html" from the list
93+
urls_to_remove = set()
94+
for url in visited_urls:
95+
if url.endswith("/index.html") and (url[:-11] in visited_urls or url[:-10] in visited_urls):
96+
urls_to_remove.add(url)
97+
visited_urls -= urls_to_remove
98+
99+
# if visited URLs end in "/", and the same URL without "/" is in the list, remove the one with "/" from the list
100+
urls_to_remove = set()
101+
for url in visited_urls:
102+
if url.endswith("/") and url[:-1] in visited_urls:
103+
urls_to_remove.add(url)
104+
visited_urls -= urls_to_remove
105+
106+
# if visited URLs end in "common-*.html", and the file URL exists in with base URL http://ardupilot.org/copter/docs/,
107+
# remove it from the list
108+
urls_to_remove = set()
109+
for url in visited_urls:
110+
if "/common-" in url and url.endswith(".html"):
111+
filename = url.split("/")[-1]
112+
copter_url = f"http://ardupilot.org/copter/docs/{filename}"
113+
if copter_url in visited_urls and url != copter_url:
114+
urls_to_remove.add(url)
115+
visited_urls -= urls_to_remove
116+
117+
return visited_urls
118+
119+
120+
def find_all_links(soup: BeautifulSoup, current_url: str, visited_urls: set[str], urls_to_visit: set[str]) -> set[str]:
121+
for link in soup.find_all("a", href=True):
122+
if not isinstance(link, Tag):
123+
continue
124+
125+
href = link.attrs.get("href")
126+
127+
if not isinstance(href, str):
128+
continue
129+
130+
full_url = urljoin(current_url, href)
131+
132+
if not isinstance(full_url, str):
133+
continue
134+
135+
# Remove anchor from URL
136+
parsed_url = urlparse(full_url)
137+
138+
clean_url = f"{parsed_url.scheme}://{parsed_url.netloc}{parsed_url.path}"
139+
140+
# Skip image files
141+
if parsed_url.path.lower().endswith((".png", ".jpg", ".jpeg", ".gif", ".svg", ".bmp", "_images")):
142+
continue
143+
144+
# Check if URL matches allowed domains
145+
if (
146+
any(domain in clean_url for domain in ALLOWED_DOMAINS)
147+
and clean_url not in visited_urls
148+
and clean_url not in urls_to_visit
149+
and all(domain not in clean_url for domain in URL_BLACKLIST_PREFIXES)
150+
):
151+
urls_to_visit.add(clean_url)
152+
153+
return urls_to_visit
154+
155+
156+
def main() -> None:
157+
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
158+
start_time = time.time()
159+
160+
visited_urls = set()
161+
broken_urls = set()
162+
urls_to_visit = {URL}
163+
proxies = get_env_proxies()
164+
165+
# Set up authentication if credentials provided
166+
auth = (USERNAME, PASSWORD) if USERNAME and PASSWORD else None
167+
168+
session = requests.Session()
169+
if auth:
170+
session.auth = auth
171+
if proxies:
172+
session.proxies = proxies
173+
174+
while urls_to_visit:
175+
current_url = urls_to_visit.pop()
176+
177+
if current_url in visited_urls or current_url in broken_urls:
178+
continue
179+
180+
try:
181+
logging.info("Crawling: %s", current_url)
182+
response = session.get(current_url, timeout=30)
183+
response.raise_for_status()
184+
185+
visited_urls.add(current_url)
186+
187+
# Parse HTML
188+
soup = BeautifulSoup(response.text, "html.parser")
189+
190+
urls_to_visit = find_all_links(soup, current_url, visited_urls, urls_to_visit)
191+
192+
except (requests.RequestException, requests.Timeout) as e:
193+
logging.error("Network error crawling %s: %s", current_url, str(e))
194+
broken_urls.add(current_url)
195+
if current_url in visited_urls:
196+
visited_urls.remove(current_url)
197+
except (KeyError, ValueError) as e:
198+
logging.error("URL processing error for %s: %s", current_url, str(e))
199+
broken_urls.add(current_url)
200+
if current_url in visited_urls:
201+
visited_urls.remove(current_url)
202+
output_urls(visited_urls, broken_urls, start_time)
203+
204+
205+
def output_urls(visited_urls: set[str], broken_urls: set[str], start_time: float) -> None:
206+
# Write all html URLs to file
207+
raw_pages = len(visited_urls)
208+
with open("gurubase.io_url_list_raw.txt", "w", encoding="utf-8") as f:
209+
for url in sorted(visited_urls):
210+
f.write(f"{url}\n") # Output to file
211+
212+
visited_urls -= set(URL_BLACKLIST)
213+
dedup_urls = remove_duplicates(visited_urls)
214+
215+
# Write de-duplicated URLs to file and terminal
216+
with open("gurubase.io_url_list.txt", "w", encoding="utf-8") as f:
217+
for url in sorted(dedup_urls):
218+
print(url) # Output to terminal # noqa: T201
219+
f.write(f"{url}\n") # Output to file
220+
221+
# Write broken URLs to file
222+
with open("gurubase.io_broken_urls_list.txt", "w", encoding="utf-8") as f:
223+
for url in sorted(broken_urls):
224+
f.write(f"{url}\n")
225+
226+
duration_mins = (time.time() - start_time) / 60
227+
pages_per_min = raw_pages / duration_mins
228+
229+
logging.info("\nCrawling Statistics:")
230+
msg = f"{raw_pages} pages crawled in {duration_mins:.2f} minutes ({pages_per_min:.2f} pages/min)"
231+
logging.info(msg)
232+
msg = f"De-duplicated pages: {len(dedup_urls)}"
233+
logging.info(msg)
234+
msg = f"Broken URLs found: {len(broken_urls)}"
235+
logging.info(msg)
236+
237+
238+
if __name__ == "__main__":
239+
main() # Call the main function

scripts/crawl_github_ardupilot.sh

+14
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
#!/bin/bash
2+
3+
# Outputs URLs of ArduPilot README.md pages on GitHub
4+
5+
# This script is used to crawl the ArduPilot GitHub repository for README.md files
6+
# and output the URLs to a file called github_urllist.txt
7+
8+
# SPDX-FileCopyrightText: 2024-2025 Amilcar do Carmo Lucas <[email protected]>
9+
10+
# SPDX-License-Identifier: GPL-3.0-or-later
11+
12+
cd ../ardupilot
13+
find . -type f -name "*.md" | sed 's|^\.|https://github.com/ArduPilot/ardupilot/blob/master|' > ../ardupilot_methodic_configurator/github_urllist.txt
14+
cd -

0 commit comments

Comments
 (0)