Skip to content

Commit

Permalink
Retry broken external links (#3528)
Browse files Browse the repository at this point in the history
This adds logic to retry requests to external links up to 3 times in
total. It also changes how the script determines what links are to
websites (and not files or emails).


## Checklist
- [x] I have read and understood the [WATcloud
Guidelines](https://cloud.watonomous.ca/docs/community-docs/watcloud/guidelines)
- [x] I have performed a self-review of my code
  • Loading branch information
rickyhuangjh authored Dec 31, 2024
1 parent 5a94184 commit 59820ef
Showing 1 changed file with 19 additions and 12 deletions.
31 changes: 19 additions & 12 deletions scripts/validate-external-links.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

from curl_cffi import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urldefrag
from urllib.parse import urljoin, urldefrag, urlparse
import sys

# Broken links that match these exactly will be ignored.
Expand All @@ -26,24 +26,23 @@
"https://jira.watonomous.ca"
]

# These will not be treated as urls at all (neither internal nor external)
IGNORE_LINK_PREFIXES = [
"mailto"
# These are the URL schemes that are treated as links (internal and external)
SCHEMES = [
"",
"https",
"http"
]


def clean_url(url):
res = url.replace("http://", "")
res = res.replace("https://", "")
res = res.replace("www.", "")
res = res.rstrip('/')
res = res.lower()
return res
parsed = urlparse(url)
return str(parsed.hostname).lower() + parsed.path.lower()


CLEANED_WHITELISTED_URLS = [clean_url(url) for url in WHITELISTED_URLS]
CLEANED_WHITELISTED_PREFIXES = [clean_url(url) for url in WHITELISTED_PREFIXES]


if len(sys.argv) < 2:
print(f"Usage: python3 {__file__} <BASE_URL>")
sys.exit(1)
Expand Down Expand Up @@ -80,9 +79,10 @@ def is_external_url(url):
return not url.startswith(BASE_URL.rstrip("/"))


def check_link(url: str, page: str) -> ExternalLink:
def check_link(url: str, page: str, attempt: int = 1) -> ExternalLink:
print(f"Checking link {url}")
print(f" on page {page}")
print(f" attempt {attempt}")
try:
request_response = requests.get(url, allow_redirects=True,
impersonate="safari", timeout=10)
Expand Down Expand Up @@ -112,12 +112,19 @@ def check_link(url: str, page: str) -> ExternalLink:
err_str = "Unauthorized redirect (likely)"
else:
err_str = "Unspecified error"

if attempt < 3:
return check_link(url, page, attempt + 1)
return ExternalLink(True, page, url, request_code, err_str)

except requests.exceptions.Timeout:
if attempt < 3:
return check_link(url, page, attempt + 1)
return ExternalLink(True, page, url, -1, "Timeout")
except requests.exceptions.RequestException as e:
# Any error like connection issues are treated as broken links
if attempt < 3:
return check_link(url, page, attempt + 1)
return ExternalLink(True, page, url, -1, f'Request exception: {str(e)}')


Expand All @@ -129,7 +136,7 @@ def get_links_on_page(url):

# Extract all anchor tags with href attributes
all_links = [a.get('href') for a in soup.find_all('a', href=True)]
filtered_links = [url for url in all_links if url not in IGNORE_LINK_PREFIXES]
filtered_links = [url for url in all_links if urlparse(url).scheme in SCHEMES]

# Join relative URLs with the base URL to form complete links
return [urljoin(url, link) for link in filtered_links]
Expand Down

0 comments on commit 59820ef

Please sign in to comment.