diff --git a/edubotics_core/dataloader/webpage_crawler.py b/edubotics_core/dataloader/webpage_crawler.py index 519bce1..a4f3246 100644 --- a/edubotics_core/dataloader/webpage_crawler.py +++ b/edubotics_core/dataloader/webpage_crawler.py @@ -26,6 +26,8 @@ def url_exists(self, url: str) -> bool: return response.status_code == 200 except requests.ConnectionError: return False + except requests.exceptions.Timeout: + return False async def get_links(self, session: ClientSession, website_link: str, base_url: str): if not website_link.startswith(base_url): @@ -104,7 +106,7 @@ def is_webpage(self, url: str) -> bool: response = requests.head(url, allow_redirects=True, timeout=TIMEOUT) content_type = response.headers.get("Content-Type", "").lower() return "text/html" in content_type - except requests.RequestException: + except (requests.RequestException, ValueError, requests.exceptions.Timeout): return False def clean_url_list(self, urls):