Skip to content

Commit

Permalink
Merge pull request #118 from edubotics-ai/timeout-patch
Browse files Browse the repository at this point in the history
Add timeout catches
  • Loading branch information
trgardos authored Oct 8, 2024
2 parents c14958e + 82b1132 commit 2b765ae
Showing 1 changed file with 3 additions and 1 deletion.
4 changes: 3 additions & 1 deletion edubotics_core/dataloader/webpage_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@ def url_exists(self, url: str) -> bool:
return response.status_code == 200
except requests.ConnectionError:
return False
except requests.exceptions.Timeout:
return False

async def get_links(self, session: ClientSession, website_link: str, base_url: str):
if not website_link.startswith(base_url):
Expand Down Expand Up @@ -104,7 +106,7 @@ def is_webpage(self, url: str) -> bool:
response = requests.head(url, allow_redirects=True, timeout=TIMEOUT)
content_type = response.headers.get("Content-Type", "").lower()
return "text/html" in content_type
except requests.RequestException:
except (requests.RequestException, ValueError, requests.exceptions.Timeout):
return False

def clean_url_list(self, urls):
Expand Down

0 comments on commit 2b765ae

Please sign in to comment.