diff --git a/api/pyproject.toml b/api/pyproject.toml index b0b8d9a..9946b73 100644 --- a/api/pyproject.toml +++ b/api/pyproject.toml @@ -14,6 +14,7 @@ dependencies = [ "pyhumps==3.8.0", "requests==2.32.3", "python-i18n==0.3.9", + "schedule==1.2.2", ] dynamic = ["authors", "classifiers", "keywords", "license", "version", "urls"] diff --git a/api/src/zimitfrontend/blacklist.py b/api/src/zimitfrontend/blacklist.py new file mode 100644 index 0000000..8ff9ccd --- /dev/null +++ b/api/src/zimitfrontend/blacklist.py @@ -0,0 +1,68 @@ +import csv +import dataclasses +from http import HTTPStatus +from re import Pattern, compile + +import requests + +from zimitfrontend.constants import ApiConfiguration, logger + + +@dataclasses.dataclass(kw_only=True) +class BlacklistedUrl: + reason_key: str + url_regex: Pattern[str] + + +class UrlBlacklistManager: + + def __init__(self): + self.blacklist: list[BlacklistedUrl] = [] + + def get_blacklist_reason(self, url: str) -> str | None: + """Return the blacklist reason key or None + + If url passed is blacklisted, then this function returns the blacklist reason, + otherwise None is returned + """ + + def _is_match(url: str, blacklist_item: BlacklistedUrl) -> bool: + return blacklist_item.url_regex.match(url) is not None + + matching = list(filter(lambda item: _is_match(url, item), self.blacklist)) + + return matching[0].reason_key if matching else None + + def load_from_url(self, url: str) -> None: + resp = requests.get( + url, allow_redirects=True, timeout=ApiConfiguration.other_requests_timeout + ) + if not HTTPStatus(resp.status_code).is_success: + logger.warning( + f"Error fetching blacklist from {url}: {resp.status_code} " + f"({resp.content.decode()[:1024] if resp.content else 'no body'})" + ) + if not resp.content: + logger.warning(f"Empty content in blacklist at {url}: {resp.status_code}") + csvreader = csv.DictReader( + resp.content.decode().splitlines(), ["url_regex", "reason_key"] + ) + new_blacklist: list[BlacklistedUrl] = [] + for row in csvreader: + new_blacklist.append( + BlacklistedUrl( + reason_key=row["reason_key"], + url_regex=compile(row["url_regex"]), + ) + ) + logger.info(f"{len(new_blacklist)} urls have been loaded into blacklist") + self.blacklist = new_blacklist + + +blacklist_manager = UrlBlacklistManager() + + +def refresh_blacklist(): + if not ApiConfiguration.blacklist_url: + return + blacklist_manager.load_from_url(ApiConfiguration.blacklist_url) diff --git a/api/src/zimitfrontend/constants.py b/api/src/zimitfrontend/constants.py index 4eb2ebd..41bb5ae 100644 --- a/api/src/zimitfrontend/constants.py +++ b/api/src/zimitfrontend/constants.py @@ -76,6 +76,7 @@ class ApiConfiguration: ) zimfarm_requests_timeout = _get_time_setting("ZIMFARM_REQUESTS_TIMEOUT", "10s") mailgun_requests_timeout = _get_time_setting("MAILGUN_REQUESTS_TIMEOUT", "10s") + other_requests_timeout = _get_time_setting("OTHER_REQUESTS_TIMEOUT", "10s") zimfarm_username = os.getenv("_ZIMFARM_USERNAME", "-") zimfarm_password = os.getenv("_ZIMFARM_PASSWORD", "-") zimit_image = os.getenv("ZIMIT_IMAGE", "openzim/zimit:1.2.0") @@ -113,3 +114,8 @@ class ApiConfiguration: # list of rtl language codes rtl_language_codes = ("fa", "he") + + blacklist_url = os.getenv( + "BLACKLIST_URL", "https://drive.zimit.kiwix.org/blacklist.csv" + ) + blacklist_refresh_minutes = int(os.getenv("BLACKLIST_REFRESH_MINUTES", "10")) diff --git a/api/src/zimitfrontend/main.py b/api/src/zimitfrontend/main.py index f424eeb..e1d06c5 100644 --- a/api/src/zimitfrontend/main.py +++ b/api/src/zimitfrontend/main.py @@ -4,9 +4,11 @@ from fastapi.encoders import jsonable_encoder from fastapi.middleware.cors import CORSMiddleware from fastapi.responses import JSONResponse, RedirectResponse +from schedule import every from starlette.requests import Request from zimitfrontend import __about__ +from zimitfrontend.blacklist import refresh_blacklist from zimitfrontend.constants import ApiConfiguration, logger from zimitfrontend.routes import hook, requests @@ -76,4 +78,12 @@ async def internal_exception_handler( # pyright: ignore[reportUnusedFunction] self.app.mount(f"/api/{__about__.__api_version__}", api) + if ApiConfiguration.blacklist_url: + refresh_blacklist() + every( + ApiConfiguration.blacklist_refresh_minutes + ).minutes.do( # pyright: ignore[reportUnknownMemberType] + refresh_blacklist + ) + return self.app diff --git a/api/src/zimitfrontend/routes/requests.py b/api/src/zimitfrontend/routes/requests.py index fa98b81..550dce5 100644 --- a/api/src/zimitfrontend/routes/requests.py +++ b/api/src/zimitfrontend/routes/requests.py @@ -4,7 +4,9 @@ from typing import Annotated from fastapi import APIRouter, HTTPException, Path +from schedule import run_pending +from zimitfrontend.blacklist import blacklist_manager from zimitfrontend.constants import ApiConfiguration, logger from zimitfrontend.routes.schemas import TaskCreateRequest, TaskCreateResponse, TaskInfo from zimitfrontend.routes.utils import get_task_info @@ -37,8 +39,9 @@ def task_info( raise HTTPException( status_code=HTTPStatus.BAD_REQUEST, detail={ - "error": f"Failed to find task on Zimfarm with HTTP {status}", - "zimfarm_message": task, + "translationKey": "requestStatus.taskNotFoundSnack", + "status": status, + "zimfarmMessage": task, }, ) return get_task_info(task) @@ -55,8 +58,17 @@ def task_info( ) def create_task(request: TaskCreateRequest) -> TaskCreateResponse: + # trigger blacklist refresh (will only happen at configured interval) + run_pending() + url = urllib.parse.urlparse(request.url) + if blacklist_reason := blacklist_manager.get_blacklist_reason(url.geturl()): + raise HTTPException( + status_code=HTTPStatus.BAD_REQUEST, + detail={"translationKey": blacklist_reason}, + ) + # generate schedule name ident = str(uuid.uuid4())[:8] schedule_name = f"{url.hostname}_{ident}" @@ -147,16 +159,21 @@ def _cap_limit(user_limit: int, zimit_limit: int) -> int: ) if not success: logger.error(f"Unable to create schedule via HTTP {status}: {resp}") - message = f"Unable to create schedule via HTTP {status}: {resp}" - if status == HTTPStatus.BAD_REQUEST: - # if Zimfarm replied this is a bad request, then this is most probably - # a bad request due to user input so we can track it like a bad request - raise HTTPException(status_code=HTTPStatus.BAD_REQUEST, detail=message) - else: - # otherwise, this is most probably an internal problem in our systems - raise HTTPException( - status_code=HTTPStatus.INTERNAL_SERVER_ERROR, detail=message - ) + # if Zimfarm replied this is a bad request, then this is most probably + # a bad request due to user input so we can track it like a bad request + # otherwise, this is most probably an internal problem in our systems + raise HTTPException( + status_code=( + HTTPStatus.BAD_REQUEST + if status == HTTPStatus.BAD_REQUEST + else HTTPStatus.INTERNAL_SERVER_ERROR + ), + detail={ + "translationKey": "newRequest.unableToCreateSchedule", + "status": status, + "zimfarmMessage": resp, + }, + ) # request a task for that newly created schedule success, status, resp = query_api( @@ -171,19 +188,30 @@ def _cap_limit(user_limit: int, zimit_limit: int) -> int: logger.error(f"Unable to request {schedule_name} via HTTP {status}: {resp}") raise HTTPException( status_code=HTTPStatus.INTERNAL_SERVER_ERROR, - detail=f"Unable to request schedule via HTTP {status}): {resp}", + detail={ + "translationKey": "newRequest.unableToRequestSchedule", + "status": status, + "zimfarmMessage": resp, + }, ) try: task_id = resp.get("requested").pop() if not task_id: + logger.error("Zimfarm returned an empty task ID") raise HTTPException( - status_code=HTTPStatus.INTERNAL_SERVER_ERROR, detail="task_id is False" + status_code=HTTPStatus.INTERNAL_SERVER_ERROR, + detail={ + "translationKey": "newRequest.missingTaskId", + }, ) except Exception as exc: + logger.error("Zimfarm did not returned the task ID as expected", exc_info=exc) raise HTTPException( status_code=HTTPStatus.INTERNAL_SERVER_ERROR, - detail=f"Couldn't retrieve requested task id: {exc}", + detail={ + "translationKey": "newRequest.failToGetTaskId", + }, ) from exc # remove newly created schedule (not needed anymore) diff --git a/api/tests/unit/test_blacklist.py b/api/tests/unit/test_blacklist.py new file mode 100644 index 0000000..06cdb1b --- /dev/null +++ b/api/tests/unit/test_blacklist.py @@ -0,0 +1,35 @@ +from re import compile + +import pytest + +from zimitfrontend.blacklist import BlacklistedUrl, UrlBlacklistManager + +manager = UrlBlacklistManager() +manager.blacklist.append( + BlacklistedUrl( + reason_key="blacklist.not_working", + url_regex=compile(r"^https?:\/\/www\.acme\.com(?:\/.*)?$"), + ) +) +manager.blacklist.append( + BlacklistedUrl( + reason_key="blacklist.already_done", + url_regex=compile(r"^https?:\/\/en\.wikipedia\.org(?:\/.*)?$"), + ) +) + + +@pytest.mark.parametrize( + "url,expected", + [ + pytest.param("http://www.acme.com", "blacklist.not_working"), + pytest.param("https://www.acme.com", "blacklist.not_working"), + pytest.param("https://www.acme.com/foo", "blacklist.not_working"), + pytest.param("https://en.wikipedia.org", "blacklist.already_done"), + pytest.param("https://en.wikipedia.org/wiki/Foo", "blacklist.already_done"), + pytest.param("https://www.foo.com", None), + pytest.param("https://www.foo.com?href=http://www.acme.com", None), + ], +) +def test_blacklist(url: str, expected: str | None): + assert manager.get_blacklist_reason(url) == expected diff --git a/dev/blacklist.csv b/dev/blacklist.csv new file mode 100644 index 0000000..c5a6e9a --- /dev/null +++ b/dev/blacklist.csv @@ -0,0 +1,2 @@ +^https?:\/\/\w*\.?wikipedia\.org(?:\/.*)?$,newRequest.urlBlacklisted +^https?:\/\/\w*\.?wikihow\.com(?:\/.*)?$,newRequest.urlBlacklisted diff --git a/locales/en.json b/locales/en.json index 2162132..ee5f1c6 100644 --- a/locales/en.json +++ b/locales/en.json @@ -40,7 +40,12 @@ "errorFetchingDefinition": "Error fetching offliner definition", "creatingRequest": "Creating request…", "errorCreatingRequest": "Error creating request", - "offlinerNotFound": "Zimit offliner not found, we probably experience a serious issue on our infrastructure." + "unableToCreateSchedule": "Unable to create schedule via HTTP {status}: {zimfarmMessage}", + "unableToRequestSchedule": "Unable to request new task for schedule via HTTP {status}: {zimfarmMessage}", + "missingTaskId": "Zimfarm returned an empty task ID", + "failToGetTaskId": "Zimfarm did not returned the task ID as expected", + "offlinerNotFound": "Zimit offliner not found, we probably experience a serious issue on our infrastructure.", + "urlBlacklisted": "This website cannot be processed with zimit.kiwix.org" }, "notFound": { "heading": "Not Found", @@ -68,7 +73,8 @@ "emailNotification": "You should have received an email with the current URL. Once the task is completed you will get a second email with a download link (you may thus close this window).", "noEmailNotification": "You did not provide us with an email: please bookmark this page before closing the window or you will not be able to retrieve your ZIM file.", "settingsHeading": "Settings", - "taskNotFound": "Task not found. Either your URL is incorrect, or our service is experiencing an issue." + "taskNotFound": "Task not found. Either your URL is incorrect, or our service is experiencing an issue.", + "taskNotFoundSnack": "Failed to find task on Zimfarm with HTTP {status}." }, "email": { "requested": { diff --git a/locales/qqq.json b/locales/qqq.json index 68559dd..81f2528 100644 --- a/locales/qqq.json +++ b/locales/qqq.json @@ -45,7 +45,12 @@ "errorFetchingDefinition": "This is the message when fetching the task definition failed.", "creatingRequest": "This is the message while creating a Zimfarm request.", "errorCreatingRequest": "This is the message when creating a Zimfarm request failed.", - "offlinerNotFound": "This is the message when we failed to load offliner definition through API call." + "unableToCreateSchedule": "This is the message when request creation failed at Zimfamrm schedule creation", + "unableToRequestSchedule": "This is the message when request creation failed at Zimfamrm new task request", + "missingTaskId": "This is the message when Zimfarm returned an empty task ID when requesting the task", + "failToGetTaskId": "This is the message when Zimfarm did not returned the task ID as expected when requesting the task", + "offlinerNotFound": "This is the message when we failed to load offliner definition through API call.", + "urlBlacklisted": "This is the snackbar message when URL passed is blacklisted from zimit.kiwix.org" }, "notFound": { "heading": "This is the heading displayed when URL is not found/handled.", @@ -73,7 +78,8 @@ "emailNotification": "This is an explanation about what to do while task is processing when email has been provided", "noEmailNotification": "This is an explanation about what to do while task is processing when email has NOT been provided", "settingsHeading": "This is the title of the section detailing task settings", - "taskNotFound": "This is the message displayed when the task is not found." + "taskNotFound": "This is the message displayed when the task is not found.", + "taskNotFoundSnack": "This is the message displayed in the snackbar when the task is not found" }, "email": { "requested": { diff --git a/ui/src/stores/main.ts b/ui/src/stores/main.ts index a27e289..ffa53ba 100644 --- a/ui/src/stores/main.ts +++ b/ui/src/stores/main.ts @@ -187,7 +187,14 @@ export const useMainStore = defineStore('main', { if (error instanceof AxiosError && error.response) { console.error(message, ':', error.response.status, error.response.statusText) if (error.response.data.detail) { - message = message + ': ' + error.response.data.detail + if (error.response.data.detail.translationKey) { + message = + message + + ': ' + + this.t(error.response.data.detail.translationKey, error.response.data.detail) + } else { + message = message + ': ' + error.response.data.detail + } } } else { console.error(message, ':', error)