Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement a blacklist of unwanted websites #113

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions api/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ dependencies = [
"pyhumps==3.8.0",
"requests==2.32.3",
"python-i18n==0.3.9",
"schedule==1.2.2",
]
dynamic = ["authors", "classifiers", "keywords", "license", "version", "urls"]

Expand Down
68 changes: 68 additions & 0 deletions api/src/zimitfrontend/blacklist.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
import csv
import dataclasses
from http import HTTPStatus
from re import Pattern, compile

import requests

from zimitfrontend.constants import ApiConfiguration, logger


@dataclasses.dataclass(kw_only=True)
class BlacklistedUrl:
reason_key: str
url_regex: Pattern[str]


class UrlBlacklistManager:

def __init__(self):
self.blacklist: list[BlacklistedUrl] = []

def get_blacklist_reason(self, url: str) -> str | None:
"""Return the blacklist reason key or None

If url passed is blacklisted, then this function returns the blacklist reason,
otherwise None is returned
"""

def _is_match(url: str, blacklist_item: BlacklistedUrl) -> bool:
return blacklist_item.url_regex.match(url) is not None

matching = list(filter(lambda item: _is_match(url, item), self.blacklist))
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Wouldn't it make more sense to have a match(url) method on BlacklistedUrl?


return matching[0].reason_key if matching else None

def load_from_url(self, url: str) -> None:
resp = requests.get(

Check warning on line 37 in api/src/zimitfrontend/blacklist.py

View check run for this annotation

Codecov / codecov/patch

api/src/zimitfrontend/blacklist.py#L37

Added line #L37 was not covered by tests
url, allow_redirects=True, timeout=ApiConfiguration.other_requests_timeout
)
if not HTTPStatus(resp.status_code).is_success:
logger.warning(

Check warning on line 41 in api/src/zimitfrontend/blacklist.py

View check run for this annotation

Codecov / codecov/patch

api/src/zimitfrontend/blacklist.py#L41

Added line #L41 was not covered by tests
f"Error fetching blacklist from {url}: {resp.status_code} "
f"({resp.content.decode()[:1024] if resp.content else 'no body'})"
)
if not resp.content:
logger.warning(f"Empty content in blacklist at {url}: {resp.status_code}")
csvreader = csv.DictReader(

Check warning on line 47 in api/src/zimitfrontend/blacklist.py

View check run for this annotation

Codecov / codecov/patch

api/src/zimitfrontend/blacklist.py#L46-L47

Added lines #L46 - L47 were not covered by tests
resp.content.decode().splitlines(), ["url_regex", "reason_key"]
)
new_blacklist: list[BlacklistedUrl] = []

Check warning on line 50 in api/src/zimitfrontend/blacklist.py

View check run for this annotation

Codecov / codecov/patch

api/src/zimitfrontend/blacklist.py#L50

Added line #L50 was not covered by tests
for row in csvreader:
new_blacklist.append(

Check warning on line 52 in api/src/zimitfrontend/blacklist.py

View check run for this annotation

Codecov / codecov/patch

api/src/zimitfrontend/blacklist.py#L52

Added line #L52 was not covered by tests
BlacklistedUrl(
reason_key=row["reason_key"],
url_regex=compile(row["url_regex"]),
)
)
logger.info(f"{len(new_blacklist)} urls have been loaded into blacklist")
self.blacklist = new_blacklist

Check warning on line 59 in api/src/zimitfrontend/blacklist.py

View check run for this annotation

Codecov / codecov/patch

api/src/zimitfrontend/blacklist.py#L58-L59

Added lines #L58 - L59 were not covered by tests


blacklist_manager = UrlBlacklistManager()


def refresh_blacklist():
if not ApiConfiguration.blacklist_url:
return
blacklist_manager.load_from_url(ApiConfiguration.blacklist_url)

Check warning on line 68 in api/src/zimitfrontend/blacklist.py

View check run for this annotation

Codecov / codecov/patch

api/src/zimitfrontend/blacklist.py#L67-L68

Added lines #L67 - L68 were not covered by tests
6 changes: 6 additions & 0 deletions api/src/zimitfrontend/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@ class ApiConfiguration:
)
zimfarm_requests_timeout = _get_time_setting("ZIMFARM_REQUESTS_TIMEOUT", "10s")
mailgun_requests_timeout = _get_time_setting("MAILGUN_REQUESTS_TIMEOUT", "10s")
other_requests_timeout = _get_time_setting("OTHER_REQUESTS_TIMEOUT", "10s")
zimfarm_username = os.getenv("_ZIMFARM_USERNAME", "-")
zimfarm_password = os.getenv("_ZIMFARM_PASSWORD", "-")
zimit_image = os.getenv("ZIMIT_IMAGE", "openzim/zimit:1.2.0")
Expand Down Expand Up @@ -113,3 +114,8 @@ class ApiConfiguration:

# list of rtl language codes
rtl_language_codes = ("fa", "he")

blacklist_url = os.getenv(
"BLACKLIST_URL", "https://drive.zimit.kiwix.org/blacklist.csv"
)
blacklist_refresh_minutes = int(os.getenv("BLACKLIST_REFRESH_MINUTES", "10"))
10 changes: 10 additions & 0 deletions api/src/zimitfrontend/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,11 @@
from fastapi.encoders import jsonable_encoder
from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import JSONResponse, RedirectResponse
from schedule import every

Check warning on line 7 in api/src/zimitfrontend/main.py

View check run for this annotation

Codecov / codecov/patch

api/src/zimitfrontend/main.py#L7

Added line #L7 was not covered by tests
from starlette.requests import Request

from zimitfrontend import __about__
from zimitfrontend.blacklist import refresh_blacklist

Check warning on line 11 in api/src/zimitfrontend/main.py

View check run for this annotation

Codecov / codecov/patch

api/src/zimitfrontend/main.py#L11

Added line #L11 was not covered by tests
from zimitfrontend.constants import ApiConfiguration, logger
from zimitfrontend.routes import hook, requests

Expand Down Expand Up @@ -76,4 +78,12 @@

self.app.mount(f"/api/{__about__.__api_version__}", api)

if ApiConfiguration.blacklist_url:
refresh_blacklist()
every(

Check warning on line 83 in api/src/zimitfrontend/main.py

View check run for this annotation

Codecov / codecov/patch

api/src/zimitfrontend/main.py#L82-L83

Added lines #L82 - L83 were not covered by tests
ApiConfiguration.blacklist_refresh_minutes
).minutes.do( # pyright: ignore[reportUnknownMemberType]
refresh_blacklist
)

return self.app
58 changes: 43 additions & 15 deletions api/src/zimitfrontend/routes/requests.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,9 @@
from typing import Annotated

from fastapi import APIRouter, HTTPException, Path
from schedule import run_pending

Check warning on line 7 in api/src/zimitfrontend/routes/requests.py

View check run for this annotation

Codecov / codecov/patch

api/src/zimitfrontend/routes/requests.py#L7

Added line #L7 was not covered by tests

from zimitfrontend.blacklist import blacklist_manager

Check warning on line 9 in api/src/zimitfrontend/routes/requests.py

View check run for this annotation

Codecov / codecov/patch

api/src/zimitfrontend/routes/requests.py#L9

Added line #L9 was not covered by tests
from zimitfrontend.constants import ApiConfiguration, logger
from zimitfrontend.routes.schemas import TaskCreateRequest, TaskCreateResponse, TaskInfo
from zimitfrontend.routes.utils import get_task_info
Expand Down Expand Up @@ -37,8 +39,9 @@
raise HTTPException(
status_code=HTTPStatus.BAD_REQUEST,
detail={
"error": f"Failed to find task on Zimfarm with HTTP {status}",
"zimfarm_message": task,
"translationKey": "requestStatus.taskNotFoundSnack",
"status": status,
"zimfarmMessage": task,
},
)
return get_task_info(task)
Expand All @@ -55,8 +58,17 @@
)
def create_task(request: TaskCreateRequest) -> TaskCreateResponse:

# trigger blacklist refresh (will only happen at configured interval)
run_pending()

Check warning on line 62 in api/src/zimitfrontend/routes/requests.py

View check run for this annotation

Codecov / codecov/patch

api/src/zimitfrontend/routes/requests.py#L62

Added line #L62 was not covered by tests

url = urllib.parse.urlparse(request.url)

if blacklist_reason := blacklist_manager.get_blacklist_reason(url.geturl()):
raise HTTPException(

Check warning on line 67 in api/src/zimitfrontend/routes/requests.py

View check run for this annotation

Codecov / codecov/patch

api/src/zimitfrontend/routes/requests.py#L67

Added line #L67 was not covered by tests
status_code=HTTPStatus.BAD_REQUEST,
detail={"translationKey": blacklist_reason},
)

# generate schedule name
ident = str(uuid.uuid4())[:8]
schedule_name = f"{url.hostname}_{ident}"
Expand Down Expand Up @@ -147,16 +159,21 @@
)
if not success:
logger.error(f"Unable to create schedule via HTTP {status}: {resp}")
message = f"Unable to create schedule via HTTP {status}: {resp}"
if status == HTTPStatus.BAD_REQUEST:
# if Zimfarm replied this is a bad request, then this is most probably
# a bad request due to user input so we can track it like a bad request
raise HTTPException(status_code=HTTPStatus.BAD_REQUEST, detail=message)
else:
# otherwise, this is most probably an internal problem in our systems
raise HTTPException(
status_code=HTTPStatus.INTERNAL_SERVER_ERROR, detail=message
)
# if Zimfarm replied this is a bad request, then this is most probably
# a bad request due to user input so we can track it like a bad request
# otherwise, this is most probably an internal problem in our systems
raise HTTPException(

Check warning on line 165 in api/src/zimitfrontend/routes/requests.py

View check run for this annotation

Codecov / codecov/patch

api/src/zimitfrontend/routes/requests.py#L165

Added line #L165 was not covered by tests
status_code=(
HTTPStatus.BAD_REQUEST
if status == HTTPStatus.BAD_REQUEST
else HTTPStatus.INTERNAL_SERVER_ERROR
),
detail={
"translationKey": "newRequest.unableToCreateSchedule",
"status": status,
"zimfarmMessage": resp,
},
)

# request a task for that newly created schedule
success, status, resp = query_api(
Expand All @@ -171,19 +188,30 @@
logger.error(f"Unable to request {schedule_name} via HTTP {status}: {resp}")
raise HTTPException(
status_code=HTTPStatus.INTERNAL_SERVER_ERROR,
detail=f"Unable to request schedule via HTTP {status}): {resp}",
detail={
"translationKey": "newRequest.unableToRequestSchedule",
"status": status,
"zimfarmMessage": resp,
},
)

try:
task_id = resp.get("requested").pop()
if not task_id:
logger.error("Zimfarm returned an empty task ID")

Check warning on line 201 in api/src/zimitfrontend/routes/requests.py

View check run for this annotation

Codecov / codecov/patch

api/src/zimitfrontend/routes/requests.py#L201

Added line #L201 was not covered by tests
raise HTTPException(
status_code=HTTPStatus.INTERNAL_SERVER_ERROR, detail="task_id is False"
status_code=HTTPStatus.INTERNAL_SERVER_ERROR,
detail={
"translationKey": "newRequest.missingTaskId",
},
)
except Exception as exc:
logger.error("Zimfarm did not returned the task ID as expected", exc_info=exc)

Check warning on line 209 in api/src/zimitfrontend/routes/requests.py

View check run for this annotation

Codecov / codecov/patch

api/src/zimitfrontend/routes/requests.py#L209

Added line #L209 was not covered by tests
raise HTTPException(
status_code=HTTPStatus.INTERNAL_SERVER_ERROR,
detail=f"Couldn't retrieve requested task id: {exc}",
detail={
"translationKey": "newRequest.failToGetTaskId",
},
) from exc

# remove newly created schedule (not needed anymore)
Expand Down
35 changes: 35 additions & 0 deletions api/tests/unit/test_blacklist.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
from re import compile

import pytest

from zimitfrontend.blacklist import BlacklistedUrl, UrlBlacklistManager

manager = UrlBlacklistManager()
manager.blacklist.append(
BlacklistedUrl(
reason_key="blacklist.not_working",
url_regex=compile(r"^https?:\/\/www\.acme\.com(?:\/.*)?$"),
)
)
manager.blacklist.append(
BlacklistedUrl(
reason_key="blacklist.already_done",
url_regex=compile(r"^https?:\/\/en\.wikipedia\.org(?:\/.*)?$"),
)
)


@pytest.mark.parametrize(
"url,expected",
[
pytest.param("http://www.acme.com", "blacklist.not_working"),
pytest.param("https://www.acme.com", "blacklist.not_working"),
pytest.param("https://www.acme.com/foo", "blacklist.not_working"),
pytest.param("https://en.wikipedia.org", "blacklist.already_done"),
pytest.param("https://en.wikipedia.org/wiki/Foo", "blacklist.already_done"),
pytest.param("https://www.foo.com", None),
pytest.param("https://www.foo.com?href=http://www.acme.com", None),
],
)
def test_blacklist(url: str, expected: str | None):
assert manager.get_blacklist_reason(url) == expected
2 changes: 2 additions & 0 deletions dev/blacklist.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
^https?:\/\/\w*\.?wikipedia\.org(?:\/.*)?$,newRequest.urlBlacklisted
^https?:\/\/\w*\.?wikihow\.com(?:\/.*)?$,newRequest.urlBlacklisted
10 changes: 8 additions & 2 deletions locales/en.json
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,12 @@
"errorFetchingDefinition": "Error fetching offliner definition",
"creatingRequest": "Creating request…",
"errorCreatingRequest": "Error creating request",
"offlinerNotFound": "Zimit offliner not found, we probably experience a serious issue on our infrastructure."
"unableToCreateSchedule": "Unable to create schedule via HTTP {status}: {zimfarmMessage}",
"unableToRequestSchedule": "Unable to request new task for schedule via HTTP {status}: {zimfarmMessage}",
"missingTaskId": "Zimfarm returned an empty task ID",
"failToGetTaskId": "Zimfarm did not returned the task ID as expected",
"offlinerNotFound": "Zimit offliner not found, we probably experience a serious issue on our infrastructure.",
"urlBlacklisted": "This website cannot be processed with zimit.kiwix.org"
},
"notFound": {
"heading": "Not Found",
Expand Down Expand Up @@ -68,7 +73,8 @@
"emailNotification": "You should have received an email with the current URL. Once the task is completed you will get a second email with a download link (you may thus close this window).",
"noEmailNotification": "You did not provide us with an email: please bookmark this page before closing the window or you will not be able to retrieve your ZIM file.",
"settingsHeading": "Settings",
"taskNotFound": "Task not found. Either your URL is incorrect, or our service is experiencing an issue."
"taskNotFound": "Task not found. Either your URL is incorrect, or our service is experiencing an issue.",
"taskNotFoundSnack": "Failed to find task on Zimfarm with HTTP {status}."
},
"email": {
"requested": {
Expand Down
10 changes: 8 additions & 2 deletions locales/qqq.json
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,12 @@
"errorFetchingDefinition": "This is the message when fetching the task definition failed.",
"creatingRequest": "This is the message while creating a Zimfarm request.",
"errorCreatingRequest": "This is the message when creating a Zimfarm request failed.",
"offlinerNotFound": "This is the message when we failed to load offliner definition through API call."
"unableToCreateSchedule": "This is the message when request creation failed at Zimfamrm schedule creation",
"unableToRequestSchedule": "This is the message when request creation failed at Zimfamrm new task request",
"missingTaskId": "This is the message when Zimfarm returned an empty task ID when requesting the task",
"failToGetTaskId": "This is the message when Zimfarm did not returned the task ID as expected when requesting the task",
"offlinerNotFound": "This is the message when we failed to load offliner definition through API call.",
"urlBlacklisted": "This is the snackbar message when URL passed is blacklisted from zimit.kiwix.org"
},
"notFound": {
"heading": "This is the heading displayed when URL is not found/handled.",
Expand Down Expand Up @@ -73,7 +78,8 @@
"emailNotification": "This is an explanation about what to do while task is processing when email has been provided",
"noEmailNotification": "This is an explanation about what to do while task is processing when email has NOT been provided",
"settingsHeading": "This is the title of the section detailing task settings",
"taskNotFound": "This is the message displayed when the task is not found."
"taskNotFound": "This is the message displayed when the task is not found.",
"taskNotFoundSnack": "This is the message displayed in the snackbar when the task is not found"
},
"email": {
"requested": {
Expand Down
9 changes: 8 additions & 1 deletion ui/src/stores/main.ts
Original file line number Diff line number Diff line change
Expand Up @@ -187,7 +187,14 @@ export const useMainStore = defineStore('main', {
if (error instanceof AxiosError && error.response) {
console.error(message, ':', error.response.status, error.response.statusText)
if (error.response.data.detail) {
message = message + ': ' + error.response.data.detail
if (error.response.data.detail.translationKey) {
message =
message +
': ' +
this.t(error.response.data.detail.translationKey, error.response.data.detail)
} else {
message = message + ': ' + error.response.data.detail
}
}
} else {
console.error(message, ':', error)
Expand Down