Skip to content

Fix typos, remaining code qa and typing issues resulting in breaking changes #178

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Jul 30, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,21 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- New `indexing.IndexData` class to hold title, content and keywords to pass to libzim to index an item
- Automatically index PDF documents content #167
- Automatically set proper title on PDF documents #168
- Expose new `optimization.get_optimization_method` to get the proper optimization method to call for a given image format
- Add `optimization.get_optimization_method` to get the proper optimization method to call for a given image format
- New `creator.Creator.convert_and_check_metadata` to convert metadata to bytes or str for known use cases and check proper type is passed to libzim

## Changed

- **BREAKING** Renamed `zimscraperlib.image.convertion` to `zimscraperlib.image.conversion` to fix typo
- **BREAKING** Many changes in type hints to match the real underlying code
- **BREAKING** Force all boolean arguments (and some other non-obvious parameters) to be keyword-only in function calls for clarity / disambiguation (see ruff rule FBT002)
- Prefer to use `IO[bytes]` to `io.BytesIO` when possible since it is more generic
- **BREAKING** `i18n.NotFound` renamed `i18n.NotFoundError`
- **BREAKING** `types.get_mime_for_name` now returns `str | None`
- **BREAKING** `creator.Creator.add_metadata` and `creator.Creator.validate_metadata` now only accepts `bytes | str` as value (it must have been converted before call)
- **BREAKING** second argument of `creator.Creator.add_metadata` has been renamed to `value` instead of `content` to align with other methods
- When a type issue arises in metadata checks, wrong value type is displayed in exception

### Fixed

Expand Down
2 changes: 1 addition & 1 deletion src/zimscraperlib/__about__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "3.4.1-dev0"
__version__ = "4.0.0-dev0"
36 changes: 20 additions & 16 deletions src/zimscraperlib/download.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,15 @@

from __future__ import annotations

import io
import pathlib
import subprocess
from concurrent.futures import Future, ThreadPoolExecutor
from typing import ClassVar
from typing import IO, ClassVar

import requests
import requests.adapters
import requests.structures
import urllib3.util
import yt_dlp as youtube_dl

from zimscraperlib import logger
Expand Down Expand Up @@ -42,7 +44,8 @@ def download(
self,
url: str,
options: dict | None,
wait: bool | None = True, # noqa: FBT002
*,
wait: bool | None = True,
) -> bool | Future:
"""Downloads video using initialized executor.

Expand All @@ -52,9 +55,7 @@ def download(

Returns download result of future (wait=False)"""

future = self.executor.submit(
self._run_youtube_dl, url, options # pyright: ignore
)
future = self.executor.submit(self._run_youtube_dl, url, options or {})
if not wait:
return future
if not future.exception():
Expand Down Expand Up @@ -143,8 +144,8 @@ def save_large_file(url: str, fpath: pathlib.Path) -> None:

def _get_retry_adapter(
max_retries: int | None = 5,
) -> requests.adapters.BaseAdapter: # pyright: ignore
retries = requests.packages.urllib3.util.retry.Retry( # pyright: ignore
) -> requests.adapters.BaseAdapter:
retries = urllib3.util.retry.Retry(
total=max_retries, # total number of retries
connect=max_retries, # connection errors
read=max_retries, # read errors
Expand All @@ -161,7 +162,7 @@ def _get_retry_adapter(
], # force retry on the following codes
)

return requests.adapters.HTTPAdapter(max_retries=retries) # pyright: ignore
return requests.adapters.HTTPAdapter(max_retries=retries)


def get_session(max_retries: int | None = 5) -> requests.Session:
Expand All @@ -174,14 +175,15 @@ def get_session(max_retries: int | None = 5) -> requests.Session:
def stream_file(
url: str,
fpath: pathlib.Path | None = None,
byte_stream: io.BytesIO | None = None,
byte_stream: IO[bytes] | None = None,
block_size: int | None = 1024,
proxies: dict | None = None,
only_first_block: bool | None = False, # noqa: FBT002
max_retries: int | None = 5,
headers: dict[str, str] | None = None,
session: requests.Session | None = None,
) -> tuple[int, requests.structures.CaseInsensitiveDict]: # pyright: ignore
*,
only_first_block: bool | None = False,
) -> tuple[int, requests.structures.CaseInsensitiveDict]:
"""Stream data from a URL to either a BytesIO object or a file
Arguments -
fpath - Path of the file where data is sent
Expand Down Expand Up @@ -212,12 +214,14 @@ def stream_file(
total_downloaded = 0
if fpath is not None:
fp = open(fpath, "wb")
else:
elif (
byte_stream is not None
): # pragma: no branch (we use a precise condition to help type checker)
fp = byte_stream

for data in resp.iter_content(block_size):
total_downloaded += len(data)
fp.write(data) # pyright: ignore
fp.write(data)

# stop downloading/reading if we're just testing first block
if only_first_block:
Expand All @@ -226,7 +230,7 @@ def stream_file(
logger.debug(f"Downloaded {total_downloaded} bytes from {url}")

if fpath:
fp.close() # pyright: ignore
fp.close()
else:
fp.seek(0) # pyright: ignore
fp.seek(0)
return total_downloaded, resp.headers
2 changes: 1 addition & 1 deletion src/zimscraperlib/filesystem.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ def get_file_mimetype(fpath: pathlib.Path) -> str:
return get_content_mimetype(fh.read(2048))


def get_content_mimetype(content: bytes) -> str:
def get_content_mimetype(content: bytes | str) -> str:
"""MIME Type of content retrieved from magic headers"""

try:
Expand Down
20 changes: 10 additions & 10 deletions src/zimscraperlib/html.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,24 +7,22 @@
import pathlib
from typing import BinaryIO, TextIO

from bs4 import BeautifulSoup
from bs4 import BeautifulSoup, element

from zimscraperlib.types import ARTICLE_MIME


def find_title_in(content: str | BinaryIO | TextIO, mime_type: str) -> str:
def find_title_in(content: str | BinaryIO | TextIO, mime_type: str | None) -> str:
"""Extracted title from HTML content

blank on failure to extract and non-HTML files"""
if mime_type != ARTICLE_MIME:
return ""
try:
return BeautifulSoup(content, "lxml").find("title").text # pyright: ignore
except Exception:
return ""
title_tag = BeautifulSoup(content, "lxml").find("title")
return title_tag.text if title_tag else ""


def find_title_in_file(fpath: pathlib.Path, mime_type: str) -> str:
def find_title_in_file(fpath: pathlib.Path, mime_type: str | None) -> str:
"""Extracted title from an HTML file"""
try:
with open(fpath) as fh:
Expand All @@ -45,15 +43,17 @@ def find_language_in(content: str | BinaryIO | TextIO, mime_type: str) -> str:
for key in keylist:
node = soup.find(nodename)
if node:
if not node.has_attr(key): # pyright: ignore
if not isinstance(node, element.Tag) or (
isinstance(node, element.Tag) and not node.has_attr(key)
):
continue
if (
nodename == "meta"
and not node.attrs.get("http-equiv", "").lower() # pyright: ignore
and not node.attrs.get("http-equiv", "").lower()
== "content-language"
):
continue
return node.attrs[key] # pyright: ignore
return node.attrs[key]
return ""


Expand Down
28 changes: 12 additions & 16 deletions src/zimscraperlib/i18n.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
ISO_LEVELS = ["1", "2b", "2t", "3", "5"]


class NotFound(ValueError): # noqa: N818
class NotFoundError(ValueError):
pass


Expand Down Expand Up @@ -81,7 +81,7 @@ def get_iso_lang_data(lang: str) -> tuple[dict, dict | None]:
iso639.exceptions.InvalidLanguageValue,
iso639.exceptions.DeprecatedLanguageValue,
) as exc:
raise NotFound("Not a valid iso language name/code") from exc
raise NotFoundError("Not a valid iso language name/code") from exc

def replace_types(new_type: str) -> str:
# convert new iso_types from iso639-lang Pypi package to old iso_types from
Expand Down Expand Up @@ -118,34 +118,32 @@ def replace_types(new_type: str) -> str:
return lang_data, None


def find_language_names(query: str, lang_data: dict | None = None) -> tuple[str, str]:
def find_language_names(
query: str, lang_data: dict | None = None
) -> tuple[str | None, str | None]:
"""(native, english) language names for lang with help from language_details dict

Falls back to English name if available or query if not"""
if lang_data is None:
lang_data = get_language_details(query, failsafe=True) or {}
try:
query_locale = babel.Locale.parse(query)
return query_locale.get_display_name(), query_locale.get_display_name(
"en"
) # pyright: ignore
return query_locale.get_display_name(), query_locale.get_display_name("en")
except (babel.UnknownLocaleError, TypeError, ValueError, AttributeError):
pass

# ISO code lookup order matters (most qualified first)!
for iso_level in [f"iso-639-{lang_}" for lang_ in reversed(ISO_LEVELS)]:
try:
query_locale = babel.Locale.parse(lang_data.get(iso_level))
return query_locale.get_display_name(), query_locale.get_display_name(
"en"
) # pyright: ignore
return query_locale.get_display_name(), query_locale.get_display_name("en")
except (babel.UnknownLocaleError, TypeError, ValueError, AttributeError):
pass
default = lang_data.get("english", query)
return default, default


def update_with_macro(lang_data: dict, macro_data: dict):
def update_with_macro(lang_data: dict, macro_data: dict | None):
"""update empty keys from lang_data with ones of macro_data"""
if macro_data:
for key, value in macro_data.items():
Expand All @@ -154,9 +152,7 @@ def update_with_macro(lang_data: dict, macro_data: dict):
return lang_data


def get_language_details(
query: str, failsafe: bool | None = False # noqa: FBT002
) -> dict:
def get_language_details(query: str, *, failsafe: bool | None = False) -> dict | None:
"""language details dict from query.

Raises NotFound or return `und` language details if failsafe
Expand Down Expand Up @@ -192,12 +188,12 @@ def get_language_details(

try:
lang_data, macro_data = get_iso_lang_data(adjusted_query)
except NotFound as exc:
except NotFoundError as exc:
if failsafe:
return None # pyright: ignore
return None
raise exc

iso_data = update_with_macro(lang_data, macro_data) # pyright: ignore
iso_data = update_with_macro(lang_data, macro_data)
native_name, english_name = find_language_names(native_query, iso_data)
iso_data.update(
{
Expand Down
2 changes: 1 addition & 1 deletion src/zimscraperlib/image/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
# vim: ai ts=4 sts=4 et sw=4 nu

# flake8: noqa
from .convertion import convert_image
from .conversion import convert_image
from .optimization import optimize_image
from .probing import is_valid_image
from .transformation import resize_image
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,10 @@

from __future__ import annotations

import io
import pathlib
from typing import IO

import PIL
from PIL.Image import open as pilopen

from zimscraperlib.constants import ALPHA_NOT_SUPPORTED
from zimscraperlib.image.probing import format_for
Expand All @@ -15,8 +15,8 @@


def convert_image(
src: pathlib.Path | io.BytesIO,
dst: pathlib.Path | io.BytesIO,
src: pathlib.Path | IO[bytes],
dst: pathlib.Path | IO[bytes],
**params: str,
) -> None:
"""convert an image file from one format to another
Expand All @@ -29,12 +29,12 @@ def convert_image(
to RGB. ex: RGB, ARGB, CMYK (and other PIL colorspaces)"""

colorspace = params.get("colorspace") # requested colorspace
fmt = (
params.pop("fmt").upper() if "fmt" in params else None # pyright: ignore
) # requested format
fmt = params.pop("fmt").upper() if "fmt" in params else None # requested format
if not fmt:
fmt = format_for(dst)
with PIL.Image.open(src) as image: # pyright: ignore
if not fmt:
raise ValueError("Impossible to guess destination image format")
with pilopen(src) as image:
if image.mode == "RGBA" and fmt in ALPHA_NOT_SUPPORTED or colorspace:
image = image.convert(colorspace or "RGB") # noqa: PLW2901
save_image(image, dst, fmt, **params)
Expand All @@ -45,13 +45,13 @@ def create_favicon(src: pathlib.Path, dst: pathlib.Path) -> None:
if dst.suffix != ".ico":
raise ValueError("favicon extension must be ICO")

img = PIL.Image.open(src) # pyright: ignore
img = pilopen(src)
w, h = img.size
# resize image to square first
if w != h:
size = min([w, h])
resized = dst.parent.joinpath(f"{src.stem}.tmp.{src.suffix}")
resize_image(src, size, size, resized, "contain")
img = PIL.Image.open(resized) # pyright: ignore
img = pilopen(resized)
# now convert to ICO
save_image(img, dst, "ICO")
Loading
Loading