diff --git a/CHANGELOG.md b/CHANGELOG.md index 28286876..5367ed7e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,6 +14,21 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - New `indexing.IndexData` class to hold title, content and keywords to pass to libzim to index an item - Automatically index PDF documents content #167 - Automatically set proper title on PDF documents #168 +- Expose new `optimization.get_optimization_method` to get the proper optimization method to call for a given image format +- Add `optimization.get_optimization_method` to get the proper optimization method to call for a given image format +- New `creator.Creator.convert_and_check_metadata` to convert metadata to bytes or str for known use cases and check proper type is passed to libzim + +## Changed + +- **BREAKING** Renamed `zimscraperlib.image.convertion` to `zimscraperlib.image.conversion` to fix typo +- **BREAKING** Many changes in type hints to match the real underlying code +- **BREAKING** Force all boolean arguments (and some other non-obvious parameters) to be keyword-only in function calls for clarity / disambiguation (see ruff rule FBT002) +- Prefer to use `IO[bytes]` to `io.BytesIO` when possible since it is more generic +- **BREAKING** `i18n.NotFound` renamed `i18n.NotFoundError` +- **BREAKING** `types.get_mime_for_name` now returns `str | None` +- **BREAKING** `creator.Creator.add_metadata` and `creator.Creator.validate_metadata` now only accepts `bytes | str` as value (it must have been converted before call) +- **BREAKING** second argument of `creator.Creator.add_metadata` has been renamed to `value` instead of `content` to align with other methods +- When a type issue arises in metadata checks, wrong value type is displayed in exception ### Fixed diff --git a/src/zimscraperlib/__about__.py b/src/zimscraperlib/__about__.py index a7c15da7..ab2eee57 100644 --- a/src/zimscraperlib/__about__.py +++ b/src/zimscraperlib/__about__.py @@ -1 +1 @@ -__version__ = "3.4.1-dev0" +__version__ = "4.0.0-dev0" diff --git a/src/zimscraperlib/download.py b/src/zimscraperlib/download.py index 8ab1affb..7cfe6914 100644 --- a/src/zimscraperlib/download.py +++ b/src/zimscraperlib/download.py @@ -3,13 +3,15 @@ from __future__ import annotations -import io import pathlib import subprocess from concurrent.futures import Future, ThreadPoolExecutor -from typing import ClassVar +from typing import IO, ClassVar import requests +import requests.adapters +import requests.structures +import urllib3.util import yt_dlp as youtube_dl from zimscraperlib import logger @@ -42,7 +44,8 @@ def download( self, url: str, options: dict | None, - wait: bool | None = True, # noqa: FBT002 + *, + wait: bool | None = True, ) -> bool | Future: """Downloads video using initialized executor. @@ -52,9 +55,7 @@ def download( Returns download result of future (wait=False)""" - future = self.executor.submit( - self._run_youtube_dl, url, options # pyright: ignore - ) + future = self.executor.submit(self._run_youtube_dl, url, options or {}) if not wait: return future if not future.exception(): @@ -143,8 +144,8 @@ def save_large_file(url: str, fpath: pathlib.Path) -> None: def _get_retry_adapter( max_retries: int | None = 5, -) -> requests.adapters.BaseAdapter: # pyright: ignore - retries = requests.packages.urllib3.util.retry.Retry( # pyright: ignore +) -> requests.adapters.BaseAdapter: + retries = urllib3.util.retry.Retry( total=max_retries, # total number of retries connect=max_retries, # connection errors read=max_retries, # read errors @@ -161,7 +162,7 @@ def _get_retry_adapter( ], # force retry on the following codes ) - return requests.adapters.HTTPAdapter(max_retries=retries) # pyright: ignore + return requests.adapters.HTTPAdapter(max_retries=retries) def get_session(max_retries: int | None = 5) -> requests.Session: @@ -174,14 +175,15 @@ def get_session(max_retries: int | None = 5) -> requests.Session: def stream_file( url: str, fpath: pathlib.Path | None = None, - byte_stream: io.BytesIO | None = None, + byte_stream: IO[bytes] | None = None, block_size: int | None = 1024, proxies: dict | None = None, - only_first_block: bool | None = False, # noqa: FBT002 max_retries: int | None = 5, headers: dict[str, str] | None = None, session: requests.Session | None = None, -) -> tuple[int, requests.structures.CaseInsensitiveDict]: # pyright: ignore + *, + only_first_block: bool | None = False, +) -> tuple[int, requests.structures.CaseInsensitiveDict]: """Stream data from a URL to either a BytesIO object or a file Arguments - fpath - Path of the file where data is sent @@ -212,12 +214,14 @@ def stream_file( total_downloaded = 0 if fpath is not None: fp = open(fpath, "wb") - else: + elif ( + byte_stream is not None + ): # pragma: no branch (we use a precise condition to help type checker) fp = byte_stream for data in resp.iter_content(block_size): total_downloaded += len(data) - fp.write(data) # pyright: ignore + fp.write(data) # stop downloading/reading if we're just testing first block if only_first_block: @@ -226,7 +230,7 @@ def stream_file( logger.debug(f"Downloaded {total_downloaded} bytes from {url}") if fpath: - fp.close() # pyright: ignore + fp.close() else: - fp.seek(0) # pyright: ignore + fp.seek(0) return total_downloaded, resp.headers diff --git a/src/zimscraperlib/filesystem.py b/src/zimscraperlib/filesystem.py index 911d9c62..65e0a2c7 100644 --- a/src/zimscraperlib/filesystem.py +++ b/src/zimscraperlib/filesystem.py @@ -30,7 +30,7 @@ def get_file_mimetype(fpath: pathlib.Path) -> str: return get_content_mimetype(fh.read(2048)) -def get_content_mimetype(content: bytes) -> str: +def get_content_mimetype(content: bytes | str) -> str: """MIME Type of content retrieved from magic headers""" try: diff --git a/src/zimscraperlib/html.py b/src/zimscraperlib/html.py index 278a4c9c..d2d974ea 100644 --- a/src/zimscraperlib/html.py +++ b/src/zimscraperlib/html.py @@ -7,24 +7,22 @@ import pathlib from typing import BinaryIO, TextIO -from bs4 import BeautifulSoup +from bs4 import BeautifulSoup, element from zimscraperlib.types import ARTICLE_MIME -def find_title_in(content: str | BinaryIO | TextIO, mime_type: str) -> str: +def find_title_in(content: str | BinaryIO | TextIO, mime_type: str | None) -> str: """Extracted title from HTML content blank on failure to extract and non-HTML files""" if mime_type != ARTICLE_MIME: return "" - try: - return BeautifulSoup(content, "lxml").find("title").text # pyright: ignore - except Exception: - return "" + title_tag = BeautifulSoup(content, "lxml").find("title") + return title_tag.text if title_tag else "" -def find_title_in_file(fpath: pathlib.Path, mime_type: str) -> str: +def find_title_in_file(fpath: pathlib.Path, mime_type: str | None) -> str: """Extracted title from an HTML file""" try: with open(fpath) as fh: @@ -45,15 +43,17 @@ def find_language_in(content: str | BinaryIO | TextIO, mime_type: str) -> str: for key in keylist: node = soup.find(nodename) if node: - if not node.has_attr(key): # pyright: ignore + if not isinstance(node, element.Tag) or ( + isinstance(node, element.Tag) and not node.has_attr(key) + ): continue if ( nodename == "meta" - and not node.attrs.get("http-equiv", "").lower() # pyright: ignore + and not node.attrs.get("http-equiv", "").lower() == "content-language" ): continue - return node.attrs[key] # pyright: ignore + return node.attrs[key] return "" diff --git a/src/zimscraperlib/i18n.py b/src/zimscraperlib/i18n.py index fb238800..f440f455 100644 --- a/src/zimscraperlib/i18n.py +++ b/src/zimscraperlib/i18n.py @@ -15,7 +15,7 @@ ISO_LEVELS = ["1", "2b", "2t", "3", "5"] -class NotFound(ValueError): # noqa: N818 +class NotFoundError(ValueError): pass @@ -81,7 +81,7 @@ def get_iso_lang_data(lang: str) -> tuple[dict, dict | None]: iso639.exceptions.InvalidLanguageValue, iso639.exceptions.DeprecatedLanguageValue, ) as exc: - raise NotFound("Not a valid iso language name/code") from exc + raise NotFoundError("Not a valid iso language name/code") from exc def replace_types(new_type: str) -> str: # convert new iso_types from iso639-lang Pypi package to old iso_types from @@ -118,7 +118,9 @@ def replace_types(new_type: str) -> str: return lang_data, None -def find_language_names(query: str, lang_data: dict | None = None) -> tuple[str, str]: +def find_language_names( + query: str, lang_data: dict | None = None +) -> tuple[str | None, str | None]: """(native, english) language names for lang with help from language_details dict Falls back to English name if available or query if not""" @@ -126,9 +128,7 @@ def find_language_names(query: str, lang_data: dict | None = None) -> tuple[str, lang_data = get_language_details(query, failsafe=True) or {} try: query_locale = babel.Locale.parse(query) - return query_locale.get_display_name(), query_locale.get_display_name( - "en" - ) # pyright: ignore + return query_locale.get_display_name(), query_locale.get_display_name("en") except (babel.UnknownLocaleError, TypeError, ValueError, AttributeError): pass @@ -136,16 +136,14 @@ def find_language_names(query: str, lang_data: dict | None = None) -> tuple[str, for iso_level in [f"iso-639-{lang_}" for lang_ in reversed(ISO_LEVELS)]: try: query_locale = babel.Locale.parse(lang_data.get(iso_level)) - return query_locale.get_display_name(), query_locale.get_display_name( - "en" - ) # pyright: ignore + return query_locale.get_display_name(), query_locale.get_display_name("en") except (babel.UnknownLocaleError, TypeError, ValueError, AttributeError): pass default = lang_data.get("english", query) return default, default -def update_with_macro(lang_data: dict, macro_data: dict): +def update_with_macro(lang_data: dict, macro_data: dict | None): """update empty keys from lang_data with ones of macro_data""" if macro_data: for key, value in macro_data.items(): @@ -154,9 +152,7 @@ def update_with_macro(lang_data: dict, macro_data: dict): return lang_data -def get_language_details( - query: str, failsafe: bool | None = False # noqa: FBT002 -) -> dict: +def get_language_details(query: str, *, failsafe: bool | None = False) -> dict | None: """language details dict from query. Raises NotFound or return `und` language details if failsafe @@ -192,12 +188,12 @@ def get_language_details( try: lang_data, macro_data = get_iso_lang_data(adjusted_query) - except NotFound as exc: + except NotFoundError as exc: if failsafe: - return None # pyright: ignore + return None raise exc - iso_data = update_with_macro(lang_data, macro_data) # pyright: ignore + iso_data = update_with_macro(lang_data, macro_data) native_name, english_name = find_language_names(native_query, iso_data) iso_data.update( { diff --git a/src/zimscraperlib/image/__init__.py b/src/zimscraperlib/image/__init__.py index 3834bc18..c17912b2 100644 --- a/src/zimscraperlib/image/__init__.py +++ b/src/zimscraperlib/image/__init__.py @@ -3,7 +3,7 @@ # vim: ai ts=4 sts=4 et sw=4 nu # flake8: noqa -from .convertion import convert_image +from .conversion import convert_image from .optimization import optimize_image from .probing import is_valid_image from .transformation import resize_image diff --git a/src/zimscraperlib/image/convertion.py b/src/zimscraperlib/image/conversion.py similarity index 81% rename from src/zimscraperlib/image/convertion.py rename to src/zimscraperlib/image/conversion.py index e262847a..3edac08e 100644 --- a/src/zimscraperlib/image/convertion.py +++ b/src/zimscraperlib/image/conversion.py @@ -3,10 +3,10 @@ from __future__ import annotations -import io import pathlib +from typing import IO -import PIL +from PIL.Image import open as pilopen from zimscraperlib.constants import ALPHA_NOT_SUPPORTED from zimscraperlib.image.probing import format_for @@ -15,8 +15,8 @@ def convert_image( - src: pathlib.Path | io.BytesIO, - dst: pathlib.Path | io.BytesIO, + src: pathlib.Path | IO[bytes], + dst: pathlib.Path | IO[bytes], **params: str, ) -> None: """convert an image file from one format to another @@ -29,12 +29,12 @@ def convert_image( to RGB. ex: RGB, ARGB, CMYK (and other PIL colorspaces)""" colorspace = params.get("colorspace") # requested colorspace - fmt = ( - params.pop("fmt").upper() if "fmt" in params else None # pyright: ignore - ) # requested format + fmt = params.pop("fmt").upper() if "fmt" in params else None # requested format if not fmt: fmt = format_for(dst) - with PIL.Image.open(src) as image: # pyright: ignore + if not fmt: + raise ValueError("Impossible to guess destination image format") + with pilopen(src) as image: if image.mode == "RGBA" and fmt in ALPHA_NOT_SUPPORTED or colorspace: image = image.convert(colorspace or "RGB") # noqa: PLW2901 save_image(image, dst, fmt, **params) @@ -45,13 +45,13 @@ def create_favicon(src: pathlib.Path, dst: pathlib.Path) -> None: if dst.suffix != ".ico": raise ValueError("favicon extension must be ICO") - img = PIL.Image.open(src) # pyright: ignore + img = pilopen(src) w, h = img.size # resize image to square first if w != h: size = min([w, h]) resized = dst.parent.joinpath(f"{src.stem}.tmp.{src.suffix}") resize_image(src, size, size, resized, "contain") - img = PIL.Image.open(resized) # pyright: ignore + img = pilopen(resized) # now convert to ICO save_image(img, dst, "ICO") diff --git a/src/zimscraperlib/image/optimization.py b/src/zimscraperlib/image/optimization.py index 7924c617..865281cb 100644 --- a/src/zimscraperlib/image/optimization.py +++ b/src/zimscraperlib/image/optimization.py @@ -24,10 +24,12 @@ from __future__ import annotations +import functools import io import os import pathlib import subprocess +from typing import Callable import piexif from optimize_images.img_aux_processing import do_reduce_colors, rebuild_palette @@ -35,13 +37,13 @@ from optimize_images.img_dynamic_quality import jpeg_dynamic_quality from PIL import Image -from zimscraperlib.image.convertion import convert_image +from zimscraperlib.image.conversion import convert_image from zimscraperlib.image.probing import format_for from zimscraperlib.image.utils import save_image def ensure_matches( - src: pathlib.Path, + src: pathlib.Path | io.BytesIO, fmt: str, ) -> None: """Raise ValueError if src is not of image type `fmt`""" @@ -52,13 +54,14 @@ def ensure_matches( def optimize_png( src: pathlib.Path | io.BytesIO, - dst: pathlib.Path | None = None, - reduce_colors: bool | None = False, # noqa: FBT002 - max_colors: int | None = 256, - fast_mode: bool | None = True, # noqa: FBT002 - remove_transparency: bool | None = False, # noqa: FBT002 - background_color: tuple[int, int, int] | None = (255, 255, 255), - **options, # noqa: ARG001 + dst: pathlib.Path | io.BytesIO | None = None, + max_colors: int = 256, + background_color: tuple[int, int, int] = (255, 255, 255), + *, + reduce_colors: bool | None = False, + fast_mode: bool | None = True, + remove_transparency: bool | None = False, + **_, ) -> pathlib.Path | io.BytesIO: """method to optimize PNG files using a pure python external optimizer @@ -76,34 +79,35 @@ def optimize_png( if remove_transparency is True (tuple containing RGB values) values: (255, 255, 255) | (221, 121, 108) | (XX, YY, ZZ)""" - ensure_matches(src, "PNG") # pyright: ignore + ensure_matches(src, "PNG") img = Image.open(src) if remove_transparency: - img = remove_alpha(img, background_color) # pyright: ignore + img = remove_alpha(img, background_color) if reduce_colors: - img, _, _ = do_reduce_colors(img, max_colors) # pyright: ignore + img, _, _ = do_reduce_colors(img, max_colors) if not fast_mode and img.mode == "P": img, _ = rebuild_palette(img) if dst is None: - dst = io.BytesIO() # pyright: ignore - img.save(dst, optimize=True, format="PNG") # pyright: ignore - if isinstance(dst, io.BytesIO): + dst = io.BytesIO() + img.save(dst, optimize=True, format="PNG") + if not isinstance(dst, pathlib.Path): dst.seek(0) - return dst # pyright: ignore + return dst def optimize_jpeg( src: pathlib.Path | io.BytesIO, - dst: pathlib.Path | None = None, + dst: pathlib.Path | io.BytesIO | None = None, quality: int | None = 85, - fast_mode: bool | None = True, # noqa: FBT002 - keep_exif: bool | None = True, # noqa: FBT002 - **options, # noqa: ARG001 + *, + fast_mode: bool | None = True, + keep_exif: bool | None = True, + **_, ) -> pathlib.Path | io.BytesIO: """method to optimize JPEG files using a pure python external optimizer quality: JPEG quality (integer between 1 and 100) @@ -114,7 +118,7 @@ def optimize_jpeg( get dynamic quality value to ensure better compression values: True | False""" - ensure_matches(src, "JPEG") # pyright: ignore + ensure_matches(src, "JPEG") img = Image.open(src) orig_size = ( @@ -124,7 +128,7 @@ def optimize_jpeg( ) had_exif = False - if (isinstance(src, io.BytesIO) and piexif.load(src.getvalue())["Exif"]) or ( + if (not isinstance(src, pathlib.Path) and piexif.load(src.getvalue())["Exif"]) or ( isinstance(src, pathlib.Path) and piexif.load(str(src))["Exif"] ): had_exif = True @@ -138,10 +142,10 @@ def optimize_jpeg( quality_setting, _ = jpeg_dynamic_quality(img) if dst is None: - dst = io.BytesIO() # pyright: ignore + dst = io.BytesIO() img.save( - dst, # pyright: ignore + dst, quality=quality_setting, optimize=True, progressive=use_progressive_jpg, @@ -157,23 +161,22 @@ def optimize_jpeg( str(src.resolve()) if isinstance(src, pathlib.Path) else src.getvalue() ), image=( - str(dst.resolve()) - if isinstance(dst, pathlib.Path) - else dst.getvalue() # pyright: ignore + str(dst.resolve()) if isinstance(dst, pathlib.Path) else dst.getvalue() ), new_file=dst, ) - return dst # pyright: ignore + return dst def optimize_webp( src: pathlib.Path | io.BytesIO, - dst: pathlib.Path | None = None, - lossless: bool | None = False, # noqa: FBT002 + dst: pathlib.Path | io.BytesIO | None = None, quality: int | None = 60, method: int | None = 6, - **options, # noqa: ARG001 + *, + lossless: bool | None = False, + **_, ) -> pathlib.Path | io.BytesIO: """method to optimize WebP using Pillow options lossless: Whether to use lossless compression (boolean) @@ -188,7 +191,7 @@ def optimize_webp( refer to the link for more details https://pillow.readthedocs.io/en/stable/handbook/image-file-formats.html#webp""" - ensure_matches(src, "WEBP") # pyright: ignore + ensure_matches(src, "WEBP") params = { "lossless": lossless, "quality": quality, @@ -197,17 +200,22 @@ def optimize_webp( webp_image = Image.open(src) if dst is None: - dst = io.BytesIO() # pyright: ignore - webp_image.save(dst, format="WEBP", **params) # pyright: ignore - dst.seek(0) # pyright: ignore + dst = io.BytesIO() + webp_image.save(dst, format="WEBP", **params) + dst.seek(0) else: try: - save_image(webp_image, dst, fmt="WEBP", **params) # pyright: ignore + save_image(webp_image, dst, fmt="WEBP", **params) except Exception as exc: - if src.resolve() != dst.resolve() and dst.exists(): # pyright: ignore + if ( + isinstance(src, pathlib.Path) + and isinstance(dst, pathlib.Path) + and src.resolve() != dst.resolve() + and dst.exists() + ): dst.unlink() # pragma: no cover raise exc - return dst # pyright: ignore + return dst def optimize_gif( @@ -215,10 +223,11 @@ def optimize_gif( dst: pathlib.Path, optimize_level: int | None = 1, lossiness: int | None = None, - interlace: bool | None = True, # noqa: FBT002 - no_extensions: bool | None = True, # noqa: FBT002 max_colors: int | None = None, - **options, # noqa: ARG001 + *, + interlace: bool | None = True, + no_extensions: bool | None = True, + **_, ) -> pathlib.Path: """method to optimize GIFs using gifsicle >= 1.92 optimize_level: Optimization level; @@ -266,10 +275,11 @@ def optimize_gif( def optimize_image( src: pathlib.Path, dst: pathlib.Path, - delete_src: bool | None = False, # noqa: FBT002 - convert: bool | str | None = False, # noqa: FBT002 + *, + delete_src: bool | None = False, + convert: bool | str | None = False, **options, -) -> bool: # pyright: ignore +): """Optimize image, automatically selecting correct optimizer delete_src: whether to remove src file upon success (boolean) @@ -280,21 +290,41 @@ def optimize_image( "FMT": convert to format FMT (use Pillow names)""" src_format, dst_format = format_for(src, from_suffix=False), format_for(dst) + + if src_format is None: # pragma: no cover + # never supposed to happens since we get format from suffix, but good for type + # checker + code safety / clean errors + raise ValueError("Impossible to guess format from src image") + if dst_format is None: + raise ValueError("Impossible to guess format from dst image") # if requested, convert src to requested format into dst path if convert and src_format != dst_format: src_format = dst_format = convert if isinstance(convert, str) else dst_format - convert_image(src, dst, fmt=src_format) # pyright: ignore + convert_image(src, dst, fmt=src_format) src_img = pathlib.Path(dst) else: src_img = pathlib.Path(src) - { # pyright: ignore - "JPEG": optimize_jpeg, - "PNG": optimize_png, - "GIF": optimize_gif, - "WEBP": optimize_webp, - }.get(src_format)(src_img, dst, **options) + get_optimization_method(src_format)(src_img, dst, **options) # delete src image if requested if delete_src and src.exists() and src.resolve() != dst.resolve(): src.unlink() + + +def get_optimization_method(fmt: str) -> Callable: + """Return the proper optimization method to call for a given image format""" + + def raise_error(*_, orig_format): + raise NotImplementedError( + f"Image format '{orig_format}' cannot yet be optimized" + ) + + fmt = fmt.lower().strip() + return { + "gif": optimize_gif, + "jpg": optimize_jpeg, + "jpeg": optimize_jpeg, + "webp": optimize_webp, + "png": optimize_png, + }.get(fmt, functools.partial(raise_error, orig_format=fmt)) diff --git a/src/zimscraperlib/image/probing.py b/src/zimscraperlib/image/probing.py index 045750ab..6b28091e 100644 --- a/src/zimscraperlib/image/probing.py +++ b/src/zimscraperlib/image/probing.py @@ -14,7 +14,7 @@ def get_colors( - src: pathlib.Path, use_palette: bool | None = True # noqa: FBT002 + src: pathlib.Path, *, use_palette: bool | None = True ) -> tuple[str, str]: """(main, secondary) HTML color codes from an image path""" @@ -49,25 +49,29 @@ def solarize(r: int, g: int, b: int) -> tuple[int, int, int]: def is_hex_color(text: str) -> bool: """whether supplied text is a valid hex-formated color code""" - return re.search(r"^#(?:[0-9a-fA-F]{3}){1,2}$", text) # pyright: ignore + return bool(re.search(r"^#(?:[0-9a-fA-F]{3}){1,2}$", text)) def format_for( src: pathlib.Path | IO[bytes], - from_suffix: bool = True, # noqa: FBT001, FBT002 -) -> str: + *, + from_suffix: bool = True, +) -> str | None: """Pillow format of a given filename, either Pillow-detected or from suffix""" if not from_suffix: with PIL.Image.open(src) as img: - return img.format # pyright: ignore + return img.format - from PIL.Image import EXTENSION as ext_fmt_map # noqa: N811 + if not isinstance(src, pathlib.Path): + raise ValueError( + "Cannot guess image format from file suffix when byte array is passed" + ) + + from PIL.Image import EXTENSION as PIL_FMT_EXTENSION from PIL.Image import init as init_pil init_pil() - return ext_fmt_map[ - src.suffix # pyright: ignore - ] # might raise KeyError on unknown extension + return PIL_FMT_EXTENSION[src.suffix] if src.suffix in PIL_FMT_EXTENSION else None def is_valid_image( diff --git a/src/zimscraperlib/image/transformation.py b/src/zimscraperlib/image/transformation.py index e49b5eb5..8732d099 100644 --- a/src/zimscraperlib/image/transformation.py +++ b/src/zimscraperlib/image/transformation.py @@ -6,7 +6,7 @@ import io import pathlib -import PIL +from PIL.Image import open as pilopen from resizeimage import resizeimage from zimscraperlib.constants import ALPHA_NOT_SUPPORTED @@ -19,14 +19,15 @@ def resize_image( height: int | None = None, dst: pathlib.Path | io.BytesIO | None = None, method: str | None = "width", - allow_upscaling: bool | None = True, # noqa: FBT002 + *, + allow_upscaling: bool | None = True, **params: str, ) -> None: """resize an image to requested dimensions methods: width, height, cover, thumbnail allow upscaling: upscale image first, preserving aspect ratio if required""" - with PIL.Image.open(src) as image: # pyright: ignore + with pilopen(src) as image: # preserve image format as resize() does not transmit it into new object image_format = image.format image_mode = image.mode @@ -59,9 +60,12 @@ def resize_image( if dst is None and isinstance(src, io.BytesIO): src.seek(0) + if image_format is None: # pragma: no cover + raise ValueError("Impossible to guess format from src image") + save_image( resized, - dst if dst is not None else src, # pyright: ignore + dst if dst is not None else src, image_format, **params, ) diff --git a/src/zimscraperlib/image/utils.py b/src/zimscraperlib/image/utils.py index fad99020..af1ed57e 100644 --- a/src/zimscraperlib/image/utils.py +++ b/src/zimscraperlib/image/utils.py @@ -2,19 +2,20 @@ # vim: ai ts=4 sts=4 et sw=4 nu from __future__ import annotations -import io import pathlib +from typing import IO -from PIL import Image +from PIL.Image import Image +from PIL.ImageFile import ImageFile def save_image( - src: Image, # pyright: ignore - dst: pathlib.Path | io.BytesIO, - fmt: str | None = None, + src: Image | ImageFile, + dst: pathlib.Path | IO[bytes], + fmt: str, **params: str, ) -> None: """PIL.Image.save() wrapper setting default parameters""" - args = {"JPEG": {"quality": 100}, "PNG": {}}.get(fmt, {}) # pyright: ignore + args = {"JPEG": {"quality": 100}, "PNG": {}}.get(fmt, {}) args.update(params or {}) - src.save(dst, fmt, **args) # pyright: ignore + src.save(dst, fmt, **args) diff --git a/src/zimscraperlib/inputs.py b/src/zimscraperlib/inputs.py index c6c1efd0..438a9fc0 100644 --- a/src/zimscraperlib/inputs.py +++ b/src/zimscraperlib/inputs.py @@ -23,8 +23,9 @@ def handle_user_provided_file( source: pathlib.Path | str | None = None, dest: pathlib.Path | None = None, in_dir: pathlib.Path | None = None, - nocopy: bool = False, # noqa: FBT001, FBT002 user_agent: str | None = DEFAULT_USER_AGENT, + *, + nocopy: bool = False, ) -> pathlib.Path | None: """path to downloaded or copied a user provided file (URL or path) diff --git a/src/zimscraperlib/logging.py b/src/zimscraperlib/logging.py index fedfc69f..f2f12ce5 100644 --- a/src/zimscraperlib/logging.py +++ b/src/zimscraperlib/logging.py @@ -3,12 +3,12 @@ from __future__ import annotations -import io import logging import pathlib import sys from collections.abc import Iterable from logging.handlers import RotatingFileHandler +from typing import TextIO from zimscraperlib.constants import NAME @@ -16,18 +16,18 @@ VERBOSE_DEPENDENCIES = ["urllib3", "PIL", "boto3", "botocore", "s3transfer"] -def getLogger( # noqa: N802 +def getLogger( # noqa: N802 (intentionally matches the stdlib getLogger name) name: str, - level: int | None = logging.INFO, - console: io.TextIOBase | None = sys.stdout, # pyright: ignore + level: int = logging.INFO, + console: TextIO | None = sys.stdout, log_format: str | None = DEFAULT_FORMAT, - file: pathlib.Path | None = False, # noqa: FBT002 # pyright: ignore + file: pathlib.Path | None = None, file_level: int | None = None, file_format: str | None = None, - file_max: int | None = 2**20, - file_nb_backup: int | None = 1, - deps_level: int | None = logging.WARNING, # noqa: ARG001 - additional_deps: Iterable | None = None, + file_max: int = 2**20, + file_nb_backup: int = 1, + deps_level: int = logging.WARNING, + additional_deps: Iterable[str] | None = None, ): """configured logger for most usages @@ -41,16 +41,17 @@ def getLogger( # noqa: N802 - deps_level: log level for idendified verbose dependencies - additional_deps: additional modules names of verbose dependencies to assign deps_level to""" - if additional_deps is None: + + if not additional_deps: additional_deps = [] # align zimscraperlib logging level to that of scraper - logging.Logger(NAME).setLevel(level) # pyright: ignore + logging.Logger(NAME).setLevel(level) # set arbitrary level for some known verbose dependencies # prevents them from polluting logs - for logger_name in set(VERBOSE_DEPENDENCIES + additional_deps): # pyright: ignore - logging.getLogger(logger_name).setLevel(logging.WARNING) + for logger_name in set(VERBOSE_DEPENDENCIES + list(additional_deps)): + logging.getLogger(logger_name).setLevel(deps_level) logger = logging.Logger(name) logger.setLevel(logging.DEBUG) @@ -59,25 +60,25 @@ def getLogger( # noqa: N802 if console: console_handler = logging.StreamHandler(console) console_handler.setFormatter(logging.Formatter(log_format)) - console_handler.setLevel(level) # pyright: ignore + console_handler.setLevel(level) logger.addHandler(console_handler) if file: - file_handler = RotatingFileHandler( # pyright: ignore + file_handler = RotatingFileHandler( file, - maxBytes=file_max, # pyright: ignore - backupCount=file_nb_backup, # pyright: ignore + maxBytes=file_max, + backupCount=file_nb_backup, ) file_handler.setFormatter(logging.Formatter(file_format or log_format)) - file_handler.setLevel(file_level or level) # pyright: ignore + file_handler.setLevel(file_level or level) logger.addHandler(file_handler) return logger -def nicer_args_join(args: Iterable) -> str: +def nicer_args_join(args: list[str]) -> str: """slightly better concateated list of subprocess args for display""" - nargs = args[0:1] # pyright: ignore - for arg in args[1:]: # pyright: ignore + nargs = args[0:1] + for arg in args[1:]: nargs.append(arg if arg.startswith("-") else f'"{arg}"') return " ".join(nargs) diff --git a/src/zimscraperlib/misc.py b/src/zimscraperlib/misc.py index 49af4b61..9d9b25f2 100644 --- a/src/zimscraperlib/misc.py +++ b/src/zimscraperlib/misc.py @@ -2,7 +2,11 @@ from __future__ import annotations +from typing import TypeVar -def first(*args: object | None) -> object: - """first non-None value from *args ; fallback to empty string""" - return next((item for item in args if item is not None), "") +T = TypeVar("T") + + +def first(*args: T | None, default: T = "") -> T: + """Return the first non-None value from *args; fallback to an empty string.""" + return next((item for item in args if item is not None), default) diff --git a/src/zimscraperlib/types.py b/src/zimscraperlib/types.py index cc053f53..228b2103 100644 --- a/src/zimscraperlib/types.py +++ b/src/zimscraperlib/types.py @@ -39,8 +39,8 @@ def get_mime_for_name( filename: str | pathlib.Path, fallback: str | None = FALLBACK_MIME, - no_ext_to=ARTICLE_MIME, -) -> str: + no_ext_to: str | None = ARTICLE_MIME, +) -> str | None: """MIME-Type string from a filename filename is a string, not a path (doesn't need to exist) @@ -51,11 +51,9 @@ def get_mime_for_name( filename = pathlib.Path(filename) if not filename.suffix: return no_ext_to - return ( - mimetypes.guess_type(f"{filename.stem}{filename.suffix}")[0] or fallback - ) # pyright: ignore + return mimetypes.guess_type(f"{filename.stem}{filename.suffix}")[0] or fallback except Exception: - return fallback # pyright: ignore + return fallback def init_types(): diff --git a/src/zimscraperlib/uri.py b/src/zimscraperlib/uri.py index 3b6a8753..323b826d 100644 --- a/src/zimscraperlib/uri.py +++ b/src/zimscraperlib/uri.py @@ -19,22 +19,23 @@ def rebuild_uri( params: str | None = None, query: str | None = None, fragment: str | None = None, - failsafe: bool = False, # noqa: FBT001, FBT002 + *, + failsafe: bool = False, ) -> urllib.parse.ParseResult: """new ParseResult named tuple from uri with requested part updated""" try: - username = first(username, uri.username, "") # pyright: ignore - password = first(password, uri.password, "") # pyright: ignore - hostname = first(hostname, uri.hostname, "") # pyright: ignore - port = first(port, uri.port, "") # pyright: ignore + username = first(username, uri.username) + password = first(password, uri.password) + hostname = first(hostname, uri.hostname) + port = first(port, uri.port) netloc = ( f"{username}{':' if password else ''}{password}" f"{'@' if username or password else ''}{hostname}" f"{':' if port else ''}{port}" ) - return urllib.parse.urlparse( # pyright: ignore - urllib.parse.urlunparse( # pyright: ignore - ( # pyright: ignore + return urllib.parse.urlparse( + urllib.parse.urlunparse( + ( first(scheme, uri.scheme), netloc, first(path, uri.path), diff --git a/src/zimscraperlib/video/encoding.py b/src/zimscraperlib/video/encoding.py index 6275e641..bedbcd3d 100644 --- a/src/zimscraperlib/video/encoding.py +++ b/src/zimscraperlib/video/encoding.py @@ -38,14 +38,15 @@ def _build_ffmpeg_args( def reencode( - src_path, - dst_path, - ffmpeg_args, - delete_src=False, # noqa: FBT002 - with_process=False, # noqa: FBT002 - failsafe=True, # noqa: FBT002 + src_path: pathlib.Path, + dst_path: pathlib.Path, + ffmpeg_args: list[str], threads: int | None = 1, -): + *, + delete_src: bool = False, + with_process: bool = False, + failsafe: bool = True, +) -> tuple[bool, subprocess.CompletedProcess[str]] | bool: """Runs ffmpeg with given ffmpeg_args Arguments - diff --git a/src/zimscraperlib/zim/_libkiwix.py b/src/zimscraperlib/zim/_libkiwix.py index 0cc1a7f8..c20357c8 100644 --- a/src/zimscraperlib/zim/_libkiwix.py +++ b/src/zimscraperlib/zim/_libkiwix.py @@ -24,7 +24,7 @@ ] -def getline(src: io.StringIO, delim: bool | None = None) -> tuple[bool, str]: +def getline(src: io.StringIO, delim: str | None = None) -> tuple[bool, str]: """C++ stdlib getline() ~clone Reads `src` until it finds `delim`. @@ -49,10 +49,10 @@ def readFullMimetypeAndCounterString( Returns whether the source is EOF and the extracted string (or empty one)""" params = "" - eof, mtcStr = getline(src, ";") # pyright: ignore + eof, mtcStr = getline(src, ";") if mtcStr.find("=") == -1: while params.count("=") != 2: # noqa: PLR2004 - eof, params = getline(src, ";") # pyright: ignore + eof, params = getline(src, ";") if params.count("=") == 2: # noqa: PLR2004 mtcStr += ";" + params if eof: diff --git a/src/zimscraperlib/zim/archive.py b/src/zimscraperlib/zim/archive.py index e974bd2c..81273947 100644 --- a/src/zimscraperlib/zim/archive.py +++ b/src/zimscraperlib/zim/archive.py @@ -41,7 +41,7 @@ def metadata(self) -> dict[str, str]: def tags(self): return self.get_tags() - def get_tags(self, libkiwix: bool = False) -> list[str]: # noqa: FBT001, FBT002 + def get_tags(self, *, libkiwix: bool = False) -> list[str]: """List of ZIM tags, optionnaly expanded with libkiwix's hints""" try: tags_meta = self.get_text_metadata("Tags") diff --git a/src/zimscraperlib/zim/creator.py b/src/zimscraperlib/zim/creator.py index ce3625d0..408a19dd 100644 --- a/src/zimscraperlib/zim/creator.py +++ b/src/zimscraperlib/zim/creator.py @@ -76,13 +76,13 @@ def mimetype_for( content: bytes | str | None = None, fpath: pathlib.Path | None = None, mimetype: str | None = None, -) -> str: +) -> str | None: """mimetype as provided or guessed from fpath, path or content""" if not mimetype: mimetype = ( get_file_mimetype(fpath) if fpath - else get_content_mimetype(content[:2048]) # pyright: ignore + else get_content_mimetype(content[:2048]) if content else None ) # try to guess more-defined mime if it's text if ( @@ -90,7 +90,9 @@ def mimetype_for( or mimetype == "application/octet-stream" or mimetype.startswith("text/") ): - mimetype = get_mime_for_name(fpath if fpath else path, mimetype, mimetype) + mimetype = get_mime_for_name( + filename=fpath if fpath else path, fallback=mimetype, no_ext_to=mimetype + ) return mimetype @@ -120,9 +122,10 @@ def __init__( filename: pathlib.Path, main_path: str, compression: str | None = None, - workaround_nocancel: bool | None = True, # noqa: FBT002 - ignore_duplicates: bool | None = False, # noqa: FBT002 - disable_metadata_checks: bool = False, # noqa: FBT001, FBT002 + *, + workaround_nocancel: bool | None = True, + ignore_duplicates: bool | None = False, + disable_metadata_checks: bool = False, ): super().__init__(filename=filename) self._metadata = {} @@ -223,14 +226,14 @@ def start(self): del self._metadata["Illustration_48x48@1"] for name, value in self._metadata.items(): if value: - self.add_metadata(name, value) + self.add_metadata(name, self.convert_and_check_metadata(name, value)) return self def validate_metadata( self, name: str, - value: bytes | str | datetime.datetime | datetime.date | Iterable[str], + value: bytes | str, ): """Ensures metadata value for name is conform with the openZIM spec on Metadata @@ -238,7 +241,7 @@ def validate_metadata( See https://wiki.openzim.org/wiki/Metadata""" validate_required_values(name, value) - validate_standard_str_types(name, value) # pyright: ignore + validate_standard_str_types(name, value) validate_title(name, value) # pyright: ignore validate_date(name, value) # pyright: ignore @@ -249,10 +252,37 @@ def validate_metadata( validate_tags(name, value) # pyright: ignore validate_illustrations(name, value) # pyright: ignore + def convert_and_check_metadata( + self, + name: str, + value: str | bytes | datetime.date | datetime.datetime | Iterable[str], + ) -> str | bytes: + """Convert metadata to appropriate type for few known usecase and check type + + Date: converts date and datetime to string YYYY-MM-DD + Tags: converts iterable to string with semi-colon separator + + Also checks that final type is appropriate for libzim (str or bytes) + """ + if name == "Date" and isinstance(value, (datetime.date, datetime.datetime)): + value = value.strftime("%Y-%m-%d") + if ( + name == "Tags" + and not isinstance(value, str) + and not isinstance(value, bytes) + and isinstance(value, Iterable) + ): + value = ";".join(value) + + if not isinstance(value, str) and not isinstance(value, bytes): + raise ValueError(f"Invalid type for {name}: {type(value)}") + + return value + def add_metadata( self, name: str, - content: str | bytes | datetime.date | datetime.datetime | Iterable[str], + value: str | bytes, mimetype: str = "text/plain;charset=UTF-8", ): # drop control characters before passing them to libzim @@ -261,18 +291,11 @@ def add_metadata( " \r\n\t" ) if not self.disable_metadata_checks: - self.validate_metadata(name, content) - if name == "Date" and isinstance(content, (datetime.date, datetime.datetime)): - content = content.strftime("%Y-%m-%d").encode("UTF-8") - if ( - name == "Tags" - and not isinstance(content, str) - and not isinstance(content, bytes) - and isinstance(content, Iterable) - ): - content = ";".join(content) - super().add_metadata(name, content, mimetype) + self.validate_metadata(name, value) + super().add_metadata(name, value, mimetype) + + # there are many N803 problems, but they are intentional to match real tag name def config_metadata( self, *, @@ -291,7 +314,16 @@ def config_metadata( Source: str | None = None, # noqa: N803 License: str | None = None, # noqa: N803 Relation: str | None = None, # noqa: N803 - **extras: str, + **extras: ( + None + | float + | int + | bytes + | str + | datetime.datetime + | datetime.date + | Iterable[str] + ), ): """Sets all mandatory Metadata as well as standard and any other text ones""" self._metadata.update( @@ -323,7 +355,19 @@ def config_metadata( ).strip(" \r\n\t") return self - def config_dev_metadata(self, **extras: str): + def config_dev_metadata( + self, + **extras: ( + None + | int + | float + | bytes + | str + | datetime.datetime + | datetime.date + | Iterable[str] + ), + ): """Calls config_metadata with default (yet overridable) values for dev""" devel_default_metadata = DEFAULT_DEV_ZIM_METADATA.copy() devel_default_metadata.update(extras) @@ -333,12 +377,13 @@ def add_item_for( self, path: str, title: str | None = None, + *, fpath: pathlib.Path | None = None, content: bytes | str | None = None, mimetype: str | None = None, is_front: bool | None = None, should_compress: bool | None = None, - delete_fpath: bool | None = False, # noqa: FBT002 + delete_fpath: bool | None = False, duplicate_ok: bool | None = None, callback: Callable | tuple[Callable, Any] | None = None, index_data: IndexData | None = None, diff --git a/src/zimscraperlib/zim/filesystem.py b/src/zimscraperlib/zim/filesystem.py index c99e98c1..ecd677d0 100644 --- a/src/zimscraperlib/zim/filesystem.py +++ b/src/zimscraperlib/zim/filesystem.py @@ -104,16 +104,22 @@ def add_redirects_to_zim( if redirects_file: with open(redirects_file) as fh: - for line in fh.readlines(): - namespace, path, title, target_url = re.match( - r"^(.)\t(.+)\t(.*)\t(.+)$", line - ).groups() # pyright: ignore + for linenumber, line in enumerate(fh.readlines()): + match = re.match(r"^(.)\t(.+)\t(.*)\t(.+)$", line) + if not match: + logger.warning( + f"Redirects file: line {linenumber} does not match expected " + f"regexp: {line}" + ) + continue + namespace, path, title, target_url = match.groups() if namespace.strip(): path = f"{namespace.strip()}/{path}" zim_file.add_redirect(path, target_url, title) def make_zim_file( + *, build_dir: pathlib.Path, fpath: pathlib.Path, name: str, @@ -121,22 +127,22 @@ def make_zim_file( illustration: str, title: str, description: str, - date: datetime.date = None, # noqa: RUF013 # pyright: ignore + date: datetime.date | None = None, language: str = "eng", creator: str = "-", publisher="-", - tags: Sequence[str] = None, # noqa: RUF013 # pyright: ignore - source: str = None, # noqa: RUF013 # pyright: ignore - flavour: str = None, # noqa: RUF013 # pyright: ignore - scraper: str = None, # noqa: RUF013 # pyright: ignore - long_description: str = None, # noqa: RUF013 # pyright: ignore - without_fulltext_index: bool = False, # noqa: FBT001, FBT002, ARG001 - redirects: Sequence[tuple[str, str, str]] = None, # noqa: RUF013 # pyright: ignore - redirects_file: pathlib.Path = None, # noqa: RUF013 # pyright: ignore - rewrite_links: bool = True, # noqa: FBT001, FBT002, ARG001 - workaround_nocancel: bool = True, # noqa: FBT001, FBT002 - ignore_duplicates: bool = True, # noqa: FBT001, FBT002 - disable_metadata_checks: bool = False, # noqa: FBT001, FBT002 + tags: Sequence[str] | None = None, + source: str | None = None, + flavour: str | None = None, + scraper: str | None = None, + long_description: str | None = None, + without_fulltext_index: bool = False, # noqa: ARG001 + redirects: Sequence[tuple[str, str, str]] | None = None, + redirects_file: pathlib.Path | None = None, + rewrite_links: bool = True, # noqa: ARG001 + workaround_nocancel: bool = True, + ignore_duplicates: bool = True, + disable_metadata_checks: bool = False, ): """Creates a zimwriterfs-like ZIM file at {fpath} from {build_dir} diff --git a/src/zimscraperlib/zim/items.py b/src/zimscraperlib/zim/items.py index 8e6a4cce..b3484e92 100644 --- a/src/zimscraperlib/zim/items.py +++ b/src/zimscraperlib/zim/items.py @@ -282,7 +282,7 @@ def get_path(self) -> str: def get_title(self) -> str: return getattr(self, "title", "") - def get_mimetype(self) -> str: + def get_mimetype(self) -> str | None: return getattr( self, "mimetype", diff --git a/src/zimscraperlib/zim/metadata.py b/src/zimscraperlib/zim/metadata.py index 6d5ec7fc..3db12c7b 100644 --- a/src/zimscraperlib/zim/metadata.py +++ b/src/zimscraperlib/zim/metadata.py @@ -29,7 +29,10 @@ def validate_required_values(name: str, value: Any): raise ValueError(f"Missing value for {name}") -def validate_standard_str_types(name: str, value: str): +def validate_standard_str_types( + name: str, + value: str | bytes, +): """ensures standard string metadata are indeed str""" if name in ( "Name", @@ -45,7 +48,7 @@ def validate_standard_str_types(name: str, value: str): "Source", "Scraper", ) and not isinstance(value, str): - raise ValueError(f"Invalid type for {name}") + raise ValueError(f"Invalid type for {name}: {type(value)}") def validate_title(name: str, value: str): @@ -58,16 +61,13 @@ def validate_date(name: str, value: datetime.datetime | datetime.date | str): """ensures Date metadata can be casted to an ISO 8601 string""" if name == "Date": if not isinstance(value, (datetime.datetime, datetime.date, str)): - raise ValueError(f"Invalid type for {name}.") + raise ValueError(f"Invalid type for {name}: {type(value)}") elif isinstance(value, str): match = re.match(r"(?P\d{4})-(?P\d{2})-(?P\d{2})", value) + if not match: + raise ValueError(f"Invalid {name} format, not matching regex") try: - datetime.date( - **{ - k: int(v) - for k, v in match.groupdict().items() # pyright: ignore - } - ) + datetime.date(**{k: int(v) for k, v in match.groupdict().items()}) except Exception as exc: raise ValueError(f"Invalid {name} format: {exc}") from None @@ -82,7 +82,7 @@ def validate_language(name: str, value: Iterable[str] | str): raise ValueError(f"{code} is not ISO-639-3.") -def validate_counter(name: str, value: str): # noqa: ARG001 +def validate_counter(name: str, _: str): """ensures Counter metadata is not manually set""" if name == "Counter": raise ValueError(f"{name} cannot be set. libzim sets it.") diff --git a/src/zimscraperlib/zim/providers.py b/src/zimscraperlib/zim/providers.py index 3d127f09..a46300be 100644 --- a/src/zimscraperlib/zim/providers.py +++ b/src/zimscraperlib/zim/providers.py @@ -45,7 +45,7 @@ class FileLikeProvider(libzim.writer.ContentProvider): def __init__( self, - fileobj: io.IOBase, + fileobj: io.BytesIO, size: int | None = None, ref: object | None = None, ): @@ -62,9 +62,7 @@ def get_size(self) -> int: return self.size # pyright: ignore def gen_blob(self) -> libzim.writer.Blob: - yield libzim.writer.Blob( # pragma: no cover - self.fileobj.getvalue() # pyright: ignore - ) + yield libzim.writer.Blob(self.fileobj.getvalue()) # pragma: no cover class URLProvider(libzim.writer.ContentProvider): diff --git a/tests/download/test_download.py b/tests/download/test_download.py index 9d82a72a..8a469a25 100644 --- a/tests/download/test_download.py +++ b/tests/download/test_download.py @@ -11,6 +11,7 @@ import pytest import requests +import requests.structures from yt_dlp import DownloadError from zimscraperlib.download import ( @@ -33,9 +34,7 @@ def assert_downloaded_file(url, file): def assert_headers(returned_headers): - assert isinstance( - returned_headers, requests.structures.CaseInsensitiveDict # pyright: ignore - ) + assert isinstance(returned_headers, requests.structures.CaseInsensitiveDict) assert returned_headers["Content-Type"] == "image/x-icon" diff --git a/tests/files/single_wave_icon.gbr b/tests/files/single_wave_icon.gbr new file mode 100644 index 00000000..28f6d00a Binary files /dev/null and b/tests/files/single_wave_icon.gbr differ diff --git a/tests/html/conftest.py b/tests/html/conftest.py index f31cbcef..9716e17f 100644 --- a/tests/html/conftest.py +++ b/tests/html/conftest.py @@ -23,3 +23,23 @@ def html_page(): """ + + +@pytest.fixture(scope="function") +def html_page_without_title(): + """sample HTML content without title""" + return """ + + + + + + + + + + +""" diff --git a/tests/html/test_html.py b/tests/html/test_html.py index af216069..011981a8 100644 --- a/tests/html/test_html.py +++ b/tests/html/test_html.py @@ -11,7 +11,7 @@ ) -def test_find_title(tmp_path, html_page): +def test_find_title(tmp_path, html_page, html_page_without_title): # find title in example HTML assert ( find_title_in(html_page, "text/html") @@ -21,6 +21,8 @@ def test_find_title(tmp_path, html_page): assert find_title_in(html_page, "text/plain") == "" # make sure non-html, even if using html mime returns no title assert find_title_in("title: Kiwix", "text/html") == "" + # make sure HTML without title returns no title + assert find_title_in(html_page_without_title, "text/html") == "" # find title in local file fpath = tmp_path / "test.html" diff --git a/tests/i18n/test_i18n.py b/tests/i18n/test_i18n.py index 75a7226c..06ccdcdc 100644 --- a/tests/i18n/test_i18n.py +++ b/tests/i18n/test_i18n.py @@ -7,7 +7,7 @@ import pytest from zimscraperlib.i18n import ( - NotFound, + NotFoundError, _, find_language_names, get_language_details, @@ -187,7 +187,7 @@ def test_selocale_unsupported(tmp_path): def test_lang_details(query, expected): if expected is None: assert get_language_details(query, failsafe=True) == expected - with pytest.raises(NotFound): + with pytest.raises(NotFoundError): get_language_details(query) else: assert get_language_details(query) == expected diff --git a/tests/image/test_image.py b/tests/image/test_image.py index 0953a093..9cfd1af8 100644 --- a/tests/image/test_image.py +++ b/tests/image/test_image.py @@ -1,6 +1,8 @@ #!/usr/bin/env python3 # vim: ai ts=4 sts=4 et sw=4 nu +from __future__ import annotations + import inspect import io import os @@ -15,9 +17,10 @@ from resizeimage.imageexceptions import ImageSizeError from zimscraperlib.image import presets -from zimscraperlib.image.convertion import convert_image, create_favicon +from zimscraperlib.image.conversion import convert_image, create_favicon from zimscraperlib.image.optimization import ( ensure_matches, + get_optimization_method, optimize_gif, optimize_image, optimize_jpeg, @@ -54,24 +57,22 @@ def get_image_size(fpath): return Image.open(fpath).size -def get_optimization_method(fmt): - return { - "gif": optimize_gif, - "jpg": optimize_jpeg, - "webp": optimize_webp, - "png": optimize_png, - }.get(fmt) - - def get_src_dst( - tmp_path, fmt, png_image=None, jpg_image=None, gif_image=None, webp_image=None -): - return ( - {"png": png_image, "jpg": jpg_image, "webp": webp_image, "gif": gif_image}.get( - fmt - ), - tmp_path / f"out.{fmt}", - ) + tmp_path: pathlib.Path, + fmt, + png_image: pathlib.Path | None = None, + jpg_image: pathlib.Path | None = None, + gif_image: pathlib.Path | None = None, + webp_image: pathlib.Path | None = None, +) -> tuple[pathlib.Path, pathlib.Path]: + options = {"png": png_image, "jpg": jpg_image, "webp": webp_image, "gif": gif_image} + if fmt not in options: + raise LookupError(f"Unsupported fmt passed: {fmt}") + src = options[fmt] + if not src: + raise LookupError(f"fmt passed has no corresponding argument: {fmt}") + else: + return (src, tmp_path / f"out.{fmt}") @pytest.mark.parametrize( @@ -95,38 +96,36 @@ def test_is_hex_color(value, valid): def test_colors_noimage(): with pytest.raises(FileNotFoundError): - get_colors("nofile.here") # pyright: ignore + get_colors(pathlib.Path("nofile.here")) def test_colors_png_nopalette(png_image): - assert get_colors(png_image, False) == ("#04659B", "#E7F6FF") + assert get_colors(png_image, use_palette=False) == ("#04659B", "#E7F6FF") def test_colors_jpg_nopalette(jpg_image): - assert get_colors(jpg_image, False) == ("#C1BBB3", "#F4F3F1") + assert get_colors(jpg_image, use_palette=False) == ("#C1BBB3", "#F4F3F1") def test_colors_png_palette(png_image): - assert get_colors(png_image, True) == ("#9E0404", "#E7F6FF") + assert get_colors(png_image, use_palette=True) == ("#9E0404", "#E7F6FF") def test_colors_jpg_palette(jpg_image): - assert get_colors(jpg_image, True) == ("#221C1B", "#F4F3F1") + assert get_colors(jpg_image, use_palette=True) == ("#221C1B", "#F4F3F1") @pytest.mark.parametrize( - "fmt,params", - [("png", None), ("jpg", {"quality": 50})], + "src_fmt,dst_fmt,params", + [ + ("png", "png", None), + ("jpg", "JPEG", {"quality": 50}), + ], ) -def test_save_image(png_image, jpg_image, tmp_path, fmt, params): - src, dst = get_src_dst(tmp_path, fmt, png_image=png_image, jpg_image=jpg_image) - img = Image.open(src) # pyright: ignore - if params: - save_image( - img, dst, "JPEG" if fmt == "jpg" else fmt, **params # pyright: ignore - ) - else: - save_image(img, dst, "JPEG" if fmt == "jpg" else fmt) # pyright: ignore +def test_save_image(png_image, jpg_image, tmp_path, src_fmt, dst_fmt, params): + src, dst = get_src_dst(tmp_path, src_fmt, png_image=png_image, jpg_image=jpg_image) + img = Image.open(src) + save_image(img, dst, fmt=dst_fmt, **(params or {})) assert pathlib.Path(dst).exists() @@ -138,7 +137,7 @@ def test_resize_thumbnail(png_image, jpg_image, tmp_path, fmt): src, dst = get_src_dst(tmp_path, fmt, png_image=png_image, jpg_image=jpg_image) width, height = 100, 50 - resize_image(src, width, height, dst=dst, method="thumbnail") # pyright: ignore + resize_image(src, width, height, dst=dst, method="thumbnail") tw, th = get_image_size(dst) assert tw <= width assert th <= height @@ -152,8 +151,8 @@ def test_resize_bytestream(png_image, jpg_image, tmp_path, fmt): src, dst = get_src_dst(tmp_path, fmt, png_image=png_image, jpg_image=jpg_image) # copy image content into a bytes stream - img = io.BytesIO() # pyright: ignore - with open(src, "rb") as srch: # pyright: ignore + img = io.BytesIO() + with open(src, "rb") as srch: img.write(srch.read()) # resize in place (no dst) @@ -172,7 +171,7 @@ def test_resize_width(png_image, jpg_image, tmp_path, fmt): src, dst = get_src_dst(tmp_path, fmt, png_image=png_image, jpg_image=jpg_image) width, height = 100, 50 - resize_image(src, width, height, dst=dst, method="width") # pyright: ignore + resize_image(src, width, height, dst=dst, method="width") tw, _ = get_image_size(dst) assert tw == width @@ -185,7 +184,7 @@ def test_resize_height(png_image, jpg_image, tmp_path, fmt): src, dst = get_src_dst(tmp_path, fmt, png_image=png_image, jpg_image=jpg_image) width, height = 100, 50 - resize_image(src, width, height, dst=dst, method="height") # pyright: ignore + resize_image(src, width, height, dst=dst, method="height") _, th = get_image_size(dst) assert th == height @@ -198,7 +197,7 @@ def test_resize_crop(png_image, jpg_image, tmp_path, fmt): src, dst = get_src_dst(tmp_path, fmt, png_image=png_image, jpg_image=jpg_image) width, height = 5, 50 - resize_image(src, width, height, dst=dst, method="crop") # pyright: ignore + resize_image(src, width, height, dst=dst, method="crop") tw, th = get_image_size(dst) assert tw == width assert th == height @@ -212,7 +211,7 @@ def test_resize_cover(png_image, jpg_image, tmp_path, fmt): src, dst = get_src_dst(tmp_path, fmt, png_image=png_image, jpg_image=jpg_image) width, height = 5, 50 - resize_image(src, width, height, dst=dst, method="cover") # pyright: ignore + resize_image(src, width, height, dst=dst, method="cover") tw, th = get_image_size(dst) assert tw == width assert th == height @@ -226,7 +225,7 @@ def test_resize_contain(png_image, jpg_image, tmp_path, fmt): src, dst = get_src_dst(tmp_path, fmt, png_image=png_image, jpg_image=jpg_image) width, height = 5, 50 - resize_image(src, width, height, dst=dst, method="contain") # pyright: ignore + resize_image(src, width, height, dst=dst, method="contain") tw, th = get_image_size(dst) assert tw <= width assert th <= height @@ -240,7 +239,7 @@ def test_resize_upscale(png_image, jpg_image, tmp_path, fmt): src, dst = get_src_dst(tmp_path, fmt, png_image=png_image, jpg_image=jpg_image) width, height = 500, 1000 - resize_image(src, width, height, dst=dst, method="cover") # pyright: ignore + resize_image(src, width, height, dst=dst, method="cover") tw, th = get_image_size(dst) assert tw == width assert th == height @@ -256,7 +255,7 @@ def test_resize_small_image_error(png_image, jpg_image, tmp_path, fmt): width, height = 500, 1000 with pytest.raises(ImageSizeError): resize_image( - src, # pyright: ignore + src, width, height, dst=dst, @@ -274,7 +273,7 @@ def test_change_image_format( ): src, _ = get_src_dst(tmp_path, src_fmt, png_image=png_image, jpg_image=jpg_image) dst = tmp_path / f"out.{dst_fmt.lower()}" - convert_image(src, dst, fmt=dst_fmt, colorspace=colorspace) # pyright: ignore + convert_image(src, dst, fmt=dst_fmt, colorspace=colorspace) dst_image = Image.open(dst) if colorspace: assert dst_image.mode == colorspace @@ -312,6 +311,15 @@ def test_convert_io_src_path_dst(png_image: pathlib.Path, tmp_path: pathlib.Path assert dst_image.format == "PNG" +def test_convert_io_src_bad_dst(png_image: pathlib.Path, tmp_path: pathlib.Path): + src = io.BytesIO(png_image.read_bytes()) + dst = tmp_path / "test.raster" + with pytest.raises( + ValueError, match="Impossible to guess destination image format" + ): + convert_image(src, dst) + + def test_convert_path_src_io_dst(png_image: pathlib.Path): src = png_image dst = io.BytesIO() @@ -327,7 +335,7 @@ def test_convert_path_src_io_dst(png_image: pathlib.Path): def test_create_favicon(png_image2, jpg_image, tmp_path, fmt, exp_size): src, dst = get_src_dst(tmp_path, fmt, png_image=png_image2, jpg_image=jpg_image) dst = dst.parent.joinpath("favicon.ico") - create_favicon(src, dst) # pyright: ignore + create_favicon(src, dst) im = Image.open(dst) assert im.format == "ICO" @@ -343,7 +351,7 @@ def test_create_favicon_square(square_png_image, square_jpg_image, tmp_path, fmt tmp_path, fmt, png_image=square_png_image, jpg_image=square_jpg_image ) dst = dst.parent.joinpath("favicon.ico") - create_favicon(src, dst) # pyright: ignore + create_favicon(src, dst) im = Image.open(dst) assert im.format == "ICO" @@ -359,7 +367,7 @@ def test_wrong_extension(square_png_image, square_jpg_image, tmp_path, fmt): tmp_path, fmt, png_image=square_png_image, jpg_image=square_jpg_image ) with pytest.raises(ValueError): - create_favicon(src, dst) # pyright: ignore + create_favicon(src, dst) @pytest.mark.parametrize( @@ -377,8 +385,8 @@ def test_optimize_image_default( gif_image=gif_image, webp_image=webp_image, ) - optimize_image(src, dst, delete_src=False) # pyright: ignore - assert os.path.getsize(dst) < os.path.getsize(src) # pyright: ignore + optimize_image(src, dst, delete_src=False) + assert os.path.getsize(dst) < os.path.getsize(src) def test_optimize_image_del_src(png_image, tmp_path): @@ -400,6 +408,14 @@ def test_optimize_image_allow_convert(png_image, tmp_path): assert dst.exists() and os.path.getsize(dst) > 0 +def test_optimize_image_bad_dst(png_image, tmp_path): + shutil.copy(png_image, tmp_path) + src = tmp_path / png_image.name + dst = tmp_path / "out.raster" + with pytest.raises(ValueError, match="Impossible to guess format from dst image"): + optimize_image(src, dst, delete_src=True, convert=True) + + @pytest.mark.parametrize( "preset,expected_version,options,fmt", [ @@ -494,20 +510,27 @@ def test_preset( gif_image=gif_image, webp_image=webp_image, ) - optimize_image(src, dst, delete_src=False, **preset.options) # pyright: ignore - assert os.path.getsize(dst) < os.path.getsize(src) # pyright: ignore + optimize_image(src, dst, delete_src=False, **preset.options) + assert os.path.getsize(dst) < os.path.getsize(src) if fmt in ["jpg", "webp", "png"]: - image_bytes = "" # pyright: ignore - with open(src, "rb") as fl: # pyright: ignore + image_bytes = "" + with open(src, "rb") as fl: image_bytes = fl.read() byte_stream = io.BytesIO(image_bytes) - dst_bytes = get_optimization_method(fmt)( - src=byte_stream, **preset.options - ) # pyright: ignore + dst_bytes = get_optimization_method(fmt)(src=byte_stream, **preset.options) assert dst_bytes.getbuffer().nbytes < byte_stream.getbuffer().nbytes +def test_optimize_image_unsupported_format(): + src = pathlib.Path(__file__).parent.parent / "files" / "single_wave_icon.gbr" + dst = pathlib.Path("image.png") + with pytest.raises( + NotImplementedError, match="Image format 'gbr' cannot yet be optimized" + ): + optimize_image(src, dst, delete_src=False) + + def test_preset_has_mime_and_ext(): for _, preset in ALL_PRESETS: assert preset().ext @@ -533,9 +556,9 @@ def test_jpeg_exif_preserve(jpg_exif_image, tmp_path): with open(jpg_exif_image, "rb") as fl: src_bytes = fl.read() optimized_img = optimize_jpeg(src=io.BytesIO(src_bytes)) - assert piexif.load(optimized_img.getvalue())["Exif"] and ( # pyright: ignore - piexif.load(src_bytes)["Exif"] - == piexif.load(optimized_img.getvalue())["Exif"] # pyright: ignore + assert isinstance(optimized_img, io.BytesIO) + assert piexif.load(optimized_img.getvalue())["Exif"] and ( + piexif.load(src_bytes)["Exif"] == piexif.load(optimized_img.getvalue())["Exif"] ) @@ -555,7 +578,43 @@ def test_ensure_matches(webp_image): "fmt,expected", [("png", "PNG"), ("jpg", "JPEG"), ("gif", "GIF"), ("webp", "WEBP")], ) -def test_format_for( +def test_format_for_real_images_suffix( + png_image, jpg_image, gif_image, webp_image, tmp_path, fmt, expected +): + src, _ = get_src_dst( + tmp_path, + fmt, + png_image=png_image, + jpg_image=jpg_image, + gif_image=gif_image, + webp_image=webp_image, + ) + assert format_for(src) == expected + + +@pytest.mark.parametrize( + "fmt,expected", + [("png", "PNG"), ("jpg", "JPEG"), ("gif", "GIF"), ("webp", "WEBP")], +) +def test_format_for_real_images_content_path( + png_image, jpg_image, gif_image, webp_image, tmp_path, fmt, expected +): + src, _ = get_src_dst( + tmp_path, + fmt, + png_image=png_image, + jpg_image=jpg_image, + gif_image=gif_image, + webp_image=webp_image, + ) + assert format_for(src, from_suffix=False) == expected + + +@pytest.mark.parametrize( + "fmt,expected", + [("png", "PNG"), ("jpg", "JPEG"), ("gif", "GIF"), ("webp", "WEBP")], +) +def test_format_for_real_images_content_bytes( png_image, jpg_image, gif_image, webp_image, tmp_path, fmt, expected ): src, _ = get_src_dst( @@ -566,7 +625,29 @@ def test_format_for( gif_image=gif_image, webp_image=webp_image, ) - assert format_for(src) == expected # pyright: ignore + assert format_for(io.BytesIO(src.read_bytes()), from_suffix=False) == expected + + +@pytest.mark.parametrize( + "src,expected", + [ + ("image.png", "PNG"), + ("image.jpg", "JPEG"), + ("image.gif", "GIF"), + ("image.webp", "WEBP"), + ("image.raster", None), + ], +) +def test_format_for_from_suffix(src, expected): + assert format_for(src=pathlib.Path(src), from_suffix=True) == expected + + +def test_format_for_cannot_use_suffix_with_byte_array(): + with pytest.raises( + ValueError, + match="Cannot guess image format from file suffix when byte array is passed", + ): + assert format_for(src=io.BytesIO(), from_suffix=True) def test_optimize_webp_gif_failure(tmp_path, webp_image, gif_image): @@ -574,13 +655,17 @@ def test_optimize_webp_gif_failure(tmp_path, webp_image, gif_image): # webp with pytest.raises(TypeError): - optimize_webp(webp_image, dst, lossless="bad") # pyright: ignore + optimize_webp( + webp_image, dst, lossless="bad" # pyright: ignore[reportArgumentType] + ) assert not dst.exists() # gif dst.touch() # fake temp file created during optim (actually fails before) with pytest.raises(CalledProcessError): - optimize_gif(gif_image, dst, optimize_level="bad") # pyright: ignore + optimize_gif( + gif_image, dst, optimize_level="bad" # pyright: ignore[reportArgumentType] + ) assert not dst.exists() @@ -598,7 +683,7 @@ def test_is_valid_image(png_image, png_image2, jpg_image, font): assert is_valid_image(png_image, "PNG", (48, 48)) assert not is_valid_image(png_image2, "PNG", (48, 48)) assert not is_valid_image(b"", "PNG") - assert not is_valid_image(34, "PNG") # pyright: ignore + assert not is_valid_image(34, "PNG") # pyright: ignore[reportArgumentType] assert not is_valid_image(font, "PNG") with open(png_image, "rb") as fh: assert is_valid_image(fh.read(), "PNG", (48, 48)) diff --git a/tests/video/test_video.py b/tests/video/test_video.py index 05d7557e..ad822c5a 100644 --- a/tests/video/test_video.py +++ b/tests/video/test_video.py @@ -430,7 +430,8 @@ def test_reencode_return_ffmpeg_output( with_process=return_output, ) if return_output: - success, process = ret # pyright: ignore + assert not isinstance(ret, bool) + success, process = ret assert success assert len(process.stdout) > 0 else: diff --git a/tests/zim/test_fs.py b/tests/zim/test_fs.py index 91d76c71..6352cd14 100644 --- a/tests/zim/test_fs.py +++ b/tests/zim/test_fs.py @@ -37,6 +37,8 @@ def test_redirects_file(tmp_path, png_image, build_data): with open(build_data["redirects_file"], "w") as fh: # write a redirect with a namespace (old ns scheme) fh.write("A\tAccueil\t\tcommons48.png\n") + # write a redirect not matching regex + fh.write("this_is_not_matching") # call make_zim_file with redirects_file make_zim_file( diff --git a/tests/zim/test_libkiwix.py b/tests/zim/test_libkiwix.py index 02fa218c..de4e6fbd 100644 --- a/tests/zim/test_libkiwix.py +++ b/tests/zim/test_libkiwix.py @@ -5,8 +5,7 @@ import pytest -from zimscraperlib.zim._libkiwix import getline -from zimscraperlib.zim._libkiwix import parseMimetypeCounter as parse # noqa: N813 +from zimscraperlib.zim._libkiwix import getline, parseMimetypeCounter empty = {} @@ -19,13 +18,13 @@ def test_geline_nodelim(): def test_getline(): ins = io.StringIO("text/javascript=8;text/html=3;application/warc-headers=28364;") - assert getline(ins, ";") == (False, "text/javascript=8") # pyright: ignore - assert getline(ins, ";") == (False, "text/html=3") # pyright: ignore - assert getline(ins, ";") == ( # pyright: ignore + assert getline(ins, ";") == (False, "text/javascript=8") + assert getline(ins, ";") == (False, "text/html=3") + assert getline(ins, ";") == ( False, "application/warc-headers=28364", ) - assert getline(ins, ";") == (True, "") # pyright: ignore + assert getline(ins, ";") == (True, "") @pytest.mark.parametrize( @@ -80,4 +79,4 @@ def test_getline(): ) def test_counter_parsing(counter_str, counter_map): # https://github.com/kiwix/libkiwix/blob/master/test/counterParsing.cpp - assert parse(counter_str) == counter_map + assert parseMimetypeCounter(counter_str) == counter_map diff --git a/tests/zim/test_zim_creator.py b/tests/zim/test_zim_creator.py index 330100d2..a60d907f 100644 --- a/tests/zim/test_zim_creator.py +++ b/tests/zim/test_zim_creator.py @@ -1,6 +1,8 @@ #!/usr/bin/env python # vim: ai ts=4 sts=4 et sw=4 nu +from __future__ import annotations + import base64 import datetime import io @@ -54,7 +56,7 @@ def test_zim_creator(tmp_path, png_image, html_file, html_str: str, html_str_cn: with open(png_image, "rb") as fh: png_data = fh.read() with Creator(fpath, main_path).config_dev_metadata( - Tags=tags, Illustration_48x48_at_1=png_data # pyright: ignore + Tags=tags, Illustration_48x48_at_1=png_data ) as creator: # verbatim HTML from string creator.add_item_for("welcome", "wel", content=html_str, is_front=True) @@ -360,9 +362,7 @@ def test_filelikeprovider_nosize(tmp_path, png_image_url): fpath = tmp_path / "test.zim" with Creator(fpath, "").config_dev_metadata() as creator: - creator.add_item( - FileLikeProviderItem(fileobj=fileobj, path="one.png") # pyright: ignore - ) + creator.add_item(FileLikeProviderItem(fileobj=fileobj, path="one.png")) zim = Archive(fpath) assert bytes(zim.get_item("one.png").content) == fileobj.getvalue() @@ -376,9 +376,7 @@ def test_urlprovider(tmp_path, png_image_url): fpath = tmp_path / "test.zim" with Creator(fpath, "").config_dev_metadata() as creator: - creator.add_item( - SpecialURLProviderItem(url=png_image_url, path="one.png") # pyright: ignore - ) + creator.add_item(SpecialURLProviderItem(url=png_image_url, path="one.png")) zim = Archive(fpath) assert bytes(zim.get_item("one.png").content) == file_bytes @@ -441,8 +439,8 @@ def do_GET(self): creator.add_item( SpecialURLProviderItem( - url=f"http://localhost:{port}/home.png", # pyright: ignore - mimetype="image/png", # pyright: ignore + url=f"http://localhost:{port}/home.png", + mimetype="image/png", ) ) finally: @@ -535,7 +533,7 @@ def test_without_metadata(tmp_path): def test_check_metadata(tmp_path): with pytest.raises(ValueError, match="Counter cannot be set"): - Creator(tmp_path, "").config_dev_metadata(Counter=1).start() # pyright: ignore + Creator(tmp_path, "").config_dev_metadata(Counter=1).start() with pytest.raises(ValueError, match="Description is too long."): Creator(tmp_path, "").config_dev_metadata(Description="T" * 90).start() @@ -802,6 +800,7 @@ def test_config_metadata_control_characters(tmp_path): ("Date", "1969-12-31", True), ("Date", "1969-13-31", False), ("Date", "2023/02/29", False), + ("Date", "2023-55-99", False), ("Language", "xxx", False), ("Language", "rmr", False), ("Language", "eng", True),