openzim · benoit74 · Jul 30, 2024 · Jul 10, 2024 · Jul 9, 2024 · Jul 9, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -14,6 +14,21 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - New `indexing.IndexData` class to hold title, content and keywords to pass to libzim to index an item
 - Automatically index PDF documents content #167
 - Automatically set proper title on PDF documents #168
+- Expose new `optimization.get_optimization_method` to get the proper optimization method to call for a given image format
+- Add `optimization.get_optimization_method` to get the proper optimization method to call for a given image format
+- New `creator.Creator.convert_and_check_metadata` to convert metadata to bytes or str for known use cases and check proper type is passed to libzim
+
+## Changed
+
+- **BREAKING** Renamed `zimscraperlib.image.convertion` to `zimscraperlib.image.conversion` to fix typo
+- **BREAKING** Many changes in type hints to match the real underlying code
+- **BREAKING** Force all boolean arguments (and some other non-obvious parameters) to be keyword-only in function calls for clarity / disambiguation (see ruff rule FBT002)
+- Prefer to use `IO[bytes]` to `io.BytesIO` when possible since it is more generic
+- **BREAKING** `i18n.NotFound` renamed `i18n.NotFoundError`
+- **BREAKING** `types.get_mime_for_name` now returns `str | None`
+- **BREAKING** `creator.Creator.add_metadata` and `creator.Creator.validate_metadata` now only accepts `bytes | str` as value (it must have been converted before call)
+- **BREAKING** second argument of `creator.Creator.add_metadata` has been renamed to `value` instead of `content` to align with other methods
+- When a type issue arises in metadata checks, wrong value type is displayed in exception
 
 ### Fixed
 

diff --git a/src/zimscraperlib/__about__.py b/src/zimscraperlib/__about__.py
@@ -1 +1 @@
-__version__ = "3.4.1-dev0"
+__version__ = "4.0.0-dev0"
diff --git a/src/zimscraperlib/download.py b/src/zimscraperlib/download.py
@@ -3,13 +3,15 @@
 
 from __future__ import annotations
 
-import io
 import pathlib
 import subprocess
 from concurrent.futures import Future, ThreadPoolExecutor
-from typing import ClassVar
+from typing import IO, ClassVar
 
 import requests
+import requests.adapters
+import requests.structures
+import urllib3.util
 import yt_dlp as youtube_dl
 
 from zimscraperlib import logger
@@ -42,7 +44,8 @@ def download(
         self,
         url: str,
         options: dict | None,
-        wait: bool | None = True,  # noqa: FBT002
+        *,
+        wait: bool | None = True,
     ) -> bool | Future:
         """Downloads video using initialized executor.
 
@@ -52,9 +55,7 @@ def download(
 
         Returns download result of future (wait=False)"""
 
-        future = self.executor.submit(
-            self._run_youtube_dl, url, options  # pyright: ignore
-        )
+        future = self.executor.submit(self._run_youtube_dl, url, options or {})
         if not wait:
             return future
         if not future.exception():
@@ -143,8 +144,8 @@ def save_large_file(url: str, fpath: pathlib.Path) -> None:
 
 def _get_retry_adapter(
     max_retries: int | None = 5,
-) -> requests.adapters.BaseAdapter:  # pyright: ignore
-    retries = requests.packages.urllib3.util.retry.Retry(  # pyright: ignore
+) -> requests.adapters.BaseAdapter:
+    retries = urllib3.util.retry.Retry(
         total=max_retries,  # total number of retries
         connect=max_retries,  # connection errors
         read=max_retries,  # read errors
@@ -161,7 +162,7 @@ def _get_retry_adapter(
         ],  # force retry on the following codes
     )
 
-    return requests.adapters.HTTPAdapter(max_retries=retries)  # pyright: ignore
+    return requests.adapters.HTTPAdapter(max_retries=retries)
 
 
 def get_session(max_retries: int | None = 5) -> requests.Session:
@@ -174,14 +175,15 @@ def get_session(max_retries: int | None = 5) -> requests.Session:
 def stream_file(
     url: str,
     fpath: pathlib.Path | None = None,
-    byte_stream: io.BytesIO | None = None,
+    byte_stream: IO[bytes] | None = None,
     block_size: int | None = 1024,
     proxies: dict | None = None,
-    only_first_block: bool | None = False,  # noqa: FBT002
     max_retries: int | None = 5,
     headers: dict[str, str] | None = None,
     session: requests.Session | None = None,
-) -> tuple[int, requests.structures.CaseInsensitiveDict]:  # pyright: ignore
+    *,
+    only_first_block: bool | None = False,
+) -> tuple[int, requests.structures.CaseInsensitiveDict]:
     """Stream data from a URL to either a BytesIO object or a file
     Arguments -
         fpath - Path of the file where data is sent
@@ -212,12 +214,14 @@ def stream_file(
     total_downloaded = 0
     if fpath is not None:
         fp = open(fpath, "wb")
-    else:
+    elif (
+        byte_stream is not None
+    ):  # pragma: no branch (we use a precise condition to help type checker)
         fp = byte_stream
 
     for data in resp.iter_content(block_size):
         total_downloaded += len(data)
-        fp.write(data)  # pyright: ignore
+        fp.write(data)
 
         # stop downloading/reading if we're just testing first block
         if only_first_block:
@@ -226,7 +230,7 @@ def stream_file(
     logger.debug(f"Downloaded {total_downloaded} bytes from {url}")
 
     if fpath:
-        fp.close()  # pyright: ignore
+        fp.close()
     else:
-        fp.seek(0)  # pyright: ignore
+        fp.seek(0)
     return total_downloaded, resp.headers
diff --git a/src/zimscraperlib/filesystem.py b/src/zimscraperlib/filesystem.py
@@ -30,7 +30,7 @@ def get_file_mimetype(fpath: pathlib.Path) -> str:
         return get_content_mimetype(fh.read(2048))
 
 
-def get_content_mimetype(content: bytes) -> str:
+def get_content_mimetype(content: bytes | str) -> str:
     """MIME Type of content retrieved from magic headers"""
 
     try:

diff --git a/src/zimscraperlib/html.py b/src/zimscraperlib/html.py
@@ -7,24 +7,22 @@
 import pathlib
 from typing import BinaryIO, TextIO
 
-from bs4 import BeautifulSoup
+from bs4 import BeautifulSoup, element
 
 from zimscraperlib.types import ARTICLE_MIME
 
 
-def find_title_in(content: str | BinaryIO | TextIO, mime_type: str) -> str:
+def find_title_in(content: str | BinaryIO | TextIO, mime_type: str | None) -> str:
     """Extracted title from HTML content
 
     blank on failure to extract and non-HTML files"""
     if mime_type != ARTICLE_MIME:
         return ""
-    try:
-        return BeautifulSoup(content, "lxml").find("title").text  # pyright: ignore
-    except Exception:
-        return ""
+    title_tag = BeautifulSoup(content, "lxml").find("title")
+    return title_tag.text if title_tag else ""
 
 
-def find_title_in_file(fpath: pathlib.Path, mime_type: str) -> str:
+def find_title_in_file(fpath: pathlib.Path, mime_type: str | None) -> str:
     """Extracted title from an HTML file"""
     try:
         with open(fpath) as fh:
@@ -45,15 +43,17 @@ def find_language_in(content: str | BinaryIO | TextIO, mime_type: str) -> str:
         for key in keylist:
             node = soup.find(nodename)
             if node:
-                if not node.has_attr(key):  # pyright: ignore
+                if not isinstance(node, element.Tag) or (
+                    isinstance(node, element.Tag) and not node.has_attr(key)
+                ):
                     continue
                 if (
                     nodename == "meta"
-                    and not node.attrs.get("http-equiv", "").lower()  # pyright: ignore
+                    and not node.attrs.get("http-equiv", "").lower()
                     == "content-language"
                 ):
                     continue
-                return node.attrs[key]  # pyright: ignore
+                return node.attrs[key]
     return ""
 
 

diff --git a/src/zimscraperlib/i18n.py b/src/zimscraperlib/i18n.py
@@ -15,7 +15,7 @@
 ISO_LEVELS = ["1", "2b", "2t", "3", "5"]
 
 
-class NotFound(ValueError):  # noqa: N818
+class NotFoundError(ValueError):
     pass
 
 
@@ -81,7 +81,7 @@ def get_iso_lang_data(lang: str) -> tuple[dict, dict | None]:
         iso639.exceptions.InvalidLanguageValue,
         iso639.exceptions.DeprecatedLanguageValue,
     ) as exc:
-        raise NotFound("Not a valid iso language name/code") from exc
+        raise NotFoundError("Not a valid iso language name/code") from exc
 
     def replace_types(new_type: str) -> str:
         # convert new iso_types from iso639-lang Pypi package to old iso_types from
@@ -118,34 +118,32 @@ def replace_types(new_type: str) -> str:
     return lang_data, None
 
 
-def find_language_names(query: str, lang_data: dict | None = None) -> tuple[str, str]:
+def find_language_names(
+    query: str, lang_data: dict | None = None
+) -> tuple[str | None, str | None]:
     """(native, english) language names for lang with help from language_details dict
 
     Falls back to English name if available or query if not"""
     if lang_data is None:
         lang_data = get_language_details(query, failsafe=True) or {}
     try:
         query_locale = babel.Locale.parse(query)
-        return query_locale.get_display_name(), query_locale.get_display_name(
-            "en"
-        )  # pyright: ignore
+        return query_locale.get_display_name(), query_locale.get_display_name("en")
     except (babel.UnknownLocaleError, TypeError, ValueError, AttributeError):
         pass
 
     # ISO code lookup order matters (most qualified first)!
     for iso_level in [f"iso-639-{lang_}" for lang_ in reversed(ISO_LEVELS)]:
         try:
             query_locale = babel.Locale.parse(lang_data.get(iso_level))
-            return query_locale.get_display_name(), query_locale.get_display_name(
-                "en"
-            )  # pyright: ignore
+            return query_locale.get_display_name(), query_locale.get_display_name("en")
         except (babel.UnknownLocaleError, TypeError, ValueError, AttributeError):
             pass
     default = lang_data.get("english", query)
     return default, default
 
 
-def update_with_macro(lang_data: dict, macro_data: dict):
+def update_with_macro(lang_data: dict, macro_data: dict | None):
     """update empty keys from lang_data with ones of macro_data"""
     if macro_data:
         for key, value in macro_data.items():
@@ -154,9 +152,7 @@ def update_with_macro(lang_data: dict, macro_data: dict):
     return lang_data
 
 
-def get_language_details(
-    query: str, failsafe: bool | None = False  # noqa: FBT002
-) -> dict:
+def get_language_details(query: str, *, failsafe: bool | None = False) -> dict | None:
     """language details dict from query.
 
     Raises NotFound or return `und` language details if failsafe
@@ -192,12 +188,12 @@ def get_language_details(
 
     try:
         lang_data, macro_data = get_iso_lang_data(adjusted_query)
-    except NotFound as exc:
+    except NotFoundError as exc:
         if failsafe:
-            return None  # pyright: ignore
+            return None
         raise exc
 
-    iso_data = update_with_macro(lang_data, macro_data)  # pyright: ignore
+    iso_data = update_with_macro(lang_data, macro_data)
     native_name, english_name = find_language_names(native_query, iso_data)
     iso_data.update(
         {

diff --git a/src/zimscraperlib/image/__init__.py b/src/zimscraperlib/image/__init__.py
@@ -3,7 +3,7 @@
 # vim: ai ts=4 sts=4 et sw=4 nu
 
 # flake8: noqa
-from .convertion import convert_image
+from .conversion import convert_image
 from .optimization import optimize_image
 from .probing import is_valid_image
 from .transformation import resize_image

diff --git a/src/zimscraperlib/image/convertion.py → src/zimscraperlib/image/conversion.py b/src/zimscraperlib/image/convertion.py → src/zimscraperlib/image/conversion.py
@@ -3,10 +3,10 @@
 
 from __future__ import annotations
 
-import io
 import pathlib
+from typing import IO
 
-import PIL
+from PIL.Image import open as pilopen
 
 from zimscraperlib.constants import ALPHA_NOT_SUPPORTED
 from zimscraperlib.image.probing import format_for
@@ -15,8 +15,8 @@
 
 
 def convert_image(
-    src: pathlib.Path | io.BytesIO,
-    dst: pathlib.Path | io.BytesIO,
+    src: pathlib.Path | IO[bytes],
+    dst: pathlib.Path | IO[bytes],
     **params: str,
 ) -> None:
     """convert an image file from one format to another
@@ -29,12 +29,12 @@ def convert_image(
      to RGB. ex: RGB, ARGB, CMYK (and other PIL colorspaces)"""
 
     colorspace = params.get("colorspace")  # requested colorspace
-    fmt = (
-        params.pop("fmt").upper() if "fmt" in params else None  # pyright: ignore
-    )  # requested format
+    fmt = params.pop("fmt").upper() if "fmt" in params else None  # requested format
     if not fmt:
         fmt = format_for(dst)
-    with PIL.Image.open(src) as image:  # pyright: ignore
+    if not fmt:
+        raise ValueError("Impossible to guess destination image format")
+    with pilopen(src) as image:
         if image.mode == "RGBA" and fmt in ALPHA_NOT_SUPPORTED or colorspace:
             image = image.convert(colorspace or "RGB")  # noqa: PLW2901
         save_image(image, dst, fmt, **params)
@@ -45,13 +45,13 @@ def create_favicon(src: pathlib.Path, dst: pathlib.Path) -> None:
     if dst.suffix != ".ico":
         raise ValueError("favicon extension must be ICO")
 
-    img = PIL.Image.open(src)  # pyright: ignore
+    img = pilopen(src)
     w, h = img.size
     # resize image to square first
     if w != h:
         size = min([w, h])
         resized = dst.parent.joinpath(f"{src.stem}.tmp.{src.suffix}")
         resize_image(src, size, size, resized, "contain")
-        img = PIL.Image.open(resized)  # pyright: ignore
+        img = pilopen(resized)
     # now convert to ICO
     save_image(img, dst, "ICO")
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		__version__ = "3.4.1-dev0"
		__version__ = "4.0.0-dev0"