From 87c0f33b42bb247ed986a2bf3f418a6d2fa318e3 Mon Sep 17 00:00:00 2001 From: benoit74 Date: Mon, 12 Feb 2024 21:42:18 +0100 Subject: [PATCH 1/4] Add more admissible types for convert_image + corresponding tests --- src/zimscraperlib/image/convertion.py | 7 +++++-- src/zimscraperlib/image/transformation.py | 2 +- src/zimscraperlib/image/utils.py | 7 ++++--- tests/image/test_image.py | 24 +++++++++++++++++++++++ 4 files changed, 34 insertions(+), 6 deletions(-) diff --git a/src/zimscraperlib/image/convertion.py b/src/zimscraperlib/image/convertion.py index 31674236..3d5e0486 100644 --- a/src/zimscraperlib/image/convertion.py +++ b/src/zimscraperlib/image/convertion.py @@ -1,8 +1,9 @@ #!/usr/bin/env python3 # vim: ai ts=4 sts=4 et sw=4 nu +import io import pathlib -from typing import Optional +from typing import Union import PIL @@ -13,7 +14,9 @@ def convert_image( - src: pathlib.Path, dst: pathlib.Path, **params: Optional[dict] + src: Union[pathlib.Path, io.BytesIO], + dst: Union[pathlib.Path, io.BytesIO], + **params: str, ) -> None: """convert an image file from one format to another params: Image.save() parameters. Depends on dest format. diff --git a/src/zimscraperlib/image/transformation.py b/src/zimscraperlib/image/transformation.py index 4db56cc6..287b7efb 100644 --- a/src/zimscraperlib/image/transformation.py +++ b/src/zimscraperlib/image/transformation.py @@ -19,7 +19,7 @@ def resize_image( dst: Optional[Union[pathlib.Path, io.BytesIO]] = None, method: Optional[str] = "width", allow_upscaling: Optional[bool] = True, # noqa: FBT002 - **params: Optional[dict], + **params: str, ) -> None: """resize an image to requested dimensions diff --git a/src/zimscraperlib/image/utils.py b/src/zimscraperlib/image/utils.py index 2568492d..6427a298 100644 --- a/src/zimscraperlib/image/utils.py +++ b/src/zimscraperlib/image/utils.py @@ -1,17 +1,18 @@ #!/usr/bin/env python # vim: ai ts=4 sts=4 et sw=4 nu +import io import pathlib -from typing import Optional +from typing import Optional, Union from PIL import Image def save_image( src: Image, # pyright: ignore - dst: pathlib.Path, + dst: Union[pathlib.Path, io.BytesIO], fmt: Optional[str] = None, - **params: Optional[dict], + **params: str, ) -> None: """PIL.Image.save() wrapper setting default parameters""" args = {"JPEG": {"quality": 100}, "PNG": {}}.get(fmt, {}) # pyright: ignore diff --git a/tests/image/test_image.py b/tests/image/test_image.py index 28041737..0953a093 100644 --- a/tests/image/test_image.py +++ b/tests/image/test_image.py @@ -296,6 +296,30 @@ def test_change_image_format_defaults(png_image, tmp_path): assert dst_image.format == "WEBP" +def test_convert_io_src_dst(png_image: pathlib.Path): + src = io.BytesIO(png_image.read_bytes()) + dst = io.BytesIO() + convert_image(src, dst, fmt="PNG") + dst_image = Image.open(dst) + assert dst_image.format == "PNG" + + +def test_convert_io_src_path_dst(png_image: pathlib.Path, tmp_path: pathlib.Path): + src = io.BytesIO(png_image.read_bytes()) + dst = tmp_path / "test.png" + convert_image(src, dst, fmt="PNG") + dst_image = Image.open(dst) + assert dst_image.format == "PNG" + + +def test_convert_path_src_io_dst(png_image: pathlib.Path): + src = png_image + dst = io.BytesIO() + convert_image(src, dst, fmt="PNG") + dst_image = Image.open(dst) + assert dst_image.format == "PNG" + + @pytest.mark.parametrize( "fmt,exp_size", [("png", 128), ("jpg", 128)], From 44f318155c4f90306c2d9ed6e6e3f2c75fde9334 Mon Sep 17 00:00:00 2001 From: benoit74 Date: Mon, 12 Feb 2024 10:34:32 +0100 Subject: [PATCH 2/4] Fix StaticItem typing issue + remove reference to callback removed long ago --- src/zimscraperlib/zim/filesystem.py | 4 ++-- src/zimscraperlib/zim/items.py | 10 ++++------ tests/zim/test_zim_creator.py | 25 ++++++++----------------- 3 files changed, 14 insertions(+), 25 deletions(-) diff --git a/src/zimscraperlib/zim/filesystem.py b/src/zimscraperlib/zim/filesystem.py index e0f3d6a2..cb3fb9c3 100644 --- a/src/zimscraperlib/zim/filesystem.py +++ b/src/zimscraperlib/zim/filesystem.py @@ -46,8 +46,8 @@ def __init__( self, root: pathlib.Path, filepath: pathlib.Path, - ): # pyright: ignore - super().__init__(root=root, filepath=filepath) # pyright: ignore + ): + super().__init__(root=root, filepath=filepath) # first look inside the file's magic headers self.mimetype = get_file_mimetype(self.filepath) # most web-specific files are plain text. In this case, use extension diff --git a/src/zimscraperlib/zim/items.py b/src/zimscraperlib/zim/items.py index a3d20d74..45026dc9 100644 --- a/src/zimscraperlib/zim/items.py +++ b/src/zimscraperlib/zim/items.py @@ -9,7 +9,7 @@ import re import tempfile import urllib.parse -from typing import Dict, Union +from typing import Any import libzim.writer # pyright: ignore @@ -23,11 +23,9 @@ class Item(libzim.writer.Item): - """libzim.writer.Item returning props for path/title/mimetype plus a callback + """libzim.writer.Item returning props for path/title/mimetype""" - Calls your `callback` prop on deletion""" - - def __init__(self, **kwargs: Dict[str, Union[str, bool, bytes]]): + def __init__(self, **kwargs: Any): super().__init__() for k, v in kwargs.items(): setattr(self, k, v) @@ -106,7 +104,7 @@ def download_for_size(url, on_disk, tmp_dir=None): size, _ = stream_file(url.geturl(), fpath=fpath, byte_stream=stream) return fpath or stream, size - def __init__(self, url: str, **kwargs): + def __init__(self, url: str, **kwargs: Any): super().__init__(**kwargs) self.url = urllib.parse.urlparse(url) use_disk = getattr(self, "use_disk", False) diff --git a/tests/zim/test_zim_creator.py b/tests/zim/test_zim_creator.py index 9db1b1ca..d9d8ef52 100644 --- a/tests/zim/test_zim_creator.py +++ b/tests/zim/test_zim_creator.py @@ -4,7 +4,6 @@ import base64 import datetime import io -import os import pathlib import random import shutil @@ -119,7 +118,7 @@ def test_noindexlanguage(tmp_path): creator = Creator(fpath, "welcome").config_dev_metadata(Language="bam") creator.config_indexing(False) with creator as creator: - creator.add_item(StaticItem(path="welcome", content="hello")) # pyright: ignore + creator.add_item(StaticItem(path="welcome", content="hello")) creator.add_item_for("index", "Index", content="-", mimetype="text/html") reader = Archive(fpath) @@ -165,15 +164,11 @@ def test_add_item_for_delete_fail(tmp_path, png_image): # copy file to local path shutil.copyfile(png_image, local_path) - def remove_source(item): - os.remove(item.filepath) - with Creator(fpath, "welcome").config_dev_metadata() as creator: creator.add_item( StaticItem( - filepath=local_path, # pyright: ignore - path="index", # pyright: ignore - callback=remove_source, # pyright: ignore + filepath=local_path, + path="index", ), callback=(delete_callback, local_path), ) @@ -188,18 +183,18 @@ def test_compression(tmp_path): with Creator( tmp_path / "test.zim", "welcome", compression="zstd" ).config_dev_metadata() as creator: - creator.add_item(StaticItem(path="welcome", content="hello")) # pyright: ignore + creator.add_item(StaticItem(path="welcome", content="hello")) with Creator( fpath, "welcome", compression=Compression.zstd # pyright: ignore ).config_dev_metadata() as creator: - creator.add_item(StaticItem(path="welcome", content="hello")) # pyright: ignore + creator.add_item(StaticItem(path="welcome", content="hello")) def test_double_finish(tmp_path): fpath = tmp_path / "test.zim" with Creator(fpath, "welcome").config_dev_metadata() as creator: - creator.add_item(StaticItem(path="welcome", content="hello")) # pyright: ignore + creator.add_item(StaticItem(path="welcome", content="hello")) # ensure we can finish an already finished creator creator.finish() @@ -219,11 +214,7 @@ def test_sourcefile_removal(tmp_path, html_file): # copy html to folder src_path = pathlib.Path(tmpdir.name, "source.html") shutil.copyfile(html_file, src_path) - creator.add_item( - StaticItem( - filepath=src_path, path=src_path.name, ref=tmpdir # pyright: ignore - ) - ) + creator.add_item(StaticItem(filepath=src_path, path=src_path.name, ref=tmpdir)) del tmpdir assert not src_path.exists() @@ -241,7 +232,7 @@ def test_sourcefile_removal_std(tmp_path, html_file): StaticItem( filepath=paths[-1], path=paths[-1].name, - mimetype="text/html", # pyright: ignore + mimetype="text/html", ), callback=(delete_callback, paths[-1]), ) From 954c253d0f1933af144c9b9d83823cd83e7a736e Mon Sep 17 00:00:00 2001 From: benoit74 Date: Tue, 13 Feb 2024 15:25:18 +0100 Subject: [PATCH 3/4] Add expected args to Item and subclasses init method --- src/zimscraperlib/zim/filesystem.py | 4 +- src/zimscraperlib/zim/items.py | 78 +++++++++++++++++++++-------- tests/zim/test_zim_creator.py | 2 + 3 files changed, 62 insertions(+), 22 deletions(-) diff --git a/src/zimscraperlib/zim/filesystem.py b/src/zimscraperlib/zim/filesystem.py index cb3fb9c3..0a81a7ee 100644 --- a/src/zimscraperlib/zim/filesystem.py +++ b/src/zimscraperlib/zim/filesystem.py @@ -47,7 +47,9 @@ def __init__( root: pathlib.Path, filepath: pathlib.Path, ): - super().__init__(root=root, filepath=filepath) + super().__init__() + self.root = root + self.filepath = filepath # first look inside the file's magic headers self.mimetype = get_file_mimetype(self.filepath) # most web-specific files are plain text. In this case, use extension diff --git a/src/zimscraperlib/zim/items.py b/src/zimscraperlib/zim/items.py index 45026dc9..881f2d50 100644 --- a/src/zimscraperlib/zim/items.py +++ b/src/zimscraperlib/zim/items.py @@ -9,7 +9,7 @@ import re import tempfile import urllib.parse -from typing import Any +from typing import Any, Optional import libzim.writer # pyright: ignore @@ -25,8 +25,19 @@ class Item(libzim.writer.Item): """libzim.writer.Item returning props for path/title/mimetype""" - def __init__(self, **kwargs: Any): + def __init__( + self, + path: Optional[str] = None, + title: Optional[str] = None, + mimetype: Optional[str] = None, + hints: Optional[dict] = None, + **kwargs: Any, + ): super().__init__() + self.path = path + self.title = title + self.mimetype = mimetype + self.hints = hints for k, v in kwargs.items(): setattr(self, k, v) @@ -35,16 +46,16 @@ def should_index(self): return self.get_mimetype().startswith("text/html") def get_path(self) -> str: - return getattr(self, "path", "") + return self.path or "" def get_title(self) -> str: - return getattr(self, "title", "") + return self.title or "" def get_mimetype(self) -> str: - return getattr(self, "mimetype", "") + return self.mimetype or "" def get_hints(self) -> dict: - return getattr(self, "hints", {}) + return self.hints or {} class StaticItem(Item): @@ -55,19 +66,37 @@ class StaticItem(Item): more efficiently: now when the libzim destroys the CP, python will destroy the Item and we can be notified that we're effectively through with our content""" + def __init__( + self, + content: Optional[str] = None, + fileobj: Optional[io.IOBase] = None, + filepath: Optional[pathlib.Path] = None, + path: Optional[str] = None, + title: Optional[str] = None, + mimetype: Optional[str] = None, + hints: Optional[dict] = None, + **kwargs: Any, + ): + super().__init__( + path=path, title=title, mimetype=mimetype, hints=hints, **kwargs + ) + self.content = content + self.fileobj = fileobj + self.filepath = filepath + def get_contentprovider(self) -> libzim.writer.ContentProvider: # content was set manually - if getattr(self, "content", None) is not None: + if self.content is not None: return StringProvider(content=self.content, ref=self) # using a file-like object - if getattr(self, "fileobj", None): + if self.fileobj: return FileLikeProvider( fileobj=self.fileobj, ref=self, size=getattr(self, "size", None) ) # we had to download locally to get size - if getattr(self, "filepath", None): + if self.filepath: return FileProvider( filepath=self.filepath, ref=self, size=getattr(self, "size", None) ) @@ -104,10 +133,22 @@ def download_for_size(url, on_disk, tmp_dir=None): size, _ = stream_file(url.geturl(), fpath=fpath, byte_stream=stream) return fpath or stream, size - def __init__(self, url: str, **kwargs: Any): - super().__init__(**kwargs) + def __init__( + self, + url: str, + path: Optional[str] = None, + title: Optional[str] = None, + mimetype: Optional[str] = None, + hints: Optional[dict] = None, + *, + use_disk: bool = False, + **kwargs: Any, + ): + super().__init__( + path=path, title=title, mimetype=mimetype, hints=hints, **kwargs + ) self.url = urllib.parse.urlparse(url) - use_disk = getattr(self, "use_disk", False) + self.use_disk = use_disk # fetch headers to retrieve size and type try: @@ -136,7 +177,7 @@ def __init__(self, url: str, **kwargs: Any): except Exception: # we couldn't retrieve size so we have to download resource to target, self.size = self.download_for_size( - self.url, on_disk=use_disk, tmp_dir=getattr(self, "tmp_dir", None) + self.url, on_disk=self.use_disk, tmp_dir=getattr(self, "tmp_dir", None) ) # downloaded to disk and using a file path from now on if use_disk: @@ -146,16 +187,11 @@ def __init__(self, url: str, **kwargs: Any): self.fileobj = target def get_path(self) -> str: - return getattr(self, "path", re.sub(r"^/", "", self.url.path)) - - def get_title(self) -> str: - return getattr(self, "title", "") + return self.path or re.sub(r"^/", "", self.url.path) def get_mimetype(self) -> str: - return getattr( - self, - "mimetype", - self.headers.get("Content-Type", "application/octet-stream"), + return self.mimetype or self.headers.get( + "Content-Type", "application/octet-stream" ) def get_contentprovider(self): diff --git a/tests/zim/test_zim_creator.py b/tests/zim/test_zim_creator.py index d9d8ef52..a8770c4b 100644 --- a/tests/zim/test_zim_creator.py +++ b/tests/zim/test_zim_creator.py @@ -40,6 +40,8 @@ def get_contentprovider(self): class FileLikeProviderItem(StaticItem): def get_contentprovider(self): + if not self.fileobj: + raise AttributeError("fileobj cannot be None") return FileLikeProvider(self.fileobj) From 824504f33d3b13d1254e445d81b590b5b73c8e7e Mon Sep 17 00:00:00 2001 From: benoit74 Date: Tue, 13 Feb 2024 16:58:34 +0100 Subject: [PATCH 4/4] Do not store useless properties on Items We might have many Items, some of them long-lived, so it is maybe better to not store many properties with a None value because they are unused and hence keep a lean memory footprint --- src/zimscraperlib/zim/items.py | 64 +++++++++++++++++++++------------- 1 file changed, 40 insertions(+), 24 deletions(-) diff --git a/src/zimscraperlib/zim/items.py b/src/zimscraperlib/zim/items.py index 881f2d50..5f4c6ed0 100644 --- a/src/zimscraperlib/zim/items.py +++ b/src/zimscraperlib/zim/items.py @@ -34,10 +34,14 @@ def __init__( **kwargs: Any, ): super().__init__() - self.path = path - self.title = title - self.mimetype = mimetype - self.hints = hints + if path: + kwargs["path"] = path + if title: + kwargs["title"] = title + if mimetype: + kwargs["mimetype"] = mimetype + if hints: + kwargs["hints"] = hints for k, v in kwargs.items(): setattr(self, k, v) @@ -46,16 +50,16 @@ def should_index(self): return self.get_mimetype().startswith("text/html") def get_path(self) -> str: - return self.path or "" + return getattr(self, "path", "") def get_title(self) -> str: - return self.title or "" + return getattr(self, "title", "") def get_mimetype(self) -> str: - return self.mimetype or "" + return getattr(self, "mimetype", "") def get_hints(self) -> dict: - return self.hints or {} + return getattr(self, "hints", {}) class StaticItem(Item): @@ -77,28 +81,34 @@ def __init__( hints: Optional[dict] = None, **kwargs: Any, ): + if content: + kwargs["content"] = content + if fileobj: + kwargs["fileobj"] = fileobj + if filepath: + kwargs["filepath"] = filepath super().__init__( path=path, title=title, mimetype=mimetype, hints=hints, **kwargs ) - self.content = content - self.fileobj = fileobj - self.filepath = filepath def get_contentprovider(self) -> libzim.writer.ContentProvider: # content was set manually - if self.content is not None: - return StringProvider(content=self.content, ref=self) + content = getattr(self, "content", None) + if content is not None: + return StringProvider(content=content, ref=self) # using a file-like object - if self.fileobj: + fileobj = getattr(self, "fileobj", None) + if fileobj: return FileLikeProvider( - fileobj=self.fileobj, ref=self, size=getattr(self, "size", None) + fileobj=fileobj, ref=self, size=getattr(self, "size", None) ) # we had to download locally to get size - if self.filepath: + filepath = getattr(self, "filepath", None) + if filepath: return FileProvider( - filepath=self.filepath, ref=self, size=getattr(self, "size", None) + filepath=filepath, ref=self, size=getattr(self, "size", None) ) raise NotImplementedError("No data to provide`") @@ -140,15 +150,16 @@ def __init__( title: Optional[str] = None, mimetype: Optional[str] = None, hints: Optional[dict] = None, - *, - use_disk: bool = False, + use_disk: Optional[bool] = None, **kwargs: Any, ): + if use_disk: + kwargs["use_disk"] = use_disk super().__init__( path=path, title=title, mimetype=mimetype, hints=hints, **kwargs ) self.url = urllib.parse.urlparse(url) - self.use_disk = use_disk + use_disk = getattr(self, "use_disk", False) # fetch headers to retrieve size and type try: @@ -177,7 +188,7 @@ def __init__( except Exception: # we couldn't retrieve size so we have to download resource to target, self.size = self.download_for_size( - self.url, on_disk=self.use_disk, tmp_dir=getattr(self, "tmp_dir", None) + self.url, on_disk=use_disk, tmp_dir=getattr(self, "tmp_dir", None) ) # downloaded to disk and using a file path from now on if use_disk: @@ -187,11 +198,16 @@ def __init__( self.fileobj = target def get_path(self) -> str: - return self.path or re.sub(r"^/", "", self.url.path) + return getattr(self, "path", re.sub(r"^/", "", self.url.path)) + + def get_title(self) -> str: + return getattr(self, "title", "") def get_mimetype(self) -> str: - return self.mimetype or self.headers.get( - "Content-Type", "application/octet-stream" + return getattr( + self, + "mimetype", + self.headers.get("Content-Type", "application/octet-stream"), ) def get_contentprovider(self):