Merge pull request #136 from openzim/small_fixes

rgaudin · web-flow · commit 604e16d6835a · 2024-02-14T08:55:03.000Z
Fix disable_metadata_checks behavior and fix StaticItem support of bytes content
diff --git a/src/zimscraperlib/zim/creator.py b/src/zimscraperlib/zim/creator.py
@@ -100,7 +100,7 @@ class Creator(libzim.writer.Creator):
     Use workaround_nocancel=False to disable the workaround.
 
     By default, all metadata are validated for compliance with openZIM guidelines and
-    conventions. Set disable_metadata_checks=False to disable this validation (you can
+    conventions. Set disable_metadata_checks=True to disable this validation (you can
     still do checks manually with the validation methods or your own logic).
     """
 
@@ -111,7 +111,7 @@ def __init__(
         compression: Optional[str] = None,
         workaround_nocancel: Optional[bool] = True,  # noqa: FBT002
         ignore_duplicates: Optional[bool] = False,  # noqa: FBT002
-        disable_metadata_checks: bool = True,  # noqa: FBT001, FBT002
+        disable_metadata_checks: bool = False,  # noqa: FBT001, FBT002
     ):
         super().__init__(filename=filename)
         self._metadata = {}
@@ -148,7 +148,7 @@ def start(self):
         if not all(self._metadata.get(key) for key in MANDATORY_ZIM_METADATA_KEYS):
             raise ValueError("Mandatory metadata are not all set.")
 
-        if self.disable_metadata_checks:
+        if not self.disable_metadata_checks:
             for name, value in self._metadata.items():
                 if value:
                     self.validate_metadata(name, value)
@@ -195,7 +195,7 @@ def add_metadata(
         content: Union[str, bytes, datetime.date, datetime.datetime, Iterable[str]],
         mimetype: str = "text/plain;charset=UTF-8",
     ):
-        if self.disable_metadata_checks:
+        if not self.disable_metadata_checks:
             self.validate_metadata(name, content)
         if name == "Date" and isinstance(content, (datetime.date, datetime.datetime)):
             content = content.strftime("%Y-%m-%d").encode("UTF-8")
@@ -303,14 +303,6 @@ def add_item_for(
         if should_compress is not None:
             hints[libzim.writer.Hint.COMPRESS] = should_compress
 
-        kwargs = {
-            "path": path,
-            "title": title or "",
-            "mimetype": mimetype,
-            "filepath": fpath if fpath is not None else "",
-            "hints": hints,
-            "content": content,
-        }
         if delete_fpath and fpath:
             cb = [delete_callback, fpath]
             if callback and callable(callback):
@@ -320,7 +312,16 @@ def add_item_for(
             callback = tuple(cb)
 
         self.add_item(
-            StaticItem(**kwargs), callback=callback, duplicate_ok=duplicate_ok
+            StaticItem(
+                path=path,
+                title=title,
+                mimetype=mimetype,
+                filepath=fpath,
+                hints=hints,
+                content=content,
+            ),
+            callback=callback,
+            duplicate_ok=duplicate_ok,
         )
         return path
 
diff --git a/src/zimscraperlib/zim/items.py b/src/zimscraperlib/zim/items.py
@@ -9,7 +9,7 @@
 import re
 import tempfile
 import urllib.parse
-from typing import Any, Optional
+from typing import Any, Optional, Union
 
 import libzim.writer  # pyright: ignore
 
@@ -34,13 +34,13 @@ def __init__(
         **kwargs: Any,
     ):
         super().__init__()
-        if path:
+        if path is not None:
             kwargs["path"] = path
-        if title:
+        if title is not None:
             kwargs["title"] = title
-        if mimetype:
+        if mimetype is not None:
             kwargs["mimetype"] = mimetype
-        if hints:
+        if hints is not None:
             kwargs["hints"] = hints
         for k, v in kwargs.items():
             setattr(self, k, v)
@@ -72,7 +72,7 @@ class StaticItem(Item):
 
     def __init__(
         self,
-        content: Optional[str] = None,
+        content: Optional[Union[str, bytes]] = None,
         fileobj: Optional[io.IOBase] = None,
         filepath: Optional[pathlib.Path] = None,
         path: Optional[str] = None,
@@ -81,11 +81,11 @@ def __init__(
         hints: Optional[dict] = None,
         **kwargs: Any,
     ):
-        if content:
+        if content is not None:
             kwargs["content"] = content
-        if fileobj:
+        if fileobj is not None:
             kwargs["fileobj"] = fileobj
-        if filepath:
+        if filepath is not None:
             kwargs["filepath"] = filepath
         super().__init__(
             path=path, title=title, mimetype=mimetype, hints=hints, **kwargs
@@ -95,6 +95,8 @@ def get_contentprovider(self) -> libzim.writer.ContentProvider:
         # content was set manually
         content = getattr(self, "content", None)
         if content is not None:
+            if not isinstance(content, (str, bytes)):
+                raise AttributeError(f"Unexpected type for content: {type(content)}")
             return StringProvider(content=content, ref=self)
 
         # using a file-like object
@@ -153,7 +155,7 @@ def __init__(
         use_disk: Optional[bool] = None,
         **kwargs: Any,
     ):
-        if use_disk:
+        if use_disk is not None:
             kwargs["use_disk"] = use_disk
         super().__init__(
             path=path, title=title, mimetype=mimetype, hints=hints, **kwargs
diff --git a/src/zimscraperlib/zim/providers.py b/src/zimscraperlib/zim/providers.py
@@ -31,7 +31,7 @@ def __init__(
 
 
 class StringProvider(libzim.writer.StringProvider):
-    def __init__(self, content: str, ref: Optional[object] = None):
+    def __init__(self, content: Union[str, bytes], ref: Optional[object] = None):
         super().__init__(content)
         self.ref = ref
 
diff --git a/tests/zim/conftest.py b/tests/zim/conftest.py
@@ -24,6 +24,26 @@ def html_str():
 """
 
 
+@pytest.fixture(scope="function")
+def html_str_cn():
+    """sample HTML content with chinese characters"""
+    return """<html>
+<body>
+<ul>
+    <li><a href="download/toto.pdf">PDF doc in 汉字</a></li>
+    <li><a href="download/toto.txt">text file</a></li>
+    <li><a href="dest.html">HTML link</a></li>
+    <li><a href="no-extension">no ext link</a></li>
+    <li><a href="http://www.example.com/index/sample.html">external link</a></li>
+    <li><a href="mailto:example@example.com">e-mail link</a></li>
+    <li><a media="">no href link</a></li>
+<object data="download/toto.jpg" width="300" height="200"></object>
+<script src="assets/js/bootstrap/bootsrap.css?v=20190101"></script>
+</body>
+</html>
+"""
+
+
 @pytest.fixture(scope="function")
 def html_file(tmp_path, html_str):
     fpath = tmp_path / "test.html"
diff --git a/tests/zim/test_zim_creator.py b/tests/zim/test_zim_creator.py
@@ -45,7 +45,7 @@ def get_contentprovider(self):
         return FileLikeProvider(self.fileobj)
 
 
-def test_zim_creator(tmp_path, png_image, html_file, html_str):
+def test_zim_creator(tmp_path, png_image, html_file, html_str: str, html_str_cn: str):
     fpath = tmp_path / "test.zim"
     main_path = "welcome"
     tags = ";".join(["toto", "tata"])
@@ -56,6 +56,13 @@ def test_zim_creator(tmp_path, png_image, html_file, html_str):
     ) as creator:
         # verbatim HTML from string
         creator.add_item_for("welcome", "wel", content=html_str, is_front=True)
+        # verbatim HTML from bytes
+        creator.add_item_for(
+            "welcome1", "wel1", content=html_str.encode(), is_front=True
+        )
+        creator.add_item_for(
+            "welcome2", "wel2", content=html_str_cn.encode("gb2312"), is_front=True
+        )
         # verbatim HTML from file
         creator.add_item_for("welcome3", "wel3", fpath=html_file)
         creator.add_item_for("welcome4", "wel4", fpath=html_file)
@@ -98,6 +105,8 @@ def test_zim_creator(tmp_path, png_image, html_file, html_str):
 
     # ensure non-rewritten articles have not been rewritten
     assert bytes(reader.get_item("welcome").content).decode(UTF8) == html_str
+    assert bytes(reader.get_item("welcome1").content).decode(UTF8) == html_str
+    assert bytes(reader.get_item("welcome2").content).decode("gb2312") == html_str_cn
     assert bytes(reader.get_item("welcome3").content).decode(UTF8) == html_str
 
     # ensure illustration is present and corrext
@@ -180,6 +189,30 @@ def test_add_item_for_delete_fail(tmp_path, png_image):
     assert reader.get_item("index")
 
 
+def test_add_item_empty_content(tmp_path):
+    fpath = tmp_path / "test.zim"
+    # test with incorrect content type
+    with Creator(fpath, "welcome").config_dev_metadata() as creator:
+        creator.add_item_for(
+            path="welcome",
+            title="hello",
+            content="",
+        )
+
+
+def test_add_item_for_unsupported_content_type(tmp_path):
+    fpath = tmp_path / "test.zim"
+    # test with incorrect content type
+    with Creator(fpath, "welcome").config_dev_metadata() as creator:
+        with pytest.raises(RuntimeError):
+            creator.add_item_for(
+                path="welcome",
+                title="hello",
+                mimetype="text/plain",
+                content=123,  # pyright: ignore[reportArgumentType]
+            )
+
+
 def test_compression(tmp_path):
     fpath = tmp_path / "test.zim"
     with Creator(
@@ -508,7 +541,7 @@ def test_check_metadata(tmp_path):
 
 
 def test_relax_metadata(tmp_path):
-    Creator(tmp_path, "", disable_metadata_checks=False).config_dev_metadata(
+    Creator(tmp_path, "", disable_metadata_checks=True).config_dev_metadata(
         Description="T" * 90
     ).start()