Skip to content

Fix disable_metadata_checks behavior and fix StaticItem support of bytes content #136

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Feb 14, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 14 additions & 13 deletions src/zimscraperlib/zim/creator.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ class Creator(libzim.writer.Creator):
Use workaround_nocancel=False to disable the workaround.

By default, all metadata are validated for compliance with openZIM guidelines and
conventions. Set disable_metadata_checks=False to disable this validation (you can
conventions. Set disable_metadata_checks=True to disable this validation (you can
still do checks manually with the validation methods or your own logic).
"""

Expand All @@ -111,7 +111,7 @@ def __init__(
compression: Optional[str] = None,
workaround_nocancel: Optional[bool] = True, # noqa: FBT002
ignore_duplicates: Optional[bool] = False, # noqa: FBT002
disable_metadata_checks: bool = True, # noqa: FBT001, FBT002
disable_metadata_checks: bool = False, # noqa: FBT001, FBT002
):
super().__init__(filename=filename)
self._metadata = {}
Expand Down Expand Up @@ -148,7 +148,7 @@ def start(self):
if not all(self._metadata.get(key) for key in MANDATORY_ZIM_METADATA_KEYS):
raise ValueError("Mandatory metadata are not all set.")

if self.disable_metadata_checks:
if not self.disable_metadata_checks:
for name, value in self._metadata.items():
if value:
self.validate_metadata(name, value)
Expand Down Expand Up @@ -195,7 +195,7 @@ def add_metadata(
content: Union[str, bytes, datetime.date, datetime.datetime, Iterable[str]],
mimetype: str = "text/plain;charset=UTF-8",
):
if self.disable_metadata_checks:
if not self.disable_metadata_checks:
self.validate_metadata(name, content)
if name == "Date" and isinstance(content, (datetime.date, datetime.datetime)):
content = content.strftime("%Y-%m-%d").encode("UTF-8")
Expand Down Expand Up @@ -303,14 +303,6 @@ def add_item_for(
if should_compress is not None:
hints[libzim.writer.Hint.COMPRESS] = should_compress

kwargs = {
"path": path,
"title": title or "",
"mimetype": mimetype,
"filepath": fpath if fpath is not None else "",
"hints": hints,
"content": content,
}
if delete_fpath and fpath:
cb = [delete_callback, fpath]
if callback and callable(callback):
Expand All @@ -320,7 +312,16 @@ def add_item_for(
callback = tuple(cb)

self.add_item(
StaticItem(**kwargs), callback=callback, duplicate_ok=duplicate_ok
StaticItem(
path=path,
title=title,
mimetype=mimetype,
filepath=fpath,
hints=hints,
content=content,
),
callback=callback,
duplicate_ok=duplicate_ok,
)
return path

Expand Down
22 changes: 12 additions & 10 deletions src/zimscraperlib/zim/items.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
import re
import tempfile
import urllib.parse
from typing import Any, Optional
from typing import Any, Optional, Union

import libzim.writer # pyright: ignore

Expand All @@ -34,13 +34,13 @@ def __init__(
**kwargs: Any,
):
super().__init__()
if path:
if path is not None:
kwargs["path"] = path
if title:
if title is not None:
kwargs["title"] = title
if mimetype:
if mimetype is not None:
kwargs["mimetype"] = mimetype
if hints:
if hints is not None:
kwargs["hints"] = hints
for k, v in kwargs.items():
setattr(self, k, v)
Expand Down Expand Up @@ -72,7 +72,7 @@ class StaticItem(Item):

def __init__(
self,
content: Optional[str] = None,
content: Optional[Union[str, bytes]] = None,
fileobj: Optional[io.IOBase] = None,
filepath: Optional[pathlib.Path] = None,
path: Optional[str] = None,
Expand All @@ -81,11 +81,11 @@ def __init__(
hints: Optional[dict] = None,
**kwargs: Any,
):
if content:
if content is not None:
kwargs["content"] = content
if fileobj:
if fileobj is not None:
kwargs["fileobj"] = fileobj
if filepath:
if filepath is not None:
kwargs["filepath"] = filepath
super().__init__(
path=path, title=title, mimetype=mimetype, hints=hints, **kwargs
Expand All @@ -95,6 +95,8 @@ def get_contentprovider(self) -> libzim.writer.ContentProvider:
# content was set manually
content = getattr(self, "content", None)
if content is not None:
if not isinstance(content, (str, bytes)):
raise AttributeError(f"Unexpected type for content: {type(content)}")
return StringProvider(content=content, ref=self)

# using a file-like object
Expand Down Expand Up @@ -153,7 +155,7 @@ def __init__(
use_disk: Optional[bool] = None,
**kwargs: Any,
):
if use_disk:
if use_disk is not None:
kwargs["use_disk"] = use_disk
super().__init__(
path=path, title=title, mimetype=mimetype, hints=hints, **kwargs
Expand Down
2 changes: 1 addition & 1 deletion src/zimscraperlib/zim/providers.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ def __init__(


class StringProvider(libzim.writer.StringProvider):
def __init__(self, content: str, ref: Optional[object] = None):
def __init__(self, content: Union[str, bytes], ref: Optional[object] = None):
super().__init__(content)
self.ref = ref

Expand Down
20 changes: 20 additions & 0 deletions tests/zim/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,26 @@ def html_str():
"""


@pytest.fixture(scope="function")
def html_str_cn():
"""sample HTML content with chinese characters"""
return """<html>
<body>
<ul>
<li><a href="download/toto.pdf">PDF doc in 汉字</a></li>
<li><a href="download/toto.txt">text file</a></li>
<li><a href="dest.html">HTML link</a></li>
<li><a href="no-extension">no ext link</a></li>
<li><a href="http://www.example.com/index/sample.html">external link</a></li>
<li><a href="mailto:[email protected]">e-mail link</a></li>
<li><a media="">no href link</a></li>
<object data="download/toto.jpg" width="300" height="200"></object>
<script src="assets/js/bootstrap/bootsrap.css?v=20190101"></script>
</body>
</html>
"""


@pytest.fixture(scope="function")
def html_file(tmp_path, html_str):
fpath = tmp_path / "test.html"
Expand Down
37 changes: 35 additions & 2 deletions tests/zim/test_zim_creator.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ def get_contentprovider(self):
return FileLikeProvider(self.fileobj)


def test_zim_creator(tmp_path, png_image, html_file, html_str):
def test_zim_creator(tmp_path, png_image, html_file, html_str: str, html_str_cn: str):
fpath = tmp_path / "test.zim"
main_path = "welcome"
tags = ";".join(["toto", "tata"])
Expand All @@ -56,6 +56,13 @@ def test_zim_creator(tmp_path, png_image, html_file, html_str):
) as creator:
# verbatim HTML from string
creator.add_item_for("welcome", "wel", content=html_str, is_front=True)
# verbatim HTML from bytes
creator.add_item_for(
"welcome1", "wel1", content=html_str.encode(), is_front=True
)
creator.add_item_for(
"welcome2", "wel2", content=html_str_cn.encode("gb2312"), is_front=True
)
# verbatim HTML from file
creator.add_item_for("welcome3", "wel3", fpath=html_file)
creator.add_item_for("welcome4", "wel4", fpath=html_file)
Expand Down Expand Up @@ -98,6 +105,8 @@ def test_zim_creator(tmp_path, png_image, html_file, html_str):

# ensure non-rewritten articles have not been rewritten
assert bytes(reader.get_item("welcome").content).decode(UTF8) == html_str
assert bytes(reader.get_item("welcome1").content).decode(UTF8) == html_str
assert bytes(reader.get_item("welcome2").content).decode("gb2312") == html_str_cn
assert bytes(reader.get_item("welcome3").content).decode(UTF8) == html_str

# ensure illustration is present and corrext
Expand Down Expand Up @@ -180,6 +189,30 @@ def test_add_item_for_delete_fail(tmp_path, png_image):
assert reader.get_item("index")


def test_add_item_empty_content(tmp_path):
fpath = tmp_path / "test.zim"
# test with incorrect content type
with Creator(fpath, "welcome").config_dev_metadata() as creator:
creator.add_item_for(
path="welcome",
title="hello",
content="",
)


def test_add_item_for_unsupported_content_type(tmp_path):
fpath = tmp_path / "test.zim"
# test with incorrect content type
with Creator(fpath, "welcome").config_dev_metadata() as creator:
with pytest.raises(RuntimeError):
creator.add_item_for(
path="welcome",
title="hello",
mimetype="text/plain",
content=123, # pyright: ignore[reportArgumentType]
)


def test_compression(tmp_path):
fpath = tmp_path / "test.zim"
with Creator(
Expand Down Expand Up @@ -508,7 +541,7 @@ def test_check_metadata(tmp_path):


def test_relax_metadata(tmp_path):
Creator(tmp_path, "", disable_metadata_checks=False).config_dev_metadata(
Creator(tmp_path, "", disable_metadata_checks=True).config_dev_metadata(
Description="T" * 90
).start()

Expand Down