Skip to content

Commit 604e16d

Browse files
authored
Merge pull request #136 from openzim/small_fixes
Fix disable_metadata_checks behavior and fix StaticItem support of bytes content
2 parents a25ec60 + d180e35 commit 604e16d

File tree

5 files changed

+82
-26
lines changed

5 files changed

+82
-26
lines changed

src/zimscraperlib/zim/creator.py

Lines changed: 14 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -100,7 +100,7 @@ class Creator(libzim.writer.Creator):
100100
Use workaround_nocancel=False to disable the workaround.
101101
102102
By default, all metadata are validated for compliance with openZIM guidelines and
103-
conventions. Set disable_metadata_checks=False to disable this validation (you can
103+
conventions. Set disable_metadata_checks=True to disable this validation (you can
104104
still do checks manually with the validation methods or your own logic).
105105
"""
106106

@@ -111,7 +111,7 @@ def __init__(
111111
compression: Optional[str] = None,
112112
workaround_nocancel: Optional[bool] = True, # noqa: FBT002
113113
ignore_duplicates: Optional[bool] = False, # noqa: FBT002
114-
disable_metadata_checks: bool = True, # noqa: FBT001, FBT002
114+
disable_metadata_checks: bool = False, # noqa: FBT001, FBT002
115115
):
116116
super().__init__(filename=filename)
117117
self._metadata = {}
@@ -148,7 +148,7 @@ def start(self):
148148
if not all(self._metadata.get(key) for key in MANDATORY_ZIM_METADATA_KEYS):
149149
raise ValueError("Mandatory metadata are not all set.")
150150

151-
if self.disable_metadata_checks:
151+
if not self.disable_metadata_checks:
152152
for name, value in self._metadata.items():
153153
if value:
154154
self.validate_metadata(name, value)
@@ -195,7 +195,7 @@ def add_metadata(
195195
content: Union[str, bytes, datetime.date, datetime.datetime, Iterable[str]],
196196
mimetype: str = "text/plain;charset=UTF-8",
197197
):
198-
if self.disable_metadata_checks:
198+
if not self.disable_metadata_checks:
199199
self.validate_metadata(name, content)
200200
if name == "Date" and isinstance(content, (datetime.date, datetime.datetime)):
201201
content = content.strftime("%Y-%m-%d").encode("UTF-8")
@@ -303,14 +303,6 @@ def add_item_for(
303303
if should_compress is not None:
304304
hints[libzim.writer.Hint.COMPRESS] = should_compress
305305

306-
kwargs = {
307-
"path": path,
308-
"title": title or "",
309-
"mimetype": mimetype,
310-
"filepath": fpath if fpath is not None else "",
311-
"hints": hints,
312-
"content": content,
313-
}
314306
if delete_fpath and fpath:
315307
cb = [delete_callback, fpath]
316308
if callback and callable(callback):
@@ -320,7 +312,16 @@ def add_item_for(
320312
callback = tuple(cb)
321313

322314
self.add_item(
323-
StaticItem(**kwargs), callback=callback, duplicate_ok=duplicate_ok
315+
StaticItem(
316+
path=path,
317+
title=title,
318+
mimetype=mimetype,
319+
filepath=fpath,
320+
hints=hints,
321+
content=content,
322+
),
323+
callback=callback,
324+
duplicate_ok=duplicate_ok,
324325
)
325326
return path
326327

src/zimscraperlib/zim/items.py

Lines changed: 12 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
import re
1010
import tempfile
1111
import urllib.parse
12-
from typing import Any, Optional
12+
from typing import Any, Optional, Union
1313

1414
import libzim.writer # pyright: ignore
1515

@@ -34,13 +34,13 @@ def __init__(
3434
**kwargs: Any,
3535
):
3636
super().__init__()
37-
if path:
37+
if path is not None:
3838
kwargs["path"] = path
39-
if title:
39+
if title is not None:
4040
kwargs["title"] = title
41-
if mimetype:
41+
if mimetype is not None:
4242
kwargs["mimetype"] = mimetype
43-
if hints:
43+
if hints is not None:
4444
kwargs["hints"] = hints
4545
for k, v in kwargs.items():
4646
setattr(self, k, v)
@@ -72,7 +72,7 @@ class StaticItem(Item):
7272

7373
def __init__(
7474
self,
75-
content: Optional[str] = None,
75+
content: Optional[Union[str, bytes]] = None,
7676
fileobj: Optional[io.IOBase] = None,
7777
filepath: Optional[pathlib.Path] = None,
7878
path: Optional[str] = None,
@@ -81,11 +81,11 @@ def __init__(
8181
hints: Optional[dict] = None,
8282
**kwargs: Any,
8383
):
84-
if content:
84+
if content is not None:
8585
kwargs["content"] = content
86-
if fileobj:
86+
if fileobj is not None:
8787
kwargs["fileobj"] = fileobj
88-
if filepath:
88+
if filepath is not None:
8989
kwargs["filepath"] = filepath
9090
super().__init__(
9191
path=path, title=title, mimetype=mimetype, hints=hints, **kwargs
@@ -95,6 +95,8 @@ def get_contentprovider(self) -> libzim.writer.ContentProvider:
9595
# content was set manually
9696
content = getattr(self, "content", None)
9797
if content is not None:
98+
if not isinstance(content, (str, bytes)):
99+
raise AttributeError(f"Unexpected type for content: {type(content)}")
98100
return StringProvider(content=content, ref=self)
99101

100102
# using a file-like object
@@ -153,7 +155,7 @@ def __init__(
153155
use_disk: Optional[bool] = None,
154156
**kwargs: Any,
155157
):
156-
if use_disk:
158+
if use_disk is not None:
157159
kwargs["use_disk"] = use_disk
158160
super().__init__(
159161
path=path, title=title, mimetype=mimetype, hints=hints, **kwargs

src/zimscraperlib/zim/providers.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ def __init__(
3131

3232

3333
class StringProvider(libzim.writer.StringProvider):
34-
def __init__(self, content: str, ref: Optional[object] = None):
34+
def __init__(self, content: Union[str, bytes], ref: Optional[object] = None):
3535
super().__init__(content)
3636
self.ref = ref
3737

tests/zim/conftest.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,26 @@ def html_str():
2424
"""
2525

2626

27+
@pytest.fixture(scope="function")
28+
def html_str_cn():
29+
"""sample HTML content with chinese characters"""
30+
return """<html>
31+
<body>
32+
<ul>
33+
<li><a href="download/toto.pdf">PDF doc in 汉字</a></li>
34+
<li><a href="download/toto.txt">text file</a></li>
35+
<li><a href="dest.html">HTML link</a></li>
36+
<li><a href="no-extension">no ext link</a></li>
37+
<li><a href="http://www.example.com/index/sample.html">external link</a></li>
38+
<li><a href="mailto:[email protected]">e-mail link</a></li>
39+
<li><a media="">no href link</a></li>
40+
<object data="download/toto.jpg" width="300" height="200"></object>
41+
<script src="assets/js/bootstrap/bootsrap.css?v=20190101"></script>
42+
</body>
43+
</html>
44+
"""
45+
46+
2747
@pytest.fixture(scope="function")
2848
def html_file(tmp_path, html_str):
2949
fpath = tmp_path / "test.html"

tests/zim/test_zim_creator.py

Lines changed: 35 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ def get_contentprovider(self):
4545
return FileLikeProvider(self.fileobj)
4646

4747

48-
def test_zim_creator(tmp_path, png_image, html_file, html_str):
48+
def test_zim_creator(tmp_path, png_image, html_file, html_str: str, html_str_cn: str):
4949
fpath = tmp_path / "test.zim"
5050
main_path = "welcome"
5151
tags = ";".join(["toto", "tata"])
@@ -56,6 +56,13 @@ def test_zim_creator(tmp_path, png_image, html_file, html_str):
5656
) as creator:
5757
# verbatim HTML from string
5858
creator.add_item_for("welcome", "wel", content=html_str, is_front=True)
59+
# verbatim HTML from bytes
60+
creator.add_item_for(
61+
"welcome1", "wel1", content=html_str.encode(), is_front=True
62+
)
63+
creator.add_item_for(
64+
"welcome2", "wel2", content=html_str_cn.encode("gb2312"), is_front=True
65+
)
5966
# verbatim HTML from file
6067
creator.add_item_for("welcome3", "wel3", fpath=html_file)
6168
creator.add_item_for("welcome4", "wel4", fpath=html_file)
@@ -98,6 +105,8 @@ def test_zim_creator(tmp_path, png_image, html_file, html_str):
98105

99106
# ensure non-rewritten articles have not been rewritten
100107
assert bytes(reader.get_item("welcome").content).decode(UTF8) == html_str
108+
assert bytes(reader.get_item("welcome1").content).decode(UTF8) == html_str
109+
assert bytes(reader.get_item("welcome2").content).decode("gb2312") == html_str_cn
101110
assert bytes(reader.get_item("welcome3").content).decode(UTF8) == html_str
102111

103112
# ensure illustration is present and corrext
@@ -180,6 +189,30 @@ def test_add_item_for_delete_fail(tmp_path, png_image):
180189
assert reader.get_item("index")
181190

182191

192+
def test_add_item_empty_content(tmp_path):
193+
fpath = tmp_path / "test.zim"
194+
# test with incorrect content type
195+
with Creator(fpath, "welcome").config_dev_metadata() as creator:
196+
creator.add_item_for(
197+
path="welcome",
198+
title="hello",
199+
content="",
200+
)
201+
202+
203+
def test_add_item_for_unsupported_content_type(tmp_path):
204+
fpath = tmp_path / "test.zim"
205+
# test with incorrect content type
206+
with Creator(fpath, "welcome").config_dev_metadata() as creator:
207+
with pytest.raises(RuntimeError):
208+
creator.add_item_for(
209+
path="welcome",
210+
title="hello",
211+
mimetype="text/plain",
212+
content=123, # pyright: ignore[reportArgumentType]
213+
)
214+
215+
183216
def test_compression(tmp_path):
184217
fpath = tmp_path / "test.zim"
185218
with Creator(
@@ -508,7 +541,7 @@ def test_check_metadata(tmp_path):
508541

509542

510543
def test_relax_metadata(tmp_path):
511-
Creator(tmp_path, "", disable_metadata_checks=False).config_dev_metadata(
544+
Creator(tmp_path, "", disable_metadata_checks=True).config_dev_metadata(
512545
Description="T" * 90
513546
).start()
514547

0 commit comments

Comments
 (0)