Skip to content

Commit a0a225b

Browse files
authored
Merge pull request #221 from openzim/safe_metadata_revamp
Significantly enhance the safety of metadata manipulation
2 parents 76a6408 + 7e2efa1 commit a0a225b

20 files changed

+1671
-664
lines changed

CHANGELOG.md

+18
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,24 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
1111

1212
- Renamed `filesystem.validate_zimfile_creatable` to `filesystem.file_creatable` to reflect general applicability to check file creation beyond ZIM files #200
1313
- Remove any "ZIM" reference in exceptions while working with files #200
14+
- Significantly enhance the safety of metadata manipulation (#205)
15+
- add types for all metadata, one type per metadata name plus some generic ones for non-standard metadata
16+
- all types are responsible to validate metadata value at initialization time
17+
- validation checks for adherence to the ZIM specification and conventions are automated
18+
- cleanup of unwanted control characters and stripping white characters are **automated in all text metadata**
19+
- whenever possible, try to **automatically clean a "reasonably" bad metadata** (e.g. automaticall accept and remove duplicate tags - harmless - but not duplicate language codes - codes are supposed to be ordered, so it is a weird situation) ; this is an alignment of paradigm, because for some metadata the lib was permissive, while for other it was quite restrictive ; this PR tries to align this and **make the lib as permissive as possible**, avoiding to fail a scraper for something which could be automatically fixed
20+
- it is now possible to disable ZIM conventions checks with `zim.metadata.check_metadata_conventions`
21+
- simplify `zim.creator.Creator.config_metadata` by using these types and been more strict:
22+
- add new `StandardMetadata` class for standard metadata, including list of mandatory one
23+
- by default, all non-standard metadata must start with `X-` prefix
24+
- this not yet an openZIM convention / specification, so it is possible to disable this check with `fail_on_missing_prefix` argument
25+
- simplify `add_metadata`, use same metadata types
26+
- simplify `zim.creator.Creator.start` with new types, and drop all metadata from memory after being passed to the libzim
27+
- drop `zim.creator.convert_and_check_metadata` (not usefull anymore, simply use proper metadata type)
28+
- move `MANDATORY_ZIM_METADATA_KEYS` and `DEFAULT_DEV_ZIM_METADATA` from `constants` to `zim.metadata` to avoid circular dependencies
29+
- new `inputs.unique_values` utility function to compute the list of uniques values from a given list, but preserving initial list order
30+
- in `__init__` of `zim.creator.Creator`, rename `disable_metadata_checks` to `check_metadata_conventions` for clarity and brevity
31+
- beware that this manipulate the global `zim.metadata.check_metadata_conventions`, so if you have many creator running in parallel, they can't have different settings, last one initialized will "win"
1432

1533
### Added
1634

pyproject.toml

+9-6
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ dependencies = [
2626
"regex>=2020.7.14",
2727
"pymupdf>=1.24.0,<2.0",
2828
"CairoSVG>=2.2.0,<3.0",
29+
"beartype==0.19.0",
2930
# youtube-dl should be updated as frequently as possible
3031
"yt-dlp"
3132
]
@@ -52,19 +53,19 @@ scripts = [
5253
]
5354
lint = [
5455
"black==24.10.0",
55-
"ruff==0.7.0",
56+
"ruff==0.8.2",
5657
]
5758
check = [
58-
"pyright==1.1.385",
59-
"pytest==8.3.3",
59+
"pyright==1.1.390",
60+
"pytest==8.3.4",
6061
]
6162
test = [
62-
"pytest==8.3.3",
63+
"pytest==8.3.4",
6364
"pytest-mock==3.14.0",
64-
"coverage==7.5.3",
65+
"coverage==7.6.9",
6566
]
6667
dev = [
67-
"ipython==8.25.0",
68+
"ipython==8.30.0",
6869
"pre-commit==4.0.1",
6970
"zimscraperlib[scripts]",
7071
"zimscraperlib[lint]",
@@ -252,6 +253,8 @@ ban-relative-imports = "all"
252253
"tests/**/*" = ["PLR2004", "S101", "TID252"]
253254
# _libkiwix mimics libkiwix C++ code, names obey C++ conventions
254255
"src/zimscraperlib/zim/_libkiwix.py" = ["N802", "N803", "N806"]
256+
# beartype must be first
257+
"src/zimscraperlib/zim/__init__.py" = ["E402"]
255258

256259
[tool.pytest.ini_options]
257260
minversion = "7.3"

src/zimscraperlib/constants.py

-29
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
#!/usr/bin/env python3
22
# vim: ai ts=4 sts=4 et sw=4 nu
33

4-
import base64
54
import pathlib
65
import re
76

@@ -22,34 +21,6 @@
2221
# list of mimetypes we consider articles using it should default to FRONT_ARTICLE
2322
FRONT_ARTICLE_MIMETYPES = ["text/html"]
2423

25-
# list of mandatory meta tags of the zim file.
26-
MANDATORY_ZIM_METADATA_KEYS = [
27-
"Name",
28-
"Title",
29-
"Creator",
30-
"Publisher",
31-
"Date",
32-
"Description",
33-
"Language",
34-
"Illustration_48x48@1",
35-
]
36-
37-
DEFAULT_DEV_ZIM_METADATA = {
38-
"Name": "Test Name",
39-
"Title": "Test Title",
40-
"Creator": "Test Creator",
41-
"Publisher": "Test Publisher",
42-
"Date": "2023-01-01",
43-
"Description": "Test Description",
44-
"Language": "fra",
45-
# blank 48x48 transparent PNG
46-
"Illustration_48x48_at_1": base64.b64decode(
47-
"iVBORw0KGgoAAAANSUhEUgAAADAAAAAwAQMAAABtzGvEAAAAGXRFWHRTb2Z0d2FyZQBB"
48-
"ZG9iZSBJbWFnZVJlYWR5ccllPAAAAANQTFRFR3BMgvrS0gAAAAF0Uk5TAEDm2GYAAAAN"
49-
"SURBVBjTY2AYBdQEAAFQAAGn4toWAAAAAElFTkSuQmCC"
50-
),
51-
}
52-
5324
RECOMMENDED_MAX_TITLE_LENGTH = 30
5425
MAXIMUM_DESCRIPTION_METADATA_LENGTH = 80
5526
MAXIMUM_LONG_DESCRIPTION_METADATA_LENGTH = 4000

src/zimscraperlib/filesystem.py

+2-12
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,6 @@
99

1010
import os
1111
import pathlib
12-
from collections.abc import Callable
13-
from typing import Any
1412

1513
import magic
1614

@@ -44,15 +42,7 @@ def get_content_mimetype(content: bytes | str) -> str:
4442
return MIME_OVERRIDES.get(detected_mime, detected_mime)
4543

4644

47-
def delete_callback(
48-
fpath: str | pathlib.Path,
49-
callback: Callable | None = None,
50-
*callback_args: Any,
51-
):
52-
"""helper deleting passed filepath, optionnaly calling an additional callback"""
45+
def delete_callback(fpath: str | pathlib.Path):
46+
"""helper deleting passed filepath"""
5347

5448
os.unlink(fpath)
55-
56-
# call the callback if requested
57-
if callback and callable(callback):
58-
callback.__call__(*callback_args)

src/zimscraperlib/image/conversion.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ def convert_image(
3737
if not fmt:
3838
raise ValueError("Impossible to guess destination image format")
3939
with pilopen(src) as image:
40-
if image.mode == "RGBA" and fmt in ALPHA_NOT_SUPPORTED or colorspace:
40+
if (image.mode == "RGBA" and fmt in ALPHA_NOT_SUPPORTED) or colorspace:
4141
image = image.convert(colorspace or "RGB") # noqa: PLW2901
4242
save_image(image, dst, fmt, **params)
4343

src/zimscraperlib/inputs.py

+5
Original file line numberDiff line numberDiff line change
@@ -136,3 +136,8 @@ def compute_tags(
136136
return {
137137
tag.strip() for tag in list(default_tags) + (user_tags or "").split(";") if tag
138138
}
139+
140+
141+
def unique_values(items: list) -> list:
142+
"""Return unique values in input list while preserving list order"""
143+
return list(dict.fromkeys(items))

src/zimscraperlib/typing.py

+26
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
from __future__ import annotations
2+
3+
from collections.abc import Callable
4+
from typing import Any, NamedTuple
5+
6+
7+
class Callback(NamedTuple):
8+
func: Callable
9+
args: tuple[Any, ...] | None = None
10+
kwargs: dict[str, Any] | None = None
11+
12+
@property
13+
def callable(self) -> bool:
14+
return callable(self.func)
15+
16+
def get_args(self) -> tuple[Any, ...]:
17+
return self.args or ()
18+
19+
def get_kwargs(self) -> dict[str, Any]:
20+
return self.kwargs or {}
21+
22+
def call_with(self, *args, **kwargs):
23+
self.func.__call__(*args, **kwargs)
24+
25+
def call(self):
26+
self.call_with(*self.get_args(), **self.get_kwargs())

src/zimscraperlib/zim/__init__.py

+8-5
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,11 @@
99
zim.items: item to add to creator
1010
zim.archive: read ZIM files, accessing or searching its content"""
1111

12+
from beartype.claw import beartype_this_package
1213
from libzim.writer import Blob # pyright: ignore
1314

15+
beartype_this_package()
16+
1417
from zimscraperlib.zim.archive import Archive
1518
from zimscraperlib.zim.creator import Creator
1619
from zimscraperlib.zim.filesystem import make_zim_file
@@ -24,14 +27,14 @@
2427

2528
__all__ = [
2629
"Archive",
30+
"Blob",
2731
"Creator",
28-
"make_zim_file",
32+
"FileLikeProvider",
33+
"FileProvider",
2934
"Item",
3035
"StaticItem",
31-
"URLItem",
32-
"FileProvider",
3336
"StringProvider",
34-
"FileLikeProvider",
37+
"URLItem",
3538
"URLProvider",
36-
"Blob",
39+
"make_zim_file",
3740
]

src/zimscraperlib/zim/_libkiwix.py

+8-5
Original file line numberDiff line numberDiff line change
@@ -15,12 +15,15 @@
1515
from __future__ import annotations
1616

1717
import io
18-
from collections import namedtuple
18+
from typing import NamedTuple
1919

20-
MimetypeAndCounter = namedtuple("MimetypeAndCounter", ["mimetype", "value"])
21-
CounterMap = dict[
22-
type(MimetypeAndCounter.mimetype), type(MimetypeAndCounter.value) # pyright: ignore
23-
]
20+
21+
class MimetypeAndCounter(NamedTuple):
22+
mimetype: str
23+
value: int
24+
25+
26+
type CounterMap = dict[str, int]
2427

2528

2629
def getline(src: io.StringIO, delim: str | None = None) -> tuple[bool, str]:

src/zimscraperlib/zim/archive.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
import libzim.search # Query, Searcher # pyright: ignore
1818
import libzim.suggestion # SuggestionSearcher # pyright: ignore
1919

20-
from zimscraperlib.zim._libkiwix import convertTags, parseMimetypeCounter
20+
from zimscraperlib.zim._libkiwix import CounterMap, convertTags, parseMimetypeCounter
2121

2222

2323
class Archive(libzim.reader.Archive):
@@ -101,7 +101,7 @@ def get_search_results_count(self, query: str) -> int:
101101
return search.getEstimatedMatches()
102102

103103
@property
104-
def counters(self) -> dict[str, int]:
104+
def counters(self) -> CounterMap:
105105
try:
106106
return parseMimetypeCounter(self.get_text_metadata("Counter"))
107107
except RuntimeError: # pragma: no cover (no ZIM avail to test itl)

0 commit comments

Comments
 (0)