Skip to content

Commit 2d2c547

Browse files
committed
Revisit typing around add_metadata and validate_metadata + create new convert_and_check_metadata
1 parent 286fd07 commit 2d2c547

File tree

3 files changed

+42
-27
lines changed

3 files changed

+42
-27
lines changed

Diff for: CHANGELOG.md

+6
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
1515
- Automatically index PDF documents content #167
1616
- Automatically set proper title on PDF documents #168
1717
- Expose new `optimization.get_optimization_method` to get the proper optimization method to call for a given image format
18+
- Add `optimization.get_optimization_method` to get the proper optimization method to call for a given image format
19+
- New `creator.Creator.convert_and_check_metadata` to convert metadata to bytes or str for known use cases and check proper type is passed to libzim
1820

1921
## Changed
2022

@@ -23,6 +25,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
2325
- **BREAKING** Force all boolean arguments (and some other non-obvious parameters) to be keyword-only in function calls for clarity / disambiguation (see ruff rule FBT002)
2426
- Prefer to use `IO[bytes]` to `io.BytesIO` when possible since it is more generic
2527
- **BREAKING** `i18n.NotFound` renamed `i18n.NotFoundError`
28+
- **BREAKING** `types.get_mime_for_name` now returns `str | None`
29+
- **BREAKING** `creator.Creator.add_metadata` and `creator.Creator.validate_metadata` now only accepts `bytes | str` as value (it must have been converted before call)
30+
- **BREAKING** second argument of `creator.Creator.add_metadata` has been renamed to `value` instead of `content` to align with other methods
31+
- When a type issue arises in metadata checks, wrong value type is displayed in exception
2632

2733
### Fixed
2834

Diff for: src/zimscraperlib/zim/creator.py

+33-22
Original file line numberDiff line numberDiff line change
@@ -226,22 +226,14 @@ def start(self):
226226
del self._metadata["Illustration_48x48@1"]
227227
for name, value in self._metadata.items():
228228
if value:
229-
self.add_metadata(name, value)
229+
self.add_metadata(name, self.convert_and_check_metadata(name, value))
230230

231231
return self
232232

233233
def validate_metadata(
234234
self,
235235
name: str,
236-
value: (
237-
int
238-
| float
239-
| bytes
240-
| str
241-
| datetime.datetime
242-
| datetime.date
243-
| Iterable[str]
244-
),
236+
value: bytes | str,
245237
):
246238
"""Ensures metadata value for name is conform with the openZIM spec on Metadata
247239
@@ -260,10 +252,37 @@ def validate_metadata(
260252
validate_tags(name, value) # pyright: ignore
261253
validate_illustrations(name, value) # pyright: ignore
262254

255+
def convert_and_check_metadata(
256+
self,
257+
name: str,
258+
value: str | bytes | datetime.date | datetime.datetime | Iterable[str],
259+
) -> str | bytes:
260+
"""Convert metadata to appropriate type for few known usecase and check type
261+
262+
Date: converts date and datetime to string YYYY-MM-DD
263+
Tags: converts iterable to string with semi-colon separator
264+
265+
Also checks that final type is appropriate for libzim (str or bytes)
266+
"""
267+
if name == "Date" and isinstance(value, (datetime.date, datetime.datetime)):
268+
value = value.strftime("%Y-%m-%d")
269+
if (
270+
name == "Tags"
271+
and not isinstance(value, str)
272+
and not isinstance(value, bytes)
273+
and isinstance(value, Iterable)
274+
):
275+
value = ";".join(value)
276+
277+
if not isinstance(value, str) and not isinstance(value, bytes):
278+
raise ValueError(f"Invalid type for {name}: {type(value)}")
279+
280+
return value
281+
263282
def add_metadata(
264283
self,
265284
name: str,
266-
content: str | bytes | datetime.date | datetime.datetime | Iterable[str],
285+
value: str | bytes,
267286
mimetype: str = "text/plain;charset=UTF-8",
268287
):
269288
# drop control characters before passing them to libzim
@@ -272,17 +291,9 @@ def add_metadata(
272291
" \r\n\t"
273292
)
274293
if not self.disable_metadata_checks:
275-
self.validate_metadata(name, content)
276-
if name == "Date" and isinstance(content, (datetime.date, datetime.datetime)):
277-
content = content.strftime("%Y-%m-%d").encode("UTF-8")
278-
if (
279-
name == "Tags"
280-
and not isinstance(content, str)
281-
and not isinstance(content, bytes)
282-
and isinstance(content, Iterable)
283-
):
284-
content = ";".join(content)
285-
super().add_metadata(name, content, mimetype)
294+
self.validate_metadata(name, value)
295+
296+
super().add_metadata(name, value, mimetype)
286297

287298
# there are many N803 problems, but they are intentional to match real tag name
288299
def config_metadata(

Diff for: src/zimscraperlib/zim/metadata.py

+3-5
Original file line numberDiff line numberDiff line change
@@ -31,9 +31,7 @@ def validate_required_values(name: str, value: Any):
3131

3232
def validate_standard_str_types(
3333
name: str,
34-
value: (
35-
int | float | bytes | str | datetime.datetime | datetime.date | Iterable[str]
36-
),
34+
value: str | bytes,
3735
):
3836
"""ensures standard string metadata are indeed str"""
3937
if name in (
@@ -50,7 +48,7 @@ def validate_standard_str_types(
5048
"Source",
5149
"Scraper",
5250
) and not isinstance(value, str):
53-
raise ValueError(f"Invalid type for {name}")
51+
raise ValueError(f"Invalid type for {name}: {type(value)}")
5452

5553

5654
def validate_title(name: str, value: str):
@@ -63,7 +61,7 @@ def validate_date(name: str, value: datetime.datetime | datetime.date | str):
6361
"""ensures Date metadata can be casted to an ISO 8601 string"""
6462
if name == "Date":
6563
if not isinstance(value, (datetime.datetime, datetime.date, str)):
66-
raise ValueError(f"Invalid type for {name}.")
64+
raise ValueError(f"Invalid type for {name}: {type(value)}")
6765
elif isinstance(value, str):
6866
match = re.match(r"(?P<year>\d{4})-(?P<month>\d{2})-(?P<day>\d{2})", value)
6967
if not match:

0 commit comments

Comments
 (0)