Skip to content

Commit 7dac807

Browse files
authored
Merge pull request #179 from openzim/drop_control_characters
Drop disallowed control characters and strip blank characters
2 parents 1eddabc + 2c894a9 commit 7dac807

File tree

3 files changed

+68
-0
lines changed

3 files changed

+68
-0
lines changed

CHANGELOG.md

+1
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
1010
### Added
1111

1212
- Add utility function to compute ZIM Tags #164, including deduplication #156
13+
- Metadata does not automatically drops control characters #159
1314

1415
### Fixed
1516

src/zimscraperlib/zim/creator.py

+16
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030

3131
import libzim.writer # pyright: ignore
3232
import PIL.Image
33+
import regex
3334

3435
from zimscraperlib import logger
3536
from zimscraperlib.constants import (
@@ -65,6 +66,9 @@
6566
re.MULTILINE | re.DOTALL,
6667
)
6768

69+
# All control characters are disallowed in str metadata except \n, \r and \t
70+
UNWANTED_CONTROL_CHARACTERS_REGEX = regex.compile(r"(?![\n\t\r])\p{C}")
71+
6872

6973
def mimetype_for(
7074
path: str,
@@ -250,6 +254,11 @@ def add_metadata(
250254
content: str | bytes | datetime.date | datetime.datetime | Iterable[str],
251255
mimetype: str = "text/plain;charset=UTF-8",
252256
):
257+
# drop control characters before passing them to libzim
258+
if isinstance(content, str):
259+
content = UNWANTED_CONTROL_CHARACTERS_REGEX.sub("", content).strip(
260+
" \r\n\t"
261+
)
253262
if not self.disable_metadata_checks:
254263
self.validate_metadata(name, content)
255264
if name == "Date" and isinstance(content, (datetime.date, datetime.datetime)):
@@ -304,6 +313,13 @@ def config_metadata(
304313
}
305314
)
306315
self._metadata.update(extras)
316+
for metadata_key, metadata_value in self._metadata.items():
317+
# drop control characters so that proper value is stored in memory and
318+
# logged in DEBUG mode ; also strip blank characters
319+
if isinstance(metadata_value, str):
320+
self._metadata[metadata_key] = UNWANTED_CONTROL_CHARACTERS_REGEX.sub(
321+
"", metadata_value
322+
).strip(" \r\n\t")
307323
return self
308324

309325
def config_dev_metadata(self, **extras: str):

tests/zim/test_zim_creator.py

+51
Original file line numberDiff line numberDiff line change
@@ -724,6 +724,57 @@ def test_config_metadata(tmp_path, png_image, tags):
724724
assert reader.get_text_metadata("TestMetadata") == "Test Metadata"
725725

726726

727+
def test_config_metadata_control_characters(tmp_path):
728+
fpath = tmp_path / "test_config.zim"
729+
creator = Creator(fpath, "").config_dev_metadata(
730+
Description="\t\n\r\n \tA description \awith \bcontrol characters\v",
731+
LongDescription="A description \rwith \a\ncontrol characters\tsss\t\n\r\n \t",
732+
Creator=" A creator ",
733+
)
734+
assert creator._metadata["Description"] == "A description with control characters"
735+
assert (
736+
creator._metadata["LongDescription"]
737+
== "A description \rwith \ncontrol characters\tsss"
738+
)
739+
assert creator._metadata["Creator"] == "A creator"
740+
with creator:
741+
creator.add_metadata(
742+
"Description_1",
743+
"\t\n\r\n \tA description \awith \bcontrol characters\v",
744+
)
745+
creator.add_metadata(
746+
"LongDescription_1",
747+
"A description \rwith \a\ncontrol characters\tsss\t\n\r\n \t",
748+
)
749+
creator.add_metadata(
750+
"Creator_1",
751+
" A creator ",
752+
)
753+
pass
754+
755+
assert fpath.exists()
756+
757+
reader = Archive(fpath)
758+
assert (
759+
reader.get_text_metadata("Description")
760+
== "A description with control characters"
761+
)
762+
assert (
763+
reader.get_text_metadata("LongDescription")
764+
== "A description \rwith \ncontrol characters\tsss"
765+
)
766+
assert reader.get_text_metadata("Creator") == "A creator"
767+
assert (
768+
reader.get_text_metadata("Description_1")
769+
== "A description with control characters"
770+
)
771+
assert (
772+
reader.get_text_metadata("LongDescription_1")
773+
== "A description \rwith \ncontrol characters\tsss"
774+
)
775+
assert reader.get_text_metadata("Creator_1") == "A creator"
776+
777+
727778
@pytest.mark.parametrize(
728779
"name,value,valid",
729780
[

0 commit comments

Comments
 (0)