Skip to content

Commit efc2942

Browse files
authored
Merge pull request #182 from openzim/pdf_support
Add indexdata + automatic indexing of PDF items
2 parents 7dac807 + 31558d0 commit efc2942

13 files changed

+48055
-3
lines changed

.pre-commit-config.yaml

+1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
# See https://pre-commit.com for more information
22
# See https://pre-commit.com/hooks.html for more hooks
3+
exclude: ^tests/files # these are raw test files, no need to mess with them
34
repos:
45
- repo: https://github.com/pre-commit/pre-commit-hooks
56
rev: v4.5.0

CHANGELOG.md

+3
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
1111

1212
- Add utility function to compute ZIM Tags #164, including deduplication #156
1313
- Metadata does not automatically drops control characters #159
14+
- New `indexing.IndexData` class to hold title, content and keywords to pass to libzim to index an item
15+
- Automatically index PDF documents content #167
16+
- Automatically set proper title on PDF documents #168
1417

1518
### Fixed
1619

pyproject.toml

+2-1
Original file line numberDiff line numberDiff line change
@@ -18,10 +18,11 @@ dependencies = [
1818
"beautifulsoup4>=4.9.3,<5.0",
1919
"lxml>=4.6.3,<6.0",
2020
"optimize-images>=1.3.6,<2.0",
21-
# regex has nNo upper-bound due to "date-based" release numbers, no semver, so their
21+
# regex has no upper-bound due to "date-based" release numbers, no semver, so their
2222
# promise is that they will never (or always) break the API, and the API is very
2323
# limited and we use only a very small subset of it.
2424
"regex>=2020.7.14",
25+
"pymupdf>=1.24.0,<2.0",
2526
# youtube-dl should be updated as frequently as possible
2627
"yt-dlp"
2728
]

src/zimscraperlib/zim/creator.py

+6
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@
4545
)
4646
from zimscraperlib.i18n import is_valid_iso_639_3
4747
from zimscraperlib.types import get_mime_for_name
48+
from zimscraperlib.zim.indexing import IndexData
4849
from zimscraperlib.zim.items import StaticItem
4950
from zimscraperlib.zim.metadata import (
5051
validate_counter,
@@ -340,6 +341,9 @@ def add_item_for(
340341
delete_fpath: bool | None = False, # noqa: FBT002
341342
duplicate_ok: bool | None = None,
342343
callback: Callable | tuple[Callable, Any] | None = None,
344+
index_data: IndexData | None = None,
345+
*,
346+
auto_index: bool = True,
343347
):
344348
"""Add a File or content at a specified path and get its path
345349
@@ -388,6 +392,8 @@ def add_item_for(
388392
filepath=fpath,
389393
hints=hints,
390394
content=content,
395+
index_data=index_data,
396+
auto_index=auto_index,
391397
),
392398
callback=callback,
393399
duplicate_ok=duplicate_ok,

src/zimscraperlib/zim/indexing.py

+119
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,119 @@
1+
""" Special item with customized index data and helper classes """
2+
3+
from __future__ import annotations
4+
5+
import io
6+
import pathlib
7+
8+
import libzim.writer # pyright: ignore
9+
10+
try:
11+
import pymupdf
12+
except ImportError: # pragma: no cover
13+
import fitz as pymupdf # pymupdf main module was named fitz before 1.24.3
14+
15+
from zimscraperlib import logger
16+
17+
18+
class IndexData(libzim.writer.IndexData):
19+
"""IndexData to properly pass indexing title and content to the libzim
20+
21+
Both title and content have to be customized (title can be identical to item title
22+
or not).
23+
keywords is optional since it can be empty
24+
wordcount is optional ; if not passed, it is automaticaly computed from content
25+
"""
26+
27+
def __init__(
28+
self, title: str, content: str, keywords: str = "", wordcount: int | None = None
29+
):
30+
# set wordcount first so that we know if we should override it based on content
31+
self.wordcount = wordcount
32+
self.title = title
33+
self.content = content
34+
self.keywords = keywords
35+
36+
def has_indexdata(self) -> bool:
37+
return len(self.content) > 0 or len(self.title) > 0
38+
39+
def get_title(self) -> str:
40+
return self.title
41+
42+
def get_content(self) -> str:
43+
return self.content
44+
45+
def get_keywords(self) -> str:
46+
return self.keywords
47+
48+
def get_wordcount(self) -> int:
49+
return self.wordcount or 0
50+
51+
@property
52+
def content(self):
53+
return self._content
54+
55+
@content.setter
56+
def content(self, value: str):
57+
self._content = value
58+
if not self.wordcount:
59+
self.wordcount = len(self.content.split()) if self.content else 0
60+
61+
62+
IGNORED_MUPDF_MESSAGES = [
63+
"lcms: not an ICC profile, invalid signature.",
64+
"format error: cmsOpenProfileFromMem failed",
65+
"ignoring broken ICC profile",
66+
]
67+
68+
69+
def get_pdf_index_data(
70+
*,
71+
content: str | bytes | None = None,
72+
fileobj: io.BytesIO | None = None,
73+
filepath: pathlib.Path | None = None,
74+
) -> IndexData:
75+
"""Returns the IndexData information for a given PDF
76+
77+
PDF can be passed either as content or fileobject or filepath
78+
"""
79+
80+
# do not display all pymupdf errors, we will filter them afterwards
81+
pymupdf.TOOLS.mupdf_display_errors(False)
82+
83+
if content:
84+
doc = pymupdf.open(stream=content)
85+
elif fileobj:
86+
doc = pymupdf.open(stream=fileobj)
87+
else:
88+
doc = pymupdf.open(filename=filepath)
89+
metadata = doc.metadata
90+
title = ""
91+
if metadata: # pragma: no branch (always metadata in test PDFs)
92+
parts = []
93+
for key in ["title", "author", "subject"]:
94+
if metadata.get(key):
95+
parts.append(metadata[key])
96+
if parts: # pragma: no branch (always metadata in test PDFs)
97+
title = " - ".join(parts)
98+
99+
content = "\n".join(
100+
page.get_text() for page in doc # pyright: ignore[reportAttributeAccessIssue]
101+
)
102+
103+
# build list of messages and filter messages which are known to not be relevant
104+
# in our use-case
105+
mupdf_messages = "\n".join(
106+
warning
107+
for warning in pymupdf.TOOLS.mupdf_warnings().splitlines()
108+
if warning not in IGNORED_MUPDF_MESSAGES
109+
)
110+
111+
if mupdf_messages:
112+
logger.warning(
113+
f"PyMuPDF issues:\n{mupdf_messages}"
114+
) # pragma: no cover (no known error in test PDFs)
115+
116+
return IndexData(
117+
title=title,
118+
content=content,
119+
)

src/zimscraperlib/zim/items.py

+77-1
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,8 @@
1515
import libzim.writer # pyright: ignore
1616

1717
from zimscraperlib.download import stream_file
18+
from zimscraperlib.filesystem import get_content_mimetype, get_file_mimetype
19+
from zimscraperlib.zim.indexing import IndexData, get_pdf_index_data
1820
from zimscraperlib.zim.providers import (
1921
FileLikeProvider,
2022
FileProvider,
@@ -69,7 +71,17 @@ class StaticItem(Item):
6971
Sets a `ref` to itself on the File/String content providers so it outlives them
7072
We need Item to survive its ContentProvider so that we can track lifecycle
7173
more efficiently: now when the libzim destroys the CP, python will destroy
72-
the Item and we can be notified that we're effectively through with our content"""
74+
the Item and we can be notified that we're effectively through with our content
75+
76+
By default, content is automatically indexed (either by the libzim itself for
77+
supported documents - text or html for now or by the python-scraperlib - only PDF
78+
supported for now). If you do not want this, set `auto_index` to False to disable
79+
both indexing (libzim and python-scraperlib).
80+
81+
It is also possible to pass index_data to configure custom indexing of the item.
82+
83+
If item title is not set by caller, it is automatically populated from index_data.
84+
"""
7385

7486
def __init__(
7587
self,
@@ -80,6 +92,9 @@ def __init__(
8092
title: str | None = None,
8193
mimetype: str | None = None,
8294
hints: dict | None = None,
95+
index_data: IndexData | None = None,
96+
*,
97+
auto_index: bool = True,
8398
**kwargs: Any,
8499
):
85100
if content is not None:
@@ -91,6 +106,20 @@ def __init__(
91106
super().__init__(
92107
path=path, title=title, mimetype=mimetype, hints=hints, **kwargs
93108
)
109+
if index_data:
110+
self.get_indexdata = lambda: index_data
111+
elif not auto_index:
112+
self.get_indexdata = lambda: IndexData("", "") # index nothing
113+
else:
114+
self._get_auto_index() # consider to add auto index
115+
116+
# Populate item title from index data if title is not set by caller
117+
if (
118+
(not hasattr(self, "title") or not self.title)
119+
and hasattr(self, "get_indexdata")
120+
and self.get_indexdata().get_title()
121+
):
122+
self.title = self.get_indexdata().get_title()
94123

95124
def get_contentprovider(self) -> libzim.writer.ContentProvider:
96125
# content was set manually
@@ -116,6 +145,53 @@ def get_contentprovider(self) -> libzim.writer.ContentProvider:
116145

117146
raise NotImplementedError("No data to provide`")
118147

148+
def _get_auto_index(self):
149+
"""Populate item index data and title automatically from content"""
150+
151+
# content was set manually
152+
content = getattr(self, "content", None)
153+
if content is not None:
154+
if not isinstance(content, (str, bytes)):
155+
raise RuntimeError(
156+
f"Unexpected type for content: {type(content)}"
157+
) # pragma: no cover
158+
mimetype = get_content_mimetype(
159+
content.encode("utf-8") if isinstance(content, str) else content
160+
)
161+
if mimetype == "application/pdf":
162+
index_data = get_pdf_index_data(content=content)
163+
self.get_indexdata = lambda: index_data
164+
else:
165+
return
166+
167+
# using a file-like object
168+
fileobj = getattr(self, "fileobj", None)
169+
if fileobj:
170+
if not isinstance(fileobj, io.BytesIO):
171+
raise RuntimeError(
172+
f"Unexpected type for content: {type(fileobj)}"
173+
) # pragma: no cover
174+
mimetype = get_content_mimetype(fileobj.getvalue())
175+
if mimetype == "application/pdf":
176+
index_data = get_pdf_index_data(fileobj=fileobj)
177+
self.get_indexdata = lambda: index_data
178+
else:
179+
return
180+
181+
# using a file path
182+
filepath = getattr(self, "filepath", None)
183+
if filepath:
184+
if not isinstance(filepath, pathlib.Path):
185+
raise RuntimeError(
186+
f"Unexpected type for content: {type(filepath)}"
187+
) # pragma: no cover
188+
mimetype = get_file_mimetype(filepath)
189+
if mimetype == "application/pdf":
190+
index_data = get_pdf_index_data(filepath=filepath)
191+
self.get_indexdata = lambda: index_data
192+
else:
193+
return
194+
119195

120196
class URLItem(StaticItem):
121197
"""StaticItem to automatically fetch and feed an URL resource

tests/conftest.py

+26
Original file line numberDiff line numberDiff line change
@@ -132,6 +132,32 @@ def webp_image():
132132
return file_src("ninja.webp")
133133

134134

135+
@pytest.fixture(scope="module")
136+
def encrypted_pdf_file():
137+
"""Return an encrypted PDF
138+
139+
encrypted.pdf is a PDF encrypted with only a owner password (restricting edit/print)
140+
we want to be sure we are capable to also index this kind of PDF documents, since
141+
they are readable by most popular readers without any issue (view is unrestricted).
142+
"""
143+
return file_src("encrypted.pdf")
144+
145+
146+
@pytest.fixture(scope="module")
147+
def encrypted_pdf_content():
148+
return file_src("encrypted.txt")
149+
150+
151+
@pytest.fixture(scope="module")
152+
def big_pdf_file():
153+
return file_src("milderm.pdf")
154+
155+
156+
@pytest.fixture(scope="module")
157+
def big_pdf_content():
158+
return file_src("milderm.txt")
159+
160+
135161
@pytest.fixture(scope="module")
136162
def valid_user_agent():
137163
return "name/version (contact)"

tests/files/encrypted.pdf

21.1 KB
Binary file not shown.

tests/files/encrypted.txt

+3
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
Placeholder Documentation
2+
This document is a placeholder for the appropriate documentation.
3+

tests/files/milderm.pdf

2.73 MB
Binary file not shown.

0 commit comments

Comments
 (0)