15
15
import libzim .writer # pyright: ignore
16
16
17
17
from zimscraperlib .download import stream_file
18
+ from zimscraperlib .filesystem import get_content_mimetype , get_file_mimetype
19
+ from zimscraperlib .zim .indexing import IndexData , get_pdf_index_data
18
20
from zimscraperlib .zim .providers import (
19
21
FileLikeProvider ,
20
22
FileProvider ,
@@ -69,7 +71,17 @@ class StaticItem(Item):
69
71
Sets a `ref` to itself on the File/String content providers so it outlives them
70
72
We need Item to survive its ContentProvider so that we can track lifecycle
71
73
more efficiently: now when the libzim destroys the CP, python will destroy
72
- the Item and we can be notified that we're effectively through with our content"""
74
+ the Item and we can be notified that we're effectively through with our content
75
+
76
+ By default, content is automatically indexed (either by the libzim itself for
77
+ supported documents - text or html for now or by the python-scraperlib - only PDF
78
+ supported for now). If you do not want this, set `auto_index` to False to disable
79
+ both indexing (libzim and python-scraperlib).
80
+
81
+ It is also possible to pass index_data to configure custom indexing of the item.
82
+
83
+ If item title is not set by caller, it is automatically populated from index_data.
84
+ """
73
85
74
86
def __init__ (
75
87
self ,
@@ -80,6 +92,9 @@ def __init__(
80
92
title : str | None = None ,
81
93
mimetype : str | None = None ,
82
94
hints : dict | None = None ,
95
+ index_data : IndexData | None = None ,
96
+ * ,
97
+ auto_index : bool = True ,
83
98
** kwargs : Any ,
84
99
):
85
100
if content is not None :
@@ -91,6 +106,20 @@ def __init__(
91
106
super ().__init__ (
92
107
path = path , title = title , mimetype = mimetype , hints = hints , ** kwargs
93
108
)
109
+ if index_data :
110
+ self .get_indexdata = lambda : index_data
111
+ elif not auto_index :
112
+ self .get_indexdata = lambda : IndexData ("" , "" ) # index nothing
113
+ else :
114
+ self ._get_auto_index () # consider to add auto index
115
+
116
+ # Populate item title from index data if title is not set by caller
117
+ if (
118
+ (not hasattr (self , "title" ) or not self .title )
119
+ and hasattr (self , "get_indexdata" )
120
+ and self .get_indexdata ().get_title ()
121
+ ):
122
+ self .title = self .get_indexdata ().get_title ()
94
123
95
124
def get_contentprovider (self ) -> libzim .writer .ContentProvider :
96
125
# content was set manually
@@ -116,6 +145,53 @@ def get_contentprovider(self) -> libzim.writer.ContentProvider:
116
145
117
146
raise NotImplementedError ("No data to provide`" )
118
147
148
+ def _get_auto_index (self ):
149
+ """Populate item index data and title automatically from content"""
150
+
151
+ # content was set manually
152
+ content = getattr (self , "content" , None )
153
+ if content is not None :
154
+ if not isinstance (content , (str , bytes )):
155
+ raise RuntimeError (
156
+ f"Unexpected type for content: { type (content )} "
157
+ ) # pragma: no cover
158
+ mimetype = get_content_mimetype (
159
+ content .encode ("utf-8" ) if isinstance (content , str ) else content
160
+ )
161
+ if mimetype == "application/pdf" :
162
+ index_data = get_pdf_index_data (content = content )
163
+ self .get_indexdata = lambda : index_data
164
+ else :
165
+ return
166
+
167
+ # using a file-like object
168
+ fileobj = getattr (self , "fileobj" , None )
169
+ if fileobj :
170
+ if not isinstance (fileobj , io .BytesIO ):
171
+ raise RuntimeError (
172
+ f"Unexpected type for content: { type (fileobj )} "
173
+ ) # pragma: no cover
174
+ mimetype = get_content_mimetype (fileobj .getvalue ())
175
+ if mimetype == "application/pdf" :
176
+ index_data = get_pdf_index_data (fileobj = fileobj )
177
+ self .get_indexdata = lambda : index_data
178
+ else :
179
+ return
180
+
181
+ # using a file path
182
+ filepath = getattr (self , "filepath" , None )
183
+ if filepath :
184
+ if not isinstance (filepath , pathlib .Path ):
185
+ raise RuntimeError (
186
+ f"Unexpected type for content: { type (filepath )} "
187
+ ) # pragma: no cover
188
+ mimetype = get_file_mimetype (filepath )
189
+ if mimetype == "application/pdf" :
190
+ index_data = get_pdf_index_data (filepath = filepath )
191
+ self .get_indexdata = lambda : index_data
192
+ else :
193
+ return
194
+
119
195
120
196
class URLItem (StaticItem ):
121
197
"""StaticItem to automatically fetch and feed an URL resource
0 commit comments