-
-
Notifications
You must be signed in to change notification settings - Fork 19
/
Copy pathproviders.py
95 lines (72 loc) · 2.98 KB
/
providers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
""" libzim Providers accepting a `ref` arg to keep it away from garbage collection
Use case is to pass it the Item instance that created the Provider so that the
Item lives longer than the provider, thus allowing:
- to keep a single copy of the data if it is to be indexed
(and thus Provider instanced twice)
- to release whatever needs to be once we know data won't be fetched anymore """
import io
import pathlib
from collections.abc import Generator
import libzim.writer # pyright: ignore[reportMissingModuleSource]
import requests
from zimscraperlib.download import get_retry_adapter, stream_file
class FileProvider(libzim.writer.FileProvider):
def __init__(
self,
filepath: pathlib.Path,
size: int | None = None, # noqa: ARG002
ref: object | None = None,
):
super().__init__(filepath)
self.ref = ref
class StringProvider(libzim.writer.StringProvider):
def __init__(self, content: str | bytes, ref: object | None = None):
super().__init__(content)
self.ref = ref
class FileLikeProvider(libzim.writer.ContentProvider):
"""Provider referrencing a file-like object
Use this to keep a single-copy of a content in memory.
Useful for indexed content"""
def __init__(
self,
fileobj: io.BytesIO,
size: int | None = None,
ref: object | None = None,
):
super().__init__()
self.ref = ref
self.fileobj = fileobj
self.size = size
if self.size is None:
self.size = size or self.fileobj.seek(0, io.SEEK_END)
self.fileobj.seek(0, io.SEEK_SET)
def get_size(self) -> int:
return getattr(self, "size", -1)
def gen_blob(self) -> Generator[libzim.writer.Blob]:
yield libzim.writer.Blob(self.fileobj.getvalue()) # pragma: no cover
class URLProvider(libzim.writer.ContentProvider):
"""Provider downloading content as it is consumed by the libzim
Useful for non-indexed content for which feed() is called only once"""
def __init__(self, url: str, size: int | None = None, ref: object | None = None):
super().__init__()
self.url = url
self.size = size if size is not None else self.get_size_of(url)
self.ref = ref
session = requests.Session()
session.mount("http", get_retry_adapter())
self.resp = session.get(url, stream=True)
self.resp.raise_for_status()
@staticmethod
def get_size_of(url: str) -> int | None:
_, headers = stream_file(url, byte_stream=io.BytesIO(), only_first_block=True)
try:
return int(headers["Content-Length"])
except Exception:
return None
def get_size(self) -> int:
return getattr(self, "size", -1)
def gen_blob(self) -> Generator[libzim.writer.Blob]: # pragma: no cover
for chunk in self.resp.iter_content(10 * 1024):
if chunk:
yield libzim.writer.Blob(chunk)
yield libzim.writer.Blob(b"")