-
-
Notifications
You must be signed in to change notification settings - Fork 19
/
Copy pathproviders.py
102 lines (78 loc) · 3.07 KB
/
providers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
#!/usr/bin/env python
# vim: ai ts=4 sts=4 et sw=4 nu
""" libzim Providers accepting a `ref` arg to keep it away from garbage collection
Use case is to pass it the Item instance that created the Provider so that the
Item lives longer than the provider, thus allowing:
- to keep a single copy of the data if it is to be indexed
(and thus Provider instanced twice)
- to release whatever needs to be once we know data won't be fetched anymore """
import io
import pathlib
from typing import Optional, Union
import libzim.writer # pyright: ignore
import requests
from zimscraperlib.download import _get_retry_adapter, stream_file
class FileProvider(libzim.writer.FileProvider):
def __init__(
self,
filepath: pathlib.Path,
size: Optional[int] = None, # noqa: ARG002
ref: Optional[object] = None,
):
super().__init__(filepath)
self.ref = ref
class StringProvider(libzim.writer.StringProvider):
def __init__(self, content: Union[str, bytes], ref: Optional[object] = None):
super().__init__(content)
self.ref = ref
class FileLikeProvider(libzim.writer.ContentProvider):
"""Provider referrencing a file-like object
Use this to keep a single-copy of a content in memory.
Useful for indexed content"""
def __init__(
self,
fileobj: io.IOBase,
size: Optional[int] = None,
ref: Optional[object] = None,
):
super().__init__()
self.ref = ref
self.fileobj = fileobj
self.size = size
if self.size is None:
self.size = size or self.fileobj.seek(0, io.SEEK_END)
self.fileobj.seek(0, io.SEEK_SET)
def get_size(self) -> int:
return self.size # pyright: ignore
def gen_blob(self) -> libzim.writer.Blob:
yield libzim.writer.Blob( # pragma: no cover
self.fileobj.getvalue() # pyright: ignore
)
class URLProvider(libzim.writer.ContentProvider):
"""Provider downloading content as it is consumed by the libzim
Useful for non-indexed content for which feed() is called only once"""
def __init__(
self, url: str, size: Optional[int] = None, ref: Optional[object] = None
):
super().__init__()
self.url = url
self.size = size if size is not None else self.get_size_of(url)
self.ref = ref
session = requests.Session()
session.mount("http", _get_retry_adapter())
self.resp = session.get(url, stream=True)
self.resp.raise_for_status()
@staticmethod
def get_size_of(url) -> Union[int, None]:
_, headers = stream_file(url, byte_stream=io.BytesIO(), only_first_block=True)
try:
return int(headers["Content-Length"])
except Exception:
return None
def get_size(self) -> int:
return self.size # pyright: ignore
def gen_blob(self) -> libzim.writer.Blob: # pragma: no cover
for chunk in self.resp.iter_content(10 * 1024):
if chunk:
yield libzim.writer.Blob(chunk)
yield libzim.writer.Blob(b"")