Skip to content

Commit 0cfd96b

Browse files
authored
Merge pull request #208 from openzim/mindtouch_changes
More changes for mindtouch scraper
2 parents 1c2c37e + 4ac7665 commit 0cfd96b

File tree

8 files changed

+514
-213
lines changed

8 files changed

+514
-213
lines changed

src/zimscraperlib/rewriting/css.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ def __simple_transform(
5151
[
5252
"url(",
5353
m_object["quote"],
54-
url_rewriter(m_object["url"], base_href),
54+
url_rewriter(m_object["url"], base_href).rewriten_url,
5555
m_object["quote"],
5656
")",
5757
]
@@ -190,7 +190,7 @@ def _process_node(self, node: ast.Node):
190190
new_url = self.url_rewriter(
191191
url_node.value, # pyright: ignore
192192
self.base_href,
193-
)
193+
).rewriten_url
194194
url_node.value = str(new_url) # pyright: ignore
195195
url_node.representation = ( # pyright: ignore
196196
f'"{serialize_url(str(new_url))}"'
@@ -206,7 +206,9 @@ def _process_node(self, node: ast.Node):
206206
elif isinstance(node, ast.Declaration):
207207
self._process_list(node.value) # pyright: ignore
208208
elif isinstance(node, ast.URLToken):
209-
new_url = self.url_rewriter(node.value, self.base_href) # pyright: ignore
209+
new_url = self.url_rewriter(
210+
node.value, self.base_href
211+
).rewriten_url # pyright: ignore
210212
node.value = new_url
211213
node.representation = f"url({serialize_url(new_url)})"
212214

src/zimscraperlib/rewriting/html.py

Lines changed: 15 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -132,9 +132,9 @@ class HtmlRewriter(HTMLParser):
132132
def __init__(
133133
self,
134134
url_rewriter: ArticleUrlRewriter,
135-
pre_head_insert: str,
135+
pre_head_insert: str | None,
136136
post_head_insert: str | None,
137-
notify_js_module: Callable[[ZimPath], None],
137+
notify_js_module: Callable[[ZimPath], None] | None,
138138
):
139139
super().__init__(convert_charrefs=False)
140140
self.url_rewriter = url_rewriter
@@ -430,7 +430,7 @@ def do_attribute_rewrite(
430430
css_rewriter: CssRewriter,
431431
url_rewriter: ArticleUrlRewriter,
432432
base_href: str | None,
433-
notify_js_module: Callable[[ZimPath], None],
433+
notify_js_module: Callable[[ZimPath], None] | None,
434434
) -> AttrNameAndValue:
435435
"""Utility function to process all attribute rewriting rules
436436
@@ -587,7 +587,7 @@ def rewrite_href_src_attributes(
587587
attrs: AttrsList,
588588
url_rewriter: ArticleUrlRewriter,
589589
base_href: str | None,
590-
notify_js_module: Callable[[ZimPath], None],
590+
notify_js_module: Callable[[ZimPath], None] | None,
591591
):
592592
"""Rewrite href and src attributes
593593
@@ -596,11 +596,16 @@ def rewrite_href_src_attributes(
596596
"""
597597
if attr_name not in ("href", "src") or not attr_value:
598598
return
599-
if get_html_rewrite_context(tag=tag, attrs=attrs) == "js-module":
599+
if (
600+
notify_js_module
601+
and get_html_rewrite_context(tag=tag, attrs=attrs) == "js-module"
602+
):
600603
notify_js_module(url_rewriter.get_item_path(attr_value, base_href=base_href))
601604
return (
602605
attr_name,
603-
url_rewriter(attr_value, base_href=base_href, rewrite_all_url=tag != "a"),
606+
url_rewriter(
607+
attr_value, base_href=base_href, rewrite_all_url=tag != "a"
608+
).rewriten_url,
604609
)
605610

606611

@@ -615,10 +620,10 @@ def rewrite_srcset_attribute(
615620
if attr_name != "srcset" or not attr_value:
616621
return
617622
value_list = attr_value.split(",")
618-
new_value_list = []
623+
new_value_list: list[str] = []
619624
for value in value_list:
620625
url, *other = value.strip().split(" ", maxsplit=1)
621-
new_url = url_rewriter(url, base_href=base_href)
626+
new_url = url_rewriter(url, base_href=base_href).rewriten_url
622627
new_value = " ".join([new_url, *other])
623628
new_value_list.append(new_value)
624629
return (attr_name, ", ".join(new_value_list))
@@ -708,5 +713,6 @@ def rewrite_meta_http_equiv_redirect(
708713
return
709714
return (
710715
attr_name,
711-
f"{match['interval']};url={url_rewriter(match['url'], base_href=base_href)}",
716+
f"{match['interval']};"
717+
f"url={url_rewriter(match['url'], base_href=base_href).rewriten_url}",
712718
)

src/zimscraperlib/rewriting/js.py

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -206,7 +206,7 @@ def __init__(
206206
self,
207207
url_rewriter: ArticleUrlRewriter,
208208
base_href: str | None,
209-
notify_js_module: Callable[[ZimPath], None],
209+
notify_js_module: Callable[[ZimPath], None] | None,
210210
):
211211
super().__init__(None)
212212
self.first_buff = self._init_local_declaration(GLOBAL_OVERRIDES)
@@ -286,7 +286,7 @@ def get_rewriten_import_url(url: str) -> str:
286286
This takes into account that the result must be a relative URL, i.e. it
287287
cannot be 'vendor.module.js' but must be './vendor.module.js'.
288288
"""
289-
url = self.url_rewriter(url, base_href=self.base_href)
289+
url = self.url_rewriter(url, base_href=self.base_href).rewriten_url
290290
if not (
291291
url.startswith("/") or url.startswith("./") or url.startswith("../")
292292
):
@@ -298,11 +298,12 @@ def func(
298298
m_object: re.Match[str], _opts: dict[str, Any] | None = None
299299
) -> str:
300300
def sub_funct(match: re.Match[str]) -> str:
301-
self.notify_js_module(
302-
self.url_rewriter.get_item_path(
303-
match.group(2), base_href=self.base_href
301+
if self.notify_js_module:
302+
self.notify_js_module(
303+
self.url_rewriter.get_item_path(
304+
match.group(2), base_href=self.base_href
305+
)
304306
)
305-
)
306307
return (
307308
f"{match.group(1)}{get_rewriten_import_url(match.group(2))}"
308309
f"{match.group(3)}"

src/zimscraperlib/rewriting/url_rewriting.py

Lines changed: 38 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,7 @@ def __str__(self) -> str:
8282
return f"HttpUrl({self.value})"
8383

8484
def __repr__(self) -> str:
85-
return f"{self.__str__} - {super().__repr__()}" # pragma: no cover
85+
return f"HttpUrl({self.value})" # pragma: no cover
8686

8787
@property
8888
def value(self) -> str:
@@ -124,7 +124,7 @@ def __str__(self) -> str:
124124
return f"ZimPath({self.value})"
125125

126126
def __repr__(self) -> str:
127-
return f"{self.__str__} - {super().__repr__()}" # pragma: no cover
127+
return f"ZimPath({self.value})" # pragma: no cover
128128

129129
@property
130130
def value(self) -> str:
@@ -147,6 +147,12 @@ def check_validity(cls, value: str) -> None:
147147
raise ValueError(f"Unexpected password in value: {value} {parts.password}")
148148

149149

150+
class RewriteResult(NamedTuple):
151+
absolute_url: str
152+
rewriten_url: str
153+
zim_path: ZimPath | None
154+
155+
150156
class ArticleUrlRewriter:
151157
"""
152158
Rewrite urls in article.
@@ -176,16 +182,11 @@ def __init__(
176182
missing_zim_paths: list of ZIM paths which are known to already be missing
177183
from the existing_zim_paths ; usefull only in complement with this variable ;
178184
new missing entries will be added as URLs are normalized in this function
179-
180-
Results:
181-
items_to_download: populated with the list of rewritten URLs, so that one
182-
might use it to download items after rewriting the document
183185
"""
184186
self.article_path = article_path or ArticleUrlRewriter.normalize(article_url)
185187
self.article_url = article_url
186188
self.existing_zim_paths = existing_zim_paths
187189
self.missing_zim_paths = missing_zim_paths
188-
self.items_to_download: dict[ZimPath, HttpUrl] = {}
189190

190191
def get_item_path(self, item_url: str, base_href: str | None) -> ZimPath:
191192
"""Utility to transform an item URL into a ZimPath"""
@@ -201,7 +202,7 @@ def __call__(
201202
base_href: str | None,
202203
*,
203204
rewrite_all_url: bool = True,
204-
) -> str:
205+
) -> RewriteResult:
205206
"""Rewrite a url contained in a article.
206207
207208
The url is "fully" rewrited to point to a normalized entry path
@@ -210,17 +211,25 @@ def __call__(
210211
try:
211212
item_url = item_url.strip()
212213

214+
item_absolute_url = urljoin(
215+
urljoin(self.article_url.value, base_href), item_url
216+
)
217+
213218
# Make case of standalone fragments more straightforward
214219
if item_url.startswith("#"):
215-
return item_url
220+
return RewriteResult(
221+
absolute_url=item_absolute_url,
222+
rewriten_url=item_url,
223+
zim_path=None,
224+
)
216225

217226
item_scheme = urlsplit(item_url).scheme
218227
if item_scheme and item_scheme not in ("http", "https"):
219-
return item_url
220-
221-
item_absolute_url = urljoin(
222-
urljoin(self.article_url.value, base_href), item_url
223-
)
228+
return RewriteResult(
229+
absolute_url=item_absolute_url,
230+
rewriten_url=item_url,
231+
zim_path=None,
232+
)
224233

225234
item_fragment = urlsplit(item_absolute_url).fragment
226235

@@ -229,9 +238,11 @@ def __call__(
229238
if rewrite_all_url or (
230239
self.existing_zim_paths and item_path in self.existing_zim_paths
231240
):
232-
if item_path not in self.items_to_download:
233-
self.items_to_download[item_path] = HttpUrl(item_absolute_url)
234-
return self.get_document_uri(item_path, item_fragment)
241+
return RewriteResult(
242+
absolute_url=item_absolute_url,
243+
rewriten_url=self.get_document_uri(item_path, item_fragment),
244+
zim_path=item_path,
245+
)
235246
else:
236247
if (
237248
self.missing_zim_paths is not None
@@ -242,7 +253,11 @@ def __call__(
242253
# with duplicate messages
243254
self.missing_zim_paths.add(item_path)
244255
# The url doesn't point to a known entry
245-
return item_absolute_url
256+
return RewriteResult(
257+
absolute_url=item_absolute_url,
258+
rewriten_url=item_absolute_url,
259+
zim_path=item_path,
260+
)
246261

247262
except Exception as exc: # pragma: no cover
248263
item_scheme = (
@@ -275,7 +290,11 @@ def __call__(
275290
f"rewrite_all_url: {rewrite_all_url}",
276291
exc_info=exc,
277292
)
278-
return item_url
293+
return RewriteResult(
294+
absolute_url=item_absolute_url,
295+
rewriten_url=item_url,
296+
zim_path=None,
297+
)
279298

280299
def get_document_uri(self, item_path: ZimPath, item_fragment: str) -> str:
281300
"""Given an ZIM item path and its fragment, get the URI to use in document

tests/rewriting/conftest.py

Lines changed: 7 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -7,20 +7,11 @@
77
from zimscraperlib.rewriting.url_rewriting import (
88
ArticleUrlRewriter,
99
HttpUrl,
10+
RewriteResult,
1011
ZimPath,
1112
)
1213

1314

14-
@pytest.fixture(scope="module")
15-
def no_js_notify():
16-
"""Fixture to not care about notification of detection of a JS file"""
17-
18-
def no_js_notify_handler(_: str):
19-
pass
20-
21-
yield no_js_notify_handler
22-
23-
2415
class SimpleUrlRewriter(ArticleUrlRewriter):
2516
"""Basic URL rewriter mocking most calls"""
2617

@@ -34,8 +25,12 @@ def __call__(
3425
base_href: str | None, # noqa: ARG002
3526
*,
3627
rewrite_all_url: bool = True, # noqa: ARG002
37-
) -> str:
38-
return item_url + self.suffix
28+
) -> RewriteResult:
29+
return RewriteResult(
30+
absolute_url=item_url + self.suffix,
31+
rewriten_url=item_url + self.suffix,
32+
zim_path=None,
33+
)
3934

4035
def get_item_path(
4136
self, item_url: str, base_href: str | None # noqa: ARG002

0 commit comments

Comments
 (0)