Skip to content

Commit 0140b90

Browse files
authored
Handle downloads as binary responses (scrapy-plugins#228)
* Refactor _download_request_with_page * Handle downloads transparently * Add test * Enable file downloads in other browsers, simplify example * Better exception handling for downloads * Add download timeout * pylint adjustments * Timeout adjustments * Install asyncio reactor on pytest_sessionstart hook * Refactor mockserver's request handler * Simplify download timeout, add tests * Fix arg type for send_header * Test exception in _handle_download * Test download failure
1 parent a1618db commit 0140b90

File tree

9 files changed

+249
-40
lines changed

9 files changed

+249
-40
lines changed

examples/.gitignore

+2
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
*.png
2+
*.pdf

examples/download.py

+31
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
from pathlib import Path
2+
3+
from scrapy import Spider, Request
4+
5+
6+
class DownloadSpider(Spider):
7+
name = "download"
8+
custom_settings = {
9+
"TWISTED_REACTOR": "twisted.internet.asyncioreactor.AsyncioSelectorReactor",
10+
"DOWNLOAD_HANDLERS": {
11+
"https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
12+
# "http": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
13+
},
14+
}
15+
16+
def start_requests(self):
17+
yield Request(url="https://example.org", meta={"playwright": True})
18+
yield Request(
19+
url="https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf",
20+
meta={"playwright": True},
21+
)
22+
23+
def parse(self, response):
24+
if filename := response.meta.get("playwright_suggested_filename"):
25+
(Path(__file__).parent / filename).write_bytes(response.body)
26+
yield {
27+
"url": response.url,
28+
"response_cls": response.__class__.__name__,
29+
"first_bytes": response.body[:60],
30+
"filename": filename,
31+
}

pylintrc

+1
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ disable=
1313
duplicate-code,
1414
import-outside-toplevel,
1515
protected-access,
16+
too-many-public-methods,
1617
unnecessary-dunder-call,
1718

1819

scrapy_playwright/handler.py

+73-21
Original file line numberDiff line numberDiff line change
@@ -3,12 +3,14 @@
33
from contextlib import suppress
44
from dataclasses import dataclass
55
from ipaddress import ip_address
6+
from tempfile import NamedTemporaryFile
67
from time import time
7-
from typing import Awaitable, Callable, Dict, Optional, Type, TypeVar, Union
8+
from typing import Awaitable, Callable, Dict, Optional, Tuple, Type, TypeVar, Union
89

910
from playwright.async_api import (
1011
BrowserContext,
1112
BrowserType,
13+
Download,
1214
Error as PlaywrightError,
1315
Page,
1416
PlaywrightContextManager,
@@ -319,7 +321,7 @@ async def _download_request(self, request: Request, spider: Spider) -> Response:
319321
)
320322

321323
try:
322-
result = await self._download_request_with_page(request, page, spider)
324+
return await self._download_request_with_page(request, page, spider)
323325
except Exception as ex:
324326
if not request.meta.get("playwright_include_page") and not page.is_closed():
325327
logger.warning(
@@ -339,8 +341,6 @@ async def _download_request(self, request: Request, spider: Spider) -> Response:
339341
await page.close()
340342
self.stats.inc_value("playwright/page_count/closed")
341343
raise
342-
else:
343-
return result
344344

345345
async def _download_request_with_page(
346346
self, request: Request, page: Page, spider: Spider
@@ -349,51 +349,61 @@ async def _download_request_with_page(
349349
if request.meta.get("playwright_include_page"):
350350
request.meta["playwright_page"] = page
351351

352-
context_name = request.meta.setdefault("playwright_context", DEFAULT_CONTEXT_NAME)
353-
354352
start_time = time()
355-
page_goto_kwargs = request.meta.get("playwright_page_goto_kwargs") or {}
356-
page_goto_kwargs.pop("url", None)
357-
response = await page.goto(url=request.url, **page_goto_kwargs)
358-
if response is None:
353+
response, download = await self._get_response_and_download(request=request, page=page)
354+
if isinstance(response, PlaywrightResponse):
355+
await _set_redirect_meta(request=request, response=response)
356+
headers = Headers(await response.all_headers())
357+
headers.pop("Content-Encoding", None)
358+
else:
359359
logger.warning(
360360
"Navigating to %s returned None, the response"
361361
" will have empty headers and status 200",
362362
request,
363363
extra={
364364
"spider": spider,
365-
"context_name": context_name,
365+
"context_name": request.meta.get("playwright_context"),
366366
"scrapy_request_url": request.url,
367367
"scrapy_request_method": request.method,
368368
},
369369
)
370370
headers = Headers()
371-
else:
372-
await _set_redirect_meta(request=request, response=response)
373-
headers = Headers(await response.all_headers())
374-
headers.pop("Content-Encoding", None)
371+
375372
await self._apply_page_methods(page, request, spider)
376373
body_str = await _get_page_content(
377374
page=page,
378375
spider=spider,
379-
context_name=context_name,
376+
context_name=request.meta.get("playwright_context"),
380377
scrapy_request_url=request.url,
381378
scrapy_request_method=request.method,
382379
)
383380
request.meta["download_latency"] = time() - start_time
384381

385382
server_ip_address = None
386-
with suppress(AttributeError, KeyError, TypeError, ValueError):
387-
server_addr = await response.server_addr()
388-
server_ip_address = ip_address(server_addr["ipAddress"])
389-
390-
with suppress(AttributeError):
383+
if response is not None:
391384
request.meta["playwright_security_details"] = await response.security_details()
385+
with suppress(KeyError, TypeError, ValueError):
386+
server_addr = await response.server_addr()
387+
server_ip_address = ip_address(server_addr["ipAddress"])
388+
389+
if download.get("exception"):
390+
raise download["exception"]
392391

393392
if not request.meta.get("playwright_include_page"):
394393
await page.close()
395394
self.stats.inc_value("playwright/page_count/closed")
396395

396+
if download:
397+
request.meta["playwright_suggested_filename"] = download.get("suggested_filename")
398+
respcls = responsetypes.from_args(url=download["url"], body=download["bytes"])
399+
return respcls(
400+
url=download["url"],
401+
status=200,
402+
body=download["bytes"],
403+
request=request,
404+
flags=["playwright"],
405+
)
406+
397407
body, encoding = _encode_body(headers=headers, text=body_str)
398408
respcls = responsetypes.from_args(headers=headers, url=page.url, body=body)
399409
return respcls(
@@ -407,6 +417,48 @@ async def _download_request_with_page(
407417
ip_address=server_ip_address,
408418
)
409419

420+
async def _get_response_and_download(
421+
self, request: Request, page: Page
422+
) -> Tuple[Optional[PlaywrightResponse], dict]:
423+
response: Optional[PlaywrightResponse] = None
424+
download: dict = {} # updated in-place in _handle_download
425+
download_ready = asyncio.Event()
426+
427+
async def _handle_download(dwnld: Download) -> None:
428+
self.stats.inc_value("playwright/download_count")
429+
try:
430+
if failure := await dwnld.failure():
431+
raise RuntimeError(f"Failed to download {dwnld.url}: {failure}")
432+
with NamedTemporaryFile() as temp_file:
433+
await dwnld.save_as(temp_file.name)
434+
temp_file.seek(0)
435+
download["bytes"] = temp_file.read()
436+
download["url"] = dwnld.url
437+
download["suggested_filename"] = dwnld.suggested_filename
438+
except Exception as ex:
439+
download["exception"] = ex
440+
finally:
441+
download_ready.set()
442+
443+
page_goto_kwargs = request.meta.get("playwright_page_goto_kwargs") or {}
444+
page_goto_kwargs.pop("url", None)
445+
page.on("download", _handle_download)
446+
try:
447+
response = await page.goto(url=request.url, **page_goto_kwargs)
448+
except PlaywrightError as err:
449+
if not (
450+
self.browser_type_name in ("firefox", "webkit")
451+
and "Download is starting" in err.message
452+
or self.browser_type_name == "chromium"
453+
and "net::ERR_ABORTED" in err.message
454+
):
455+
raise
456+
await download_ready.wait()
457+
finally:
458+
page.remove_listener("download", _handle_download)
459+
460+
return response, download
461+
410462
async def _apply_page_methods(self, page: Page, request: Request, spider: Spider) -> None:
411463
context_name = request.meta.get("playwright_context")
412464
page_methods = request.meta.get("playwright_page_methods") or ()

tests/conftest.py

+15
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
def pytest_sessionstart(session): # pylint: disable=unused-argument
2+
"""
3+
Called after the Session object has been created and before performing
4+
collection and entering the run test loop.
5+
"""
6+
from twisted.internet.asyncioreactor import install, AsyncioSelectorReactor
7+
from twisted.internet.error import ReactorAlreadyInstalledError
8+
9+
try:
10+
install()
11+
except ReactorAlreadyInstalledError as exc:
12+
from twisted.internet import reactor
13+
14+
if not isinstance(reactor, AsyncioSelectorReactor):
15+
raise RuntimeError(f"Wrong reactor installed: {type(reactor)}") from exc

tests/mockserver.py

+29-17
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
from subprocess import Popen, PIPE
88
from threading import Thread
99
from typing import Optional
10-
from urllib.parse import urljoin
10+
from urllib.parse import urljoin, urlparse, parse_qs
1111

1212

1313
class StaticMockServer:
@@ -42,39 +42,51 @@ def urljoin(self, url):
4242
class _RequestHandler(BaseHTTPRequestHandler):
4343
def do_POST(self) -> None:
4444
"""Echo back the request body"""
45-
content_length = int(self.headers["Content-Length"])
46-
body = self.rfile.read(content_length)
45+
content_length = int(self.headers.get("Content-Length") or 0)
46+
body_bytes = b"Request body: " + self.rfile.read(content_length)
4747
self.send_response(200)
48+
self.send_header("Content-Length", str(len(body_bytes)))
4849
self.end_headers()
49-
self.wfile.write(b"Request body: ")
50-
self.wfile.write(body)
50+
self.wfile.write(body_bytes)
5151

5252
def do_GET(self) -> None:
53-
if self.path == "/headers":
53+
parsed_path = urlparse(self.path)
54+
query_string = {key: values[0] for key, values in parse_qs(parsed_path.query).items()}
55+
56+
if delay := int(query_string.get("delay") or 0):
57+
print(f"Sleeping {delay} seconds on path {parsed_path.path}...")
58+
time.sleep(delay)
59+
60+
if parsed_path.path == "/headers":
5461
self._send_json(dict(self.headers))
55-
elif self.path == "/redirect2":
62+
elif parsed_path.path == "/redirect2":
5663
self.send_response(302)
64+
self.send_header("Content-Length", "0")
5765
self.send_header("Location", "/redirect")
5866
self.end_headers()
59-
elif self.path == "/redirect":
67+
elif parsed_path.path == "/redirect":
6068
self.send_response(301)
69+
self.send_header("Content-Length", "0")
6170
self.send_header("Location", "/headers")
6271
self.end_headers()
72+
elif parsed_path.path == "/mancha.pdf":
73+
body_bytes = (Path(__file__).absolute().parent / "site/files/mancha.pdf").read_bytes()
74+
content_length_multiplier = int(query_string.get("content_length_multiplier") or 1)
75+
self.send_response(200)
76+
self.send_header("Content-Type", "application/pdf")
77+
self.send_header("Content-Disposition", 'attachment; filename="mancha.pdf"')
78+
self.send_header("Content-Length", str(len(body_bytes) * content_length_multiplier))
79+
self.end_headers()
80+
self.wfile.write(body_bytes)
6381
else:
64-
delay_match = re.match(r"^/delay/(\d+)$", self.path)
65-
if delay_match:
66-
delay = int(delay_match.group(1))
67-
print(f"Sleeping {delay} seconds...")
68-
time.sleep(delay)
69-
self._send_json({"delay": delay})
70-
else:
71-
self._send_json({"error": "unknown path"}, status=400)
82+
self._send_json({"error": "unknown path"}, status=404)
7283

7384
def _send_json(self, body: dict, status: int = 200) -> None:
85+
body_bytes = json.dumps(body, indent=2).encode("utf8")
7486
self.send_response(status)
87+
self.send_header("Content-Length", str(len(body_bytes)))
7588
self.send_header("Content-Type", "application/json")
7689
self.end_headers()
77-
body_bytes = json.dumps(body, indent=4).encode("utf8")
7890
self.wfile.write(body_bytes)
7991

8092

tests/site/files/mancha.pdf

7.63 KB
Binary file not shown.

0 commit comments

Comments
 (0)