3
3
from contextlib import suppress
4
4
from dataclasses import dataclass
5
5
from ipaddress import ip_address
6
+ from tempfile import NamedTemporaryFile
6
7
from time import time
7
- from typing import Awaitable , Callable , Dict , Optional , Type , TypeVar , Union
8
+ from typing import Awaitable , Callable , Dict , Optional , Tuple , Type , TypeVar , Union
8
9
9
10
from playwright .async_api import (
10
11
BrowserContext ,
11
12
BrowserType ,
13
+ Download ,
12
14
Error as PlaywrightError ,
13
15
Page ,
14
16
PlaywrightContextManager ,
@@ -319,7 +321,7 @@ async def _download_request(self, request: Request, spider: Spider) -> Response:
319
321
)
320
322
321
323
try :
322
- result = await self ._download_request_with_page (request , page , spider )
324
+ return await self ._download_request_with_page (request , page , spider )
323
325
except Exception as ex :
324
326
if not request .meta .get ("playwright_include_page" ) and not page .is_closed ():
325
327
logger .warning (
@@ -339,8 +341,6 @@ async def _download_request(self, request: Request, spider: Spider) -> Response:
339
341
await page .close ()
340
342
self .stats .inc_value ("playwright/page_count/closed" )
341
343
raise
342
- else :
343
- return result
344
344
345
345
async def _download_request_with_page (
346
346
self , request : Request , page : Page , spider : Spider
@@ -349,51 +349,61 @@ async def _download_request_with_page(
349
349
if request .meta .get ("playwright_include_page" ):
350
350
request .meta ["playwright_page" ] = page
351
351
352
- context_name = request .meta .setdefault ("playwright_context" , DEFAULT_CONTEXT_NAME )
353
-
354
352
start_time = time ()
355
- page_goto_kwargs = request .meta .get ("playwright_page_goto_kwargs" ) or {}
356
- page_goto_kwargs .pop ("url" , None )
357
- response = await page .goto (url = request .url , ** page_goto_kwargs )
358
- if response is None :
353
+ response , download = await self ._get_response_and_download (request = request , page = page )
354
+ if isinstance (response , PlaywrightResponse ):
355
+ await _set_redirect_meta (request = request , response = response )
356
+ headers = Headers (await response .all_headers ())
357
+ headers .pop ("Content-Encoding" , None )
358
+ else :
359
359
logger .warning (
360
360
"Navigating to %s returned None, the response"
361
361
" will have empty headers and status 200" ,
362
362
request ,
363
363
extra = {
364
364
"spider" : spider ,
365
- "context_name" : context_name ,
365
+ "context_name" : request . meta . get ( "playwright_context" ) ,
366
366
"scrapy_request_url" : request .url ,
367
367
"scrapy_request_method" : request .method ,
368
368
},
369
369
)
370
370
headers = Headers ()
371
- else :
372
- await _set_redirect_meta (request = request , response = response )
373
- headers = Headers (await response .all_headers ())
374
- headers .pop ("Content-Encoding" , None )
371
+
375
372
await self ._apply_page_methods (page , request , spider )
376
373
body_str = await _get_page_content (
377
374
page = page ,
378
375
spider = spider ,
379
- context_name = context_name ,
376
+ context_name = request . meta . get ( "playwright_context" ) ,
380
377
scrapy_request_url = request .url ,
381
378
scrapy_request_method = request .method ,
382
379
)
383
380
request .meta ["download_latency" ] = time () - start_time
384
381
385
382
server_ip_address = None
386
- with suppress (AttributeError , KeyError , TypeError , ValueError ):
387
- server_addr = await response .server_addr ()
388
- server_ip_address = ip_address (server_addr ["ipAddress" ])
389
-
390
- with suppress (AttributeError ):
383
+ if response is not None :
391
384
request .meta ["playwright_security_details" ] = await response .security_details ()
385
+ with suppress (KeyError , TypeError , ValueError ):
386
+ server_addr = await response .server_addr ()
387
+ server_ip_address = ip_address (server_addr ["ipAddress" ])
388
+
389
+ if download .get ("exception" ):
390
+ raise download ["exception" ]
392
391
393
392
if not request .meta .get ("playwright_include_page" ):
394
393
await page .close ()
395
394
self .stats .inc_value ("playwright/page_count/closed" )
396
395
396
+ if download :
397
+ request .meta ["playwright_suggested_filename" ] = download .get ("suggested_filename" )
398
+ respcls = responsetypes .from_args (url = download ["url" ], body = download ["bytes" ])
399
+ return respcls (
400
+ url = download ["url" ],
401
+ status = 200 ,
402
+ body = download ["bytes" ],
403
+ request = request ,
404
+ flags = ["playwright" ],
405
+ )
406
+
397
407
body , encoding = _encode_body (headers = headers , text = body_str )
398
408
respcls = responsetypes .from_args (headers = headers , url = page .url , body = body )
399
409
return respcls (
@@ -407,6 +417,48 @@ async def _download_request_with_page(
407
417
ip_address = server_ip_address ,
408
418
)
409
419
420
+ async def _get_response_and_download (
421
+ self , request : Request , page : Page
422
+ ) -> Tuple [Optional [PlaywrightResponse ], dict ]:
423
+ response : Optional [PlaywrightResponse ] = None
424
+ download : dict = {} # updated in-place in _handle_download
425
+ download_ready = asyncio .Event ()
426
+
427
+ async def _handle_download (dwnld : Download ) -> None :
428
+ self .stats .inc_value ("playwright/download_count" )
429
+ try :
430
+ if failure := await dwnld .failure ():
431
+ raise RuntimeError (f"Failed to download { dwnld .url } : { failure } " )
432
+ with NamedTemporaryFile () as temp_file :
433
+ await dwnld .save_as (temp_file .name )
434
+ temp_file .seek (0 )
435
+ download ["bytes" ] = temp_file .read ()
436
+ download ["url" ] = dwnld .url
437
+ download ["suggested_filename" ] = dwnld .suggested_filename
438
+ except Exception as ex :
439
+ download ["exception" ] = ex
440
+ finally :
441
+ download_ready .set ()
442
+
443
+ page_goto_kwargs = request .meta .get ("playwright_page_goto_kwargs" ) or {}
444
+ page_goto_kwargs .pop ("url" , None )
445
+ page .on ("download" , _handle_download )
446
+ try :
447
+ response = await page .goto (url = request .url , ** page_goto_kwargs )
448
+ except PlaywrightError as err :
449
+ if not (
450
+ self .browser_type_name in ("firefox" , "webkit" )
451
+ and "Download is starting" in err .message
452
+ or self .browser_type_name == "chromium"
453
+ and "net::ERR_ABORTED" in err .message
454
+ ):
455
+ raise
456
+ await download_ready .wait ()
457
+ finally :
458
+ page .remove_listener ("download" , _handle_download )
459
+
460
+ return response , download
461
+
410
462
async def _apply_page_methods (self , page : Page , request : Request , spider : Spider ) -> None :
411
463
context_name = request .meta .get ("playwright_context" )
412
464
page_methods = request .meta .get ("playwright_page_methods" ) or ()
0 commit comments