@@ -106,6 +106,9 @@ class Download:
106
106
exception : Optional [Exception ] = None
107
107
response_status : int = 200
108
108
109
+ def __bool__ (self ) -> bool :
110
+ return bool (self .body ) or bool (self .exception )
111
+
109
112
110
113
class ScrapyPlaywrightDownloadHandler (HTTPDownloadHandler ):
111
114
playwright_context_manager : Optional [PlaywrightContextManager ] = None
@@ -380,15 +383,24 @@ async def _download_request_with_page(
380
383
if request .meta .get ("playwright_include_page" ):
381
384
request .meta ["playwright_page" ] = page
382
385
386
+ # default response values
387
+ server_ip_address = None
388
+ headers = Headers ()
389
+
383
390
start_time = time ()
384
- response , download = await self ._get_response_and_download (
385
- request = request , page = page , spider = spider
386
- )
387
- if isinstance (response , PlaywrightResponse ):
391
+
392
+ response , download = await self ._get_response_and_download (request , page , spider )
393
+ if response :
388
394
await _set_redirect_meta (request = request , response = response )
389
395
headers = Headers (await response .all_headers ())
390
396
headers .pop ("Content-Encoding" , None )
391
- elif not download .url :
397
+ request .meta ["playwright_security_details" ] = await response .security_details ()
398
+ with suppress (KeyError , TypeError , ValueError ):
399
+ server_addr = await response .server_addr ()
400
+ server_ip_address = ip_address (server_addr ["ipAddress" ])
401
+ elif download :
402
+ request .meta ["playwright_suggested_filename" ] = download .suggested_filename
403
+ else :
392
404
logger .warning (
393
405
"Navigating to %s returned None, the response"
394
406
" will have empty headers and status 200" ,
@@ -400,7 +412,6 @@ async def _download_request_with_page(
400
412
"scrapy_request_method" : request .method ,
401
413
},
402
414
)
403
- headers = Headers ()
404
415
405
416
await self ._apply_page_methods (page , request , spider )
406
417
body_str = await _get_page_content (
@@ -412,22 +423,14 @@ async def _download_request_with_page(
412
423
)
413
424
request .meta ["download_latency" ] = time () - start_time
414
425
415
- server_ip_address = None
416
- if response is not None :
417
- request .meta ["playwright_security_details" ] = await response .security_details ()
418
- with suppress (KeyError , TypeError , ValueError ):
419
- server_addr = await response .server_addr ()
420
- server_ip_address = ip_address (server_addr ["ipAddress" ])
421
-
422
- if download .exception :
426
+ if download and download .exception :
423
427
raise download .exception
424
428
425
429
if not request .meta .get ("playwright_include_page" ):
426
430
await page .close ()
427
431
self .stats .inc_value ("playwright/page_count/closed" )
428
432
429
- if download .url :
430
- request .meta ["playwright_suggested_filename" ] = download .suggested_filename
433
+ if download and download .url :
431
434
respcls = responsetypes .from_args (url = download .url , body = download .body )
432
435
return respcls (
433
436
url = download .url ,
@@ -452,7 +455,7 @@ async def _download_request_with_page(
452
455
453
456
async def _get_response_and_download (
454
457
self , request : Request , page : Page , spider : Spider
455
- ) -> Tuple [Optional [PlaywrightResponse ], Download ]:
458
+ ) -> Tuple [Optional [PlaywrightResponse ], Optional [ Download ] ]:
456
459
response : Optional [PlaywrightResponse ] = None
457
460
download : Download = Download () # updated in-place in _handle_download
458
461
download_started = asyncio .Event ()
@@ -521,7 +524,7 @@ async def _handle_response(response: PlaywrightResponse) -> None:
521
524
page .remove_listener ("download" , _handle_download )
522
525
page .remove_listener ("response" , _handle_response )
523
526
524
- return response , download
527
+ return response , download if download else None
525
528
526
529
async def _apply_page_methods (self , page : Page , request : Request , spider : Spider ) -> None :
527
530
context_name = request .meta .get ("playwright_context" )
0 commit comments