Skip to content

Commit 164ff9a

Browse files
authored
Improve concurrency on Windows (#286)
* Get future result in a callback * POC with asyncio queue * Clean up * Update windows test wrapper * WindowsProactorEventLoopPolicy on Windows * Test setting future exception * Update docstring * Different errors depending on the platform
1 parent 5926a20 commit 164ff9a

File tree

4 files changed

+85
-32
lines changed

4 files changed

+85
-32
lines changed

scrapy_playwright/_utils.py

+53-22
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
import asyncio
2-
import concurrent
32
import logging
43
import platform
54
import threading
@@ -98,34 +97,66 @@ async def _get_header_value(
9897

9998
if platform.system() == "Windows":
10099

101-
class _WindowsAdapter:
102-
"""Utility class to redirect coroutines to an asyncio event loop running
103-
in a different thread. This allows to use a ProactorEventLoop, which is
104-
supported by Playwright on Windows.
100+
class _ThreadedLoopAdapter:
101+
"""Utility class to start an asyncio event loop in a new thread and redirect coroutines.
102+
This allows to run Playwright in a different loop than the Scrapy crawler, allowing to
103+
use ProactorEventLoop which is supported by Playwright on Windows.
105104
"""
106105

107-
loop = None
108-
thread = None
106+
_loop: asyncio.AbstractEventLoop
107+
_thread: threading.Thread
108+
_coro_queue: asyncio.Queue = asyncio.Queue()
109+
_stop_event: asyncio.Event = asyncio.Event()
109110

110111
@classmethod
111-
def get_event_loop(cls) -> asyncio.AbstractEventLoop:
112-
if cls.thread is None:
113-
if cls.loop is None:
114-
policy = asyncio.WindowsProactorEventLoopPolicy() # type: ignore
115-
cls.loop = policy.new_event_loop()
116-
asyncio.set_event_loop(cls.loop)
117-
if not cls.loop.is_running():
118-
cls.thread = threading.Thread(target=cls.loop.run_forever, daemon=True)
119-
cls.thread.start()
120-
logger.info("Started loop on separate thread: %s", cls.loop)
121-
return cls.loop
112+
async def _handle_coro(cls, coro, future) -> None:
113+
try:
114+
future.set_result(await coro)
115+
except Exception as exc:
116+
future.set_exception(exc)
122117

123118
@classmethod
124-
async def get_result(cls, coro) -> concurrent.futures.Future:
125-
return asyncio.run_coroutine_threadsafe(coro=coro, loop=cls.get_event_loop()).result()
119+
async def _process_queue(cls) -> None:
120+
while not cls._stop_event.is_set():
121+
coro, future = await cls._coro_queue.get()
122+
asyncio.create_task(cls._handle_coro(coro, future))
123+
cls._coro_queue.task_done()
126124

127-
def _deferred_from_coro(coro) -> Deferred:
128-
return scrapy.utils.defer.deferred_from_coro(_WindowsAdapter.get_result(coro))
125+
@classmethod
126+
def _deferred_from_coro(cls, coro) -> Deferred:
127+
future: asyncio.Future = asyncio.Future()
128+
asyncio.run_coroutine_threadsafe(cls._coro_queue.put((coro, future)), cls._loop)
129+
return scrapy.utils.defer.deferred_from_coro(future)
130+
131+
@classmethod
132+
def start(cls) -> None:
133+
policy = asyncio.WindowsProactorEventLoopPolicy() # type: ignore[attr-defined]
134+
cls._loop = policy.new_event_loop()
135+
asyncio.set_event_loop(cls._loop)
136+
137+
cls._thread = threading.Thread(target=cls._loop.run_forever, daemon=True)
138+
cls._thread.start()
139+
logger.info("Started loop on separate thread: %s", cls._loop)
140+
141+
asyncio.run_coroutine_threadsafe(cls._process_queue(), cls._loop)
142+
143+
@classmethod
144+
def stop(cls) -> None:
145+
cls._stop_event.set()
146+
asyncio.run_coroutine_threadsafe(cls._coro_queue.join(), cls._loop)
147+
cls._loop.call_soon_threadsafe(cls._loop.stop)
148+
cls._thread.join()
129149

150+
_deferred_from_coro = _ThreadedLoopAdapter._deferred_from_coro
130151
else:
152+
153+
class _ThreadedLoopAdapter: # type: ignore[no-redef]
154+
@classmethod
155+
def start(cls) -> None:
156+
pass
157+
158+
@classmethod
159+
def stop(cls) -> None:
160+
pass
161+
131162
_deferred_from_coro = scrapy.utils.defer.deferred_from_coro

scrapy_playwright/handler.py

+4-1
Original file line numberDiff line numberDiff line change
@@ -33,12 +33,13 @@
3333
from scrapy_playwright.headers import use_scrapy_headers
3434
from scrapy_playwright.page import PageMethod
3535
from scrapy_playwright._utils import (
36+
_ThreadedLoopAdapter,
37+
_deferred_from_coro,
3638
_encode_body,
3739
_get_header_value,
3840
_get_page_content,
3941
_is_safe_close_error,
4042
_maybe_await,
41-
_deferred_from_coro,
4243
)
4344

4445

@@ -102,6 +103,7 @@ class ScrapyPlaywrightDownloadHandler(HTTPDownloadHandler):
102103

103104
def __init__(self, crawler: Crawler) -> None:
104105
super().__init__(settings=crawler.settings, crawler=crawler)
106+
_ThreadedLoopAdapter.start()
105107
if platform.system() != "Windows":
106108
verify_installed_reactor("twisted.internet.asyncioreactor.AsyncioSelectorReactor")
107109
crawler.signals.connect(self._engine_started, signals.engine_started)
@@ -293,6 +295,7 @@ def close(self) -> Deferred:
293295
logger.info("Closing download handler")
294296
yield super().close()
295297
yield _deferred_from_coro(self._close())
298+
_ThreadedLoopAdapter.stop()
296299

297300
async def _close(self) -> None:
298301
await asyncio.gather(*[ctx.context.close() for ctx in self.context_wrappers.values()])

tests/__init__.py

+7-4
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import asyncio
12
import inspect
23
import logging
34
import platform
@@ -13,17 +14,19 @@
1314

1415

1516
if platform.system() == "Windows":
16-
from scrapy_playwright._utils import _WindowsAdapter
17+
from scrapy_playwright._utils import _ThreadedLoopAdapter
1718

1819
def allow_windows(test_method):
19-
"""Wrap tests with the _WindowsAdapter class on Windows."""
20+
"""Wrap tests with the _ThreadedLoopAdapter class on Windows."""
2021
if not inspect.iscoroutinefunction(test_method):
2122
raise RuntimeError(f"{test_method} must be an async def method")
2223

2324
@wraps(test_method)
2425
async def wrapped(self, *args, **kwargs):
25-
logger.debug("Calling _WindowsAdapter.get_result for %r", self)
26-
await _WindowsAdapter.get_result(test_method(self, *args, **kwargs))
26+
_ThreadedLoopAdapter.start()
27+
coro = test_method(self, *args, **kwargs)
28+
asyncio.run_coroutine_threadsafe(coro=coro, loop=_ThreadedLoopAdapter._loop).result()
29+
_ThreadedLoopAdapter.stop()
2730

2831
return wrapped
2932

tests/tests_twisted/test_mixed_requests.py

+21-5
Original file line numberDiff line numberDiff line change
@@ -15,11 +15,15 @@ class MixedRequestsTestCase(TestCase):
1515
'_download_request', which is a coroutine ('download_request' returns a Deferred).
1616
"""
1717

18+
timeout_ms = 500
19+
1820
@defer.inlineCallbacks
1921
def setUp(self):
2022
self.server = StaticMockServer()
2123
self.server.__enter__()
22-
self.handler = ScrapyPlaywrightDownloadHandler.from_crawler(get_crawler())
24+
self.handler = ScrapyPlaywrightDownloadHandler.from_crawler(
25+
get_crawler(settings_dict={"PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT": self.timeout_ms})
26+
)
2327
yield self.handler._engine_started()
2428

2529
@defer.inlineCallbacks
@@ -29,26 +33,38 @@ def tearDown(self):
2933

3034
@defer.inlineCallbacks
3135
def test_download_request(self):
32-
def _test_regular(response, request):
36+
def _check_regular(response, request):
3337
self.assertIsInstance(response, Response)
3438
self.assertEqual(response.css("a::text").getall(), ["Lorem Ipsum", "Infinite Scroll"])
3539
self.assertEqual(response.url, request.url)
3640
self.assertEqual(response.status, 200)
3741
self.assertNotIn("playwright", response.flags)
3842

39-
def _test_playwright(response, request):
43+
def _check_playwright_ok(response, request):
4044
self.assertIsInstance(response, Response)
4145
self.assertEqual(response.css("a::text").getall(), ["Lorem Ipsum", "Infinite Scroll"])
4246
self.assertEqual(response.url, request.url)
4347
self.assertEqual(response.status, 200)
4448
self.assertIn("playwright", response.flags)
4549

50+
def _check_playwright_error(failure, url):
51+
# different errors depending on the platform
52+
self.assertTrue(
53+
f"Page.goto: net::ERR_CONNECTION_REFUSED at {url}" in str(failure.value)
54+
or f"Page.goto: Timeout {self.timeout_ms}ms exceeded" in str(failure.value)
55+
)
56+
4657
req1 = Request(self.server.urljoin("/index.html"))
4758
yield self.handler.download_request(req1, Spider("foo")).addCallback(
48-
_test_regular, request=req1
59+
_check_regular, request=req1
4960
)
5061

5162
req2 = Request(self.server.urljoin("/index.html"), meta={"playwright": True})
5263
yield self.handler.download_request(req2, Spider("foo")).addCallback(
53-
_test_playwright, request=req2
64+
_check_playwright_ok, request=req2
65+
)
66+
67+
req3 = Request("http://localhost:12345/asdf", meta={"playwright": True})
68+
yield self.handler.download_request(req3, Spider("foo")).addErrback(
69+
_check_playwright_error, url=req3.url
5470
)

0 commit comments

Comments
 (0)