Merge remote-tracking branch 'origin/main' into close-inactive-contexts

elacuesta · elacuesta · commit 77b6721bb395 · 2024-06-13T22:12:15.000-03:00
diff --git a/.bumpversion.cfg b/.bumpversion.cfg
@@ -1,5 +1,5 @@
 [bumpversion]
-current_version = 0.0.33
+current_version = 0.0.35
 commit = True
 tag = True
 
diff --git a/README.md b/README.md
@@ -845,6 +845,32 @@ for a list of the accepted events and the arguments passed to their handlers.
   images, scripts, stylesheets, etc are not seen by Scrapy.
 
 
+## Memory usage extension
+
+The default Scrapy memory usage extension
+(`scrapy.extensions.memusage.MemoryUsage`) does not include the memory used by
+Playwright because the browser is launched as a separate process. The
+scrapy-playwright package provides a replacement extension which also considers
+the memory used by Playwright. This extension needs the
+[`psutil`](https://pypi.org/project/psutil/) package to work.
+
+Update the [EXTENSIONS](https://docs.scrapy.org/en/latest/topics/settings.html#std-setting-EXTENSIONS)
+setting to disable the built-in Scrapy extension and replace it with the one
+from the scrapy-playwright package:
+
+```python
+# settings.py
+EXTENSIONS = {
+    "scrapy.extensions.memusage.MemoryUsage": None,
+    "scrapy_playwright.memusage.ScrapyPlaywrightMemoryUsageExtension": 0,
+}
+```
+
+Refer to the
+[upstream docs](https://docs.scrapy.org/en/latest/topics/extensions.html#module-scrapy.extensions.memusage)
+for more information about supported settings.
+
+
 ## Examples
 
 **Click on a link, save the resulting page as PDF**
@@ -975,6 +1001,68 @@ async def main():
 asyncio.run(main())
 ```
 
+### Software versions
+
+Be sure to include which versions of Scrapy and scrapy-playwright you are using:
+
+```
+$ python -c "import scrapy_playwright; print(scrapy_playwright.__version__)"
+0.0.34
+```
+
+```
+$ scrapy version -v
+Scrapy       : 2.11.1
+lxml         : 5.1.0.0
+libxml2      : 2.12.3
+cssselect    : 1.2.0
+parsel       : 1.8.1
+w3lib        : 2.1.2
+Twisted      : 23.10.0
+Python       : 3.10.12 (main, Nov 20 2023, 15:14:05) [GCC 11.4.0]
+pyOpenSSL    : 24.0.0 (OpenSSL 3.2.1 30 Jan 2024)
+cryptography : 42.0.5
+Platform     : Linux-6.5.0-35-generic-x86_64-with-glibc2.35
+```
+
+### Reproducible code example
+
+When opening an issue please include a
+[Minimal, Reproducible Example](https://stackoverflow.com/help/minimal-reproducible-example)
+that shows the reported behavior. In addition, please make the code as self-contained as possible
+so an active Scrapy project is not required and the spider can be executed directly from a file with
+[`scrapy runspider`](https://docs.scrapy.org/en/latest/topics/commands.html#std-command-runspider).
+This usually means including the relevant settings in the spider's
+[`custom_settings`](https://docs.scrapy.org/en/latest/topics/settings.html#settings-per-spider)
+attribute:
+
+```python
+import scrapy
+
+class ExampleSpider(scrapy.Spider):
+    name = "example"
+    custom_settings = {
+        "TWISTED_REACTOR": "twisted.internet.asyncioreactor.AsyncioSelectorReactor",
+        "DOWNLOAD_HANDLERS": {
+            "https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
+            "http": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
+        },
+    }
+
+    def start_requests(self):
+        yield scrapy.Request(
+            url="https://example.org",
+            meta={"playwright": True},
+        )
+```
+
+### Logs and stats
+
+Logs for spider jobs displaying the issue in detail are extremely useful
+for understanding possible bugs. Include lines before and after the problem,
+not just isolated tracebacks. Job stats displayed at the end of the job
+are also important.
+
 
 ## Frequently Asked Questions
 
diff --git a/docs/changelog.md b/docs/changelog.md
@@ -1,5 +1,11 @@
 # scrapy-playwright changelog
 
+### [v0.0.34](https://github.com/scrapy-plugins/scrapy-playwright/releases/tag/v0.0.34) (2024-01-01)
+
+* Update dev status classifier to 4 - beta
+* Official Python 3.12 support (#254)
+* Custom memusage extension (#257)
+
 
 ### [v0.0.33](https://github.com/scrapy-plugins/scrapy-playwright/releases/tag/v0.0.33) (2023-10-19)
 
diff --git a/scrapy_playwright/__init__.py b/scrapy_playwright/__init__.py
@@ -1 +1 @@
-__version__ = "0.0.33"
+__version__ = "0.0.35"
diff --git a/scrapy_playwright/_utils.py b/scrapy_playwright/_utils.py
@@ -65,7 +65,7 @@ async def _get_page_content(
     try:
         return await page.content()
     except Error as err:
-        if err.message == _NAVIGATION_ERROR_MSG:
+        if _NAVIGATION_ERROR_MSG in err.message:
             logger.debug(
                 "Retrying to get content from page '%s', error: '%s'",
                 page.url,
diff --git a/scrapy_playwright/handler.py b/scrapy_playwright/handler.py
@@ -12,6 +12,7 @@
     Download,
     Error as PlaywrightError,
     Page,
+    Playwright as AsyncPlaywright,
     PlaywrightContextManager,
     Request as PlaywrightRequest,
     Response as PlaywrightResponse,
@@ -102,6 +103,9 @@ def from_settings(cls, settings: Settings) -> "Config":
 
 
 class ScrapyPlaywrightDownloadHandler(HTTPDownloadHandler):
+    playwright_context_manager: Optional[PlaywrightContextManager] = None
+    playwright: Optional[AsyncPlaywright] = None
+
     def __init__(self, crawler: Crawler) -> None:
         super().__init__(settings=crawler.settings, crawler=crawler)
         verify_installed_reactor("twisted.internet.asyncioreactor.AsyncioSelectorReactor")
@@ -326,8 +330,10 @@ async def _close(self) -> None:
         if hasattr(self, "browser"):
             logger.info("Closing browser")
             await self.browser.close()
-        await self.playwright_context_manager.__aexit__()
-        await self.playwright.stop()
+        if self.playwright_context_manager:
+            await self.playwright_context_manager.__aexit__()
+        if self.playwright:
+            await self.playwright.stop()
 
     def download_request(self, request: Request, spider: Spider) -> Deferred:
         if request.meta.get("playwright"):
diff --git a/scrapy_playwright/headers.py b/scrapy_playwright/headers.py
@@ -2,6 +2,7 @@
 This module includes functions to process request headers.
 Refer to the PLAYWRIGHT_PROCESS_REQUEST_HEADERS setting for more information.
 """
+
 from urllib.parse import urlparse
 
 from playwright.async_api import Request as PlaywrightRequest
diff --git a/scrapy_playwright/memusage.py b/scrapy_playwright/memusage.py
@@ -0,0 +1,56 @@
+from contextlib import suppress
+from importlib import import_module
+from typing import List
+
+from scrapy.exceptions import NotConfigured
+from scrapy.extensions.memusage import MemoryUsage
+
+from scrapy_playwright.handler import ScrapyPlaywrightDownloadHandler, logger
+
+
+_MIB_FACTOR = 1024**2
+
+
+class ScrapyPlaywrightMemoryUsageExtension(MemoryUsage):
+    def __init__(self, *args, **kwargs) -> None:
+        super().__init__(*args, **kwargs)
+        try:
+            self.psutil = import_module("psutil")
+        except ImportError as exc:
+            raise NotConfigured("The psutil module is not available") from exc
+
+    def _get_main_process_ids(self) -> List[int]:
+        try:
+            return [
+                handler.playwright_context_manager._connection._transport._proc.pid
+                for handler in self.crawler.engine.downloader.handlers._handlers.values()
+                if isinstance(handler, ScrapyPlaywrightDownloadHandler)
+                and handler.playwright_context_manager
+            ]
+        except Exception:
+            return []
+
+    def _get_descendant_processes(self, process) -> list:
+        children = process.children()
+        result = children.copy()
+        for child in children:
+            result.extend(self._get_descendant_processes(child))
+        return result
+
+    def _get_total_playwright_process_memory(self) -> int:
+        process_list = [self.psutil.Process(pid) for pid in self._get_main_process_ids()]
+        for proc in process_list.copy():
+            process_list.extend(self._get_descendant_processes(proc))
+        total_process_size = 0
+        for proc in process_list:
+            with suppress(Exception):  # might fail if the process exited in the meantime
+                total_process_size += proc.memory_info().rss
+        logger.debug(
+            "Total Playwright process memory: %i Bytes (%i MiB)",
+            total_process_size,
+            total_process_size / _MIB_FACTOR,
+        )
+        return total_process_size
+
+    def get_virtual_size(self) -> int:
+        return super().get_virtual_size() + self._get_total_playwright_process_memory()
diff --git a/setup.py b/setup.py
@@ -19,7 +19,7 @@
     url="https://github.com/scrapy-plugins/scrapy-playwright",
     packages=["scrapy_playwright"],
     classifiers=[
-        "Development Status :: 3 - Alpha",
+        "Development Status :: 4 - Beta",
         "License :: OSI Approved :: BSD License",
         "Programming Language :: Python",
         "Programming Language :: Python :: 3.8",
diff --git a/tests/tests_asyncio/test_extensions.py b/tests/tests_asyncio/test_extensions.py
@@ -0,0 +1,89 @@
+from asyncio.subprocess import Process as AsyncioProcess
+from unittest import IsolatedAsyncioTestCase
+from unittest.mock import MagicMock, patch
+
+import pytest
+from playwright.async_api import PlaywrightContextManager
+from scrapy.exceptions import NotConfigured
+from scrapy.extensions.memusage import MemoryUsage
+
+from scrapy_playwright.memusage import ScrapyPlaywrightMemoryUsageExtension
+from scrapy_playwright.handler import ScrapyPlaywrightDownloadHandler
+
+
+SCHEMA_PID_MAP = {"http": 123, "https": 456}
+
+
+def mock_crawler_with_handlers() -> dict:
+    handlers = {}
+    for schema, pid in SCHEMA_PID_MAP.items():
+        process = MagicMock()
+        process.pid = pid
+        handlers[schema] = MagicMock(spec=ScrapyPlaywrightDownloadHandler)
+        handlers[schema].playwright_context_manager._connection._transport._proc = process
+    crawler = MagicMock()
+    crawler.engine.downloader.handlers._handlers = handlers
+    return crawler
+
+
+def raise_import_error(*args, **kwargs):
+    raise ImportError
+
+
+class MockMemoryInfo:
+    rss = 999
+
+
+@patch("scrapy.extensions.memusage.MailSender")
+class TestMemoryUsageExtension(IsolatedAsyncioTestCase):
+    async def test_process_availability(self, _MailSender):
+        """The main node process should be accessible from the context manager"""
+        ctx_manager = PlaywrightContextManager()
+        await ctx_manager.start()
+        assert isinstance(ctx_manager._connection._transport._proc, AsyncioProcess)
+        await ctx_manager.__aexit__()
+
+    @patch("scrapy_playwright.memusage.import_module", side_effect=raise_import_error)
+    async def test_psutil_not_available_extension_disabled(self, _import_module, _MailSender):
+        crawler = MagicMock()
+        with pytest.raises(NotConfigured):
+            ScrapyPlaywrightMemoryUsageExtension(crawler)
+
+    async def test_get_process_ids_ok(self, _MailSender):
+        crawler = mock_crawler_with_handlers()
+        extension = ScrapyPlaywrightMemoryUsageExtension(crawler)
+        assert extension._get_main_process_ids() == list(SCHEMA_PID_MAP.values())
+
+    async def test_get_process_ids_error(self, _MailSender):
+        crawler = mock_crawler_with_handlers()
+        crawler.engine.downloader.handlers._handlers = MagicMock()
+        crawler.engine.downloader.handlers._handlers.values.side_effect = raise_import_error
+        extension = ScrapyPlaywrightMemoryUsageExtension(crawler)
+        assert extension._get_main_process_ids() == []
+
+    async def test_get_descendant_processes(self, _MailSender):
+        p1 = MagicMock()
+        p2 = MagicMock()
+        p3 = MagicMock()
+        p4 = MagicMock()
+        p2.children.return_value = [p3, p4]
+        p1.children.return_value = [p2]
+        crawler = MagicMock()
+        extension = ScrapyPlaywrightMemoryUsageExtension(crawler)
+        assert extension._get_descendant_processes(p1) == [p2, p3, p4]
+
+    async def test_get_total_process_size(self, _MailSender):
+        crawler = MagicMock()
+        extension = ScrapyPlaywrightMemoryUsageExtension(crawler)
+        extension.psutil = MagicMock()
+        extension.psutil.Process.return_value.memory_info.return_value = MockMemoryInfo()
+        extension._get_main_process_ids = MagicMock(return_value=[1, 2, 3])
+        expected_size = MockMemoryInfo().rss * len(extension._get_main_process_ids())
+        assert extension._get_total_playwright_process_memory() == expected_size
+
+    async def test_get_virtual_size_sum(self, _MailSender):
+        crawler = MagicMock()
+        extension = ScrapyPlaywrightMemoryUsageExtension(crawler)
+        parent_cls_extension = MemoryUsage(crawler)
+        extension._get_total_playwright_process_memory = MagicMock(return_value=123)
+        assert extension.get_virtual_size() == parent_cls_extension.get_virtual_size() + 123
diff --git a/tox.ini b/tox.ini
@@ -6,6 +6,7 @@ deps =
     pytest==7.4.0
     pytest_cov==4.1.0
     pytest_twisted==1.14
+    psutil==5.9.7
 commands =
     playwright install
     py.test -vv --reactor=asyncio \
@@ -37,27 +38,27 @@ commands =
 
 [testenv:black]
 deps =
-    black>=22.10.0
+    black==24.4.2
 commands =
     black --check {posargs: scrapy_playwright setup.py tests examples}
 
 [testenv:flake8]
 deps =
-    flake8>=5.0
+    flake8==7.0.0
 commands =
     flake8 --exclude=.git,.tox,venv* {posargs: scrapy_playwright setup.py tests examples}
 
 [testenv:typing]
 deps =
-    mypy>=0.980
+    mypy==1.10.0
 commands =
     mypy --show-error-codes --ignore-missing-imports \
         --follow-imports=skip {posargs: scrapy_playwright setup.py tests examples}
 
 [testenv:pylint]
 deps =
     pytest==7.4.0
-    pylint==2.17.5
+    pylint==3.2.2
 commands =
     pip install -e .
     pylint {posargs: scrapy_playwright setup.py tests}

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-__version__ = "0.0.33"`
	`1`	`+__version__ = "0.0.35"`