hello world

gjoseph92 · gjoseph92 · commit bdc232a0fc4b · 2021-04-08T18:15:42.000-06:00
diff --git a/.flake8 b/.flake8
@@ -0,0 +1,2 @@
+[flake8]
+max-line-length = 120
diff --git a/.gitignore b/.gitignore
@@ -127,3 +127,5 @@ dmypy.json
 
 # Pyre type checker
 .pyre/
+
+.vscode
diff --git a/README.md b/README.md
@@ -1,2 +1,42 @@
 # distributed-pyspy
-Profile the dask distributed scheduler with py-spy
+
+Profile the dask [distributed](https://github.com/dask/distributed) scheduler with the [py-spy](https://github.com/benfred/py-spy) statistical profiler.
+
+```python
+import dask
+import distributed
+from distributed_pyspy import pyspy_on_scheduler
+
+
+client = distributed.Client()
+
+df = dask.datasets.timeseries(
+    start="2000-01-01",
+    end="2000-01-31",
+    partition_freq="1h",
+    freq="60s",
+)
+
+with pyspy_on_scheduler("profile.json"):
+    df.set_index("id").mean().compute()
+
+# Saves a speedscope profile to `profile.json` locally
+```
+
+Using `pyspy_on_scheduler` attaches py-spy to the scheduler process, records a profile, and sends the file back to the client.
+
+By default, profiles are recorded in [speedscope](https://www.speedscope.app/) format.
+
+`distributed-pyspy` (and, transitively, `py-spy`) must be installed in the environment where the scheduler is running.
+
+## Installation
+
+```
+python -m pip install git+https://github.com/gjoseph92/distributed-pyspy.git
+```
+
+## Privileges
+
+You may need to run the scheduler process as root for py-spy to be able to profile it (especially on macOS). See https://github.com/benfred/py-spy#when-do-you-need-to-run-as-sudo
+
+In a container, even if the scheduler is running as root, you'll need the `SYS_PTRACE` capability: https://github.com/benfred/py-spy#how-do-i-run-py-spy-in-docker
diff --git a/distributed_pyspy/__init__.py b/distributed_pyspy/__init__.py
@@ -0,0 +1,3 @@
+from .distributed_pyspy import pyspy_on_scheduler
+
+__all__ = ["pyspy_on_scheduler"]
diff --git a/distributed_pyspy/distributed_pyspy.py b/distributed_pyspy/distributed_pyspy.py
@@ -0,0 +1,260 @@
+import asyncio
+from contextlib import contextmanager
+import tempfile
+import os
+import signal
+from typing import List, Optional, Iterable
+import logging
+
+import distributed
+from distributed.diagnostics import SchedulerPlugin
+
+logger = logging.getLogger(__name__)
+
+
+class PySpyScheduler(SchedulerPlugin):
+    _HANDLER_NAME = "get_py_spy_profile"
+
+    def __init__(
+        self,
+        output: Optional[str] = None,
+        format: str = "speedscope",
+        rate: int = 100,
+        subprocesses: bool = True,
+        function: bool = False,
+        gil: bool = False,
+        threads: bool = False,
+        idle: bool = True,
+        nonblocking: bool = False,
+        native: bool = False,
+        extra_pyspy_args: Iterable[str] = (),
+    ) -> None:
+        self.output = output
+        self.pyspy_args: List[str] = ["--format", format, "--rate", str(rate)] + [
+            flag
+            for flag, active in {
+                "--subprocesses": subprocesses,
+                "--function": function,
+                "--gil": gil,
+                "--threads": threads,
+                "--idle": idle,
+                "--nonblocking": nonblocking,
+                "--native": native,
+            }.items()
+            if active
+        ]
+        self.pyspy_args.extend(extra_pyspy_args)
+        self.proc = None
+        self._tempfile = None
+
+    def __repr__(self) -> str:
+        return f"<{type(self).__name__} {self.pyspy_args}>"
+
+    async def start(self, scheduler):
+        if self.output is None:
+            self._tempfile = tempfile.NamedTemporaryFile(suffix="pyspy.json")
+            self.output = self._tempfile.name
+
+        # HACK: inject a `get_py_spy_profile` handler into the scheduler,
+        # so we can retrieve the data more easily. Until we can stream back files,
+        # there's probably not any advantage to this over an async
+        # `run_on_scheduler` to retrieve the data.
+        self.scheduler = scheduler
+        if self._HANDLER_NAME in scheduler.handlers:
+            raise RuntimeError(
+                f"A py-spy plugin is already registered: "
+                f"{scheduler.handlers[self._HANDLER_NAME]} vs {self._get_py_spy_profile}!"
+            )
+        else:
+            scheduler.handlers[self._HANDLER_NAME] = self._get_py_spy_profile
+
+        pid = os.getpid()
+        self.proc = await asyncio.create_subprocess_exec(
+            "py-spy",
+            "record",
+            "--pid",
+            str(pid),
+            "--output",
+            self.output,
+            *self.pyspy_args,
+            stdout=asyncio.subprocess.PIPE,
+            stderr=asyncio.subprocess.PIPE,
+        )
+
+    async def _stop(self) -> Optional[int]:
+        if self.proc is None:
+            return None
+
+        try:
+            self.proc.send_signal(signal.SIGINT)
+        except ProcessLookupError:
+            logger.warning(
+                f"py-spy subprocess {self.proc.pid} already terminated (it probably never ran?)."
+            )
+
+        stdout, stderr = await self.proc.communicate()  # TODO timeout
+        retcode = self.proc.returncode
+        if retcode != 0:
+            logging.warn(f"py-spy exited with code {retcode}")
+            logging.warn(f"py-spy stderr:\n{stderr.decode()}")
+            logging.warn(f"py-spy stdout:\n{stdout.decode()}")
+
+        self.proc = None
+        # Remove our injected handler
+        del self.scheduler.handlers[self._HANDLER_NAME]
+        # TODO should we remove the plugin as well?
+        # At this point, there's not much reason to be using a plugin...
+        return retcode
+
+    def _maybe_close_tempfile(self):
+        if self._tempfile is not None:
+            self._tempfile.close()
+        self._tempfile = None
+
+    # This handler gets injected into the scheduler
+    async def _get_py_spy_profile(self, comm=None) -> Optional[bytes]:
+        retcode = await self._stop()
+        if retcode == 0:
+            with open(self.output, "rb") as f:
+                data = f.read()  # TODO streaming!
+        else:
+            data = None
+
+        self._maybe_close_tempfile()
+        return data
+
+    async def close(self):
+        await self._stop()
+        self._maybe_close_tempfile()
+
+
+def start_pyspy_on_scheduler(
+    output: Optional[str] = None,
+    format: str = "speedscope",
+    rate: int = 100,
+    subprocesses: bool = True,
+    function: bool = False,
+    gil: bool = False,
+    threads: bool = False,
+    idle: bool = True,
+    nonblocking: bool = False,
+    native: bool = False,
+    extra_pyspy_args: Iterable[str] = (),
+    client: Optional[distributed.Client] = None,
+) -> None:
+    """
+    Add a `PySpyScheduler` plugin to the Scheduler, and start it.
+    """
+    client = client or distributed.worker.get_client()
+
+    async def _inject(dask_scheduler: distributed.Scheduler):
+        plugin = PySpyScheduler(
+            output=output,
+            format=format,
+            rate=rate,
+            subprocesses=subprocesses,
+            function=function,
+            gil=gil,
+            threads=threads,
+            idle=idle,
+            nonblocking=nonblocking,
+            native=native,
+            extra_pyspy_args=extra_pyspy_args,
+        )
+        await plugin.start(dask_scheduler)
+        dask_scheduler.add_plugin(plugin)
+
+    client.run_on_scheduler(_inject)
+
+
+def get_profile_from_scheduler(
+    path: str, client: Optional[distributed.Client] = None
+) -> None:
+    """
+    Stop the current `PySpyScheduler` plugin, send back its profile data, and write it to ``path``.
+    """
+    client = client or distributed.worker.get_client()
+
+    async def _get_profile():
+        return await getattr(client.scheduler, PySpyScheduler._HANDLER_NAME)()
+
+    data = client.sync(_get_profile)
+    if data:
+        with open(path, "wb") as f:
+            f.write(data)
+    else:
+        logger.warning("No data from py-spy profile!")
+
+
+@contextmanager
+def pyspy_on_scheduler(
+    output: str,
+    format: str = "speedscope",
+    rate: int = 100,
+    subprocesses: bool = True,
+    function: bool = False,
+    gil: bool = False,
+    threads: bool = False,
+    idle: bool = True,
+    nonblocking: bool = False,
+    native: bool = False,
+    extra_pyspy_args: Iterable[str] = (),
+    client: Optional[distributed.Client] = None,
+):
+    """
+    Spy on the Scheduler with py-spy.
+
+    Use as a context manager (similar to `distributed.performance_report`) to record a py-spy
+    profile of the scheduler.
+
+    When the context manager exits, the profile is sent back to the client and saved to
+    the ``output`` path.
+
+    Parameters
+    ----------
+    output:
+        *Local* path to save the profile to, once it's sent back from the scheduler.
+    format:
+        Output file format [default: flamegraph]  [possible values: flamegraph, raw, speedscope]
+    rate:
+        The number of samples to collect per second [default: 100]
+    subprocesses:
+        Profile subprocesses of the original process
+    function:
+        Aggregate samples by function name instead of by line number
+    gil:
+        Only include traces that are holding on to the GIL
+    threads:
+        Show thread ids in the output
+    idle:
+        Include stack traces for idle threads
+    nonblocking:
+        Don't pause the python process when collecting samples. Setting this option
+        will reduce the perfomance impact of sampling, but may lead to inaccurate results
+    native:
+        Collect stack traces from native extensions written in Cython, C or C++
+    extra_pyspy_args:
+        Iterable of any extra arguments to pass to ``py-spy``.
+    client:
+        The distributed Client to use. If None (default), the default client is used.
+    """
+    client = client or distributed.worker.get_client()
+
+    start_pyspy_on_scheduler(
+        output=None,
+        format=format,
+        rate=rate,
+        subprocesses=subprocesses,
+        function=function,
+        gil=gil,
+        threads=threads,
+        idle=idle,
+        nonblocking=nonblocking,
+        native=native,
+        extra_pyspy_args=extra_pyspy_args,
+        client=client,
+    )
+    try:
+        yield
+    finally:
+        get_profile_from_scheduler(output, client=client)
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml

-Original file line number
+Diff line change
 # Pyre type checker
 .pyre/
++
 +.vscode
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+from .distributed_pyspy import pyspy_on_scheduler`
	`2`	`+`
	`3`	`+__all__ = ["pyspy_on_scheduler"]`