Merge pull request #36 from radix-ai/ls-highlevelgraph

lsorber · web-flow · commit cb16c928fdda · 2019-04-29T17:12:17.000+02:00
Add support for `HighLevelGraph`s
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -28,14 +28,14 @@ jobs:
           name: Run linters
           command: |
             source activate graphchain-circleci-env
-            flake8 graphchain --max-complexity=10 --ignore=W504
-            pydocstyle graphchain --convention=numpy
-            mypy graphchain --ignore-missing-imports --strict
+            flake8 graphchain
+            pydocstyle graphchain
+            mypy graphchain
       - run:
           name: Run tests
           command: |
             source activate graphchain-circleci-env
-            pytest -vx --cov=graphchain graphchain
+            pytest
 
 workflows:
   version: 2
diff --git a/docs/conf.py b/docs/conf.py
@@ -27,7 +27,7 @@
 # The short X.Y version
 version = ''
 # The full version, including alpha/beta/rc tags
-release = '1.0.0'
+release = '1.1.0'
 
 
 # -- General configuration ---------------------------------------------------
diff --git a/environment.circleci.yml b/environment.circleci.yml
@@ -3,20 +3,20 @@ channels:
   - defaults
   - conda-forge
 dependencies:
-  - cloudpickle=0.6
-  - dask=1.0
+  - cloudpickle=0.8
+  - dask=1.2
   - fs-s3fs=0.1
   - joblib=0.13
-  - mypy<0.700
+  - mypy<0.800
   - pydocstyle=3.0
-  - pytest=4.0
+  - pytest=4.4
   - pytest-cov=2.6
-  - pytest-xdist=1.25
+  - pytest-xdist=1.28
   - pip:
-      - flake8~=3.6.0
-      - flake8-comprehensions~=1.4.1
-      - flake8-bandit~=2.0.0
-      - flake8-bugbear~=18.8.0
+      - flake8~=3.7.7
+      - flake8-comprehensions~=2.1.0
+      - flake8-bandit~=2.1.0
+      - flake8-bugbear~=19.3.0
       - flake8-mutable~=1.2.0
-      - flake8-rst-docstrings~=0.0.8
+      - flake8-rst-docstrings~=0.0.9
       - lz4~=2.1.6
diff --git a/graphchain/core.py b/graphchain/core.py
@@ -2,8 +2,9 @@
 import datetime as dt
 import functools
 import logging
-import pickle
 import time
+from copy import deepcopy
+from pickle import HIGHEST_PROTOCOL  # noqa: S403
 from typing import (Any, Callable, Container, Dict, Hashable, Iterable,
                     Optional, Union)
 
@@ -12,9 +13,24 @@
 import fs
 import fs.base
 import joblib
+from dask.highlevelgraph import HighLevelGraph
 
 from .utils import get_size, str_to_posix_fully_portable_filename
 
+
+def hlg_setitem(self: HighLevelGraph, key: Hashable, value: Any) -> None:
+    """Set a HighLevelGraph computation."""
+    for d in self.layers.values():
+        if key in d:
+            d[key] = value
+            break
+
+
+# Monkey patch HighLevelGraph to add a missing `__setitem__` method.
+if not hasattr(HighLevelGraph, '__setitem__'):
+    HighLevelGraph.__setitem__ = hlg_setitem
+
+
 logger = logging.getLogger(__name__)
 
 
@@ -166,7 +182,7 @@ def time_to_result(self, memoize: bool = True) -> float:
                     load_time = self.read_time('store') / 2
                 self._time_to_result = load_time
                 return load_time
-            except Exception:
+            except Exception:  # noqa: S110
                 pass
         compute_time = self.read_time('compute')
         dependency_time = 0
@@ -232,7 +248,7 @@ def store(self, result: Any) -> None:
                 start_time = time.perf_counter()
                 with self.cache_fs.open(  # type: ignore
                         self.cache_filename, 'wb') as fid:
-                    joblib.dump(result, fid, protocol=pickle.HIGHEST_PROTOCOL)
+                    joblib.dump(result, fid, protocol=HIGHEST_PROTOCOL)
                 store_time = time.perf_counter() - start_time
                 # Write store time and log operation
                 self.write_time('store', store_time)
@@ -243,7 +259,7 @@ def store(self, result: Any) -> None:
                 # Try to delete leftovers if they were created by accident.
                 try:
                     self.cache_fs.remove(self.cache_filename)  # type: ignore
-                except Exception:
+                except Exception:  # noqa: S110
                     pass
 
     def patch_computation_in_graph(self) -> None:
@@ -343,7 +359,7 @@ def optimize(
     .. [2] http://dask.pydata.org/en/latest/optimize.html
     """
     # Verify that the graph is a DAG.
-    dsk = dsk.copy()
+    dsk = deepcopy(dsk)
     assert dask.core.isdag(dsk, list(dsk.keys()))
     # Open or create the cache FS.
     # TODO(lsorber): lazily evaluate this for compatibility with `distributed`?
diff --git a/graphchain/tests/test_graphchain.py b/graphchain/tests/test_graphchain.py
@@ -167,7 +167,8 @@ def test_first_run(
     storage.close()
 
 
-def NO_test_single_run_s3(
+@pytest.mark.skip(reason='Need AWS credentials to test')  # type: ignore
+def test_single_run_s3(
         dask_graph: Dict[Hashable, Any],
         optimizer_s3: Tuple[
             str,
diff --git a/graphchain/tests/test_highlevelgraph.py b/graphchain/tests/test_highlevelgraph.py
@@ -0,0 +1,53 @@
+"""Test module for the dask HighLevelGraphs."""
+import dask
+import pandas as pd
+import pytest
+from dask.highlevelgraph import HighLevelGraph
+
+from ..core import optimize
+
+
+@pytest.fixture(scope="function")  # type: ignore
+def dask_highlevelgraph() -> HighLevelGraph:
+    """Generate an example dask HighLevelGraph."""
+    @dask.delayed(pure=True)  # type: ignore
+    def create_dataframe(num_rows: int, num_cols: int) -> pd.DataFrame:
+        print('Creating DataFrame...')
+        return pd.DataFrame(data=[range(num_cols)] * num_rows)
+
+    @dask.delayed(pure=True)  # type: ignore
+    def create_dataframe2(num_rows: int, num_cols: int) -> pd.DataFrame:
+        print('Creating DataFrame...')
+        return pd.DataFrame(data=[range(num_cols)] * num_rows)
+
+    @dask.delayed(pure=True)  # type: ignore
+    def complicated_computation(df: pd.DataFrame, num_quantiles: int) \
+            -> pd.DataFrame:
+        print('Running complicated computation on DataFrame...')
+        return df.quantile(q=[i / num_quantiles for i in range(num_quantiles)])
+
+    @dask.delayed(pure=True)  # type: ignore
+    def summarise_dataframes(*dfs: pd.DataFrame) -> float:
+        print('Summing DataFrames...')
+        return sum(df.sum().sum() for df in dfs)
+
+    df_a = create_dataframe(1000, 1000)
+    df_b = create_dataframe2(1000, 1000)
+    df_c = complicated_computation(df_a, 2048)
+    df_d = complicated_computation(df_b, 2048)
+    result = summarise_dataframes(df_c, df_d)
+    return result
+
+
+def test_highleveldag(dask_highlevelgraph: HighLevelGraph) -> None:
+    """Test that the graph can be traversed and its result is correct."""
+    with dask.config.set(scheduler='sync'):
+        result = dask_highlevelgraph.compute()
+    assert result == 2045952000.0
+
+
+def test_highlevelgraph(dask_highlevelgraph: HighLevelGraph) -> None:
+    """Test that the graph can be traversed and its result is correct."""
+    with dask.config.set(scheduler='sync', delayed_optimize=optimize):
+        result = dask_highlevelgraph.compute()
+    assert result == 2045952000.0
diff --git a/setup.cfg b/setup.cfg
@@ -0,0 +1,40 @@
+; http://flake8.pycqa.org/en/latest/user/configuration.html#project-configuration
+[flake8]
+max_complexity = 10
+doctests = True
+ignore =
+    # S101 Use of assert detected.
+    # Motivation: asserts are useful to test invariants.
+    S101,
+    # Line breaks before (W503) or after (W504) binary operator.
+    # Motivation: At least one must be ignored. This project enforces W503 [1].
+    # [1] https://github.com/PyCQA/pycodestyle/issues/498.
+    W504,
+    # Failed to parse __all__ entry.
+    # Motivation: flake8-rst-docstrings cannot parse dynamically generated
+    # __all__ variables.
+    RST902
+
+; https://mypy.readthedocs.io/en/latest/config_file.html
+[mypy]
+ignore_missing_imports = True
+warn_unused_configs = True
+disallow_subclassing_any = True
+disallow_untyped_calls = True
+disallow_untyped_defs = True
+disallow_incomplete_defs = True
+check_untyped_defs = True
+disallow_untyped_decorators = True
+no_implicit_optional = True
+warn_redundant_casts = True
+warn_unused_ignores = True
+warn_return_any = True
+disallow_any_generics = True
+
+; http://www.pydocstyle.org/en/latest/usage.html#configuration-files
+[pydocstyle]
+convention = numpy
+
+; https://docs.pytest.org/en/latest/customize.html#adding-default-options
+[tool:pytest]
+addopts = --verbose --exitfirst --doctest-modules --log-level DEBUG --cov=graphchain
diff --git a/setup.py b/setup.py
@@ -10,7 +10,7 @@
 
 setup(
     name='graphchain',
-    version='1.0.0',
+    version='1.1.0',
     description='An efficient cache for the execution of dask graphs',
     long_description=long_description,
     long_description_content_type='text/markdown',