Merge pull request #34 from radix-ai/ls-betterlinting

lsorber · web-flow · commit 882d355fca23 · 2019-01-06T12:57:46.000+01:00
More linters, stricter linting
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -28,7 +28,7 @@ jobs:
           name: Run linters
           command: |
             source activate graphchain-circleci-env
-            flake8 graphchain --max-complexity=10
+            flake8 graphchain --max-complexity=10 --ignore=W504
             pydocstyle graphchain --convention=numpy
             mypy graphchain --ignore-missing-imports --strict
       - run:
diff --git a/environment.circleci.yml b/environment.circleci.yml
@@ -4,16 +4,19 @@ channels:
   - conda-forge
 dependencies:
   - cloudpickle=0.6
-  - dask=0.19
-  - flake8=3.5
-  - flake8-comprehensions=1.4
+  - dask=1.0
   - fs-s3fs=0.1
-  - joblib=0.12
+  - joblib=0.13
   - mypy<0.700
-  - pydocstyle=2.1
-  - pytest=3.8
+  - pydocstyle=3.0
+  - pytest=4.0
   - pytest-cov=2.6
-  - pytest-xdist=1.23
+  - pytest-xdist=1.25
   - pip:
+      - flake8~=3.6.0
+      - flake8-comprehensions~=1.4.1
+      - flake8-bandit~=2.0.0
       - flake8-bugbear~=18.8.0
-      - lz4~=2.1.0
+      - flake8-mutable~=1.2.0
+      - flake8-rst-docstrings~=0.0.8
+      - lz4~=2.1.6
diff --git a/environment.local.yml b/environment.local.yml
@@ -6,8 +6,6 @@ dependencies:
   - autopep8
   - cloudpickle
   - dask
-  - flake8
-  - flake8-comprehensions
   - fs-s3fs
   - isort
   - joblib
@@ -22,5 +20,10 @@ dependencies:
   - sphinx_rtd_theme
   - twine
   - pip:
+      - flake8
+      - flake8-comprehensions
+      - flake8-bandit
       - flake8-bugbear
+      - flake8-mutable
+      - flake8-rst-docstrings
       - lz4
diff --git a/graphchain/core.py b/graphchain/core.py
@@ -4,8 +4,8 @@
 import logging
 import pickle
 import time
-from typing import (Any, Callable, Container, Hashable, Iterable, Optional,
-                    Union)
+from typing import (Any, Callable, Container, Dict, Hashable, Iterable,
+                    Optional, Union)
 
 import cloudpickle
 import dask
@@ -23,11 +23,11 @@ class CachedComputation:
 
     def __init__(
             self,
-            dsk: dict,
+            dsk: Dict[Hashable, Any],
             key: Hashable,
             computation: Any,
             location: Union[str, fs.base.FS],
-            write_to_cache: Union[bool, str]='auto') -> None:
+            write_to_cache: Union[bool, str] = 'auto') -> None:
         """Cache a dask graph computation.
 
         Parameters
@@ -51,9 +51,9 @@ def __init__(
 
         Returns
         -------
-            CachedComputation
-                A wrapper for the computation object to replace the original
-                computation with in the dask graph.
+        CachedComputation
+            A wrapper for the computation object to replace the original
+            computation with in the dask graph.
         """
         self.dsk = dsk
         self.key = key
@@ -62,7 +62,7 @@ def __init__(
         self.write_to_cache = write_to_cache
 
     @property  # type: ignore
-    @functools.lru_cache()  # type: ignore
+    @functools.lru_cache()
     def cache_fs(self) -> fs.base.FS:
         """Open a PyFilesystem FS to the cache directory."""
         # create=True does not yet work for S3FS [1]. This should probably be
@@ -133,7 +133,7 @@ def estimate_load_time(self, result: Any) -> float:
             500e6 if isinstance(self.cache_fs, fs.osfs.OSFS) else 50e6))
         return read_latency + size / read_throughput
 
-    @functools.lru_cache()  # type: ignore
+    @functools.lru_cache()
     def read_time(self, timing_type: str) -> float:
         """Read the time to load, compute, or store from file."""
         time_filename = f'{self.hash}.time.{timing_type}'
@@ -154,7 +154,7 @@ def write_log(self, log_type: str) -> None:
         with self.cache_fs.open(log_filename, 'w') as fid:  # type: ignore
             fid.write(self.hash)
 
-    def time_to_result(self, memoize: bool=True) -> float:
+    def time_to_result(self, memoize: bool = True) -> float:
         """Estimate the time to load or compute this computation."""
         if hasattr(self, '_time_to_result'):
             return self._time_to_result  # type: ignore
@@ -286,10 +286,11 @@ def __call__(self, *args: Any, **kwargs: Any) -> Any:
 
 
 def optimize(
-        dsk: dict,
-        keys: Optional[Union[Hashable, Iterable[Hashable]]]=None,
-        skip_keys: Optional[Container[Hashable]]=None,
-        location: Union[str, fs.base.FS]="./__graphchain_cache__") -> dict:
+        dsk: Dict[Hashable, Any],
+        keys: Optional[Union[Hashable, Iterable[Hashable]]] = None,
+        skip_keys: Optional[Container[Hashable]] = None,
+        location: Union[str, fs.base.FS] = "./__graphchain_cache__") \
+        -> Dict[Hashable, Any]:
     """Optimize a dask graph with cached computations.
 
     According to the dask graph specification [1]_, a dask graph is a
@@ -318,23 +319,23 @@ def optimize(
 
     Parameters
     ----------
-        dsk
-            The dask graph to optimize with caching computations.
-        keys
-            Not used. Is present for compatibility with dask optimizers [2]_.
-        skip_keys
-            A container of keys not to cache.
-        location
-            A PyFilesystem FS URL to store the cached computations in. Can be a
-            local directory such as ``'./__graphchain_cache__'`` or a remote
-            directory such as ``'s3://bucket/__graphchain_cache__'``. You can
-            also pass a PyFilesystem itself instead.
+    dsk
+        The dask graph to optimize with caching computations.
+    keys
+        Not used. Is present for compatibility with dask optimizers [2]_.
+    skip_keys
+        A container of keys not to cache.
+    location
+        A PyFilesystem FS URL to store the cached computations in. Can be a
+        local directory such as ``'./__graphchain_cache__'`` or a remote
+        directory such as ``'s3://bucket/__graphchain_cache__'``. You can
+        also pass a PyFilesystem itself instead.
 
     Returns
     -------
-        dict
-            A copy of the dask graph where the computations have been replaced
-            by ``CachedComputation``'s.
+    dict
+        A copy of the dask graph where the computations have been replaced by
+        ``CachedComputation``'s.
 
     References
     ----------
@@ -361,11 +362,14 @@ def optimize(
 
 
 def get(
-        dsk: dict,
+        dsk: Dict[Hashable, Any],
         keys: Union[Hashable, Iterable[Hashable]],
-        skip_keys: Optional[Container[Hashable]]=None,
-        location: Union[str, fs.base.FS]="./__graphchain_cache__",
-        scheduler: Optional[Callable]=None) -> Any:
+        skip_keys: Optional[Container[Hashable]] = None,
+        location: Union[str, fs.base.FS] = "./__graphchain_cache__",
+        scheduler: Optional[Callable[
+            [Dict[Hashable, Any], Union[Hashable, Iterable[Hashable]]],
+            Any
+        ]] = None) -> Any:
     """Get one or more keys from a dask graph with caching.
 
     Optimizes a dask graph with ``graphchain.optimize`` and then computes the
@@ -377,24 +381,24 @@ def get(
 
     Parameters
     ----------
-        dsk
-            The dask graph to query.
-        keys
-            The keys to compute.
-        skip_keys
-            A container of keys not to cache.
-        location
-            A PyFilesystem FS URL to store the cached computations in. Can be a
-            local directory such as ``'./__graphchain_cache__'`` or a remote
-            directory such as ``'s3://bucket/__graphchain_cache__'``. You can
-            also pass a PyFilesystem itself instead.
-        scheduler
-            The dask scheduler to use to retrieve the keys from the graph.
+    dsk
+        The dask graph to query.
+    keys
+        The keys to compute.
+    skip_keys
+        A container of keys not to cache.
+    location
+        A PyFilesystem FS URL to store the cached computations in. Can be a
+        local directory such as ``'./__graphchain_cache__'`` or a remote
+        directory such as ``'s3://bucket/__graphchain_cache__'``. You can also
+        pass a PyFilesystem itself instead.
+    scheduler
+        The dask scheduler to use to retrieve the keys from the graph.
 
     Returns
     -------
-        Any
-            The computed values corresponding to the given keys.
+    Any
+        The computed values corresponding to the given keys.
     """
     cached_dsk = optimize(dsk, keys, skip_keys=skip_keys, location=location)
     scheduler = \
diff --git a/graphchain/tests/test_graphchain.py b/graphchain/tests/test_graphchain.py
diff --git a/graphchain/utils.py b/graphchain/utils.py