Skip to content

Commit 3d6f160

Browse files
committed
Move to minimalkv
1 parent d027ca2 commit 3d6f160

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

56 files changed

+144
-152
lines changed

CHANGES.rst

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,10 @@
22
Changelog
33
=========
44

5+
Kartothek 4.1.0 (2021-04-xx)
6+
============================
7+
8+
* Switch from ``simplekv`` and ``storefact`` to their successor ``minimalkv`` as the library providing the store implementations.
59

610
Kartothek 4.0.2 (2021-04-xx)
711
============================

asv_bench/benchmarks/index.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,10 +9,10 @@
99
import uuid
1010
from functools import lru_cache
1111

12+
import minimalkv
1213
import numpy as np
1314
import pandas as pd
1415
import pyarrow as pa
15-
import storefact
1616

1717
from kartothek.core.index import ExplicitSecondaryIndex
1818
from kartothek.io_components.metapartition import MetaPartition
@@ -44,7 +44,7 @@ def setup(self, number_values, number_partitions, dtype):
4444
column=self.column_name, index_dct=index_dct, dtype=arrow_type
4545
)
4646
self.tmp_dir = tempfile.mkdtemp()
47-
self.store = storefact.get_store_from_url("hfs://{}".format(self.tmp_dir))
47+
self.store = minimalkv.get_store_from_url("hfs://{}".format(self.tmp_dir))
4848
self.dataset_uuid = "some_uuid"
4949
self.storage_key = self.ktk_index.store(self.store, self.dataset_uuid)
5050

asv_bench/benchmarks/predicate_pushdown.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
from storefact import get_store_from_url
1+
from minimalkv import get_store_from_url
22

33
from kartothek.serialization import ParquetSerializer
44
from kartothek.serialization.testing import get_dataframe_not_nested

asv_bench/benchmarks/write.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
import tempfile
55
import uuid
66

7-
from storefact import get_store_from_url
7+
from minimalkv import get_store_from_url
88

99
from kartothek.core.common_metadata import make_meta
1010
from kartothek.core.testing import get_dataframe_alltypes

conda-requirements.txt

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,12 @@
11
dask[dataframe]
22
decorator
3+
minimalkv
34
msgpack-python>=0.5.2
45
# Currently dask and numpy==1.16.0 clash
56
numpy!=1.15.0,!=1.16.0
67
pandas>=0.23.0, !=1.0.0
78
pyarrow>=0.17.1,!=1.0.0, <4
89
simplejson
9-
simplekv
10-
storefact
1110
toolz
1211
typing_extensions # Some backports of the py3.8 typing module
1312
urlquote>=1.1.3

docs/conf.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -105,13 +105,11 @@
105105

106106
intersphinx_mapping = {
107107
"pandas": ("https://pandas.pydata.org/docs/", None),
108-
"simplekv": ("https://simplekv.readthedocs.io/en/stable/", None),
108+
"minimalkv": ("https://minimalkv.readthedocs.io/en/stable/", None),
109109
"pyarrow": ("https://arrow.apache.org/docs/", None),
110110
"numpy": ("https://numpy.org/doc/stable/", None),
111111
"python": ("https://docs.python.org/3", None),
112112
"dask": ("https://docs.dask.org/en/stable/", None),
113-
# Storefact isn't exposing any sphinx refs
114-
# "storefact": ("https://storefact.readthedocs.io/en/stable", None),
115113
}
116114

117115
# In particular type annotations are rendered as its full path to the class but

docs/environment-docs.yml

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,14 +4,13 @@ channels:
44
dependencies:
55
- dask[dataframe]
66
- decorator
7+
- minimalkv
78
- msgpack-python>=0.5.2
89
# Currently dask and numpy==1.16.0 clash
910
- numpy!=1.15.0,!=1.16.0
1011
- pandas>=0.23.0, !=1.0.0
1112
- pyarrow>=0.17.1,!=1.0.0, <4
1213
- simplejson
13-
- simplekv
14-
- storefact
1514
- toolz
1615
- typing_extensions # Some backports of the py3.8 typing module
1716
- urlquote>=1.1.3

docs/guide/cube/command_line_features.rst

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ Command Line Features
1717
</style>
1818

1919
Kartothek Cube also features a command line interface (CLI) for some cube operations. To use it, create a ``skv.yml`` file that
20-
describes `storefact`_ stores:
20+
describes `minimalkv`_ stores:
2121

2222
.. code-block:: yaml
2323
@@ -147,5 +147,5 @@ Some information is not available when reading the schema information and requir
147147

148148
Use ``kartothek_cube --help`` to get a list of all commands, or see :mod:`~kartothek.cli`.
149149

150-
.. _storefact: https://github.com/blue-yonder/storefact
150+
.. _minimalkv: https://github.com/data-engineering-collective/minimalkv
151151

docs/guide/cube/examples.rst

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -13,14 +13,14 @@ First, we want to create a cube for geodata:
1313
... partition_columns=["country"],
1414
... )
1515

16-
Apart from an abstract cube definition, we need a `simplekv`_-based storage backend:
16+
Apart from an abstract cube definition, we need a `minimalkv`_-based storage backend:
1717

1818
>>> from functools import partial
1919
>>> import tempfile
20-
>>> import storefact
20+
>>> import minimalkv
2121
>>> store_location = tempfile.mkdtemp()
2222
>>> store_factory = partial(
23-
... storefact.get_store_from_url,
23+
... minimalkv.get_store_from_url,
2424
... "hfs://" + store_location,
2525
... )
2626
>>> store = store_factory()
@@ -424,4 +424,4 @@ geodata++time/table/_common_metadata
424424
.. _Dask: https://docs.dask.org/
425425
.. _Dask.Bag: https://docs.dask.org/en/latest/bag.html
426426
.. _Dask.DataFrame: https://docs.dask.org/en/latest/dataframe.html
427-
.. _simplekv: https://simplekv.readthedocs.io/
427+
.. _minimalkv: https://minimalkv.readthedocs.io/

docs/guide/cube/glossary.rst

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -55,8 +55,8 @@ Glossary
5555
Dataset that provides the groundtruth about which :term:`Cell` are in a :term:`Cube`.
5656

5757
Store Factory
58-
A callable that does not take any arguments and creates a new `simplekv`_ store when being called. Its type is
59-
``Callable[[], simplekv.KeyValueStore]``.
58+
A callable that does not take any arguments and creates a new `minimalkv`_ store when being called. Its type is
59+
``Callable[[], minimalkv.KeyValueStore]``.
6060

6161
Query
6262
A request for data from the cube, including things like "payload columns", "conditions", and more.
@@ -76,4 +76,4 @@ Glossary
7676

7777
.. _Data Cubes: https://en.wikipedia.org/wiki/Data_cube
7878
.. _Parquet: https://parquet.apache.org/
79-
.. _simplekv: https://simplekv.readthedocs.io/
79+
.. _minimalkv: https://minimalkv.readthedocs.io/

docs/guide/getting_started.rst

Lines changed: 8 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -47,24 +47,24 @@ We want to store this DataFrame now as a dataset. Therefore, we first need
4747
to connect to a storage location.
4848

4949
We define a store factory as a callable which contains the storage information.
50-
We will use `storefact`_ in this example to construct such a store factory
50+
We will use `minimalkv`_ in this example to construct such a store factory
5151
for the local filesystem (``hfs://`` indicates we are using the local filesystem and
5252
what follows is the filepath).
5353

5454
.. ipython:: python
5555
5656
from functools import partial
5757
from tempfile import TemporaryDirectory
58-
from storefact import get_store_from_url
58+
from minimalkv import get_store_from_url
5959
6060
dataset_dir = TemporaryDirectory()
6161
6262
store_url = f"hfs://{dataset_dir.name}"
6363
6464
.. admonition:: Storage locations
6565

66-
`storefact`_ offers support for several stores in Kartothek, these can be created using the
67-
function `storefact.get_store_from_url` with one of the following prefixes:
66+
`minimalkv`_ offers support for several stores in Kartothek, these can be created using the
67+
function `minimalkv.get_store_from_url` with one of the following prefixes:
6868

6969
- ``hfs``: Local filesystem
7070
- ``hazure``: AzureBlockBlobStorage
@@ -74,15 +74,11 @@ Interface
7474
---------
7575

7676
Kartothek can write to any location that
77-
fulfills the `simplekv.KeyValueStore interface
78-
<https://simplekv.readthedocs.io/en/latest/#simplekv.KeyValueStore>`_ as long as they
79-
support `ExtendedKeyspaceMixin
80-
<https://github.com/mbr/simplekv/search?q=%22class+ExtendedKeyspaceMixin%22&unscoped_q=%22class+ExtendedKeyspaceMixin%22>`_
77+
fulfills the `minimalkv.KeyValueStore interface
78+
<https://minimalkv.readthedocs.io/en/latest/#minimalkv.KeyValueStore>`_ as long as they
79+
support ``ExtendedKeyspaceMixin``
8180
(this is necessary so that ``/`` can be used in the storage key name).
8281

83-
For more information, take a look out at the `storefact documentation
84-
<https://storefact.readthedocs.io/en/latest/reference/storefact.html>`_.
85-
8682

8783
Writing data to storage
8884
=======================
@@ -232,5 +228,5 @@ function but returns a collection of ``dask.delayed`` objects.
232228
233229
read_table("a_unique_dataset_identifier", store_url, predicates=[[("A", "<", 2.5)]])
234230
235-
.. _storefact: https://github.com/blue-yonder/storefact
231+
.. _minimalkv: https://github.com/data-engineering-collective/minimalkv
236232
.. _dask: https://docs.dask.org/en/latest/

docs/guide/mutating_datasets.rst

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ some data there with Kartothek.
1919
import pandas as pd
2020
from functools import partial
2121
from tempfile import TemporaryDirectory
22-
from storefact import get_store_from_url
22+
from minimalkv import get_store_from_url
2323
2424
from kartothek.api.dataset import store_dataframes_as_dataset
2525
@@ -236,7 +236,7 @@ When garbage collection is called, the files are removed.
236236
.. ipython:: python
237237
238238
from kartothek.api.dataset import garbage_collect_dataset
239-
from storefact import get_store_from_url
239+
from minimalkv import get_store_from_url
240240
241241
store = get_store_from_url(store_url)
242242
@@ -246,7 +246,7 @@ When garbage collection is called, the files are removed.
246246
247247
files_before.difference(store.keys()) # Show files removed
248248
249-
.. _storefact: https://github.com/blue-yonder/storefact
249+
.. _minimalkv: https://github.com/data-engineering-collective/minimalkv
250250

251251

252252
Mutating indexed datasets

docs/guide/partitioning.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ first and store the data there with Kartothek:
3030
import pandas as pd
3131
from functools import partial
3232
from tempfile import TemporaryDirectory
33-
from storefact import get_store_from_url
33+
from minimalkv import get_store_from_url
3434
3535
from kartothek.api.dataset import store_dataframes_as_dataset
3636

docs/spec/store_interface.rst

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
KeyValueStore Interface
55
=======================
66

7-
All storage interaction use ``simplekv.KeyValueStore`` as an storage layer
7+
All storage interaction use ``minimalkv.KeyValueStore`` as an storage layer
88
abstraction. This allows convenient access to many different common Key-Value
99
stores (ABS, S3, GCS, local filesystem, etc.) and allows an easy switch between
1010
the storage backends to facilitate a simpler test setup.
@@ -13,7 +13,7 @@ Generally, all of our public functions accepting a ``store`` argument accept a
1313
multitude of different input types and we generally accept all kinds of stores
1414
inheriting from ``KeyValueStore``, assuming they implement the pickle protocol.
1515
However, there are storages which simply cannot be distributed across processes
16-
or network nodes sensibly. A prime Example is the ``simplekv.memory.DictStore``
16+
or network nodes sensibly. A prime Example is the ``minimalkv.memory.DictStore``
1717
which uses a simple python dictionary as a backend store. It is technically
1818
possible to (de-)serialize the store but once it is deserialized in another
1919
process, or another node, the store looses its meaning since the stores are
@@ -25,9 +25,8 @@ protocol, or some more complex logic is required to initialize it, kartothek
2525
also accepts _factories_ which must be a callable returning a ``KeyValueStore``
2626
(see also ``kartothek.core.typing.StoreFactory``).
2727

28-
For convenience we also offer a `storefact`_ integration and accept store urls
28+
For convenience we also offer an integration that accepts store urls
2929
which proves another easy level of access and is well suited for ad-hoc
3030
investigations.
3131

32-
.. _simplekv: https://simplekv.readthedocs.io/
33-
.. _storefact: https://storefact.readthedocs.io/
32+
.. _minimalkv: https://minimalkv.readthedocs.io/

kartothek/cli/__init__.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
.. important::
55
This module does not contain any public APIs.
66
7-
Kartothek comes with a CLI tool named ``kartothek_cube``. To use it, create an YAML file that contains a dictionary of `storefact`_
7+
Kartothek comes with a CLI tool named ``kartothek_cube``. To use it, create an YAML file that contains a dictionary of `minimalkv`_
88
stores (keys are names of the store and the values are dicts that contain the store config). ``Kartothek`` uses a `YAML`_
99
file called ``skv.yml`` and a store called ``dataset`` by default, but you may pass ``--skv`` and ``--store`` to change
1010
these. An example file could look like:
@@ -30,7 +30,7 @@
3030
3131
3232
.. _Dask: https://docs.dask.org/
33-
.. _storefact: https://github.com/blue-yonder/storefact
33+
.. _minimalkv: https://github.com/data-engineering-collective/minimalkv
3434
.. _YAML: https://yaml.org/
3535
"""
3636
import logging

kartothek/cli/_utils.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
from functools import partial
33

44
import click
5-
import storefact
5+
import minimalkv
66
import yaml
77

88
from kartothek.api.discover import discover_cube
@@ -18,7 +18,7 @@ def get_cube(store, uuid_prefix):
1818
----------
1919
uuid_prefix: str
2020
Dataset UUID prefix.
21-
store: Union[Callable[[], simplekv.KeyValueStore], simplekv.KeyValueStore]
21+
store: Union[Callable[[], minimalkv.KeyValueStore], minimalkv.KeyValueStore]
2222
KV store.
2323
2424
Returns
@@ -41,18 +41,18 @@ def get_cube(store, uuid_prefix):
4141

4242
def get_store(skv, store):
4343
"""
44-
Get simplekv store from storefact config file.
44+
Get minimalkv store from minimalkv config file.
4545
4646
Parameters
4747
----------
4848
skv: str
49-
Name of the storefact yaml. Normally ``'skv.yml'``.
49+
Name of the minimalkv yaml. Normally ``'skv.yml'``.
5050
store: str
5151
ID of the store.
5252
5353
Returns
5454
-------
55-
store_factory: Callable[[], simplekv.KeyValueStore]
55+
store_factory: Callable[[], minimalkv.KeyValueStore]
5656
Store object.
5757
5858
Raises
@@ -73,7 +73,7 @@ def get_store(skv, store):
7373
"Could not find store {store} in {skv}".format(store=store, skv=skv)
7474
)
7575

76-
return partial(storefact.get_store, **store_cfg[store])
76+
return partial(minimalkv.get_store, **store_cfg[store])
7777

7878

7979
def _match_pattern(what, items, pattern):

kartothek/core/common_metadata.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
import pyarrow as pa
1313
import pyarrow.parquet as pq
1414
import simplejson
15-
from simplekv import KeyValueStore
15+
from minimalkv import KeyValueStore
1616

1717
from kartothek.core import naming
1818
from kartothek.core._compat import load_json

kartothek/core/docs.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -7,11 +7,11 @@
77

88
_PARAMETER_MAPPING = {
99
"store": """
10-
store: Callable or str or simplekv.KeyValueStore
10+
store: Callable or str or minimalkv.KeyValueStore
1111
The store where we can find or store the dataset.
1212
13-
Can be either ``simplekv.KeyValueStore``, a storefact store url or a
14-
generic Callable producing a ``simplekv.KeyValueStore``""",
13+
Can be either ``minimalkv.KeyValueStore``, a minimalkv store url or a
14+
generic Callable producing a ``minimalkv.KeyValueStore``""",
1515
"overwrite": """
1616
overwrite: Optional[bool]
1717
If True, allow overwrite of an existing dataset.""",
@@ -70,12 +70,12 @@
7070
`merge_datasets__pipeline` key that contains the source dataset uuids for
7171
the merge.""",
7272
"output_store": """
73-
output_store : Union[Callable, str, simplekv.KeyValueStore]
73+
output_store : Union[Callable, str, minimalkv.KeyValueStore]
7474
If given, the resulting dataset is written to this store. By default
7575
the input store.
7676
77-
Can be either `simplekv.KeyValueStore`, a storefact store url or a
78-
generic Callable producing a ``simplekv.KeyValueStore``""",
77+
Can be either `minimalkv.KeyValueStore`, a minimalkv store url or a
78+
generic Callable producing a ``minimalkv.KeyValueStore``""",
7979
"metadata": """
8080
metadata : Optional[Dict]
8181
A dictionary used to update the dataset metadata.""",

kartothek/core/factory.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
from kartothek.core.utils import lazy_store
99

1010
if TYPE_CHECKING:
11-
from simplekv import KeyValueStore
11+
from minimalkv import KeyValueStore
1212

1313
__all__ = ("DatasetFactory",)
1414

@@ -38,7 +38,7 @@ def __init__(
3838
.. code::
3939
4040
from functools import partial
41-
from storefact import get_store_from_url
41+
from minimalkv import get_store_from_url
4242
from kartothek.io.eager import read_table
4343
4444
ds_factory = DatasetFactory(

0 commit comments

Comments
 (0)