Skip to content

Commit ce0cd8c

Browse files
committed
Fixed #1058 Added support for Polars
1 parent 4416873 commit ce0cd8c

File tree

10 files changed

+56
-24
lines changed

10 files changed

+56
-24
lines changed

environment.yml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,8 +21,9 @@ dependencies:
2121
- pydata-sphinx-theme>=0.5.2
2222
- scikit-learn>=0.21.3
2323
- numpydoc>=1.1.0
24-
- build>=0.7.0
24+
- python-build>=0.7.0
2525
- pytest-check-links>=0.7.1
2626
- isort>=5.11.0
2727
- jupyterlab-myst>=2.0.0
2828
- myst-nb>=1.0.0
29+
- polars>=1.14.0

pyproject.toml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,8 @@ ci = [
5252
"black >= 22.1.0",
5353
"pytest >= 4.4.1",
5454
"isort >= 5.11.0",
55-
'tbb >= 2019.5 ; platform_system == "Linux"'
55+
'tbb >= 2019.5 ; platform_system == "Linux"',
56+
"polars >= 1.14.0"
5657
]
5758

5859
[project.urls]

stumpy/core.py

Lines changed: 15 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -445,25 +445,26 @@ def check_dtype(a, dtype=np.float64): # pragma: no cover
445445

446446
def transpose_dataframe(df): # pragma: no cover
447447
"""
448-
Check if the input is a column-wise Pandas `DataFrame`. If `True`, return a
448+
Check if the input is a column-wise pandas/polars `DataFrame`. If `True`, return a
449449
transpose dataframe since stumpy assumes that each row represents data from a
450450
different dimension while each column represents data from the same dimension.
451-
If `False`, return `a` unchanged. Pandas `Series` do not need to be transposed.
451+
If `False`, return `a` unchanged. Pandas/polars `Series` do not need to be
452+
transposed.
452453
453454
Note that this function has zero dependency on Pandas (not even a soft dependency).
454455
455456
Parameters
456457
----------
457-
df : numpy.ndarray
458-
Pandas dataframe
458+
df : DataFrame
459+
pandas/polars dataframe
459460
460461
Returns
461462
-------
462463
output : df
463464
If `df` is a Pandas `DataFrame` then return `df.T`. Otherwise, return `df`
464465
"""
465466
if type(df).__name__ == "DataFrame":
466-
return df.T
467+
return df.transpose()
467468

468469
return df
469470

@@ -2062,8 +2063,16 @@ def _preprocess(T, copy=True):
20622063
Modified time series
20632064
"""
20642065
if copy:
2065-
T = T.copy()
2066+
try:
2067+
T = T.copy()
2068+
except AttributeError: # Polars copy
2069+
T = T.clone()
2070+
20662071
T = transpose_dataframe(T)
2072+
2073+
if "polars" in str(type(T)):
2074+
T = T.to_numpy(writable=True)
2075+
20672076
T = np.asarray(T)
20682077
check_dtype(T)
20692078

stumpy/maamp.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -879,8 +879,8 @@ def maamp(T, m, include=None, discords=False, p=2.0):
879879
----------
880880
T : numpy.ndarray
881881
The time series or sequence for which to compute the multi-dimensional
882-
matrix profile. Each row in `T` represents data from a different
883-
dimension while each column in `T` represents data from the same
882+
matrix profile. Each row in `T` represents data from the same
883+
dimension while each column in `T` represents data from a different
884884
dimension.
885885
886886
m : int

stumpy/maamped.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -39,8 +39,8 @@ def _dask_maamped(
3939
4040
T_A : numpy.ndarray
4141
The time series or sequence for which to compute the multi-dimensional
42-
matrix profile. Each row in `T_A` represents data from a different
43-
dimension while each column in `T_A` represents data from the same
42+
matrix profile. Each row in `T_A` represents data from the same
43+
dimension while each column in `T_A` represents data from a different
4444
dimension.
4545
4646
T_B : numpy.ndarray
@@ -194,8 +194,8 @@ def _ray_maamped(
194194
195195
T_A : numpy.ndarray
196196
The time series or sequence for which to compute the multi-dimensional
197-
matrix profile. Each row in `T_A` represents data from a different
198-
dimension while each column in `T_A` represents data from the same
197+
matrix profile. Each row in `T_A` represents data from the same
198+
dimension while each column in `T_A` represents data from a different
199199
dimension.
200200
201201
T_B : numpy.ndarray
@@ -335,8 +335,8 @@ def maamped(client, T, m, include=None, discords=False, p=2.0):
335335
336336
T : numpy.ndarray
337337
The time series or sequence for which to compute the multi-dimensional
338-
matrix profile. Each row in `T` represents data from a different
339-
dimension while each column in `T` represents data from the same
338+
matrix profile. Each row in `T` represents data from the same
339+
dimension while each column in `T` represents data from a different
340340
dimension.
341341
342342
m : int

stumpy/mstump.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1126,8 +1126,8 @@ def mstump(
11261126
----------
11271127
T : numpy.ndarray
11281128
The time series or sequence for which to compute the multi-dimensional
1129-
matrix profile. Each row in ``T`` represents data from a different
1130-
dimension while each column in ``T`` represents data from the same
1129+
matrix profile. Each row in ``T`` represents data from the same
1130+
dimension while each column in ``T`` represents data from a different
11311131
dimension.
11321132
11331133
m : int

stumpy/mstumped.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -43,8 +43,8 @@ def _dask_mstumped(
4343
4444
T_A : numpy.ndarray
4545
The time series or sequence for which to compute the multi-dimensional
46-
matrix profile. Each row in `T_A` represents data from a different
47-
dimension while each column in `T_A` represents data from the same
46+
matrix profile. Each row in `T_A` represents data from the same
47+
dimension while each column in `T_A` represents data from a different
4848
dimension.
4949
5050
T_B : numpy.ndarray
@@ -216,8 +216,8 @@ def _ray_mstumped(
216216
217217
T_A : numpy.ndarray
218218
The time series or sequence for which to compute the multi-dimensional
219-
matrix profile. Each row in `T_A` represents data from a different
220-
dimension while each column in `T_A` represents data from the same
219+
matrix profile. Each row in `T_A` represents data from the same
220+
dimension while each column in `T_A` represents data from a different
221221
dimension.
222222
223223
T_B : numpy.ndarray
@@ -387,8 +387,8 @@ def mstumped(
387387
388388
T : numpy.ndarray
389389
The time series or sequence for which to compute the multi-dimensional
390-
matrix profile. Each row in ``T`` represents data from a different
391-
dimension while each column in ``T`` represents data from the same
390+
matrix profile. Each row in ``T`` represents data from the same
391+
dimension while each column in ``T`` represents data from a different
392392
dimension.
393393
394394
m : int

tests/test_mstump.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
import numpy as np
55
import numpy.testing as npt
66
import pandas as pd
7+
import polars as pl
78
import pytest
89

910
from stumpy import config, core, mdl, mstump, subspace
@@ -305,6 +306,12 @@ def test_mstump_wrapper(T, m):
305306
npt.assert_almost_equal(ref_P, comp_P)
306307
npt.assert_almost_equal(ref_I, comp_I)
307308

309+
df = pl.DataFrame(T.T)
310+
comp_P, comp_I = mstump(df, m)
311+
312+
npt.assert_almost_equal(ref_P, comp_P)
313+
npt.assert_almost_equal(ref_I, comp_I)
314+
308315

309316
@pytest.mark.parametrize("T, m", test_data)
310317
def test_mstump_wrapper_include(T, m):

tests/test_stump.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
import numpy as np
55
import numpy.testing as npt
66
import pandas as pd
7+
import polars as pl
78
import pytest
89

910
from stumpy import config, stump
@@ -42,6 +43,10 @@ def test_stump_self_join(T_A, T_B):
4243
naive.replace_inf(comp_mp)
4344
npt.assert_almost_equal(ref_mp, comp_mp)
4445

46+
comp_mp = stump(pl.Series(T_B), m, ignore_trivial=True)
47+
naive.replace_inf(comp_mp)
48+
npt.assert_almost_equal(ref_mp, comp_mp)
49+
4550

4651
@pytest.mark.parametrize("T_A, T_B", test_data)
4752
def test_stump_A_B_join(T_A, T_B):
@@ -56,6 +61,10 @@ def test_stump_A_B_join(T_A, T_B):
5661
naive.replace_inf(comp_mp)
5762
npt.assert_almost_equal(ref_mp, comp_mp)
5863

64+
comp_mp = stump(pl.Series(T_A), m, pl.Series(T_B), ignore_trivial=False)
65+
naive.replace_inf(comp_mp)
66+
npt.assert_almost_equal(ref_mp, comp_mp)
67+
5968

6069
def test_stump_constant_subsequence_self_join():
6170
T_A = np.concatenate((np.zeros(20, dtype=np.float64), np.ones(5, dtype=np.float64)))

tests/test_stumped.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
import numpy as np
55
import numpy.testing as npt
66
import pandas as pd
7+
import polars as pl
78
import pytest
89
from dask.distributed import Client, LocalCluster
910

@@ -75,6 +76,10 @@ def test_stumped_self_join_df(T_A, T_B, dask_cluster):
7576
naive.replace_inf(comp_mp)
7677
npt.assert_almost_equal(ref_mp, comp_mp)
7778

79+
comp_mp = stumped(dask_client, pl.Series(T_B), m, ignore_trivial=True)
80+
naive.replace_inf(comp_mp)
81+
npt.assert_almost_equal(ref_mp, comp_mp)
82+
7883

7984
@pytest.mark.filterwarnings("ignore:numpy.dtype size changed")
8085
@pytest.mark.filterwarnings("ignore:numpy.ufunc size changed")

0 commit comments

Comments
 (0)