Skip to content

Commit fc49895

Browse files
feat: polars implementation of table (#744)
Closes #638 Closes #641 Closes #649 Closes #712 ### Summary of Changes Implement our table using polars as backend. --------- Co-authored-by: megalinter-bot <[email protected]>
1 parent 0564b52 commit fc49895

File tree

63 files changed

+5011
-465
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

63 files changed

+5011
-465
lines changed
Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
from timeit import timeit
2+
3+
from safeds.data.tabular.containers import ExperimentalTable
4+
5+
from benchmarks.table.utils import create_synthetic_table_polars
6+
7+
REPETITIONS = 10
8+
9+
10+
def _run_remove_columns_with_missing_values() -> None:
11+
table.remove_columns_with_missing_values()._lazy_frame.collect()
12+
13+
14+
def _run_remove_non_numeric_columns() -> None:
15+
table.remove_non_numeric_columns()._lazy_frame.collect()
16+
17+
18+
def _run_summarize_statistics() -> None:
19+
table.summarize_statistics()._lazy_frame.collect()
20+
21+
22+
if __name__ == "__main__":
23+
# Create a synthetic Table
24+
table = create_synthetic_table_polars(100, 5000)
25+
26+
# Run the benchmarks
27+
timings: dict[str, float] = {
28+
"remove_columns_with_missing_values": timeit(
29+
_run_remove_columns_with_missing_values,
30+
number=REPETITIONS,
31+
),
32+
"remove_non_numeric_columns": timeit(
33+
_run_remove_non_numeric_columns,
34+
number=REPETITIONS,
35+
),
36+
"summarize_statistics": timeit(
37+
_run_summarize_statistics,
38+
number=REPETITIONS,
39+
),
40+
}
41+
42+
# Print the timings
43+
print(
44+
ExperimentalTable(
45+
{
46+
"method": list(timings.keys()),
47+
"timing": list(timings.values()),
48+
}
49+
)
50+
)

benchmarks/table/row_operations_polars.py

Lines changed: 27 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
from timeit import timeit
22

3-
from safeds.data.tabular.containers import Table
3+
import polars as pl
4+
5+
from safeds.data.tabular.containers import ExperimentalTable
46

57
from benchmarks.table.utils import create_synthetic_table_polars
68

@@ -15,14 +17,18 @@ def _run_remove_rows_with_missing_values() -> None:
1517
table.remove_rows_with_missing_values()._lazy_frame.collect()
1618

1719

18-
# def _run_remove_rows_with_outliers() -> None:
19-
# table.remove_rows_with_outliers()
20+
def _run_remove_rows_with_outliers() -> None:
21+
table.remove_rows_with_outliers()
2022

2123

2224
def _run_remove_rows() -> None:
2325
table.remove_rows(lambda row: row.get_value("column_0") % 2 == 0)._lazy_frame.collect()
2426

2527

28+
def _run_remove_rows_by_column() -> None:
29+
table.remove_rows_by_column("column_0", lambda cell: cell % 2 == 0)._lazy_frame.collect()
30+
31+
2632
def _run_shuffle_rows() -> None:
2733
table.shuffle_rows()._lazy_frame.collect()
2834

@@ -63,14 +69,18 @@ def _run_transform_column() -> None:
6369
_run_remove_rows_with_missing_values,
6470
number=REPETITIONS,
6571
),
66-
# "remove_rows_with_outliers": timeit(
67-
# _run_remove_rows_with_outliers,
68-
# number=REPETITIONS,
69-
# ),
72+
"remove_rows_with_outliers": timeit(
73+
_run_remove_rows_with_outliers,
74+
number=REPETITIONS,
75+
),
7076
"remove_rows": timeit(
7177
_run_remove_rows,
7278
number=REPETITIONS,
7379
),
80+
"remove_rows_by_column": timeit(
81+
_run_remove_rows_by_column,
82+
number=REPETITIONS,
83+
),
7484
"shuffle_rows": timeit(
7585
_run_shuffle_rows,
7686
number=REPETITIONS,
@@ -98,11 +108,14 @@ def _run_transform_column() -> None:
98108
}
99109

100110
# Print the timings
101-
print(
102-
Table(
103-
{
104-
"method": list(timings.keys()),
105-
"timing": list(timings.values()),
106-
}
111+
with pl.Config(
112+
tbl_rows=-1,
113+
):
114+
print(
115+
ExperimentalTable(
116+
{
117+
"method": list(timings.keys()),
118+
"timing": list(timings.values()),
119+
}
120+
)
107121
)
108-
)

benchmarks/table/utils/create_synthetic_table.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,8 @@ def create_synthetic_table(
1010
min_value: int = 0,
1111
max_value: int = 1000,
1212
) -> Table:
13-
"""Create a synthetic Table with random numerical data.
13+
"""
14+
Create a synthetic Table with random numerical data.
1415
1516
Parameters
1617
----------

benchmarks/table/utils/create_synthetic_table_polars.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,8 @@ def create_synthetic_table_polars(
1010
min_value: int = 0,
1111
max_value: int = 1000,
1212
) -> ExperimentalTable:
13-
"""Create a synthetic Table with random numerical data.
13+
"""
14+
Create a synthetic Table with random numerical data.
1415
1516
Parameters
1617
----------

poetry.lock

Lines changed: 8 additions & 8 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ matplotlib = "^3.6.3"
2020
openpyxl = "^3.1.2"
2121
pandas = "^2.0.0"
2222
pillow = ">=9.5,<11.0"
23-
polars = {extras = ["numpy", "pyarrow"], version = "^0.20.24"}
23+
polars = {extras = ["numpy", "pyarrow"], version = "^0.20.25"}
2424
scikit-learn = "^1.2.0"
2525
seaborn = "^0.13.0"
2626
statsmodels = "^0.14.1"

src/resources/from_json_file_2.json

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
{
2+
"columns": [
3+
{ "name": "a", "datatype": "Int64", "bit_settings": "", "values": [1, 2, 3] },
4+
{ "name": "b", "datatype": "Int64", "bit_settings": "", "values": [4, 5, 6] }
5+
]
6+
}
1.65 KB
Binary file not shown.

src/resources/to_excel_file.xlsx

1 Byte
Binary file not shown.

src/resources/to_json_file_2.json

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
{
2+
"columns": [
3+
{ "name": "a", "datatype": "Int64", "bit_settings": "", "values": [1, 2, 3] },
4+
{ "name": "b", "datatype": "Int64", "bit_settings": "", "values": [4, 5, 6] }
5+
]
6+
}

src/resources/to_parquet_file.parquet

1.65 KB
Binary file not shown.

src/safeds/_config/__init__.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,17 +5,19 @@
55
import apipkg
66

77
if TYPE_CHECKING:
8-
from ._device import _get_device, _init_default_device
8+
from ._torch import _get_device, _init_default_device, _set_default_device
99

1010
apipkg.initpkg(
1111
__name__,
1212
{
13-
"_get_device": "._device:_get_device",
14-
"_init_default_device": "._device:_init_default_device",
13+
"_get_device": "._torch:_get_device",
14+
"_init_default_device": "._torch:_init_default_device",
15+
"_set_default_device": "._torch:_set_default_device",
1516
},
1617
)
1718

1819
__all__ = [
1920
"_get_device",
2021
"_init_default_device",
22+
"_set_default_device",
2123
]

src/safeds/_config/_polars.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
from __future__ import annotations
2+
3+
from typing import TYPE_CHECKING
4+
5+
if TYPE_CHECKING:
6+
import polars as pl
7+
8+
9+
def _get_polars_config() -> pl.Config:
10+
import polars as pl
11+
12+
return pl.Config(
13+
float_precision=5,
14+
tbl_cell_numeric_alignment="RIGHT",
15+
tbl_formatting="ASCII_FULL_CONDENSED",
16+
tbl_hide_dataframe_shape=True,
17+
)

src/safeds/_config/_device.py renamed to src/safeds/_config/_torch.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ def _get_device() -> Device:
1818
def _init_default_device() -> None:
1919
import torch
2020

21-
global _default_device
21+
global _default_device # noqa: PLW0603
2222

2323
if _default_device is None:
2424
_default_device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
@@ -28,7 +28,7 @@ def _init_default_device() -> None:
2828

2929
def _set_default_device(device: Device) -> None:
3030
# This changes all future tensors, but not any tensor that already exists
31-
global _default_device
31+
global _default_device # noqa: PLW0603
3232

3333
_default_device = device
3434
_init_default_device()

src/safeds/_utils/__init__.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,16 +7,22 @@
77
if TYPE_CHECKING:
88
from ._file_io import _check_and_normalize_file_path
99
from ._hashing import _structural_hash
10+
from ._plotting import _figure_to_image
11+
from ._random import _get_random_seed
1012

1113
apipkg.initpkg(
1214
__name__,
1315
{
1416
"_check_and_normalize_file_path": "._file_io:_check_and_normalize_file_path",
1517
"_structural_hash": "._hashing:_structural_hash",
18+
"_figure_to_image": "._plotting:_figure_to_image",
19+
"_get_random_seed": "._random:_get_random_seed",
1620
},
1721
)
1822

1923
__all__ = [
2024
"_check_and_normalize_file_path",
2125
"_structural_hash",
26+
"_figure_to_image",
27+
"_get_random_seed",
2228
]

src/safeds/_utils/_plotting.py

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
from __future__ import annotations
2+
3+
import io
4+
from typing import TYPE_CHECKING
5+
6+
from safeds.data.image.containers import Image
7+
8+
if TYPE_CHECKING:
9+
import matplotlib.pyplot as plt
10+
11+
12+
def _figure_to_image(figure: plt.Figure) -> Image:
13+
"""
14+
Store the figure as an image and closes it.
15+
16+
Parameters
17+
----------
18+
figure:
19+
The figure to store.
20+
21+
Returns
22+
-------
23+
image:
24+
The figure as an image.
25+
"""
26+
import matplotlib.pyplot as plt
27+
28+
buffer = io.BytesIO()
29+
figure.savefig(buffer, format="png")
30+
plt.close(figure) # Prevents the figure from being displayed directly
31+
buffer.seek(0)
32+
return Image.from_bytes(buffer.read())

src/safeds/data/labeled/containers/__init__.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,20 +5,23 @@
55
import apipkg
66

77
if TYPE_CHECKING:
8+
from ._experimental_tabular_dataset import ExperimentalTabularDataset
89
from ._image_dataset import ImageDataset
910
from ._tabular_dataset import TabularDataset
1011
from ._time_series_dataset import TimeSeriesDataset
1112

1213
apipkg.initpkg(
1314
__name__,
1415
{
16+
"ExperimentalTabularDataset": "._experimental_tabular_dataset:ExperimentalTabularDataset",
1517
"ImageDataset": "._image_dataset:ImageDataset",
1618
"TabularDataset": "._tabular_dataset:TabularDataset",
1719
"TimeSeriesDataset": "._time_series_dataset:TimeSeriesDataset",
1820
},
1921
)
2022

2123
__all__ = [
24+
"ExperimentalTabularDataset",
2225
"ImageDataset",
2326
"TabularDataset",
2427
"TimeSeriesDataset",

0 commit comments

Comments
 (0)