Skip to content

Commit

Permalink
merge main
Browse files Browse the repository at this point in the history
  • Loading branch information
FBruzzesi committed Feb 1, 2025
2 parents 044db09 + 562b213 commit 35c2a2e
Show file tree
Hide file tree
Showing 48 changed files with 1,323 additions and 734 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/mkdocs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,6 @@ jobs:
- name: griffe
# hopefully temporary until https://github.com/mkdocstrings/mkdocstrings/issues/716
run: pip install git+https://github.com/MarcoGorelli/griffe.git@no-overloads
- run: pip install -e .[docs,pyspark,dask,duckdb]
- run: pip install -e .[docs,dask,duckdb]

- run: mkdocs gh-deploy --force
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,7 @@ Join the party!
- [marimo](https://github.com/marimo-team/marimo)
- [panel-graphic-walker](https://github.com/panel-extensions/panel-graphic-walker)
- [plotly](https://plotly.com)
- [pointblank](https://github.com/posit-dev/pointblank)
- [pymarginaleffects](https://github.com/vincentarelbundock/pymarginaleffects)
- [py-shiny](https://github.com/posit-dev/py-shiny)
- [rio](https://github.com/rio-labs/rio)
Expand Down
1 change: 1 addition & 0 deletions docs/api-reference/selectors.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ set operations are supported:
- boolean
- by_dtype
- categorical
- matches
- numeric
- string
show_root_heading: false
Expand Down
1 change: 1 addition & 0 deletions docs/ecosystem.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ for their dataframe interoperability needs:
* [marimo](https://github.com/marimo-team/marimo)
* [panel-graphic-walker](https://github.com/panel-extensions/panel-graphic-walker)
* [plotly](https://github.com/plotly/plotly.py)
* [pointblank](https://github.com/posit-dev/pointblank)
* [pymarginaleffects](https://github.com/vincentarelbundock/pymarginaleffects)
* [py-shiny](https://github.com/posit-dev/py-shiny)
* [rio](https://github.com/rio-labs/rio)
Expand Down
2 changes: 1 addition & 1 deletion docs/installation.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ To verify the installation, start the Python REPL and execute:
```python
>>> import narwhals
>>> narwhals.__version__
'1.24.0'
'1.24.1'
```

If you see the version number, then the installation was successful!
Expand Down
2 changes: 1 addition & 1 deletion narwhals/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@
from narwhals.utils import maybe_reset_index
from narwhals.utils import maybe_set_index

__version__ = "1.24.0"
__version__ = "1.24.1"

__all__ = [
"Array",
Expand Down
4 changes: 2 additions & 2 deletions narwhals/_arrow/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -294,7 +294,7 @@ def simple_select(self, *column_names: str) -> Self:
return self._from_native_frame(self._native_frame.select(list(column_names)))

def select(self: Self, *exprs: IntoArrowExpr, **named_exprs: IntoArrowExpr) -> Self:
new_series: list[ArrowSeries] = evaluate_into_exprs(self)(*exprs, **named_exprs)
new_series: list[ArrowSeries] = evaluate_into_exprs(self, *exprs, **named_exprs)
if not new_series:
# return empty dataframe, like Polars does
return self._from_native_frame(self._native_frame.__class__.from_arrays([]))
Expand All @@ -306,7 +306,7 @@ def with_columns(
self: Self, *exprs: IntoArrowExpr, **named_exprs: IntoArrowExpr
) -> Self:
native_frame = self._native_frame
new_columns: list[ArrowSeries] = evaluate_into_exprs(self)(*exprs, **named_exprs)
new_columns: list[ArrowSeries] = evaluate_into_exprs(self, *exprs, **named_exprs)

length = len(self)
columns = self.columns
Expand Down
19 changes: 19 additions & 0 deletions narwhals/_arrow/selectors.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from __future__ import annotations

import re
from typing import TYPE_CHECKING
from typing import Any
from typing import Sequence
Expand Down Expand Up @@ -43,6 +44,24 @@ def evalute_output_names(df: ArrowDataFrame) -> Sequence[str]:
kwargs={"dtypes": dtypes},
)

def matches(self: Self, pattern: str) -> ArrowSelector:
def func(df: ArrowDataFrame) -> list[ArrowSeries]:
return [df[col] for col in df.columns if re.search(pattern, col)]

def evalute_output_names(df: ArrowDataFrame) -> Sequence[str]:
return [col for col in df.columns if re.search(pattern, col)]

return ArrowSelector(
func,
depth=0,
function_name="selector",
evaluate_output_names=evalute_output_names,
alias_output_names=None,
backend_version=self._backend_version,
version=self._version,
kwargs={"pattern": pattern},
)

def numeric(self: Self) -> ArrowSelector:
dtypes = import_dtypes_module(self._version)
return self.by_dtype(
Expand Down
4 changes: 2 additions & 2 deletions narwhals/_dask/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ def _from_native_frame(self: Self, df: Any) -> Self:

def with_columns(self: Self, *exprs: DaskExpr, **named_exprs: DaskExpr) -> Self:
df = self._native_frame
new_series = parse_exprs_and_named_exprs(self)(*exprs, **named_exprs)
new_series = parse_exprs_and_named_exprs(self, *exprs, **named_exprs)
df = df.assign(**new_series)
return self._from_native_frame(df)

Expand Down Expand Up @@ -115,7 +115,7 @@ def simple_select(self: Self, *column_names: str) -> Self:
)

def select(self: Self, *exprs: DaskExpr, **named_exprs: DaskExpr) -> Self:
new_series = parse_exprs_and_named_exprs(self)(*exprs, **named_exprs)
new_series = parse_exprs_and_named_exprs(self, *exprs, **named_exprs)

if not new_series:
# return empty dataframe, like Polars does
Expand Down
35 changes: 29 additions & 6 deletions narwhals/_dask/selectors.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
from __future__ import annotations

import re
from typing import TYPE_CHECKING
from typing import Any
from typing import Iterable
from typing import Sequence

from narwhals._dask.expr import DaskExpr
Expand All @@ -27,7 +29,7 @@ def __init__(
self._backend_version = backend_version
self._version = version

def by_dtype(self: Self, dtypes: list[DType | type[DType]]) -> DaskSelector:
def by_dtype(self: Self, dtypes: Iterable[DType | type[DType]]) -> DaskSelector:
def func(df: DaskLazyFrame) -> list[dx.Series]:
return [
df._native_frame[col] for col in df.columns if df.schema[col] in dtypes
Expand All @@ -48,10 +50,31 @@ def evalute_output_names(df: DaskLazyFrame) -> Sequence[str]:
kwargs={},
)

def matches(self: Self, pattern: str) -> DaskSelector:
def func(df: DaskLazyFrame) -> list[dx.Series]:
return [
df._native_frame[col] for col in df.columns if re.search(pattern, col)
]

def evalute_output_names(df: DaskLazyFrame) -> Sequence[str]:
return [col for col in df.columns if re.search(pattern, col)]

return DaskSelector(
func,
depth=0,
function_name="selector",
evaluate_output_names=evalute_output_names,
alias_output_names=None,
backend_version=self._backend_version,
returns_scalar=False,
version=self._version,
kwargs={},
)

def numeric(self: Self) -> DaskSelector:
dtypes = import_dtypes_module(self._version)
return self.by_dtype(
[
{
dtypes.Int128,
dtypes.Int64,
dtypes.Int32,
Expand All @@ -64,20 +87,20 @@ def numeric(self: Self) -> DaskSelector:
dtypes.UInt8,
dtypes.Float64,
dtypes.Float32,
],
},
)

def categorical(self: Self) -> DaskSelector:
dtypes = import_dtypes_module(self._version)
return self.by_dtype([dtypes.Categorical])
return self.by_dtype({dtypes.Categorical})

def string(self: Self) -> DaskSelector:
dtypes = import_dtypes_module(self._version)
return self.by_dtype([dtypes.String])
return self.by_dtype({dtypes.String})

def boolean(self: Self) -> DaskSelector:
dtypes = import_dtypes_module(self._version)
return self.by_dtype([dtypes.Boolean])
return self.by_dtype({dtypes.Boolean})

def all(self: Self) -> DaskSelector:
def func(df: DaskLazyFrame) -> list[dx.Series]:
Expand Down
50 changes: 23 additions & 27 deletions narwhals/_dask/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@

from typing import TYPE_CHECKING
from typing import Any
from typing import Callable

from narwhals._expression_parsing import evaluate_output_names_and_aliases
from narwhals._pandas_like.utils import select_columns_by_name
Expand Down Expand Up @@ -45,32 +44,29 @@ def maybe_evaluate(df: DaskLazyFrame, obj: Any) -> Any:
return obj


def parse_exprs_and_named_exprs(df: DaskLazyFrame) -> Callable[..., dict[str, dx.Series]]:
def func(*exprs: DaskExpr, **named_exprs: DaskExpr) -> dict[str, dx.Series]:
native_results: dict[str, dx.Series] = {}
for expr in exprs:
native_series_list = expr._call(df)
return_scalar = getattr(expr, "_returns_scalar", False)
_, aliases = evaluate_output_names_and_aliases(expr, df, [])
if len(aliases) != len(native_series_list): # pragma: no cover
msg = f"Internal error: got aliases {aliases}, but only got {len(native_series_list)} results"
raise AssertionError(msg)
for native_series, alias in zip(native_series_list, aliases):
native_results[alias] = (
native_series[0] if return_scalar else native_series
)
for name, value in named_exprs.items():
native_series_list = value._call(df)
if len(native_series_list) != 1: # pragma: no cover
msg = "Named expressions must return a single column"
raise AssertionError(msg)
return_scalar = getattr(value, "_returns_scalar", False)
native_results[name] = (
native_series_list[0][0] if return_scalar else native_series_list[0]
)
return native_results

return func
def parse_exprs_and_named_exprs(
df: DaskLazyFrame, /, *exprs: DaskExpr, **named_exprs: DaskExpr
) -> dict[str, dx.Series]:
native_results: dict[str, dx.Series] = {}
for expr in exprs:
native_series_list = expr._call(df)
return_scalar = getattr(expr, "_returns_scalar", False)
_, aliases = evaluate_output_names_and_aliases(expr, df, [])
if len(aliases) != len(native_series_list): # pragma: no cover
msg = f"Internal error: got aliases {aliases}, but only got {len(native_series_list)} results"
raise AssertionError(msg)
for native_series, alias in zip(native_series_list, aliases):
native_results[alias] = native_series[0] if return_scalar else native_series
for name, value in named_exprs.items():
native_series_list = value._call(df)
if len(native_series_list) != 1: # pragma: no cover
msg = "Named expressions must return a single column"
raise AssertionError(msg)
return_scalar = getattr(value, "_returns_scalar", False)
native_results[name] = (
native_series_list[0][0] if return_scalar else native_series_list[0]
)
return native_results


def add_row_index(
Expand Down
31 changes: 27 additions & 4 deletions narwhals/_duckdb/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import duckdb
from duckdb import ColumnExpression

from narwhals._duckdb.utils import ExprKind
from narwhals._duckdb.utils import native_to_narwhals_dtype
from narwhals._duckdb.utils import parse_exprs_and_named_exprs
from narwhals.dependencies import get_duckdb
Expand Down Expand Up @@ -104,19 +105,29 @@ def select(
*exprs: DuckDBExpr,
**named_exprs: DuckDBExpr,
) -> Self:
new_columns_map = parse_exprs_and_named_exprs(self)(*exprs, **named_exprs)
new_columns_map = parse_exprs_and_named_exprs(self, *exprs, **named_exprs)
if not new_columns_map:
# TODO(marco): return empty relation with 0 columns?
return self._from_native_frame(self._native_frame.limit(0))

if all(getattr(x, "_returns_scalar", False) for x in exprs) and all(
getattr(x, "_returns_scalar", False) for x in named_exprs.values()
if not any(expr._expr_kind is ExprKind.TRANSFORM for expr in exprs) and not any(
expr._expr_kind is ExprKind.TRANSFORM for expr in named_exprs.values()
):
return self._from_native_frame(
self._native_frame.aggregate(
[val.alias(col) for col, val in new_columns_map.items()]
)
)
if any(expr._expr_kind is ExprKind.AGGREGATION for expr in exprs) or any(
expr._expr_kind is ExprKind.AGGREGATION for expr in named_exprs.values()
):
msg = (
"Mixing expressions which aggregate and expressions which don't\n"
"is not yet supported by the DuckDB backend. Once they introduce\n"
"duckdb.WindowExpression to their Python API, we'll be able to\n"
"support this."
)
raise NotImplementedError(msg)

return self._from_native_frame(
self._native_frame.select(
Expand All @@ -139,7 +150,19 @@ def with_columns(
*exprs: DuckDBExpr,
**named_exprs: DuckDBExpr,
) -> Self:
new_columns_map = parse_exprs_and_named_exprs(self)(*exprs, **named_exprs)
new_columns_map = parse_exprs_and_named_exprs(self, *exprs, **named_exprs)

if any(expr._expr_kind is ExprKind.AGGREGATION for expr in exprs) or any(
expr._expr_kind is ExprKind.AGGREGATION for expr in named_exprs.values()
):
msg = (
"Mixing expressions which aggregate and expressions which don't\n"
"is not yet supported by the DuckDB backend. Once they introduce\n"
"duckdb.WindowExpression to their Python API, we'll be able to\n"
"support this."
)
raise NotImplementedError(msg)

result = []
for col in self._native_frame.columns:
if col in new_columns_map:
Expand Down
Loading

0 comments on commit 35c2a2e

Please sign in to comment.