merge main

narwhals-dev · Feb 1, 2025 · 35c2a2e · 35c2a2e
2 parents 044db09 + 562b213
commit 35c2a2e
Show file tree

Hide file tree

Showing 48 changed files with 1,323 additions and 734 deletions.
diff --git a/.github/workflows/mkdocs.yml b/.github/workflows/mkdocs.yml
@@ -29,6 +29,6 @@ jobs:
       - name: griffe
         # hopefully temporary until https://github.com/mkdocstrings/mkdocstrings/issues/716
         run: pip install git+https://github.com/MarcoGorelli/griffe.git@no-overloads
-      - run: pip install -e .[docs,pyspark,dask,duckdb]
+      - run: pip install -e .[docs,dask,duckdb]
 
       - run: mkdocs gh-deploy --force
diff --git a/README.md b/README.md
@@ -113,6 +113,7 @@ Join the party!
 - [marimo](https://github.com/marimo-team/marimo)
 - [panel-graphic-walker](https://github.com/panel-extensions/panel-graphic-walker)
 - [plotly](https://plotly.com)
+- [pointblank](https://github.com/posit-dev/pointblank)
 - [pymarginaleffects](https://github.com/vincentarelbundock/pymarginaleffects)
 - [py-shiny](https://github.com/posit-dev/py-shiny)
 - [rio](https://github.com/rio-labs/rio)

diff --git a/docs/api-reference/selectors.md b/docs/api-reference/selectors.md
@@ -15,6 +15,7 @@ set operations are supported:
         - boolean
         - by_dtype
         - categorical
+        - matches
         - numeric
         - string
       show_root_heading: false

diff --git a/docs/ecosystem.md b/docs/ecosystem.md
@@ -10,6 +10,7 @@ for their dataframe interoperability needs:
 * [marimo](https://github.com/marimo-team/marimo)
 * [panel-graphic-walker](https://github.com/panel-extensions/panel-graphic-walker)
 * [plotly](https://github.com/plotly/plotly.py)
+* [pointblank](https://github.com/posit-dev/pointblank)
 * [pymarginaleffects](https://github.com/vincentarelbundock/pymarginaleffects)
 * [py-shiny](https://github.com/posit-dev/py-shiny)
 * [rio](https://github.com/rio-labs/rio)

diff --git a/docs/installation.md b/docs/installation.md
@@ -30,7 +30,7 @@ To verify the installation, start the Python REPL and execute:
 ```python
 >>> import narwhals
 >>> narwhals.__version__
-'1.24.0'
+'1.24.1'
 ```
 
 If you see the version number, then the installation was successful!

diff --git a/narwhals/__init__.py b/narwhals/__init__.py
@@ -79,7 +79,7 @@
 from narwhals.utils import maybe_reset_index
 from narwhals.utils import maybe_set_index
 
-__version__ = "1.24.0"
+__version__ = "1.24.1"
 
 __all__ = [
     "Array",

diff --git a/narwhals/_arrow/dataframe.py b/narwhals/_arrow/dataframe.py
@@ -294,7 +294,7 @@ def simple_select(self, *column_names: str) -> Self:
         return self._from_native_frame(self._native_frame.select(list(column_names)))
 
     def select(self: Self, *exprs: IntoArrowExpr, **named_exprs: IntoArrowExpr) -> Self:
-        new_series: list[ArrowSeries] = evaluate_into_exprs(self)(*exprs, **named_exprs)
+        new_series: list[ArrowSeries] = evaluate_into_exprs(self, *exprs, **named_exprs)
         if not new_series:
             # return empty dataframe, like Polars does
             return self._from_native_frame(self._native_frame.__class__.from_arrays([]))
@@ -306,7 +306,7 @@ def with_columns(
         self: Self, *exprs: IntoArrowExpr, **named_exprs: IntoArrowExpr
     ) -> Self:
         native_frame = self._native_frame
-        new_columns: list[ArrowSeries] = evaluate_into_exprs(self)(*exprs, **named_exprs)
+        new_columns: list[ArrowSeries] = evaluate_into_exprs(self, *exprs, **named_exprs)
 
         length = len(self)
         columns = self.columns

diff --git a/narwhals/_arrow/selectors.py b/narwhals/_arrow/selectors.py
@@ -1,5 +1,6 @@
 from __future__ import annotations
 
+import re
 from typing import TYPE_CHECKING
 from typing import Any
 from typing import Sequence
@@ -43,6 +44,24 @@ def evalute_output_names(df: ArrowDataFrame) -> Sequence[str]:
             kwargs={"dtypes": dtypes},
         )
 
+    def matches(self: Self, pattern: str) -> ArrowSelector:
+        def func(df: ArrowDataFrame) -> list[ArrowSeries]:
+            return [df[col] for col in df.columns if re.search(pattern, col)]
+
+        def evalute_output_names(df: ArrowDataFrame) -> Sequence[str]:
+            return [col for col in df.columns if re.search(pattern, col)]
+
+        return ArrowSelector(
+            func,
+            depth=0,
+            function_name="selector",
+            evaluate_output_names=evalute_output_names,
+            alias_output_names=None,
+            backend_version=self._backend_version,
+            version=self._version,
+            kwargs={"pattern": pattern},
+        )
+
     def numeric(self: Self) -> ArrowSelector:
         dtypes = import_dtypes_module(self._version)
         return self.by_dtype(

diff --git a/narwhals/_dask/dataframe.py b/narwhals/_dask/dataframe.py
@@ -75,7 +75,7 @@ def _from_native_frame(self: Self, df: Any) -> Self:
 
     def with_columns(self: Self, *exprs: DaskExpr, **named_exprs: DaskExpr) -> Self:
         df = self._native_frame
-        new_series = parse_exprs_and_named_exprs(self)(*exprs, **named_exprs)
+        new_series = parse_exprs_and_named_exprs(self, *exprs, **named_exprs)
         df = df.assign(**new_series)
         return self._from_native_frame(df)
 
@@ -115,7 +115,7 @@ def simple_select(self: Self, *column_names: str) -> Self:
         )
 
     def select(self: Self, *exprs: DaskExpr, **named_exprs: DaskExpr) -> Self:
-        new_series = parse_exprs_and_named_exprs(self)(*exprs, **named_exprs)
+        new_series = parse_exprs_and_named_exprs(self, *exprs, **named_exprs)
 
         if not new_series:
             # return empty dataframe, like Polars does

diff --git a/narwhals/_dask/selectors.py b/narwhals/_dask/selectors.py
@@ -1,7 +1,9 @@
 from __future__ import annotations
 
+import re
 from typing import TYPE_CHECKING
 from typing import Any
+from typing import Iterable
 from typing import Sequence
 
 from narwhals._dask.expr import DaskExpr
@@ -27,7 +29,7 @@ def __init__(
         self._backend_version = backend_version
         self._version = version
 
-    def by_dtype(self: Self, dtypes: list[DType | type[DType]]) -> DaskSelector:
+    def by_dtype(self: Self, dtypes: Iterable[DType | type[DType]]) -> DaskSelector:
         def func(df: DaskLazyFrame) -> list[dx.Series]:
             return [
                 df._native_frame[col] for col in df.columns if df.schema[col] in dtypes
@@ -48,10 +50,31 @@ def evalute_output_names(df: DaskLazyFrame) -> Sequence[str]:
             kwargs={},
         )
 
+    def matches(self: Self, pattern: str) -> DaskSelector:
+        def func(df: DaskLazyFrame) -> list[dx.Series]:
+            return [
+                df._native_frame[col] for col in df.columns if re.search(pattern, col)
+            ]
+
+        def evalute_output_names(df: DaskLazyFrame) -> Sequence[str]:
+            return [col for col in df.columns if re.search(pattern, col)]
+
+        return DaskSelector(
+            func,
+            depth=0,
+            function_name="selector",
+            evaluate_output_names=evalute_output_names,
+            alias_output_names=None,
+            backend_version=self._backend_version,
+            returns_scalar=False,
+            version=self._version,
+            kwargs={},
+        )
+
     def numeric(self: Self) -> DaskSelector:
         dtypes = import_dtypes_module(self._version)
         return self.by_dtype(
-            [
+            {
                 dtypes.Int128,
                 dtypes.Int64,
                 dtypes.Int32,
@@ -64,20 +87,20 @@ def numeric(self: Self) -> DaskSelector:
                 dtypes.UInt8,
                 dtypes.Float64,
                 dtypes.Float32,
-            ],
+            },
         )
 
     def categorical(self: Self) -> DaskSelector:
         dtypes = import_dtypes_module(self._version)
-        return self.by_dtype([dtypes.Categorical])
+        return self.by_dtype({dtypes.Categorical})
 
     def string(self: Self) -> DaskSelector:
         dtypes = import_dtypes_module(self._version)
-        return self.by_dtype([dtypes.String])
+        return self.by_dtype({dtypes.String})
 
     def boolean(self: Self) -> DaskSelector:
         dtypes = import_dtypes_module(self._version)
-        return self.by_dtype([dtypes.Boolean])
+        return self.by_dtype({dtypes.Boolean})
 
     def all(self: Self) -> DaskSelector:
         def func(df: DaskLazyFrame) -> list[dx.Series]:

diff --git a/narwhals/_dask/utils.py b/narwhals/_dask/utils.py
@@ -2,7 +2,6 @@
 
 from typing import TYPE_CHECKING
 from typing import Any
-from typing import Callable
 
 from narwhals._expression_parsing import evaluate_output_names_and_aliases
 from narwhals._pandas_like.utils import select_columns_by_name
@@ -45,32 +44,29 @@ def maybe_evaluate(df: DaskLazyFrame, obj: Any) -> Any:
     return obj
 
 
-def parse_exprs_and_named_exprs(df: DaskLazyFrame) -> Callable[..., dict[str, dx.Series]]:
-    def func(*exprs: DaskExpr, **named_exprs: DaskExpr) -> dict[str, dx.Series]:
-        native_results: dict[str, dx.Series] = {}
-        for expr in exprs:
-            native_series_list = expr._call(df)
-            return_scalar = getattr(expr, "_returns_scalar", False)
-            _, aliases = evaluate_output_names_and_aliases(expr, df, [])
-            if len(aliases) != len(native_series_list):  # pragma: no cover
-                msg = f"Internal error: got aliases {aliases}, but only got {len(native_series_list)} results"
-                raise AssertionError(msg)
-            for native_series, alias in zip(native_series_list, aliases):
-                native_results[alias] = (
-                    native_series[0] if return_scalar else native_series
-                )
-        for name, value in named_exprs.items():
-            native_series_list = value._call(df)
-            if len(native_series_list) != 1:  # pragma: no cover
-                msg = "Named expressions must return a single column"
-                raise AssertionError(msg)
-            return_scalar = getattr(value, "_returns_scalar", False)
-            native_results[name] = (
-                native_series_list[0][0] if return_scalar else native_series_list[0]
-            )
-        return native_results
-
-    return func
+def parse_exprs_and_named_exprs(
+    df: DaskLazyFrame, /, *exprs: DaskExpr, **named_exprs: DaskExpr
+) -> dict[str, dx.Series]:
+    native_results: dict[str, dx.Series] = {}
+    for expr in exprs:
+        native_series_list = expr._call(df)
+        return_scalar = getattr(expr, "_returns_scalar", False)
+        _, aliases = evaluate_output_names_and_aliases(expr, df, [])
+        if len(aliases) != len(native_series_list):  # pragma: no cover
+            msg = f"Internal error: got aliases {aliases}, but only got {len(native_series_list)} results"
+            raise AssertionError(msg)
+        for native_series, alias in zip(native_series_list, aliases):
+            native_results[alias] = native_series[0] if return_scalar else native_series
+    for name, value in named_exprs.items():
+        native_series_list = value._call(df)
+        if len(native_series_list) != 1:  # pragma: no cover
+            msg = "Named expressions must return a single column"
+            raise AssertionError(msg)
+        return_scalar = getattr(value, "_returns_scalar", False)
+        native_results[name] = (
+            native_series_list[0][0] if return_scalar else native_series_list[0]
+        )
+    return native_results
 
 
 def add_row_index(

diff --git a/narwhals/_duckdb/dataframe.py b/narwhals/_duckdb/dataframe.py
@@ -9,6 +9,7 @@
 import duckdb
 from duckdb import ColumnExpression
 
+from narwhals._duckdb.utils import ExprKind
 from narwhals._duckdb.utils import native_to_narwhals_dtype
 from narwhals._duckdb.utils import parse_exprs_and_named_exprs
 from narwhals.dependencies import get_duckdb
@@ -104,19 +105,29 @@ def select(
         *exprs: DuckDBExpr,
         **named_exprs: DuckDBExpr,
     ) -> Self:
-        new_columns_map = parse_exprs_and_named_exprs(self)(*exprs, **named_exprs)
+        new_columns_map = parse_exprs_and_named_exprs(self, *exprs, **named_exprs)
         if not new_columns_map:
             # TODO(marco): return empty relation with 0 columns?
             return self._from_native_frame(self._native_frame.limit(0))
 
-        if all(getattr(x, "_returns_scalar", False) for x in exprs) and all(
-            getattr(x, "_returns_scalar", False) for x in named_exprs.values()
+        if not any(expr._expr_kind is ExprKind.TRANSFORM for expr in exprs) and not any(
+            expr._expr_kind is ExprKind.TRANSFORM for expr in named_exprs.values()
         ):
             return self._from_native_frame(
                 self._native_frame.aggregate(
                     [val.alias(col) for col, val in new_columns_map.items()]
                 )
             )
+        if any(expr._expr_kind is ExprKind.AGGREGATION for expr in exprs) or any(
+            expr._expr_kind is ExprKind.AGGREGATION for expr in named_exprs.values()
+        ):
+            msg = (
+                "Mixing expressions which aggregate and expressions which don't\n"
+                "is not yet supported by the DuckDB backend. Once they introduce\n"
+                "duckdb.WindowExpression to their Python API, we'll be able to\n"
+                "support this."
+            )
+            raise NotImplementedError(msg)
 
         return self._from_native_frame(
             self._native_frame.select(
@@ -139,7 +150,19 @@ def with_columns(
         *exprs: DuckDBExpr,
         **named_exprs: DuckDBExpr,
     ) -> Self:
-        new_columns_map = parse_exprs_and_named_exprs(self)(*exprs, **named_exprs)
+        new_columns_map = parse_exprs_and_named_exprs(self, *exprs, **named_exprs)
+
+        if any(expr._expr_kind is ExprKind.AGGREGATION for expr in exprs) or any(
+            expr._expr_kind is ExprKind.AGGREGATION for expr in named_exprs.values()
+        ):
+            msg = (
+                "Mixing expressions which aggregate and expressions which don't\n"
+                "is not yet supported by the DuckDB backend. Once they introduce\n"
+                "duckdb.WindowExpression to their Python API, we'll be able to\n"
+                "support this."
+            )
+            raise NotImplementedError(msg)
+
         result = []
         for col in self._native_frame.columns:
             if col in new_columns_map:
-Original file line number
+Diff line change
@@ Expand Up / @@ -15,6 +15,7 @@ set operations are supported: @@
             - boolean
             - by_dtype
             - categorical
+            - matches
             - numeric
             - string
           show_root_heading: false
@@ Expand Down @@