fix: PySpark was raising during collect when it contained no rows and a void dtype column (#2032)

MarcoGorelli · web-flow · commit 9d153a8bcad1 · 2025-02-17T15:34:36.000Z
diff --git a/.github/workflows/downstream_tests.yml b/.github/workflows/downstream_tests.yml
@@ -461,18 +461,27 @@ jobs:
       - name: install-validoopsie-dev
         run: |
           cd validoopsie
+          uv venv
+          . .venv/bin/activate
           uv sync --dev
           uv pip install pytest-env
           which python
       - name: show-deps
-        run: uv pip freeze
+        run: |
+          cd validoopsie
+          . .venv/bin/activate
+          uv pip freeze
       - name: install-narwhals-dev
         run: |
           cd validoopsie
+          . .venv/bin/activate
           uv pip uninstall narwhals
           uv pip install -e ./..
       - name: Run tests
         run: |
           cd validoopsie
-          uv run pytest
+          . .venv/bin/activate
+          touch tests/__init__.py
+          touch tests/utils/__init__.py
+          pytest tests
         timeout-minutes: 15
diff --git a/narwhals/_duckdb/expr.py b/narwhals/_duckdb/expr.py
@@ -494,9 +494,7 @@ def is_finite(self: Self) -> Self:
 
     def is_in(self: Self, other: Sequence[Any]) -> Self:
         return self._from_call(
-            lambda _input: lit(False)  # noqa: FBT003
-            if not other
-            else _input.isin(*[lit(x) for x in other]),
+            lambda _input: FunctionExpression("contains", lit(other), _input),
             "is_in",
             expr_kind=self._expr_kind,
         )
diff --git a/narwhals/_spark_like/dataframe.py b/narwhals/_spark_like/dataframe.py
@@ -1,5 +1,6 @@
 from __future__ import annotations
 
+import warnings
 from typing import TYPE_CHECKING
 from typing import Any
 from typing import Literal
@@ -13,6 +14,7 @@
 from narwhals.typing import CompliantLazyFrame
 from narwhals.utils import Implementation
 from narwhals.utils import check_column_exists
+from narwhals.utils import find_stacklevel
 from narwhals.utils import import_dtypes_module
 from narwhals.utils import parse_columns_to_drop
 from narwhals.utils import parse_version
@@ -124,13 +126,27 @@ def _collect_to_arrow(self) -> pa.Table:
                     from narwhals._arrow.utils import narwhals_to_native_dtype
 
                     data: dict[str, list[Any]] = {}
-                    schema = []
+                    schema: list[tuple[str, pa.DataType]] = []
                     current_schema = self.collect_schema()
                     for key, value in current_schema.items():
                         data[key] = []
-                        schema.append(
-                            (key, narwhals_to_native_dtype(value, self._version))
-                        )
+                        try:
+                            native_dtype = narwhals_to_native_dtype(value, self._version)
+                        except Exception as exc:  # noqa: BLE001
+                            native_spark_dtype = self._native_frame.schema[key].dataType
+                            # If we can't convert the type, just set it to `pa.null`, and warn.
+                            # Avoid the warning if we're starting from PySpark's void type.
+                            # We can avoid the check when we introduce `nw.Null` dtype.
+                            if not isinstance(
+                                native_spark_dtype, self._native_dtypes.NullType
+                            ):
+                                warnings.warn(
+                                    f"Could not convert dtype {native_spark_dtype} to PyArrow dtype, {exc!r}",
+                                    stacklevel=find_stacklevel(),
+                                )
+                            schema.append((key, pa.null()))
+                        else:
+                            schema.append((key, native_dtype))
                     native_pyarrow_frame = pa.Table.from_pydict(
                         data, schema=pa.schema(schema)
                     )
diff --git a/tests/frame/collect_test.py b/tests/frame/collect_test.py
@@ -105,3 +105,10 @@ def test_collect_with_kwargs(constructor: Constructor) -> None:
 
     expected = {"a": [3], "b": [7]}
     assert_equal_data(result, expected)
+
+
+def test_collect_empty_pyspark(constructor: Constructor) -> None:
+    df = nw_v1.from_native(constructor({"a": [1, 2, 3]}))
+    df = df.filter(nw.col("a").is_null()).with_columns(b=nw.lit(None)).lazy()
+    result = df.collect()
+    assert_equal_data(result, {"a": [], "b": []})