Skip to content

Commit 20539cb

Browse files
authored
chore: pyarrow do not combine chunks (#1662)
* fix: pyarrow do not combine chunks * duplicated no threads?
1 parent fcc5411 commit 20539cb

File tree

3 files changed

+14
-11
lines changed

3 files changed

+14
-11
lines changed

Diff for: narwhals/_arrow/dataframe.py

+8-5
Original file line numberDiff line numberDiff line change
@@ -597,7 +597,6 @@ def write_csv(self: Self, file: Any) -> Any:
597597
return pa_csv.write_csv(pa_table, file)
598598

599599
def is_duplicated(self: Self) -> ArrowSeries:
600-
import numpy as np # ignore-banned-import
601600
import pyarrow as pa
602601
import pyarrow.compute as pc
603602

@@ -608,14 +607,18 @@ def is_duplicated(self: Self) -> ArrowSeries:
608607
columns = self.columns
609608
col_token = generate_temporary_column_name(n_bytes=8, columns=columns)
610609
row_count = (
611-
df.append_column(col_token, pa.array(np.arange(len(self))))
610+
df.append_column(col_token, pa.repeat(pa.scalar(1), len(self)))
612611
.group_by(columns)
613-
.aggregate([(col_token, "count")])
612+
.aggregate([(col_token, "sum")])
614613
)
615614
is_duplicated = pc.greater(
616615
df.join(
617-
row_count, keys=columns, right_keys=columns, join_type="inner"
618-
).column(f"{col_token}_count"),
616+
row_count,
617+
keys=columns,
618+
right_keys=columns,
619+
join_type="inner",
620+
use_threads=False,
621+
).column(f"{col_token}_sum"),
619622
1,
620623
)
621624
return ArrowSeries(

Diff for: narwhals/_arrow/namespace.py

+4-4
Original file line numberDiff line numberDiff line change
@@ -445,18 +445,18 @@ def __call__(self: Self, df: ArrowDataFrame) -> Sequence[ArrowSeries]:
445445
except TypeError:
446446
# `self._otherwise_value` is a scalar and can't be converted to an expression
447447
value_series = condition.__class__._from_iterable(
448-
[self._then_value] * len(condition),
448+
pa.repeat(pa.scalar(self._then_value), len(condition)),
449449
name="literal",
450450
backend_version=self._backend_version,
451451
version=self._version,
452452
)
453453

454454
value_series_native = value_series._native_series
455-
condition_native = condition._native_series.combine_chunks()
455+
condition_native = condition._native_series
456456

457457
if self._otherwise_value is None:
458-
otherwise_native = pa.array(
459-
[None] * len(condition_native), type=value_series_native.type
458+
otherwise_native = pa.repeat(
459+
pa.scalar(None, type=value_series_native.type), len(condition_native)
460460
)
461461
return [
462462
value_series._from_native_series(

Diff for: narwhals/_arrow/utils.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -261,7 +261,7 @@ def vertical_concat(dfs: list[pa.Table]) -> pa.Table:
261261

262262
import pyarrow as pa
263263

264-
return pa.concat_tables(dfs).combine_chunks()
264+
return pa.concat_tables(dfs)
265265

266266

267267
def diagonal_concat(dfs: list[pa.Table], backend_version: tuple[int, ...]) -> pa.Table:
@@ -276,7 +276,7 @@ def diagonal_concat(dfs: list[pa.Table], backend_version: tuple[int, ...]) -> pa
276276
if backend_version < (14, 0, 0)
277277
else {"promote_options": "default"} # type: ignore[dict-item]
278278
)
279-
return pa.concat_tables(dfs, **kwargs).combine_chunks()
279+
return pa.concat_tables(dfs, **kwargs)
280280

281281

282282
def floordiv_compat(left: Any, right: Any) -> Any:

0 commit comments

Comments
 (0)