Skip to content

Commit 33c9527

Browse files
authored
feat: expose generate_temporary_column_name publicly (#1264)
1 parent c0a26be commit 33c9527

File tree

10 files changed

+108
-24
lines changed

10 files changed

+108
-24
lines changed

docs/api-reference/narwhals.md

+1
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ Here are the top-level functions available in Narwhals.
1515
- from_dict
1616
- from_native
1717
- from_arrow
18+
- generate_temporary_column_name
1819
- get_level
1920
- get_native_namespace
2021
- is_ordered_categorical

narwhals/__init__.py

+2
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,7 @@
5959
from narwhals.translate import narwhalify
6060
from narwhals.translate import to_native
6161
from narwhals.translate import to_py_scalar
62+
from narwhals.utils import generate_temporary_column_name
6263
from narwhals.utils import is_ordered_categorical
6364
from narwhals.utils import maybe_align_index
6465
from narwhals.utils import maybe_convert_dtypes
@@ -74,6 +75,7 @@
7475
"concat",
7576
"from_dict",
7677
"from_arrow",
78+
"generate_temporary_column_name",
7779
"get_level",
7880
"new_series",
7981
"to_native",

narwhals/_arrow/dataframe.py

+4-4
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
from narwhals.dependencies import is_numpy_array
1818
from narwhals.utils import Implementation
1919
from narwhals.utils import flatten
20-
from narwhals.utils import generate_unique_token
20+
from narwhals.utils import generate_temporary_column_name
2121
from narwhals.utils import is_sequence_but_not_str
2222
from narwhals.utils import parse_columns_to_drop
2323

@@ -358,7 +358,7 @@ def join(
358358

359359
if how == "cross":
360360
plx = self.__narwhals_namespace__()
361-
key_token = generate_unique_token(
361+
key_token = generate_temporary_column_name(
362362
n_bytes=8, columns=[*self.columns, *other.columns]
363363
)
364364

@@ -579,7 +579,7 @@ def is_duplicated(self: Self) -> ArrowSeries:
579579
df = self._native_frame
580580

581581
columns = self.columns
582-
col_token = generate_unique_token(n_bytes=8, columns=columns)
582+
col_token = generate_temporary_column_name(n_bytes=8, columns=columns)
583583
row_count = (
584584
df.append_column(col_token, pa.array(np.arange(len(self))))
585585
.group_by(columns)
@@ -638,7 +638,7 @@ def unique(
638638
agg_func_map = {"any": "min", "first": "min", "last": "max"}
639639

640640
agg_func = agg_func_map[keep]
641-
col_token = generate_unique_token(n_bytes=8, columns=self.columns)
641+
col_token = generate_temporary_column_name(n_bytes=8, columns=self.columns)
642642
keep_idx = (
643643
df.append_column(col_token, pa.array(np.arange(len(self))))
644644
.group_by(subset)

narwhals/_arrow/series.py

+4-4
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
from narwhals._arrow.utils import native_to_narwhals_dtype
1515
from narwhals._arrow.utils import validate_column_comparand
1616
from narwhals.utils import Implementation
17-
from narwhals.utils import generate_unique_token
17+
from narwhals.utils import generate_temporary_column_name
1818

1919
if TYPE_CHECKING:
2020
from types import ModuleType
@@ -604,7 +604,7 @@ def is_first_distinct(self: Self) -> Self:
604604
import pyarrow.compute as pc # ignore-banned-import()
605605

606606
row_number = pa.array(np.arange(len(self)))
607-
col_token = generate_unique_token(n_bytes=8, columns=[self.name])
607+
col_token = generate_temporary_column_name(n_bytes=8, columns=[self.name])
608608
first_distinct_index = (
609609
pa.Table.from_arrays([self._native_series], names=[self.name])
610610
.append_column(col_token, row_number)
@@ -621,7 +621,7 @@ def is_last_distinct(self: Self) -> Self:
621621
import pyarrow.compute as pc # ignore-banned-import()
622622

623623
row_number = pa.array(np.arange(len(self)))
624-
col_token = generate_unique_token(n_bytes=8, columns=[self.name])
624+
col_token = generate_temporary_column_name(n_bytes=8, columns=[self.name])
625625
last_distinct_index = (
626626
pa.Table.from_arrays([self._native_series], names=[self.name])
627627
.append_column(col_token, row_number)
@@ -715,7 +715,7 @@ def to_arrow(self: Self) -> pa.Array:
715715

716716
def mode(self: Self) -> ArrowSeries:
717717
plx = self.__narwhals_namespace__()
718-
col_token = generate_unique_token(n_bytes=8, columns=[self.name])
718+
col_token = generate_temporary_column_name(n_bytes=8, columns=[self.name])
719719
return self.value_counts(name=col_token, normalize=False).filter(
720720
plx.col(col_token) == plx.col(col_token).max()
721721
)[self.name]

narwhals/_dask/dataframe.py

+5-5
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
from narwhals._pandas_like.utils import native_to_narwhals_dtype
1212
from narwhals.utils import Implementation
1313
from narwhals.utils import flatten
14-
from narwhals.utils import generate_unique_token
14+
from narwhals.utils import generate_temporary_column_name
1515
from narwhals.utils import parse_columns_to_drop
1616
from narwhals.utils import parse_version
1717

@@ -194,7 +194,7 @@ def unique(
194194
native_frame = self._native_frame
195195
if keep == "none":
196196
subset = subset or self.columns
197-
token = generate_unique_token(n_bytes=8, columns=subset)
197+
token = generate_temporary_column_name(n_bytes=8, columns=subset)
198198
ser = native_frame.groupby(subset).size().rename(token)
199199
ser = ser.loc[ser == 1]
200200
unique = ser.reset_index().drop(columns=token)
@@ -236,7 +236,7 @@ def join(
236236
if isinstance(right_on, str):
237237
right_on = [right_on]
238238
if how == "cross":
239-
key_token = generate_unique_token(
239+
key_token = generate_temporary_column_name(
240240
n_bytes=8, columns=[*self.columns, *other.columns]
241241
)
242242

@@ -253,7 +253,7 @@ def join(
253253
)
254254

255255
if how == "anti":
256-
indicator_token = generate_unique_token(
256+
indicator_token = generate_temporary_column_name(
257257
n_bytes=8, columns=[*self.columns, *other.columns]
258258
)
259259

@@ -363,7 +363,7 @@ def tail(self: Self, n: int) -> Self:
363363
raise NotImplementedError(msg)
364364

365365
def gather_every(self: Self, n: int, offset: int) -> Self:
366-
row_index_token = generate_unique_token(n_bytes=8, columns=self.columns)
366+
row_index_token = generate_temporary_column_name(n_bytes=8, columns=self.columns)
367367
pln = self.__narwhals_namespace__()
368368
return (
369369
self.with_row_index(name=row_index_token)

narwhals/_dask/expr.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
from narwhals._pandas_like.utils import calculate_timestamp_datetime
1515
from narwhals._pandas_like.utils import native_to_narwhals_dtype
1616
from narwhals.utils import Implementation
17-
from narwhals.utils import generate_unique_token
17+
from narwhals.utils import generate_temporary_column_name
1818

1919
if TYPE_CHECKING:
2020
import dask_expr
@@ -580,7 +580,7 @@ def func(_input: dask_expr.Series, _quantile: float) -> dask_expr.Series:
580580
def is_first_distinct(self: Self) -> Self:
581581
def func(_input: dask_expr.Series) -> dask_expr.Series:
582582
_name = _input.name
583-
col_token = generate_unique_token(n_bytes=8, columns=[_name])
583+
col_token = generate_temporary_column_name(n_bytes=8, columns=[_name])
584584
_input = add_row_index(_input.to_frame(), col_token)
585585
first_distinct_index = _input.groupby(_name).agg({col_token: "min"})[
586586
col_token
@@ -597,7 +597,7 @@ def func(_input: dask_expr.Series) -> dask_expr.Series:
597597
def is_last_distinct(self: Self) -> Self:
598598
def func(_input: dask_expr.Series) -> dask_expr.Series:
599599
_name = _input.name
600-
col_token = generate_unique_token(n_bytes=8, columns=[_name])
600+
col_token = generate_temporary_column_name(n_bytes=8, columns=[_name])
601601
_input = add_row_index(_input.to_frame(), col_token)
602602
last_distinct_index = _input.groupby(_name).agg({col_token: "max"})[col_token]
603603

narwhals/_pandas_like/dataframe.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919
from narwhals.dependencies import is_numpy_array
2020
from narwhals.utils import Implementation
2121
from narwhals.utils import flatten
22-
from narwhals.utils import generate_unique_token
22+
from narwhals.utils import generate_temporary_column_name
2323
from narwhals.utils import is_sequence_but_not_str
2424
from narwhals.utils import parse_columns_to_drop
2525

@@ -506,7 +506,7 @@ def join(
506506
self._implementation is Implementation.PANDAS
507507
and self._backend_version < (1, 4)
508508
):
509-
key_token = generate_unique_token(
509+
key_token = generate_temporary_column_name(
510510
n_bytes=8, columns=[*self.columns, *other.columns]
511511
)
512512

@@ -541,7 +541,7 @@ def join(
541541
)
542542
)
543543
else:
544-
indicator_token = generate_unique_token(
544+
indicator_token = generate_temporary_column_name(
545545
n_bytes=8, columns=[*self.columns, *other.columns]
546546
)
547547

narwhals/stable/v1/__init__.py

+30
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,9 @@
5555
from narwhals.typing import IntoDataFrameT
5656
from narwhals.typing import IntoFrameT
5757
from narwhals.typing import IntoSeriesT
58+
from narwhals.utils import (
59+
generate_temporary_column_name as nw_generate_temporary_column_name,
60+
)
5861
from narwhals.utils import is_ordered_categorical as nw_is_ordered_categorical
5962
from narwhals.utils import maybe_align_index as nw_maybe_align_index
6063
from narwhals.utils import maybe_convert_dtypes as nw_maybe_convert_dtypes
@@ -2149,6 +2152,32 @@ def maybe_reset_index(obj: T) -> T:
21492152
return nw_maybe_reset_index(obj)
21502153

21512154

2155+
def generate_temporary_column_name(n_bytes: int, columns: list[str]) -> str:
2156+
"""Generates a unique token of specified `n_bytes` that is not present in the given
2157+
list of columns.
2158+
2159+
It relies on [python secrets token_hex](https://docs.python.org/3/library/secrets.html#secrets.token_hex)
2160+
function to return a string nbytes random bytes.
2161+
2162+
Arguments:
2163+
n_bytes: The number of bytes to generate for the token.
2164+
columns: The list of columns to check for uniqueness.
2165+
2166+
Returns:
2167+
A unique token that is not present in the given list of columns.
2168+
2169+
Raises:
2170+
AssertionError: If a unique token cannot be generated after 100 attempts.
2171+
2172+
Examples:
2173+
>>> import narwhals.stable.v1 as nw
2174+
>>> columns = ["abc", "xyz"]
2175+
>>> nw.generate_temporary_column_name(n_bytes=8, columns=columns) not in columns
2176+
True
2177+
"""
2178+
return nw_generate_temporary_column_name(n_bytes=n_bytes, columns=columns)
2179+
2180+
21522181
def get_native_namespace(obj: Any) -> Any:
21532182
"""
21542183
Get native namespace from object.
@@ -2447,6 +2476,7 @@ def from_dict(
24472476
"maybe_get_index",
24482477
"maybe_reset_index",
24492478
"maybe_set_index",
2479+
"generate_temporary_column_name",
24502480
"get_native_namespace",
24512481
"get_level",
24522482
"all",

narwhals/utils.py

+26-5
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
from typing import Sequence
1111
from typing import TypeVar
1212
from typing import cast
13+
from warnings import warn
1314

1415
from narwhals._exceptions import ColumnNotFoundError
1516
from narwhals.dependencies import get_cudf
@@ -481,17 +482,37 @@ def is_ordered_categorical(series: Series) -> bool:
481482

482483

483484
def generate_unique_token(n_bytes: int, columns: list[str]) -> str: # pragma: no cover
484-
"""Generates a unique token of specified n_bytes that is not present in the given list of columns.
485+
warn(
486+
"Use `generate_temporary_column_name` instead. `generate_unique_token` is "
487+
"deprecated and it will be removed in future versions",
488+
DeprecationWarning,
489+
stacklevel=2,
490+
)
491+
return generate_temporary_column_name(n_bytes=n_bytes, columns=columns)
492+
493+
494+
def generate_temporary_column_name(n_bytes: int, columns: list[str]) -> str:
495+
"""Generates a unique token of specified `n_bytes` that is not present in the given
496+
list of columns.
497+
498+
It relies on [python secrets token_hex](https://docs.python.org/3/library/secrets.html#secrets.token_hex)
499+
function to return a string nbytes random bytes.
485500
486501
Arguments:
487-
n_bytes : The number of bytes to generate for the token.
488-
columns : The list of columns to check for uniqueness.
502+
n_bytes: The number of bytes to generate for the token.
503+
columns: The list of columns to check for uniqueness.
489504
490505
Returns:
491506
A unique token that is not present in the given list of columns.
492507
493508
Raises:
494509
AssertionError: If a unique token cannot be generated after 100 attempts.
510+
511+
Examples:
512+
>>> import narwhals as nw
513+
>>> columns = ["abc", "xyz"]
514+
>>> nw.generate_temporary_column_name(n_bytes=8, columns=columns) not in columns
515+
True
495516
"""
496517
counter = 0
497518
while True:
@@ -502,8 +523,8 @@ def generate_unique_token(n_bytes: int, columns: list[str]) -> str: # pragma: n
502523
counter += 1
503524
if counter > 100:
504525
msg = (
505-
"Internal Error: Narwhals was not able to generate a column name to perform given "
506-
"join operation"
526+
"Internal Error: Narwhals was not able to generate a column name with "
527+
f"{n_bytes=} and not in {columns}"
507528
)
508529
raise AssertionError(msg)
509530

tests/utils_test.py

+30
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,12 @@
11
from __future__ import annotations
22

3+
import string
4+
5+
import hypothesis.strategies as st
36
import pandas as pd
47
import polars as pl
58
import pytest
9+
from hypothesis import given
610
from pandas.testing import assert_frame_equal
711
from pandas.testing import assert_index_equal
812
from pandas.testing import assert_series_equal
@@ -147,3 +151,29 @@ def test_maybe_convert_dtypes_polars() -> None:
147151
def test_get_trivial_version_with_uninstalled_module() -> None:
148152
result = get_module_version_as_tuple("non_existent_module")
149153
assert result == (0, 0, 0)
154+
155+
156+
@given(n_bytes=st.integers(1, 100)) # type: ignore[misc]
157+
def test_generate_temporary_column_name(n_bytes: int) -> None:
158+
columns = ["abc", "XYZ"]
159+
160+
temp_col_name = nw.generate_temporary_column_name(n_bytes=n_bytes, columns=columns)
161+
assert temp_col_name not in columns
162+
163+
164+
def test_generate_temporary_column_name_raise() -> None:
165+
from itertools import product
166+
167+
columns = [
168+
"".join(t)
169+
for t in product(
170+
string.ascii_lowercase + string.digits,
171+
string.ascii_lowercase + string.digits,
172+
)
173+
]
174+
175+
with pytest.raises(
176+
AssertionError,
177+
match="Internal Error: Narwhals was not able to generate a column name with ",
178+
):
179+
nw.generate_temporary_column_name(n_bytes=1, columns=columns)

0 commit comments

Comments
 (0)