feat: expose generate_temporary_column_name publicly (#1264)

FBruzzesi · web-flow · commit 33c9527d8912 · 2024-10-28T08:33:12.000+01:00
diff --git a/docs/api-reference/narwhals.md b/docs/api-reference/narwhals.md
@@ -15,6 +15,7 @@ Here are the top-level functions available in Narwhals.
         - from_dict
         - from_native
         - from_arrow
+        - generate_temporary_column_name
         - get_level
         - get_native_namespace
         - is_ordered_categorical
diff --git a/narwhals/__init__.py b/narwhals/__init__.py
@@ -59,6 +59,7 @@
 from narwhals.translate import narwhalify
 from narwhals.translate import to_native
 from narwhals.translate import to_py_scalar
+from narwhals.utils import generate_temporary_column_name
 from narwhals.utils import is_ordered_categorical
 from narwhals.utils import maybe_align_index
 from narwhals.utils import maybe_convert_dtypes
@@ -74,6 +75,7 @@
     "concat",
     "from_dict",
     "from_arrow",
+    "generate_temporary_column_name",
     "get_level",
     "new_series",
     "to_native",
diff --git a/narwhals/_arrow/dataframe.py b/narwhals/_arrow/dataframe.py
@@ -17,7 +17,7 @@
 from narwhals.dependencies import is_numpy_array
 from narwhals.utils import Implementation
 from narwhals.utils import flatten
-from narwhals.utils import generate_unique_token
+from narwhals.utils import generate_temporary_column_name
 from narwhals.utils import is_sequence_but_not_str
 from narwhals.utils import parse_columns_to_drop
 
@@ -358,7 +358,7 @@ def join(
 
         if how == "cross":
             plx = self.__narwhals_namespace__()
-            key_token = generate_unique_token(
+            key_token = generate_temporary_column_name(
                 n_bytes=8, columns=[*self.columns, *other.columns]
             )
 
@@ -579,7 +579,7 @@ def is_duplicated(self: Self) -> ArrowSeries:
         df = self._native_frame
 
         columns = self.columns
-        col_token = generate_unique_token(n_bytes=8, columns=columns)
+        col_token = generate_temporary_column_name(n_bytes=8, columns=columns)
         row_count = (
             df.append_column(col_token, pa.array(np.arange(len(self))))
             .group_by(columns)
@@ -638,7 +638,7 @@ def unique(
             agg_func_map = {"any": "min", "first": "min", "last": "max"}
 
             agg_func = agg_func_map[keep]
-            col_token = generate_unique_token(n_bytes=8, columns=self.columns)
+            col_token = generate_temporary_column_name(n_bytes=8, columns=self.columns)
             keep_idx = (
                 df.append_column(col_token, pa.array(np.arange(len(self))))
                 .group_by(subset)
diff --git a/narwhals/_arrow/series.py b/narwhals/_arrow/series.py
@@ -14,7 +14,7 @@
 from narwhals._arrow.utils import native_to_narwhals_dtype
 from narwhals._arrow.utils import validate_column_comparand
 from narwhals.utils import Implementation
-from narwhals.utils import generate_unique_token
+from narwhals.utils import generate_temporary_column_name
 
 if TYPE_CHECKING:
     from types import ModuleType
@@ -604,7 +604,7 @@ def is_first_distinct(self: Self) -> Self:
         import pyarrow.compute as pc  # ignore-banned-import()
 
         row_number = pa.array(np.arange(len(self)))
-        col_token = generate_unique_token(n_bytes=8, columns=[self.name])
+        col_token = generate_temporary_column_name(n_bytes=8, columns=[self.name])
         first_distinct_index = (
             pa.Table.from_arrays([self._native_series], names=[self.name])
             .append_column(col_token, row_number)
@@ -621,7 +621,7 @@ def is_last_distinct(self: Self) -> Self:
         import pyarrow.compute as pc  # ignore-banned-import()
 
         row_number = pa.array(np.arange(len(self)))
-        col_token = generate_unique_token(n_bytes=8, columns=[self.name])
+        col_token = generate_temporary_column_name(n_bytes=8, columns=[self.name])
         last_distinct_index = (
             pa.Table.from_arrays([self._native_series], names=[self.name])
             .append_column(col_token, row_number)
@@ -715,7 +715,7 @@ def to_arrow(self: Self) -> pa.Array:
 
     def mode(self: Self) -> ArrowSeries:
         plx = self.__narwhals_namespace__()
-        col_token = generate_unique_token(n_bytes=8, columns=[self.name])
+        col_token = generate_temporary_column_name(n_bytes=8, columns=[self.name])
         return self.value_counts(name=col_token, normalize=False).filter(
             plx.col(col_token) == plx.col(col_token).max()
         )[self.name]
diff --git a/narwhals/_dask/dataframe.py b/narwhals/_dask/dataframe.py
@@ -11,7 +11,7 @@
 from narwhals._pandas_like.utils import native_to_narwhals_dtype
 from narwhals.utils import Implementation
 from narwhals.utils import flatten
-from narwhals.utils import generate_unique_token
+from narwhals.utils import generate_temporary_column_name
 from narwhals.utils import parse_columns_to_drop
 from narwhals.utils import parse_version
 
@@ -194,7 +194,7 @@ def unique(
         native_frame = self._native_frame
         if keep == "none":
             subset = subset or self.columns
-            token = generate_unique_token(n_bytes=8, columns=subset)
+            token = generate_temporary_column_name(n_bytes=8, columns=subset)
             ser = native_frame.groupby(subset).size().rename(token)
             ser = ser.loc[ser == 1]
             unique = ser.reset_index().drop(columns=token)
@@ -236,7 +236,7 @@ def join(
         if isinstance(right_on, str):
             right_on = [right_on]
         if how == "cross":
-            key_token = generate_unique_token(
+            key_token = generate_temporary_column_name(
                 n_bytes=8, columns=[*self.columns, *other.columns]
             )
 
@@ -253,7 +253,7 @@ def join(
             )
 
         if how == "anti":
-            indicator_token = generate_unique_token(
+            indicator_token = generate_temporary_column_name(
                 n_bytes=8, columns=[*self.columns, *other.columns]
             )
 
@@ -363,7 +363,7 @@ def tail(self: Self, n: int) -> Self:
             raise NotImplementedError(msg)
 
     def gather_every(self: Self, n: int, offset: int) -> Self:
-        row_index_token = generate_unique_token(n_bytes=8, columns=self.columns)
+        row_index_token = generate_temporary_column_name(n_bytes=8, columns=self.columns)
         pln = self.__narwhals_namespace__()
         return (
             self.with_row_index(name=row_index_token)
diff --git a/narwhals/_dask/expr.py b/narwhals/_dask/expr.py
@@ -14,7 +14,7 @@
 from narwhals._pandas_like.utils import calculate_timestamp_datetime
 from narwhals._pandas_like.utils import native_to_narwhals_dtype
 from narwhals.utils import Implementation
-from narwhals.utils import generate_unique_token
+from narwhals.utils import generate_temporary_column_name
 
 if TYPE_CHECKING:
     import dask_expr
@@ -580,7 +580,7 @@ def func(_input: dask_expr.Series, _quantile: float) -> dask_expr.Series:
     def is_first_distinct(self: Self) -> Self:
         def func(_input: dask_expr.Series) -> dask_expr.Series:
             _name = _input.name
-            col_token = generate_unique_token(n_bytes=8, columns=[_name])
+            col_token = generate_temporary_column_name(n_bytes=8, columns=[_name])
             _input = add_row_index(_input.to_frame(), col_token)
             first_distinct_index = _input.groupby(_name).agg({col_token: "min"})[
                 col_token
@@ -597,7 +597,7 @@ def func(_input: dask_expr.Series) -> dask_expr.Series:
     def is_last_distinct(self: Self) -> Self:
         def func(_input: dask_expr.Series) -> dask_expr.Series:
             _name = _input.name
-            col_token = generate_unique_token(n_bytes=8, columns=[_name])
+            col_token = generate_temporary_column_name(n_bytes=8, columns=[_name])
             _input = add_row_index(_input.to_frame(), col_token)
             last_distinct_index = _input.groupby(_name).agg({col_token: "max"})[col_token]
 
diff --git a/narwhals/_pandas_like/dataframe.py b/narwhals/_pandas_like/dataframe.py
@@ -19,7 +19,7 @@
 from narwhals.dependencies import is_numpy_array
 from narwhals.utils import Implementation
 from narwhals.utils import flatten
-from narwhals.utils import generate_unique_token
+from narwhals.utils import generate_temporary_column_name
 from narwhals.utils import is_sequence_but_not_str
 from narwhals.utils import parse_columns_to_drop
 
@@ -506,7 +506,7 @@ def join(
                 self._implementation is Implementation.PANDAS
                 and self._backend_version < (1, 4)
             ):
-                key_token = generate_unique_token(
+                key_token = generate_temporary_column_name(
                     n_bytes=8, columns=[*self.columns, *other.columns]
                 )
 
@@ -541,7 +541,7 @@ def join(
                     )
                 )
             else:
-                indicator_token = generate_unique_token(
+                indicator_token = generate_temporary_column_name(
                     n_bytes=8, columns=[*self.columns, *other.columns]
                 )
 
diff --git a/narwhals/stable/v1/__init__.py b/narwhals/stable/v1/__init__.py
@@ -55,6 +55,9 @@
 from narwhals.typing import IntoDataFrameT
 from narwhals.typing import IntoFrameT
 from narwhals.typing import IntoSeriesT
+from narwhals.utils import (
+    generate_temporary_column_name as nw_generate_temporary_column_name,
+)
 from narwhals.utils import is_ordered_categorical as nw_is_ordered_categorical
 from narwhals.utils import maybe_align_index as nw_maybe_align_index
 from narwhals.utils import maybe_convert_dtypes as nw_maybe_convert_dtypes
@@ -2149,6 +2152,32 @@ def maybe_reset_index(obj: T) -> T:
     return nw_maybe_reset_index(obj)
 
 
+def generate_temporary_column_name(n_bytes: int, columns: list[str]) -> str:
+    """Generates a unique token of specified `n_bytes` that is not present in the given
+    list of columns.
+
+    It relies on [python secrets token_hex](https://docs.python.org/3/library/secrets.html#secrets.token_hex)
+    function to return a string nbytes random bytes.
+
+    Arguments:
+        n_bytes: The number of bytes to generate for the token.
+        columns: The list of columns to check for uniqueness.
+
+    Returns:
+        A unique token that is not present in the given list of columns.
+
+    Raises:
+        AssertionError: If a unique token cannot be generated after 100 attempts.
+
+    Examples:
+        >>> import narwhals.stable.v1 as nw
+        >>> columns = ["abc", "xyz"]
+        >>> nw.generate_temporary_column_name(n_bytes=8, columns=columns) not in columns
+        True
+    """
+    return nw_generate_temporary_column_name(n_bytes=n_bytes, columns=columns)
+
+
 def get_native_namespace(obj: Any) -> Any:
     """
     Get native namespace from object.
@@ -2447,6 +2476,7 @@ def from_dict(
     "maybe_get_index",
     "maybe_reset_index",
     "maybe_set_index",
+    "generate_temporary_column_name",
     "get_native_namespace",
     "get_level",
     "all",
diff --git a/narwhals/utils.py b/narwhals/utils.py
@@ -10,6 +10,7 @@
 from typing import Sequence
 from typing import TypeVar
 from typing import cast
+from warnings import warn
 
 from narwhals._exceptions import ColumnNotFoundError
 from narwhals.dependencies import get_cudf
@@ -481,17 +482,37 @@ def is_ordered_categorical(series: Series) -> bool:
 
 
 def generate_unique_token(n_bytes: int, columns: list[str]) -> str:  # pragma: no cover
-    """Generates a unique token of specified n_bytes that is not present in the given list of columns.
+    warn(
+        "Use `generate_temporary_column_name` instead. `generate_unique_token` is "
+        "deprecated and it will be removed in future versions",
+        DeprecationWarning,
+        stacklevel=2,
+    )
+    return generate_temporary_column_name(n_bytes=n_bytes, columns=columns)
+
+
+def generate_temporary_column_name(n_bytes: int, columns: list[str]) -> str:
+    """Generates a unique token of specified `n_bytes` that is not present in the given
+    list of columns.
+
+    It relies on [python secrets token_hex](https://docs.python.org/3/library/secrets.html#secrets.token_hex)
+    function to return a string nbytes random bytes.
 
     Arguments:
-        n_bytes : The number of bytes to generate for the token.
-        columns : The list of columns to check for uniqueness.
+        n_bytes: The number of bytes to generate for the token.
+        columns: The list of columns to check for uniqueness.
 
     Returns:
         A unique token that is not present in the given list of columns.
 
     Raises:
         AssertionError: If a unique token cannot be generated after 100 attempts.
+
+    Examples:
+        >>> import narwhals as nw
+        >>> columns = ["abc", "xyz"]
+        >>> nw.generate_temporary_column_name(n_bytes=8, columns=columns) not in columns
+        True
     """
     counter = 0
     while True:
@@ -502,8 +523,8 @@ def generate_unique_token(n_bytes: int, columns: list[str]) -> str:  # pragma: n
         counter += 1
         if counter > 100:
             msg = (
-                "Internal Error: Narwhals was not able to generate a column name to perform given "
-                "join operation"
+                "Internal Error: Narwhals was not able to generate a column name with "
+                f"{n_bytes=} and not in {columns}"
             )
             raise AssertionError(msg)
 
diff --git a/tests/utils_test.py b/tests/utils_test.py
@@ -1,8 +1,12 @@
 from __future__ import annotations
 
+import string
+
+import hypothesis.strategies as st
 import pandas as pd
 import polars as pl
 import pytest
+from hypothesis import given
 from pandas.testing import assert_frame_equal
 from pandas.testing import assert_index_equal
 from pandas.testing import assert_series_equal
@@ -147,3 +151,29 @@ def test_maybe_convert_dtypes_polars() -> None:
 def test_get_trivial_version_with_uninstalled_module() -> None:
     result = get_module_version_as_tuple("non_existent_module")
     assert result == (0, 0, 0)
+
+
+@given(n_bytes=st.integers(1, 100))  # type: ignore[misc]
+def test_generate_temporary_column_name(n_bytes: int) -> None:
+    columns = ["abc", "XYZ"]
+
+    temp_col_name = nw.generate_temporary_column_name(n_bytes=n_bytes, columns=columns)
+    assert temp_col_name not in columns
+
+
+def test_generate_temporary_column_name_raise() -> None:
+    from itertools import product
+
+    columns = [
+        "".join(t)
+        for t in product(
+            string.ascii_lowercase + string.digits,
+            string.ascii_lowercase + string.digits,
+        )
+    ]
+
+    with pytest.raises(
+        AssertionError,
+        match="Internal Error: Narwhals was not able to generate a column name with ",
+    ):
+        nw.generate_temporary_column_name(n_bytes=1, columns=columns)

Original file line number	Diff line number	Diff line change
`@@ -19,7 +19,7 @@`
`19`	`19`	`from narwhals.dependencies import is_numpy_array`
`20`	`20`	`from narwhals.utils import Implementation`
`21`	`21`	`from narwhals.utils import flatten`
`22`		`-from narwhals.utils import generate_unique_token`
	`22`	`+from narwhals.utils import generate_temporary_column_name`
`23`	`23`	`from narwhals.utils import is_sequence_but_not_str`
`24`	`24`	`from narwhals.utils import parse_columns_to_drop`
`25`	`25`
`@@ -506,7 +506,7 @@ def join(`
`506`	`506`	`self._implementation is Implementation.PANDAS`
`507`	`507`	`and self._backend_version < (1, 4)`
`508`	`508`	`):`
`509`		`- key_token = generate_unique_token(`
	`509`	`+ key_token = generate_temporary_column_name(`
`510`	`510`	`n_bytes=8, columns=[self.columns, other.columns]`
`511`	`511`	`)`
`512`	`512`
`@@ -541,7 +541,7 @@ def join(`
`541`	`541`	`)`
`542`	`542`	`)`
`543`	`543`	`else:`
`544`		`- indicator_token = generate_unique_token(`
	`544`	`+ indicator_token = generate_temporary_column_name(`
`545`	`545`	`n_bytes=8, columns=[self.columns, other.columns]`
`546`	`546`	`)`
`547`	`547`