From cc6e7822ee88174d55cae7ab3e1d34f8e2de9735 Mon Sep 17 00:00:00 2001 From: Jiaming Yuan Date: Mon, 10 Nov 2025 19:14:39 +0800 Subject: [PATCH 1/3] Support arrow-backed pandas categorical columns. --- python-package/xgboost/_data_utils.py | 18 +++++++++++ python-package/xgboost/data.py | 45 ++++++++++++++++----------- 2 files changed, 44 insertions(+), 19 deletions(-) diff --git a/python-package/xgboost/_data_utils.py b/python-package/xgboost/_data_utils.py index 4531848aa782..e9b6256c7e74 100644 --- a/python-package/xgboost/_data_utils.py +++ b/python-package/xgboost/_data_utils.py @@ -749,3 +749,21 @@ def array_interface(self) -> bytes: @abstractmethod def shape(self) -> Tuple[int, int]: """Return the shape of the dataframe.""" + + +class ArrowCatMixin: + """Mixin for handling arrow-backed dictionary array in a transformed dataframe.""" + + def _push_arrow_cat( + self, col: "pa.DictionaryArray", aitfs: AifType, temporary_buffers: List[Tuple] + ) -> None: + pa = import_pyarrow() + cats = col.dictionary + codes = col.indices + if not isinstance(cats, (pa.StringArray, pa.LargeStringArray)): + raise TypeError( + "Only string-based categorical index is supported for arrow." + ) + jnames, jcodes, buf = arrow_cat_inf(cats, codes) + temporary_buffers.append(buf) + aitfs.append((jnames, jcodes)) diff --git a/python-package/xgboost/data.py b/python-package/xgboost/data.py index f0917c615d39..d8e13166ad12 100644 --- a/python-package/xgboost/data.py +++ b/python-package/xgboost/data.py @@ -25,6 +25,7 @@ from ._data_utils import ( AifType, + ArrowCatMixin, Categories, DfCatAccessor, TransformedDf, @@ -493,8 +494,7 @@ def is_pd_sparse_dtype(dtype: PandasDType) -> bool: return is_sparse(dtype) -def pandas_pa_type(ser: Any) -> np.ndarray: - """Handle pandas pyarrow extention.""" +def pandas_pa_chunk(ser: Any) -> "pa.Array": pd = import_pandas() if TYPE_CHECKING: @@ -511,6 +511,14 @@ def pandas_pa_type(ser: Any) -> np.ndarray: aa: "pa.ChunkedArray" = d_array.__arrow_array__() # combine_chunks takes the most significant amount of time chunk: "pa.Array" = aa.combine_chunks() + return chunk + + +def pandas_pa_type(ser: Any) -> np.ndarray: + """Handle numerical pandas pyarrow extention.""" + pa = import_pyarrow() + + chunk = pandas_pa_chunk(ser) # When there's null value, we have to use copy zero_copy = chunk.null_count == 0 and not pa.types.is_boolean(chunk.type) # Alternately, we can use chunk.buffers(), which returns a list of buffers and @@ -524,6 +532,11 @@ def pandas_pa_type(ser: Any) -> np.ndarray: return arr +def pandas_pa_cat_type(ser: Any) -> np.ndarray: + """Handle categorical pandas pyarrow extention.""" + return pandas_pa_chunk(ser) + + @functools.cache def _lazy_has_npdtypes() -> bool: return np.lib.NumpyVersion(np.__version__) > np.lib.NumpyVersion("1.25.0") @@ -582,10 +595,9 @@ def oth_type(ser: "PdSeries") -> np.ndarray: for col, dtype in zip(data.columns, data.dtypes): if is_pa_ext_categorical_dtype(dtype): - raise ValueError( - "pyarrow dictionary type is not supported. Use pandas category instead." - ) - if is_pd_cat_dtype(dtype): + arr_cat = pandas_pa_cat_type(data[col]) + result.append(arr_cat) + elif is_pd_cat_dtype(dtype): result.append(cat_codes(data[col])) elif is_pa_ext_dtype(dtype): result.append(pandas_pa_type(data[col])) @@ -606,7 +618,7 @@ def oth_type(ser: "PdSeries") -> np.ndarray: return result -class PandasTransformed(TransformedDf): +class PandasTransformed(TransformedDf, ArrowCatMixin): """A storage class for transformed pandas DataFrame.""" def __init__( @@ -620,8 +632,11 @@ def __init__( # Get the array interface representation for each column. for col in self.columns: - if _is_df_cat(col): - # Categorical column + if is_arrow_dict(col): + # Arrow categorical column + self._push_arrow_cat(col, aitfs, self.temporary_buffers) + elif _is_df_cat(col): + # Pandas categorical column jnames, jcodes, buf = pd_cat_inf(col.categories, col.codes) self.temporary_buffers.append(buf) aitfs.append((jnames, jcodes)) @@ -755,7 +770,7 @@ def _from_pandas_series( ) -class ArrowTransformed(TransformedDf): +class ArrowTransformed(TransformedDf, ArrowCatMixin): """A storage class for transformed arrow table.""" def __init__( @@ -776,15 +791,7 @@ def __init__( def push_series(col: Union["pa.NumericArray", "pa.DictionaryArray"]) -> None: if isinstance(col, pa.DictionaryArray): - cats = col.dictionary - codes = col.indices - if not isinstance(cats, (pa.StringArray, pa.LargeStringArray)): - raise TypeError( - "Only string-based categorical index is supported for arrow." - ) - jnames, jcodes, buf = arrow_cat_inf(cats, codes) - self.temporary_buffers.append(buf) - aitfs.append((jnames, jcodes)) + self._push_arrow_cat(col, aitfs, self.temporary_buffers) else: jdata = _arrow_array_inf(col) aitfs.append(jdata) From 160d0f07a2fff3dfbaa9065bc36c6604f1d7c1f1 Mon Sep 17 00:00:00 2001 From: Jiaming Yuan Date: Mon, 10 Nov 2025 19:27:58 +0800 Subject: [PATCH 2/3] simple tests. --- python-package/xgboost/testing/ordinal.py | 21 +++++++++++++++++++-- tests/python-gpu/test_gpu_ordinal.py | 2 +- tests/python/test_ordinal.py | 7 ++++++- 3 files changed, 26 insertions(+), 4 deletions(-) diff --git a/python-package/xgboost/testing/ordinal.py b/python-package/xgboost/testing/ordinal.py index 6629868778f4..e7ec726b5e9c 100644 --- a/python-package/xgboost/testing/ordinal.py +++ b/python-package/xgboost/testing/ordinal.py @@ -335,7 +335,7 @@ def run_basic_predict(DMatrixT: Type, device: Device, tdevice: Device) -> None: assert_allclose(device, predt0, predt2) -def run_cat_predict(device: Device) -> None: +def run_cat_predict(device: Device, use_arrow: bool) -> None: """Basic tests for re-coding during prediction.""" Df, _ = get_df_impl(device) @@ -343,7 +343,24 @@ def run_cat_predict(device: Device) -> None: run_basic_predict(dm, device, device) def run_mixed(DMatrixT: Type) -> None: - df = Df({"b": [2, 1, 3], "c": ["cdef", "abc", "def"]}, dtype="category") + b_list = [2, 1, 3] + c_list = ["cdef", "abc", "def"] + if use_arrow: + import pandas as pd + import pyarrow as pa + + c_typ = pa.DictionaryArray.from_arrays( + pa.array([0, 1, 2]), + pa.array(["cdef", "abc", "def"], type=pa.large_utf8()), + ) + c_ser = pd.Series(c_typ, dtype=pd.ArrowDtype(c_typ.type)) + b_typ = pa.DictionaryArray.from_arrays( + pa.array([0, 1, 2], pa.array(b_list)) + ) + b_ser = pd.Series(b_typ, dtype=pd.ArrowDtype(b_typ.type)) + df = Df({"b": b_ser, "c": c_ser}) + else: + df = Df({"b": b_list, "c": c_list}, dtype="category") y = np.array([0, 1, 2]) # used with the next df diff --git a/tests/python-gpu/test_gpu_ordinal.py b/tests/python-gpu/test_gpu_ordinal.py index e091ffc80773..58ae79cc517f 100644 --- a/tests/python-gpu/test_gpu_ordinal.py +++ b/tests/python-gpu/test_gpu_ordinal.py @@ -42,7 +42,7 @@ def test_cat_container_iter() -> None: def test_cat_predict() -> None: - run_cat_predict("cuda") + run_cat_predict("cuda", False) def test_cat_invalid() -> None: diff --git a/tests/python/test_ordinal.py b/tests/python/test_ordinal.py index 3e76a8b37eee..859528ee7232 100644 --- a/tests/python/test_ordinal.py +++ b/tests/python/test_ordinal.py @@ -34,7 +34,12 @@ def test_cat_container_iter() -> None: def test_cat_predict() -> None: - run_cat_predict("cpu") + run_cat_predict("cpu", False) + + +@pytest.mark.skipif(**tm.no_arrow()) +def test_cat_predict_arrow() -> None: + run_cat_predict("cpu", True) def test_cat_invalid() -> None: From fa6523a736595ca046c08ad86d0b134793793f57 Mon Sep 17 00:00:00 2001 From: Jiaming Yuan Date: Tue, 11 Nov 2025 01:02:08 +0800 Subject: [PATCH 3/3] cleanup. --- python-package/xgboost/data.py | 1 - python-package/xgboost/testing/ordinal.py | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/python-package/xgboost/data.py b/python-package/xgboost/data.py index d8e13166ad12..655ba1939626 100644 --- a/python-package/xgboost/data.py +++ b/python-package/xgboost/data.py @@ -35,7 +35,6 @@ array_hasobject, array_interface, array_interface_dict, - arrow_cat_inf, check_cudf_meta, cuda_array_interface, cuda_array_interface_dict, diff --git a/python-package/xgboost/testing/ordinal.py b/python-package/xgboost/testing/ordinal.py index e7ec726b5e9c..6c6ab20d698f 100644 --- a/python-package/xgboost/testing/ordinal.py +++ b/python-package/xgboost/testing/ordinal.py @@ -355,7 +355,7 @@ def run_mixed(DMatrixT: Type) -> None: ) c_ser = pd.Series(c_typ, dtype=pd.ArrowDtype(c_typ.type)) b_typ = pa.DictionaryArray.from_arrays( - pa.array([0, 1, 2], pa.array(b_list)) + pa.array([0, 1, 2]), pa.array(b_list) ) b_ser = pd.Series(b_typ, dtype=pd.ArrowDtype(b_typ.type)) df = Df({"b": b_ser, "c": c_ser})