From cc6e7822ee88174d55cae7ab3e1d34f8e2de9735 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Mon, 10 Nov 2025 19:14:39 +0800
Subject: [PATCH 1/3] Support arrow-backed pandas categorical columns.

---
 python-package/xgboost/_data_utils.py | 18 +++++++++++
 python-package/xgboost/data.py        | 45 ++++++++++++++++-----------
 2 files changed, 44 insertions(+), 19 deletions(-)

diff --git a/python-package/xgboost/_data_utils.py b/python-package/xgboost/_data_utils.py
index 4531848aa782..e9b6256c7e74 100644
--- a/python-package/xgboost/_data_utils.py
+++ b/python-package/xgboost/_data_utils.py
@@ -749,3 +749,21 @@ def array_interface(self) -> bytes:
     @abstractmethod
     def shape(self) -> Tuple[int, int]:
         """Return the shape of the dataframe."""
+
+
+class ArrowCatMixin:
+    """Mixin for handling arrow-backed dictionary array in a transformed dataframe."""
+
+    def _push_arrow_cat(
+        self, col: "pa.DictionaryArray", aitfs: AifType, temporary_buffers: List[Tuple]
+    ) -> None:
+        pa = import_pyarrow()
+        cats = col.dictionary
+        codes = col.indices
+        if not isinstance(cats, (pa.StringArray, pa.LargeStringArray)):
+            raise TypeError(
+                "Only string-based categorical index is supported for arrow."
+            )
+        jnames, jcodes, buf = arrow_cat_inf(cats, codes)
+        temporary_buffers.append(buf)
+        aitfs.append((jnames, jcodes))
diff --git a/python-package/xgboost/data.py b/python-package/xgboost/data.py
index f0917c615d39..d8e13166ad12 100644
--- a/python-package/xgboost/data.py
+++ b/python-package/xgboost/data.py
@@ -25,6 +25,7 @@
 
 from ._data_utils import (
     AifType,
+    ArrowCatMixin,
     Categories,
     DfCatAccessor,
     TransformedDf,
@@ -493,8 +494,7 @@ def is_pd_sparse_dtype(dtype: PandasDType) -> bool:
     return is_sparse(dtype)
 
 
-def pandas_pa_type(ser: Any) -> np.ndarray:
-    """Handle pandas pyarrow extention."""
+def pandas_pa_chunk(ser: Any) -> "pa.Array":
     pd = import_pandas()
 
     if TYPE_CHECKING:
@@ -511,6 +511,14 @@ def pandas_pa_type(ser: Any) -> np.ndarray:
     aa: "pa.ChunkedArray" = d_array.__arrow_array__()
     # combine_chunks takes the most significant amount of time
     chunk: "pa.Array" = aa.combine_chunks()
+    return chunk
+
+
+def pandas_pa_type(ser: Any) -> np.ndarray:
+    """Handle numerical pandas pyarrow extention."""
+    pa = import_pyarrow()
+
+    chunk = pandas_pa_chunk(ser)
     # When there's null value, we have to use copy
     zero_copy = chunk.null_count == 0 and not pa.types.is_boolean(chunk.type)
     # Alternately, we can use chunk.buffers(), which returns a list of buffers and
@@ -524,6 +532,11 @@ def pandas_pa_type(ser: Any) -> np.ndarray:
     return arr
 
 
+def pandas_pa_cat_type(ser: Any) -> np.ndarray:
+    """Handle categorical pandas pyarrow extention."""
+    return pandas_pa_chunk(ser)
+
+
 @functools.cache
 def _lazy_has_npdtypes() -> bool:
     return np.lib.NumpyVersion(np.__version__) > np.lib.NumpyVersion("1.25.0")
@@ -582,10 +595,9 @@ def oth_type(ser: "PdSeries") -> np.ndarray:
 
     for col, dtype in zip(data.columns, data.dtypes):
         if is_pa_ext_categorical_dtype(dtype):
-            raise ValueError(
-                "pyarrow dictionary type is not supported. Use pandas category instead."
-            )
-        if is_pd_cat_dtype(dtype):
+            arr_cat = pandas_pa_cat_type(data[col])
+            result.append(arr_cat)
+        elif is_pd_cat_dtype(dtype):
             result.append(cat_codes(data[col]))
         elif is_pa_ext_dtype(dtype):
             result.append(pandas_pa_type(data[col]))
@@ -606,7 +618,7 @@ def oth_type(ser: "PdSeries") -> np.ndarray:
     return result
 
 
-class PandasTransformed(TransformedDf):
+class PandasTransformed(TransformedDf, ArrowCatMixin):
     """A storage class for transformed pandas DataFrame."""
 
     def __init__(
@@ -620,8 +632,11 @@ def __init__(
 
         # Get the array interface representation for each column.
         for col in self.columns:
-            if _is_df_cat(col):
-                # Categorical column
+            if is_arrow_dict(col):
+                # Arrow categorical column
+                self._push_arrow_cat(col, aitfs, self.temporary_buffers)
+            elif _is_df_cat(col):
+                # Pandas categorical column
                 jnames, jcodes, buf = pd_cat_inf(col.categories, col.codes)
                 self.temporary_buffers.append(buf)
                 aitfs.append((jnames, jcodes))
@@ -755,7 +770,7 @@ def _from_pandas_series(
     )
 
 
-class ArrowTransformed(TransformedDf):
+class ArrowTransformed(TransformedDf, ArrowCatMixin):
     """A storage class for transformed arrow table."""
 
     def __init__(
@@ -776,15 +791,7 @@ def __init__(
 
         def push_series(col: Union["pa.NumericArray", "pa.DictionaryArray"]) -> None:
             if isinstance(col, pa.DictionaryArray):
-                cats = col.dictionary
-                codes = col.indices
-                if not isinstance(cats, (pa.StringArray, pa.LargeStringArray)):
-                    raise TypeError(
-                        "Only string-based categorical index is supported for arrow."
-                    )
-                jnames, jcodes, buf = arrow_cat_inf(cats, codes)
-                self.temporary_buffers.append(buf)
-                aitfs.append((jnames, jcodes))
+                self._push_arrow_cat(col, aitfs, self.temporary_buffers)
             else:
                 jdata = _arrow_array_inf(col)
                 aitfs.append(jdata)

From 160d0f07a2fff3dfbaa9065bc36c6604f1d7c1f1 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Mon, 10 Nov 2025 19:27:58 +0800
Subject: [PATCH 2/3] simple tests.

---
 python-package/xgboost/testing/ordinal.py | 21 +++++++++++++++++++--
 tests/python-gpu/test_gpu_ordinal.py      |  2 +-
 tests/python/test_ordinal.py              |  7 ++++++-
 3 files changed, 26 insertions(+), 4 deletions(-)

diff --git a/python-package/xgboost/testing/ordinal.py b/python-package/xgboost/testing/ordinal.py
index 6629868778f4..e7ec726b5e9c 100644
--- a/python-package/xgboost/testing/ordinal.py
+++ b/python-package/xgboost/testing/ordinal.py
@@ -335,7 +335,7 @@ def run_basic_predict(DMatrixT: Type, device: Device, tdevice: Device) -> None:
     assert_allclose(device, predt0, predt2)
 
 
-def run_cat_predict(device: Device) -> None:
+def run_cat_predict(device: Device, use_arrow: bool) -> None:
     """Basic tests for re-coding during prediction."""
     Df, _ = get_df_impl(device)
 
@@ -343,7 +343,24 @@ def run_cat_predict(device: Device) -> None:
         run_basic_predict(dm, device, device)
 
     def run_mixed(DMatrixT: Type) -> None:
-        df = Df({"b": [2, 1, 3], "c": ["cdef", "abc", "def"]}, dtype="category")
+        b_list = [2, 1, 3]
+        c_list = ["cdef", "abc", "def"]
+        if use_arrow:
+            import pandas as pd
+            import pyarrow as pa
+
+            c_typ = pa.DictionaryArray.from_arrays(
+                pa.array([0, 1, 2]),
+                pa.array(["cdef", "abc", "def"], type=pa.large_utf8()),
+            )
+            c_ser = pd.Series(c_typ, dtype=pd.ArrowDtype(c_typ.type))
+            b_typ = pa.DictionaryArray.from_arrays(
+                pa.array([0, 1, 2], pa.array(b_list))
+            )
+            b_ser = pd.Series(b_typ, dtype=pd.ArrowDtype(b_typ.type))
+            df = Df({"b": b_ser, "c": c_ser})
+        else:
+            df = Df({"b": b_list, "c": c_list}, dtype="category")
         y = np.array([0, 1, 2])
 
         # used with the next df
diff --git a/tests/python-gpu/test_gpu_ordinal.py b/tests/python-gpu/test_gpu_ordinal.py
index e091ffc80773..58ae79cc517f 100644
--- a/tests/python-gpu/test_gpu_ordinal.py
+++ b/tests/python-gpu/test_gpu_ordinal.py
@@ -42,7 +42,7 @@ def test_cat_container_iter() -> None:
 
 
 def test_cat_predict() -> None:
-    run_cat_predict("cuda")
+    run_cat_predict("cuda", False)
 
 
 def test_cat_invalid() -> None:
diff --git a/tests/python/test_ordinal.py b/tests/python/test_ordinal.py
index 3e76a8b37eee..859528ee7232 100644
--- a/tests/python/test_ordinal.py
+++ b/tests/python/test_ordinal.py
@@ -34,7 +34,12 @@ def test_cat_container_iter() -> None:
 
 
 def test_cat_predict() -> None:
-    run_cat_predict("cpu")
+    run_cat_predict("cpu", False)
+
+
+@pytest.mark.skipif(**tm.no_arrow())
+def test_cat_predict_arrow() -> None:
+    run_cat_predict("cpu", True)
 
 
 def test_cat_invalid() -> None:

From fa6523a736595ca046c08ad86d0b134793793f57 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Tue, 11 Nov 2025 01:02:08 +0800
Subject: [PATCH 3/3] cleanup.

---
 python-package/xgboost/data.py            | 1 -
 python-package/xgboost/testing/ordinal.py | 2 +-
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/python-package/xgboost/data.py b/python-package/xgboost/data.py
index d8e13166ad12..655ba1939626 100644
--- a/python-package/xgboost/data.py
+++ b/python-package/xgboost/data.py
@@ -35,7 +35,6 @@
     array_hasobject,
     array_interface,
     array_interface_dict,
-    arrow_cat_inf,
     check_cudf_meta,
     cuda_array_interface,
     cuda_array_interface_dict,
diff --git a/python-package/xgboost/testing/ordinal.py b/python-package/xgboost/testing/ordinal.py
index e7ec726b5e9c..6c6ab20d698f 100644
--- a/python-package/xgboost/testing/ordinal.py
+++ b/python-package/xgboost/testing/ordinal.py
@@ -355,7 +355,7 @@ def run_mixed(DMatrixT: Type) -> None:
             )
             c_ser = pd.Series(c_typ, dtype=pd.ArrowDtype(c_typ.type))
             b_typ = pa.DictionaryArray.from_arrays(
-                pa.array([0, 1, 2], pa.array(b_list))
+                pa.array([0, 1, 2]), pa.array(b_list)
             )
             b_ser = pd.Series(b_typ, dtype=pd.ArrowDtype(b_typ.type))
             df = Df({"b": b_ser, "c": c_ser})