dmlc · trivialfis · Jul 27, 2025 · Jul 26, 2025 · Jul 26, 2025 · Jul 26, 2025
diff --git a/.gitignore b/.gitignore
@@ -15,6 +15,7 @@
 *.Rcheck
 *.rds
 *.tar.gz
+*.tar.bz2
 *conf
 *buffer
 *.model

diff --git a/include/xgboost/data.h b/include/xgboost/data.h
@@ -430,7 +430,7 @@ class SparsePage {
    * \return  The maximum number of columns encountered in this input batch. Useful when pushing many adapter batches to work out the total number of columns.
    */
   template <typename AdapterBatchT>
-  uint64_t Push(const AdapterBatchT& batch, float missing, int nthread);
+  bst_idx_t Push(AdapterBatchT const& batch, float missing, std::int32_t nthread);
 
   /*!
    * \brief Push a sparse page

diff --git a/python-package/xgboost/_data_utils.py b/python-package/xgboost/_data_utils.py
@@ -23,8 +23,15 @@
 
 import numpy as np
 
-from ._typing import CNumericPtr, DataType, NumpyDType, NumpyOrCupy
-from .compat import import_cupy, import_pyarrow, lazy_isinstance
+from ._typing import (
+    ArrowCatList,
+    CNumericPtr,
+    DataType,
+    FeatureNames,
+    NumpyDType,
+    NumpyOrCupy,
+)
+from .compat import import_cupy, import_pyarrow, is_pyarrow_available, lazy_isinstance
 
 if TYPE_CHECKING:
     import pandas as pd
@@ -480,3 +487,46 @@ def cudf_cat_inf(
 
     joffset, jdata, buf = _arrow_cat_inf(cats.to_arrow(), codes)
     return joffset, jdata, buf
+
+
+class Categories:
+    """An internal storage class for categories returned by the DMatrix and the
+    Booster. This class is designed to be opaque. It is intended to be used exclusively
+    by XGBoost as an intermediate storage for re-coding categorical data.
+
+    The categories are saved along with the booster object. As a result, users don't
+    need to preserve this class for re-coding. Use the booster model IO instead if you
+    want to preserve the categories in a stable format.
+
+    """
+
+    def __init__(
+        self,
+        handle: ctypes.c_void_p,
+        feature_names: FeatureNames,
+        arrow_arrays: Optional[ArrowCatList],
+    ) -> None:
+        self._handle = handle
+        self._feature_names = feature_names
+        self._arrow_arrays = arrow_arrays
+
+    def to_arrow(self) -> Optional[ArrowCatList]:
+        """Get the categories in the dataset. The results are stored in a list of arrow
+        arrays with one array for each feature. If a feature is numerical, then the
+        corresponding element in the list is None. A value error is rasied if this
+        container is created without the export option.
+
+        """
+        if self._arrow_arrays is None:
+            raise ValueError(
+                "The `export_to_arrow` option of the `get_categories` method"
+                " is required."
+            )
+        return self._arrow_arrays
+
+    def __del__(self) -> None:
+        from .core import _LIB, _check_call
+
+        assert self._handle is not None
+        _check_call(_LIB.XGBCategoriesFree(self._handle))
+        del self._handle
diff --git a/python-package/xgboost/_typing.py b/python-package/xgboost/_typing.py
@@ -13,6 +13,7 @@
     Sequence,
     Tuple,
     Type,
+    TypeAlias,
     TypeVar,
     Union,
 )
@@ -30,9 +31,15 @@
 
 ArrayLike = Any
 if TYPE_CHECKING:
+    import pyarrow as pa
+
     PathLike = Union[str, os.PathLike[str]]
 else:
     PathLike = Union[str, os.PathLike]
+
+ArrowCatCol: TypeAlias = Optional[Union["pa.StringArray", "pa.NumericArray"]]
+ArrowCatList: TypeAlias = List[Tuple[str, Optional[ArrowCatCol]]]
+
 CupyT = ArrayLike  # maybe need a stub for cupy arrays
 NumpyOrCupy = Any
 NumpyDType = Union[str, Type[np.number]]  # pylint: disable=invalid-name

diff --git a/python-package/xgboost/core.py b/python-package/xgboost/core.py
@@ -38,6 +38,7 @@
 import scipy.sparse
 
 from ._data_utils import (
+    Categories,
     TransformedDf,
     array_interface,
     cuda_array_interface,
@@ -47,6 +48,7 @@
 from ._typing import (
     _T,
     ArrayLike,
+    ArrowCatList,
     BoosterParam,
     CFloatPtr,
     CNumeric,
@@ -76,9 +78,6 @@
 )
 from .libpath import find_lib_path, is_sphinx_build
 
-if TYPE_CHECKING:
-    import pyarrow as pa
-
 
 class XGBoostError(ValueError):
     """Error thrown by xgboost trainer."""
@@ -781,27 +780,22 @@ def inner_f(*args: Any, **kwargs: Any) -> _T:
 
 def _get_categories(
     cfn: Callable[[ctypes.c_char_p], int],
-    feature_names: Optional[FeatureNames],
+    feature_names: FeatureNames,
     n_features: int,
-) -> Optional[Dict[str, "pa.DictionaryArray"]]:
+) -> Optional[ArrowCatList]:
     if not is_pyarrow_available():
         raise ImportError("`pyarrow` is required for exporting categories.")
 
-    if TYPE_CHECKING:
-        import pyarrow as pa
-    else:
+    if not TYPE_CHECKING:
         pa = import_pyarrow()
+    else:
+        import pyarrow as pa
 
-    fnames = feature_names
-    if fnames is None:
-        fnames = [str(i) for i in range(n_features)]
-
-    results: Dict[str, "pa.DictionaryArray"] = {}
+    results: ArrowCatList = []
 
     ret = ctypes.c_char_p()
     _check_call(cfn(ret))
-    if ret.value is None:
-        return None
+    assert ret.value is not None
 
     retstr = ret.value.decode()  # pylint: disable=no-member
     jcats = json.loads(retstr)
@@ -811,19 +805,19 @@ def _get_categories(
         f_jcats = jcats[fidx]
         if f_jcats is None:
             # Numeric data
-            results[fnames[fidx]] = None
+            results.append((feature_names[fidx], None))
             continue
 
         if "offsets" not in f_jcats:
             values = from_array_interface(f_jcats)
             pa_values = pa.Array.from_pandas(values)
-            results[fnames[fidx]] = pa_values
+            results.append((feature_names[fidx], pa_values))
             continue
 
         joffsets = f_jcats["offsets"]
         jvalues = f_jcats["values"]
-        offsets = from_array_interface(joffsets, True)
-        values = from_array_interface(jvalues, True)
+        offsets = from_array_interface(joffsets)
+        values = from_array_interface(jvalues)
         pa_offsets = pa.array(offsets).buffers()
         pa_values = pa.array(values).buffers()
         assert (
@@ -832,7 +826,7 @@ def _get_categories(
         pa_dict = pa.StringArray.from_buffers(
             len(offsets) - 1, pa_offsets[1], pa_values[1]
         )
-        results[fnames[fidx]] = pa_dict
+        results.append((feature_names[fidx], pa_dict))
 
     return results
 
@@ -1346,22 +1340,41 @@ def get_quantile_cut(self) -> Tuple[np.ndarray, np.ndarray]:
         assert data.dtype == np.float32
         return indptr, data
 
-    def get_categories(self) -> Optional[Dict[str, "pa.DictionaryArray"]]:
-        """Get the categories in the dataset using `pyarrow`. Returns `None` if there's
-        no categorical features.
+    def get_categories(self, export_to_arrow: bool = False) -> Categories:
+        """Get the categories in the dataset.
+
+        .. versionadded:: 3.1.0
 
         .. warning::
 
             This function is still working in progress.
 
-        .. versionadded:: 3.1.0
+        Parameters
+        ----------
+        export_to_arrow :
+            The returned container will contain a dictionary to `pyarrow` arrays
+            encoding the categories.
 
         """
-        return _get_categories(
-            lambda ret: _LIB.XGBDMatrixGetCategories(self.handle, ctypes.byref(ret)),
-            self.feature_names,
-            self.num_col(),
-        )
+        fnames = self.feature_names
+        n_features = self.num_col()
+        if fnames is None:
+            fnames = [str(i) for i in range(n_features)]
+
+        hdl = ctypes.c_void_p()
+        if export_to_arrow:
+            arrow_arrays = _get_categories(
+                lambda ret: _LIB.XGBDMatrixGetCategoriesExportToArrow(
+                    self.handle, ctypes.byref(hdl), ctypes.byref(ret)
+                ),
+                fnames,
+                n_features,
+            )
+        else:
+            arrow_arrays = None
+            _check_call(_LIB.XGBDMatrixGetCategories(self.handle, ctypes.byref(hdl)))
+
+        return Categories(hdl, fnames, arrow_arrays)
 
     def num_row(self) -> int:
         """Get the number of rows in the DMatrix."""
@@ -2323,9 +2336,8 @@ def feature_names(self) -> Optional[FeatureNames]:
     def feature_names(self, features: Optional[FeatureNames]) -> None:
         self._set_feature_info(features, "feature_name")
 
-    def get_categories(self) -> Optional[Dict[str, "pa.DictionaryArray"]]:
-        """Get the categories in the dataset using `pyarrow`. Returns `None` if there's
-        no categorical features.
+    def get_categories(self, export_to_arrow: bool = False) -> Categories:
+        """Get the categories in the dataset.
 
         .. warning::
 
@@ -2334,11 +2346,25 @@ def get_categories(self) -> Optional[Dict[str, "pa.DictionaryArray"]]:
         .. versionadded:: 3.1.0
 
         """
-        return _get_categories(
-            lambda ret: _LIB.XGBoosterGetCategories(self.handle, ctypes.byref(ret)),
-            self.feature_names,
-            self.num_features(),
-        )
+        fnames = self.feature_names
+        n_features = self.num_features()
+        if fnames is None:
+            fnames = [str(i) for i in range(n_features)]
+
+        hdl = ctypes.c_void_p()
+        if export_to_arrow:
+            arrow_arrays = _get_categories(
+                lambda ret: _LIB.XGBoosterGetCategoriesExportToArrow(
+                    self.handle, ctypes.byref(hdl), ctypes.byref(ret)
+                ),
+                fnames,
+                n_features,
+            )
+        else:
+            arrow_arrays = None
+            _check_call(_LIB.XGBDMatrixGetCategories(self.handle, ctypes.byref(hdl)))
+
+        return Categories(hdl, fnames, arrow_arrays)
 
     def set_param(
         self,
-Original file line number
+Diff line change
@@ Expand Up / @@ -15,6 +15,7 @@ @@
     *.Rcheck
     *.rds
     *.tar.gz
+    *.tar.bz2
     *conf
     *buffer
     *.model
@@ Expand Down @@