Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
*.Rcheck
*.rds
*.tar.gz
*.tar.bz2
*conf
*buffer
*.model
Expand Down
2 changes: 1 addition & 1 deletion include/xgboost/data.h
Original file line number Diff line number Diff line change
Expand Up @@ -430,7 +430,7 @@ class SparsePage {
* \return The maximum number of columns encountered in this input batch. Useful when pushing many adapter batches to work out the total number of columns.
*/
template <typename AdapterBatchT>
uint64_t Push(const AdapterBatchT& batch, float missing, int nthread);
bst_idx_t Push(AdapterBatchT const& batch, float missing, std::int32_t nthread);

/*!
* \brief Push a sparse page
Expand Down
54 changes: 52 additions & 2 deletions python-package/xgboost/_data_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,15 @@

import numpy as np

from ._typing import CNumericPtr, DataType, NumpyDType, NumpyOrCupy
from .compat import import_cupy, import_pyarrow, lazy_isinstance
from ._typing import (
ArrowCatList,
CNumericPtr,
DataType,
FeatureNames,
NumpyDType,
NumpyOrCupy,
)
from .compat import import_cupy, import_pyarrow, is_pyarrow_available, lazy_isinstance

if TYPE_CHECKING:
import pandas as pd
Expand Down Expand Up @@ -480,3 +487,46 @@ def cudf_cat_inf(

joffset, jdata, buf = _arrow_cat_inf(cats.to_arrow(), codes)
return joffset, jdata, buf


class Categories:
"""An internal storage class for categories returned by the DMatrix and the
Booster. This class is designed to be opaque. It is intended to be used exclusively
by XGBoost as an intermediate storage for re-coding categorical data.

The categories are saved along with the booster object. As a result, users don't
need to preserve this class for re-coding. Use the booster model IO instead if you
want to preserve the categories in a stable format.

"""

def __init__(
self,
handle: ctypes.c_void_p,
feature_names: FeatureNames,
arrow_arrays: Optional[ArrowCatList],
) -> None:
self._handle = handle
self._feature_names = feature_names
self._arrow_arrays = arrow_arrays

def to_arrow(self) -> Optional[ArrowCatList]:
"""Get the categories in the dataset. The results are stored in a list of arrow
arrays with one array for each feature. If a feature is numerical, then the
corresponding element in the list is None. A value error is rasied if this
container is created without the export option.

"""
if self._arrow_arrays is None:
raise ValueError(
"The `export_to_arrow` option of the `get_categories` method"
" is required."
)
return self._arrow_arrays

def __del__(self) -> None:
from .core import _LIB, _check_call

assert self._handle is not None
_check_call(_LIB.XGBCategoriesFree(self._handle))
del self._handle
7 changes: 7 additions & 0 deletions python-package/xgboost/_typing.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
Sequence,
Tuple,
Type,
TypeAlias,
TypeVar,
Union,
)
Expand All @@ -30,9 +31,15 @@

ArrayLike = Any
if TYPE_CHECKING:
import pyarrow as pa

PathLike = Union[str, os.PathLike[str]]
else:
PathLike = Union[str, os.PathLike]

ArrowCatCol: TypeAlias = Optional[Union["pa.StringArray", "pa.NumericArray"]]
ArrowCatList: TypeAlias = List[Tuple[str, Optional[ArrowCatCol]]]

CupyT = ArrayLike # maybe need a stub for cupy arrays
NumpyOrCupy = Any
NumpyDType = Union[str, Type[np.number]] # pylint: disable=invalid-name
Expand Down
100 changes: 63 additions & 37 deletions python-package/xgboost/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
import scipy.sparse

from ._data_utils import (
Categories,
TransformedDf,
array_interface,
cuda_array_interface,
Expand All @@ -47,6 +48,7 @@
from ._typing import (
_T,
ArrayLike,
ArrowCatList,
BoosterParam,
CFloatPtr,
CNumeric,
Expand Down Expand Up @@ -76,9 +78,6 @@
)
from .libpath import find_lib_path, is_sphinx_build

if TYPE_CHECKING:
import pyarrow as pa


class XGBoostError(ValueError):
"""Error thrown by xgboost trainer."""
Expand Down Expand Up @@ -781,27 +780,22 @@ def inner_f(*args: Any, **kwargs: Any) -> _T:

def _get_categories(
cfn: Callable[[ctypes.c_char_p], int],
feature_names: Optional[FeatureNames],
feature_names: FeatureNames,
n_features: int,
) -> Optional[Dict[str, "pa.DictionaryArray"]]:
) -> Optional[ArrowCatList]:
if not is_pyarrow_available():
raise ImportError("`pyarrow` is required for exporting categories.")

if TYPE_CHECKING:
import pyarrow as pa
else:
if not TYPE_CHECKING:
pa = import_pyarrow()
else:
import pyarrow as pa

fnames = feature_names
if fnames is None:
fnames = [str(i) for i in range(n_features)]

results: Dict[str, "pa.DictionaryArray"] = {}
results: ArrowCatList = []

ret = ctypes.c_char_p()
_check_call(cfn(ret))
if ret.value is None:
return None
assert ret.value is not None

retstr = ret.value.decode() # pylint: disable=no-member
jcats = json.loads(retstr)
Expand All @@ -811,19 +805,19 @@ def _get_categories(
f_jcats = jcats[fidx]
if f_jcats is None:
# Numeric data
results[fnames[fidx]] = None
results.append((feature_names[fidx], None))
continue

if "offsets" not in f_jcats:
values = from_array_interface(f_jcats)
pa_values = pa.Array.from_pandas(values)
results[fnames[fidx]] = pa_values
results.append((feature_names[fidx], pa_values))
continue

joffsets = f_jcats["offsets"]
jvalues = f_jcats["values"]
offsets = from_array_interface(joffsets, True)
values = from_array_interface(jvalues, True)
offsets = from_array_interface(joffsets)
values = from_array_interface(jvalues)
pa_offsets = pa.array(offsets).buffers()
pa_values = pa.array(values).buffers()
assert (
Expand All @@ -832,7 +826,7 @@ def _get_categories(
pa_dict = pa.StringArray.from_buffers(
len(offsets) - 1, pa_offsets[1], pa_values[1]
)
results[fnames[fidx]] = pa_dict
results.append((feature_names[fidx], pa_dict))

return results

Expand Down Expand Up @@ -1346,22 +1340,41 @@ def get_quantile_cut(self) -> Tuple[np.ndarray, np.ndarray]:
assert data.dtype == np.float32
return indptr, data

def get_categories(self) -> Optional[Dict[str, "pa.DictionaryArray"]]:
"""Get the categories in the dataset using `pyarrow`. Returns `None` if there's
no categorical features.
def get_categories(self, export_to_arrow: bool = False) -> Categories:
"""Get the categories in the dataset.

.. versionadded:: 3.1.0

.. warning::

This function is still working in progress.

.. versionadded:: 3.1.0
Parameters
----------
export_to_arrow :
The returned container will contain a dictionary to `pyarrow` arrays
encoding the categories.

"""
return _get_categories(
lambda ret: _LIB.XGBDMatrixGetCategories(self.handle, ctypes.byref(ret)),
self.feature_names,
self.num_col(),
)
fnames = self.feature_names
n_features = self.num_col()
if fnames is None:
fnames = [str(i) for i in range(n_features)]

hdl = ctypes.c_void_p()
if export_to_arrow:
arrow_arrays = _get_categories(
lambda ret: _LIB.XGBDMatrixGetCategoriesExportToArrow(
self.handle, ctypes.byref(hdl), ctypes.byref(ret)
),
fnames,
n_features,
)
else:
arrow_arrays = None
_check_call(_LIB.XGBDMatrixGetCategories(self.handle, ctypes.byref(hdl)))

return Categories(hdl, fnames, arrow_arrays)

def num_row(self) -> int:
"""Get the number of rows in the DMatrix."""
Expand Down Expand Up @@ -2323,9 +2336,8 @@ def feature_names(self) -> Optional[FeatureNames]:
def feature_names(self, features: Optional[FeatureNames]) -> None:
self._set_feature_info(features, "feature_name")

def get_categories(self) -> Optional[Dict[str, "pa.DictionaryArray"]]:
"""Get the categories in the dataset using `pyarrow`. Returns `None` if there's
no categorical features.
def get_categories(self, export_to_arrow: bool = False) -> Categories:
"""Get the categories in the dataset.

.. warning::

Expand All @@ -2334,11 +2346,25 @@ def get_categories(self) -> Optional[Dict[str, "pa.DictionaryArray"]]:
.. versionadded:: 3.1.0

"""
return _get_categories(
lambda ret: _LIB.XGBoosterGetCategories(self.handle, ctypes.byref(ret)),
self.feature_names,
self.num_features(),
)
fnames = self.feature_names
n_features = self.num_features()
if fnames is None:
fnames = [str(i) for i in range(n_features)]

hdl = ctypes.c_void_p()
if export_to_arrow:
arrow_arrays = _get_categories(
lambda ret: _LIB.XGBoosterGetCategoriesExportToArrow(
self.handle, ctypes.byref(hdl), ctypes.byref(ret)
),
fnames,
n_features,
)
else:
arrow_arrays = None
_check_call(_LIB.XGBDMatrixGetCategories(self.handle, ctypes.byref(hdl)))

return Categories(hdl, fnames, arrow_arrays)

def set_param(
self,
Expand Down
Loading
Loading