Skip to content

Commit 3512e24

Browse files
authored
ENH: Implement multi-column DataFrame.quantiles (#44301)
1 parent d906b33 commit 3512e24

File tree

3 files changed

+361
-94
lines changed

3 files changed

+361
-94
lines changed

doc/source/whatsnew/v1.5.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -294,6 +294,7 @@ Other enhancements
294294
- :class:`Series` reducers (e.g. ``min``, ``max``, ``sum``, ``mean``) will now successfully operate when the dtype is numeric and ``numeric_only=True`` is provided; previously this would raise a ``NotImplementedError`` (:issue:`47500`)
295295
- :meth:`RangeIndex.union` now can return a :class:`RangeIndex` instead of a :class:`Int64Index` if the resulting values are equally spaced (:issue:`47557`, :issue:`43885`)
296296
- :meth:`DataFrame.compare` now accepts an argument ``result_names`` to allow the user to specify the result's names of both left and right DataFrame which are being compared. This is by default ``'self'`` and ``'other'`` (:issue:`44354`)
297+
- :meth:`DataFrame.quantile` gained a ``method`` argument that can accept ``table`` to evaluate multi-column quantiles (:issue:`43881`)
297298
- :class:`Interval` now supports checking whether one interval is contained by another interval (:issue:`46613`)
298299
- Added ``copy`` keyword to :meth:`Series.set_axis` and :meth:`DataFrame.set_axis` to allow user to set axis on a new object without necessarily copying the underlying data (:issue:`47932`)
299300
- :meth:`Series.add_suffix`, :meth:`DataFrame.add_suffix`, :meth:`Series.add_prefix` and :meth:`DataFrame.add_prefix` support a ``copy`` argument. If ``False``, the underlying data is not copied in the returned object (:issue:`47934`)

pandas/core/frame.py

+69-9
Original file line numberDiff line numberDiff line change
@@ -83,7 +83,10 @@
8383
npt,
8484
)
8585
from pandas.compat._optional import import_optional_dependency
86-
from pandas.compat.numpy import function as nv
86+
from pandas.compat.numpy import (
87+
function as nv,
88+
np_percentile_argname,
89+
)
8790
from pandas.util._decorators import (
8891
Appender,
8992
Substitution,
@@ -11168,6 +11171,7 @@ def quantile(
1116811171
axis: Axis = 0,
1116911172
numeric_only: bool | lib.NoDefault = no_default,
1117011173
interpolation: QuantileInterpolation = "linear",
11174+
method: Literal["single", "table"] = "single",
1117111175
) -> Series | DataFrame:
1117211176
"""
1117311177
Return values at the given quantile over requested axis.
@@ -11196,6 +11200,10 @@ def quantile(
1119611200
* higher: `j`.
1119711201
* nearest: `i` or `j` whichever is nearest.
1119811202
* midpoint: (`i` + `j`) / 2.
11203+
method : {'single', 'table'}, default 'single'
11204+
Whether to compute quantiles per-column ('single') or over all columns
11205+
('table'). When 'table', the only allowed interpolation methods are
11206+
'nearest', 'lower', and 'higher'.
1119911207
1120011208
Returns
1120111209
-------
@@ -11225,6 +11233,17 @@ def quantile(
1122511233
0.1 1.3 3.7
1122611234
0.5 2.5 55.0
1122711235
11236+
Specifying `method='table'` will compute the quantile over all columns.
11237+
11238+
>>> df.quantile(.1, method="table", interpolation="nearest")
11239+
a 1
11240+
b 1
11241+
Name: 0.1, dtype: int64
11242+
>>> df.quantile([.1, .5], method="table", interpolation="nearest")
11243+
a b
11244+
0.1 1 1
11245+
0.5 3 100
11246+
1122811247
Specifying `numeric_only=False` will also compute the quantile of
1122911248
datetime and timedelta data.
1123011249
@@ -11251,13 +11270,18 @@ def quantile(
1125111270
# error: List item 0 has incompatible type "Union[float, Union[Union[
1125211271
# ExtensionArray, ndarray[Any, Any]], Index, Series], Sequence[float]]";
1125311272
# expected "float"
11254-
res_df = self.quantile(
11255-
[q], # type: ignore[list-item]
11273+
res_df = self.quantile( # type: ignore[call-overload]
11274+
[q],
1125611275
axis=axis,
1125711276
numeric_only=numeric_only,
1125811277
interpolation=interpolation,
11278+
method=method,
1125911279
)
11260-
res = res_df.iloc[0]
11280+
if method == "single":
11281+
res = res_df.iloc[0]
11282+
else:
11283+
# cannot directly iloc over sparse arrays
11284+
res = res_df.T.iloc[:, 0]
1126111285
if axis == 1 and len(self) == 0:
1126211286
# GH#41544 try to get an appropriate dtype
1126311287
dtype = find_common_type(list(self.dtypes))
@@ -11285,11 +11309,47 @@ def quantile(
1128511309
res = self._constructor([], index=q, columns=cols, dtype=dtype)
1128611310
return res.__finalize__(self, method="quantile")
1128711311

11288-
# error: Argument "qs" to "quantile" of "BlockManager" has incompatible type
11289-
# "Index"; expected "Float64Index"
11290-
res = data._mgr.quantile(
11291-
qs=q, axis=1, interpolation=interpolation # type: ignore[arg-type]
11292-
)
11312+
valid_method = {"single", "table"}
11313+
if method not in valid_method:
11314+
raise ValueError(
11315+
f"Invalid method: {method}. Method must be in {valid_method}."
11316+
)
11317+
if method == "single":
11318+
# error: Argument "qs" to "quantile" of "BlockManager" has incompatible type
11319+
# "Index"; expected "Float64Index"
11320+
res = data._mgr.quantile(
11321+
qs=q, axis=1, interpolation=interpolation # type: ignore[arg-type]
11322+
)
11323+
elif method == "table":
11324+
valid_interpolation = {"nearest", "lower", "higher"}
11325+
if interpolation not in valid_interpolation:
11326+
raise ValueError(
11327+
f"Invalid interpolation: {interpolation}. "
11328+
f"Interpolation must be in {valid_interpolation}"
11329+
)
11330+
# handle degenerate case
11331+
if len(data) == 0:
11332+
if data.ndim == 2:
11333+
dtype = find_common_type(list(self.dtypes))
11334+
else:
11335+
dtype = self.dtype
11336+
return self._constructor([], index=q, columns=data.columns, dtype=dtype)
11337+
11338+
q_idx = np.quantile( # type: ignore[call-overload]
11339+
np.arange(len(data)), q, **{np_percentile_argname: interpolation}
11340+
)
11341+
11342+
by = data.columns
11343+
if len(by) > 1:
11344+
keys = [data._get_label_or_level_values(x) for x in by]
11345+
indexer = lexsort_indexer(keys)
11346+
else:
11347+
by = by[0]
11348+
k = data._get_label_or_level_values(by) # type: ignore[arg-type]
11349+
indexer = nargsort(k)
11350+
11351+
res = data._mgr.take(indexer[q_idx], verify=False)
11352+
res.axes[1] = q
1129311353

1129411354
result = self._constructor(res)
1129511355
return result.__finalize__(self, method="quantile")

0 commit comments

Comments
 (0)