Skip to content

Commit 92524f5

Browse files
TomAugspurgerjreback
authored andcommitted
ENH: ExtensionArray.fillna (#19909)
1 parent 3561580 commit 92524f5

File tree

13 files changed

+222
-79
lines changed

13 files changed

+222
-79
lines changed

doc/source/whatsnew/v0.23.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -844,6 +844,7 @@ Categorical
844844
- Bug in :meth:`Series.astype` and ``Categorical.astype()`` where an existing categorical data does not get updated (:issue:`10696`, :issue:`18593`)
845845
- Bug in :class:`Index` constructor with ``dtype=CategoricalDtype(...)`` where ``categories`` and ``ordered`` are not maintained (issue:`19032`)
846846
- Bug in :class:`Series` constructor with scalar and ``dtype=CategoricalDtype(...)`` where ``categories`` and ``ordered`` are not maintained (issue:`19565`)
847+
- Bug in ``Categorical.__iter__`` not converting to Python types (:issue:`19909`)
847848
- Bug in :func:`pandas.factorize` returning the unique codes for the ``uniques``. This now returns a ``Categorical`` with the same dtype as the input (:issue:`19721`)
848849
- Bug in :func:`pandas.factorize` including an item for missing values in the ``uniques`` return value (:issue:`19721`)
849850

pandas/core/arrays/base.py

+54
Original file line numberDiff line numberDiff line change
@@ -236,6 +236,59 @@ def isna(self):
236236
"""
237237
raise AbstractMethodError(self)
238238

239+
def fillna(self, value=None, method=None, limit=None):
240+
""" Fill NA/NaN values using the specified method.
241+
242+
Parameters
243+
----------
244+
value : scalar, array-like
245+
If a scalar value is passed it is used to fill all missing values.
246+
Alternatively, an array-like 'value' can be given. It's expected
247+
that the array-like have the same length as 'self'.
248+
method : {'backfill', 'bfill', 'pad', 'ffill', None}, default None
249+
Method to use for filling holes in reindexed Series
250+
pad / ffill: propagate last valid observation forward to next valid
251+
backfill / bfill: use NEXT valid observation to fill gap
252+
limit : int, default None
253+
If method is specified, this is the maximum number of consecutive
254+
NaN values to forward/backward fill. In other words, if there is
255+
a gap with more than this number of consecutive NaNs, it will only
256+
be partially filled. If method is not specified, this is the
257+
maximum number of entries along the entire axis where NaNs will be
258+
filled.
259+
260+
Returns
261+
-------
262+
filled : ExtensionArray with NA/NaN filled
263+
"""
264+
from pandas.api.types import is_scalar
265+
from pandas.util._validators import validate_fillna_kwargs
266+
from pandas.core.missing import pad_1d, backfill_1d
267+
268+
value, method = validate_fillna_kwargs(value, method)
269+
270+
mask = self.isna()
271+
272+
if not is_scalar(value):
273+
if len(value) != len(self):
274+
raise ValueError("Length of 'value' does not match. Got ({}) "
275+
" expected {}".format(len(value), len(self)))
276+
value = value[mask]
277+
278+
if mask.any():
279+
if method is not None:
280+
func = pad_1d if method == 'pad' else backfill_1d
281+
new_values = func(self.astype(object), limit=limit,
282+
mask=mask)
283+
new_values = self._constructor_from_sequence(new_values)
284+
else:
285+
# fill with value
286+
new_values = self.copy()
287+
new_values[mask] = value
288+
else:
289+
new_values = self.copy()
290+
return new_values
291+
239292
def unique(self):
240293
"""Compute the ExtensionArray of unique values.
241294
@@ -285,6 +338,7 @@ def take(self, indexer, allow_fill=True, fill_value=None):
285338
.. code-block:: python
286339
287340
def take(self, indexer, allow_fill=True, fill_value=None):
341+
indexer = np.asarray(indexer)
288342
mask = indexer == -1
289343
result = self.data.take(indexer)
290344
result[mask] = np.nan # NA for this type

pandas/core/arrays/categorical.py

+6-8
Original file line numberDiff line numberDiff line change
@@ -480,9 +480,7 @@ def tolist(self):
480480
(for str, int, float) or a pandas scalar
481481
(for Timestamp/Timedelta/Interval/Period)
482482
"""
483-
if is_datetimelike(self.categories):
484-
return [com._maybe_box_datetimelike(x) for x in self]
485-
return np.array(self).tolist()
483+
return list(self)
486484

487485
@property
488486
def base(self):
@@ -1581,16 +1579,16 @@ def fillna(self, value=None, method=None, limit=None):
15811579
15821580
Parameters
15831581
----------
1584-
method : {'backfill', 'bfill', 'pad', 'ffill', None}, default None
1585-
Method to use for filling holes in reindexed Series
1586-
pad / ffill: propagate last valid observation forward to next valid
1587-
backfill / bfill: use NEXT valid observation to fill gap
15881582
value : scalar, dict, Series
15891583
If a scalar value is passed it is used to fill all missing values.
15901584
Alternatively, a Series or dict can be used to fill in different
15911585
values for each index. The value should not be a list. The
15921586
value(s) passed should either be in the categories or should be
15931587
NaN.
1588+
method : {'backfill', 'bfill', 'pad', 'ffill', None}, default None
1589+
Method to use for filling holes in reindexed Series
1590+
pad / ffill: propagate last valid observation forward to next valid
1591+
backfill / bfill: use NEXT valid observation to fill gap
15941592
limit : int, default None
15951593
(Not implemented yet for Categorical!)
15961594
If method is specified, this is the maximum number of consecutive
@@ -1698,7 +1696,7 @@ def __len__(self):
16981696

16991697
def __iter__(self):
17001698
"""Returns an Iterator over the values of this Categorical."""
1701-
return iter(self.get_values())
1699+
return iter(self.get_values().tolist())
17021700

17031701
def _tidy_repr(self, max_vals=10, footer=True):
17041702
""" a short repr displaying only max_vals and an optional (but default

pandas/core/base.py

+4-3
Original file line numberDiff line numberDiff line change
@@ -9,10 +9,10 @@
99
from pandas.core.dtypes.missing import isna
1010
from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries, ABCIndexClass
1111
from pandas.core.dtypes.common import (
12+
is_datetimelike,
1213
is_object_dtype,
1314
is_list_like,
1415
is_scalar,
15-
is_datetimelike,
1616
is_extension_type,
1717
is_extension_array_dtype)
1818

@@ -883,9 +883,10 @@ def tolist(self):
883883
--------
884884
numpy.ndarray.tolist
885885
"""
886-
887-
if is_datetimelike(self):
886+
if is_datetimelike(self._values):
888887
return [com._maybe_box_datetimelike(x) for x in self._values]
888+
elif is_extension_array_dtype(self._values):
889+
return list(self._values)
889890
else:
890891
return self._values.tolist()
891892

pandas/core/dtypes/cast.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,8 @@
1111
from .common import (_ensure_object, is_bool, is_integer, is_float,
1212
is_complex, is_datetimetz, is_categorical_dtype,
1313
is_datetimelike,
14-
is_extension_type, is_object_dtype,
14+
is_extension_type,
15+
is_object_dtype,
1516
is_datetime64tz_dtype, is_datetime64_dtype,
1617
is_datetime64_ns_dtype,
1718
is_timedelta64_dtype, is_timedelta64_ns_dtype,

pandas/core/internals.py

+17-21
Original file line numberDiff line numberDiff line change
@@ -1965,6 +1965,23 @@ def concat_same_type(self, to_concat, placement=None):
19651965
return self.make_block_same_class(values, ndim=self.ndim,
19661966
placement=placement)
19671967

1968+
def fillna(self, value, limit=None, inplace=False, downcast=None,
1969+
mgr=None):
1970+
values = self.values if inplace else self.values.copy()
1971+
values = values.fillna(value=value, limit=limit)
1972+
return [self.make_block_same_class(values=values,
1973+
placement=self.mgr_locs,
1974+
ndim=self.ndim)]
1975+
1976+
def interpolate(self, method='pad', axis=0, inplace=False, limit=None,
1977+
fill_value=None, **kwargs):
1978+
1979+
values = self.values if inplace else self.values.copy()
1980+
return self.make_block_same_class(
1981+
values=values.fillna(value=fill_value, method=method,
1982+
limit=limit),
1983+
placement=self.mgr_locs)
1984+
19681985

19691986
class NumericBlock(Block):
19701987
__slots__ = ()
@@ -2524,27 +2541,6 @@ def _try_coerce_result(self, result):
25242541

25252542
return result
25262543

2527-
def fillna(self, value, limit=None, inplace=False, downcast=None,
2528-
mgr=None):
2529-
# we may need to upcast our fill to match our dtype
2530-
if limit is not None:
2531-
raise NotImplementedError("specifying a limit for 'fillna' has "
2532-
"not been implemented yet")
2533-
2534-
values = self.values if inplace else self.values.copy()
2535-
values = self._try_coerce_result(values.fillna(value=value,
2536-
limit=limit))
2537-
return [self.make_block(values=values)]
2538-
2539-
def interpolate(self, method='pad', axis=0, inplace=False, limit=None,
2540-
fill_value=None, **kwargs):
2541-
2542-
values = self.values if inplace else self.values.copy()
2543-
return self.make_block_same_class(
2544-
values=values.fillna(fill_value=fill_value, method=method,
2545-
limit=limit),
2546-
placement=self.mgr_locs)
2547-
25482544
def shift(self, periods, axis=0, mgr=None):
25492545
return self.make_block_same_class(values=self.values.shift(periods),
25502546
placement=self.mgr_locs)

pandas/tests/categorical/test_dtypes.py

+15-2
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,12 @@
11
# -*- coding: utf-8 -*-
2-
32
import pytest
43

54
import numpy as np
65

76
import pandas.util.testing as tm
87
from pandas.core.dtypes.dtypes import CategoricalDtype
9-
from pandas import Categorical, Index, CategoricalIndex, Series
8+
from pandas.compat import long
9+
from pandas import Categorical, Index, CategoricalIndex, Series, Timestamp
1010

1111

1212
class TestCategoricalDtypes(object):
@@ -161,3 +161,16 @@ def test_astype_category(self, dtype_ordered, cat_ordered):
161161
result = cat.astype('category')
162162
expected = cat
163163
tm.assert_categorical_equal(result, expected)
164+
165+
def test_iter_python_types(self):
166+
# GH-19909
167+
# TODO(Py2): Remove long
168+
cat = Categorical([1, 2])
169+
assert isinstance(list(cat)[0], (int, long))
170+
assert isinstance(cat.tolist()[0], (int, long))
171+
172+
def test_iter_python_types_datetime(self):
173+
cat = Categorical([Timestamp('2017-01-01'),
174+
Timestamp('2017-01-02')])
175+
assert isinstance(list(cat)[0], Timestamp)
176+
assert isinstance(cat.tolist()[0], Timestamp)

pandas/tests/extension/base/casting.py

+5
Original file line numberDiff line numberDiff line change
@@ -11,3 +11,8 @@ def test_astype_object_series(self, all_data):
1111
ser = pd.Series({"A": all_data})
1212
result = ser.astype(object)
1313
assert isinstance(result._data.blocks[0], ObjectBlock)
14+
15+
def test_tolist(self, data):
16+
result = pd.Series(data).tolist()
17+
expected = list(data)
18+
assert result == expected

pandas/tests/extension/base/missing.py

+69
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import numpy as np
2+
import pytest
23

34
import pandas as pd
45
import pandas.util.testing as tm
@@ -45,3 +46,71 @@ def test_dropna_frame(self, data_missing):
4546
result = df.dropna()
4647
expected = df.iloc[:0]
4748
self.assert_frame_equal(result, expected)
49+
50+
def test_fillna_limit_pad(self, data_missing):
51+
arr = data_missing.take([1, 0, 0, 0, 1])
52+
result = pd.Series(arr).fillna(method='ffill', limit=2)
53+
expected = pd.Series(data_missing.take([1, 1, 1, 0, 1]))
54+
self.assert_series_equal(result, expected)
55+
56+
def test_fillna_limit_backfill(self, data_missing):
57+
arr = data_missing.take([1, 0, 0, 0, 1])
58+
result = pd.Series(arr).fillna(method='backfill', limit=2)
59+
expected = pd.Series(data_missing.take([1, 0, 1, 1, 1]))
60+
self.assert_series_equal(result, expected)
61+
62+
def test_fillna_series(self, data_missing):
63+
fill_value = data_missing[1]
64+
ser = pd.Series(data_missing)
65+
66+
result = ser.fillna(fill_value)
67+
expected = pd.Series(type(data_missing)([fill_value, fill_value]))
68+
self.assert_series_equal(result, expected)
69+
70+
# Fill with a series
71+
result = ser.fillna(expected)
72+
self.assert_series_equal(result, expected)
73+
74+
# Fill with a series not affecting the missing values
75+
result = ser.fillna(ser)
76+
self.assert_series_equal(result, ser)
77+
78+
@pytest.mark.parametrize('method', ['ffill', 'bfill'])
79+
def test_fillna_series_method(self, data_missing, method):
80+
fill_value = data_missing[1]
81+
82+
if method == 'ffill':
83+
data_missing = type(data_missing)(data_missing[::-1])
84+
85+
result = pd.Series(data_missing).fillna(method=method)
86+
expected = pd.Series(type(data_missing)([fill_value, fill_value]))
87+
88+
self.assert_series_equal(result, expected)
89+
90+
def test_fillna_frame(self, data_missing):
91+
fill_value = data_missing[1]
92+
93+
result = pd.DataFrame({
94+
"A": data_missing,
95+
"B": [1, 2]
96+
}).fillna(fill_value)
97+
98+
expected = pd.DataFrame({
99+
"A": type(data_missing)([fill_value, fill_value]),
100+
"B": [1, 2],
101+
})
102+
103+
self.assert_frame_equal(result, expected)
104+
105+
def test_fillna_fill_other(self, data):
106+
result = pd.DataFrame({
107+
"A": data,
108+
"B": [np.nan] * len(data)
109+
}).fillna({"B": 0.0})
110+
111+
expected = pd.DataFrame({
112+
"A": data,
113+
"B": [0.0] * len(result),
114+
})
115+
116+
self.assert_frame_equal(result, expected)

pandas/tests/extension/category/test_categorical.py

+8-1
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,14 @@ def test_getitem_scalar(self):
6969

7070

7171
class TestMissing(base.BaseMissingTests):
72-
pass
72+
73+
@pytest.mark.skip(reason="Not implemented")
74+
def test_fillna_limit_pad(self):
75+
pass
76+
77+
@pytest.mark.skip(reason="Not implemented")
78+
def test_fillna_limit_backfill(self):
79+
pass
7380

7481

7582
class TestMethods(base.BaseMethodsTests):

pandas/tests/extension/decimal/array.py

+1
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,7 @@ def isna(self):
7171
return np.array([x.is_nan() for x in self.values])
7272

7373
def take(self, indexer, allow_fill=True, fill_value=None):
74+
indexer = np.asarray(indexer)
7475
mask = indexer == -1
7576

7677
indexer = _ensure_platform_int(indexer)

0 commit comments

Comments
 (0)