Skip to content

Commit 5b15515

Browse files
authored
fix series.isin slow issue with Dtype IntegerArray (#38379)
1 parent fa09c6e commit 5b15515

File tree

6 files changed

+76
-27
lines changed

6 files changed

+76
-27
lines changed

Diff for: asv_bench/benchmarks/series_methods.py

+33-12
Original file line numberDiff line numberDiff line change
@@ -25,17 +25,32 @@ def time_constructor(self, data):
2525

2626
class IsIn:
2727

28-
params = ["int64", "uint64", "object"]
28+
params = ["int64", "uint64", "object", "Int64"]
2929
param_names = ["dtype"]
3030

3131
def setup(self, dtype):
32-
self.s = Series(np.random.randint(1, 10, 100000)).astype(dtype)
32+
N = 10000
33+
self.s = Series(np.random.randint(1, 10, N)).astype(dtype)
3334
self.values = [1, 2]
3435

3536
def time_isin(self, dtypes):
3637
self.s.isin(self.values)
3738

3839

40+
class IsInBoolean:
41+
42+
params = ["boolean", "bool"]
43+
param_names = ["dtype"]
44+
45+
def setup(self, dtype):
46+
N = 10000
47+
self.s = Series(np.random.randint(0, 2, N)).astype(dtype)
48+
self.values = [True, False]
49+
50+
def time_isin(self, dtypes):
51+
self.s.isin(self.values)
52+
53+
3954
class IsInDatetime64:
4055
def setup(self):
4156
dti = date_range(
@@ -59,21 +74,27 @@ def time_isin_empty(self):
5974

6075

6176
class IsInFloat64:
62-
def setup(self):
63-
self.small = Series([1, 2], dtype=np.float64)
64-
self.many_different_values = np.arange(10 ** 6, dtype=np.float64)
65-
self.few_different_values = np.zeros(10 ** 7, dtype=np.float64)
66-
self.only_nans_values = np.full(10 ** 7, np.nan, dtype=np.float64)
6777

68-
def time_isin_many_different(self):
78+
params = [np.float64, "Float64"]
79+
param_names = ["dtype"]
80+
81+
def setup(self, dtype):
82+
N_many = 10 ** 5
83+
N_few = 10 ** 6
84+
self.small = Series([1, 2], dtype=dtype)
85+
self.many_different_values = np.arange(N_many, dtype=np.float64)
86+
self.few_different_values = np.zeros(N_few, dtype=np.float64)
87+
self.only_nans_values = np.full(N_few, np.nan, dtype=np.float64)
88+
89+
def time_isin_many_different(self, dtypes):
6990
# runtime is dominated by creation of the lookup-table
7091
self.small.isin(self.many_different_values)
7192

72-
def time_isin_few_different(self):
93+
def time_isin_few_different(self, dtypes):
7394
# runtime is dominated by creation of the lookup-table
7495
self.small.isin(self.few_different_values)
7596

76-
def time_isin_nan_values(self):
97+
def time_isin_nan_values(self, dtypes):
7798
# runtime is dominated by creation of the lookup-table
7899
self.small.isin(self.few_different_values)
79100

@@ -114,7 +135,7 @@ def time_isin_long_series_long_values_floats(self):
114135

115136
class IsInLongSeriesLookUpDominates:
116137
params = [
117-
["int64", "int32", "float64", "float32", "object"],
138+
["int64", "int32", "float64", "float32", "object", "Int64", "Float64"],
118139
[5, 1000],
119140
["random_hits", "random_misses", "monotone_hits", "monotone_misses"],
120141
]
@@ -141,7 +162,7 @@ def time_isin(self, dtypes, MaxNumber, series_type):
141162

142163
class IsInLongSeriesValuesDominate:
143164
params = [
144-
["int64", "int32", "float64", "float32", "object"],
165+
["int64", "int32", "float64", "float32", "object", "Int64", "Float64"],
145166
["random", "monotone"],
146167
]
147168
param_names = ["dtype", "series_type"]

Diff for: doc/source/reference/extensions.rst

+1
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@ objects.
4848
api.extensions.ExtensionArray.equals
4949
api.extensions.ExtensionArray.factorize
5050
api.extensions.ExtensionArray.fillna
51+
api.extensions.ExtensionArray.isin
5152
api.extensions.ExtensionArray.isna
5253
api.extensions.ExtensionArray.ravel
5354
api.extensions.ExtensionArray.repeat

Diff for: doc/source/whatsnew/v1.3.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -204,6 +204,7 @@ Performance improvements
204204
~~~~~~~~~~~~~~~~~~~~~~~~
205205
- Performance improvement in :meth:`IntervalIndex.isin` (:issue:`38353`)
206206
- Performance improvement in :meth:`Series.mean` for nullable data types (:issue:`34814`)
207+
- Performance improvement in :meth:`Series.isin` for nullable data types (:issue:`38340`)
207208
-
208209

209210
.. ---------------------------------------------------------------------------

Diff for: pandas/core/algorithms.py

+8-13
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,6 @@
3636
is_float_dtype,
3737
is_integer,
3838
is_integer_dtype,
39-
is_interval_dtype,
4039
is_list_like,
4140
is_numeric_dtype,
4241
is_object_dtype,
@@ -68,7 +67,7 @@
6867

6968
if TYPE_CHECKING:
7069
from pandas import Categorical, DataFrame, Index, Series
71-
from pandas.core.arrays import DatetimeArray, IntervalArray, TimedeltaArray
70+
from pandas.core.arrays import DatetimeArray, TimedeltaArray
7271

7372
_shared_docs: Dict[str, str] = {}
7473

@@ -450,13 +449,8 @@ def isin(comps: AnyArrayLike, values: AnyArrayLike) -> np.ndarray:
450449

451450
comps = _ensure_arraylike(comps)
452451
comps = extract_array(comps, extract_numpy=True)
453-
if is_categorical_dtype(comps.dtype):
454-
# TODO(extension)
455-
# handle categoricals
456-
return cast("Categorical", comps).isin(values)
457-
458-
elif is_interval_dtype(comps.dtype):
459-
return cast("IntervalArray", comps).isin(values)
452+
if is_extension_array_dtype(comps.dtype):
453+
return comps.isin(values)
460454

461455
elif needs_i8_conversion(comps.dtype):
462456
# Dispatch to DatetimeLikeArrayMixin.isin
@@ -468,9 +462,7 @@ def isin(comps: AnyArrayLike, values: AnyArrayLike) -> np.ndarray:
468462
elif needs_i8_conversion(values.dtype):
469463
return isin(comps, values.astype(object))
470464

471-
elif is_extension_array_dtype(comps.dtype) or is_extension_array_dtype(
472-
values.dtype
473-
):
465+
elif is_extension_array_dtype(values.dtype):
474466
return isin(np.asarray(comps), np.asarray(values))
475467

476468
# GH16012
@@ -481,7 +473,10 @@ def isin(comps: AnyArrayLike, values: AnyArrayLike) -> np.ndarray:
481473
# If the values include nan we need to check for nan explicitly
482474
# since np.nan it not equal to np.nan
483475
if isna(values).any():
484-
f = lambda c, v: np.logical_or(np.in1d(c, v), np.isnan(c))
476+
477+
def f(c, v):
478+
return np.logical_or(np.in1d(c, v), np.isnan(c))
479+
485480
else:
486481
f = np.in1d
487482

Diff for: pandas/core/arrays/base.py

+18-1
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@
4545
from pandas.core.dtypes.missing import isna
4646

4747
from pandas.core import ops
48-
from pandas.core.algorithms import factorize_array, unique
48+
from pandas.core.algorithms import factorize_array, isin, unique
4949
from pandas.core.missing import get_fill_func
5050
from pandas.core.sorting import nargminmax, nargsort
5151

@@ -78,6 +78,7 @@ class ExtensionArray:
7878
factorize
7979
fillna
8080
equals
81+
isin
8182
isna
8283
ravel
8384
repeat
@@ -852,6 +853,22 @@ def equals(self, other: object) -> bool:
852853
equal_na = self.isna() & other.isna()
853854
return bool((equal_values | equal_na).all())
854855

856+
def isin(self, values) -> np.ndarray:
857+
"""
858+
Pointwise comparison for set containment in the given values.
859+
860+
Roughly equivalent to `np.array([x in values for x in self])`
861+
862+
Parameters
863+
----------
864+
values : Sequence
865+
866+
Returns
867+
-------
868+
np.ndarray[bool]
869+
"""
870+
return isin(np.asarray(self), values)
871+
855872
def _values_for_factorize(self) -> Tuple[np.ndarray, Any]:
856873
"""
857874
Return an array and missing value suitable for factorization.

Diff for: pandas/core/arrays/masked.py

+15-1
Original file line numberDiff line numberDiff line change
@@ -21,14 +21,15 @@
2121
from pandas.core.dtypes.missing import isna, notna
2222

2323
from pandas.core import nanops
24-
from pandas.core.algorithms import factorize_array, take
24+
from pandas.core.algorithms import factorize_array, isin, take
2525
from pandas.core.array_algos import masked_reductions
2626
from pandas.core.arraylike import OpsMixin
2727
from pandas.core.arrays import ExtensionArray
2828
from pandas.core.indexers import check_array_indexer
2929

3030
if TYPE_CHECKING:
3131
from pandas import Series
32+
from pandas.core.arrays import BooleanArray
3233

3334

3435
BaseMaskedArrayT = TypeVar("BaseMaskedArrayT", bound="BaseMaskedArray")
@@ -328,6 +329,19 @@ def take(
328329

329330
return type(self)(result, mask, copy=False)
330331

332+
def isin(self, values) -> BooleanArray:
333+
334+
from pandas.core.arrays import BooleanArray
335+
336+
result = isin(self._data, values)
337+
if self._hasna:
338+
if libmissing.NA in values:
339+
result += self._mask
340+
else:
341+
result *= np.invert(self._mask)
342+
mask = np.zeros_like(self, dtype=bool)
343+
return BooleanArray(result, mask, copy=False)
344+
331345
def copy(self: BaseMaskedArrayT) -> BaseMaskedArrayT:
332346
data, mask = self._data, self._mask
333347
data = data.copy()

0 commit comments

Comments
 (0)