Skip to content

Parametrized NA sentinel for factorize #20473

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from 12 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
48 changes: 33 additions & 15 deletions pandas/_libs/hashtable_class_helper.pxi.in
Original file line number Diff line number Diff line change
Expand Up @@ -250,13 +250,13 @@ cdef class HashTable:

{{py:

# name, dtype, null_condition, float_group
dtypes = [('Float64', 'float64', 'val != val', True),
('UInt64', 'uint64', 'False', False),
('Int64', 'int64', 'val == iNaT', False)]
# name, dtype, float_group, default_na_value
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

when is float_group used? seems superfluous?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Seems to be used in unique.

dtypes = [('Float64', 'float64', True, 'nan'),
('UInt64', 'uint64', False, 0),
('Int64', 'int64', False, 'iNaT')]

def get_dispatch(dtypes):
for (name, dtype, null_condition, float_group) in dtypes:
for (name, dtype, float_group, default_na_value) in dtypes:
unique_template = """\
cdef:
Py_ssize_t i, n = len(values)
Expand Down Expand Up @@ -298,13 +298,13 @@ def get_dispatch(dtypes):
return uniques.to_array()
"""

unique_template = unique_template.format(name=name, dtype=dtype, null_condition=null_condition, float_group=float_group)
unique_template = unique_template.format(name=name, dtype=dtype, float_group=float_group)

yield (name, dtype, null_condition, float_group, unique_template)
yield (name, dtype, float_group, default_na_value, unique_template)
}}


{{for name, dtype, null_condition, float_group, unique_template in get_dispatch(dtypes)}}
{{for name, dtype, float_group, default_na_value, unique_template in get_dispatch(dtypes)}}

cdef class {{name}}HashTable(HashTable):

Expand Down Expand Up @@ -408,24 +408,36 @@ cdef class {{name}}HashTable(HashTable):
@cython.boundscheck(False)
def get_labels(self, {{dtype}}_t[:] values, {{name}}Vector uniques,
Py_ssize_t count_prior, Py_ssize_t na_sentinel,
bint check_null=True):
object na_value=None):
cdef:
Py_ssize_t i, n = len(values)
int64_t[:] labels
Py_ssize_t idx, count = count_prior
int ret = 0
{{dtype}}_t val
{{dtype}}_t val, na_value2
khiter_t k
{{name}}VectorData *ud
bint use_na_value

labels = np.empty(n, dtype=np.int64)
ud = uniques.data
use_na_value = na_value is not None

if use_na_value:
# We need this na_value2 because we want to allow users
# to *optionally* specify an NA sentinel *of the correct* type.
# We use None, to make it optional, which requires `object` type
# for the parameter. To please the compiler, we use na_value2,
# which is only used if it's *specified*.
na_value2 = <{{dtype}}_t>na_value
else:
na_value2 = {{default_na_value}}

with nogil:
for i in range(n):
val = values[i]

if check_null and {{null_condition}}:
if val != val or (use_na_value and val == na_value2):
labels[i] = na_sentinel
continue

Expand Down Expand Up @@ -695,7 +707,7 @@ cdef class StringHashTable(HashTable):
@cython.boundscheck(False)
def get_labels(self, ndarray[object] values, ObjectVector uniques,
Py_ssize_t count_prior, int64_t na_sentinel,
bint check_null=1):
object na_value=None):
cdef:
Py_ssize_t i, n = len(values)
int64_t[:] labels
Expand All @@ -706,18 +718,21 @@ cdef class StringHashTable(HashTable):
char *v
char **vecs
khiter_t k
bint use_na_value

# these by-definition *must* be strings
labels = np.zeros(n, dtype=np.int64)
uindexer = np.empty(n, dtype=np.int64)
use_na_value = na_value is not None

# pre-filter out missing
# and assign pointers
vecs = <char **> malloc(n * sizeof(char *))
for i in range(n):
val = values[i]

if PyUnicode_Check(val) or PyString_Check(val):
if ((PyUnicode_Check(val) or PyString_Check(val)) and
not (use_na_value and val == na_value)):
v = util.get_c_string(val)
vecs[i] = v
else:
Expand Down Expand Up @@ -868,22 +883,25 @@ cdef class PyObjectHashTable(HashTable):

def get_labels(self, ndarray[object] values, ObjectVector uniques,
Py_ssize_t count_prior, int64_t na_sentinel,
bint check_null=True):
object na_value=None):
cdef:
Py_ssize_t i, n = len(values)
int64_t[:] labels
Py_ssize_t idx, count = count_prior
int ret = 0
object val
khiter_t k
bint use_na_value

labels = np.empty(n, dtype=np.int64)
use_na_value = na_value is not None

for i in range(n):
val = values[i]
hash(val)

if check_null and val != val or val is None:
if ((val != val or val is None) or
(use_na_value and val == na_value)):
labels[i] = na_sentinel
continue

Expand Down
26 changes: 19 additions & 7 deletions pandas/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -435,19 +435,23 @@ def isin(comps, values):
return f(comps, values)


def _factorize_array(values, check_nulls, na_sentinel=-1, size_hint=None):
def _factorize_array(values, na_sentinel=-1, size_hint=None,
na_value=None):
"""Factorize an array-like to labels and uniques.

This doesn't do any coercion of types or unboxing before factorization.

Parameters
----------
values : ndarray
check_nulls : bool
Whether to check for nulls in the hashtable's 'get_labels' method.
na_sentinel : int, default -1
size_hint : int, optional
Passsed through to the hashtable's 'get_labels' method
na_value : object, optional
A value in `values` to consider missing. Note: only use this
parameter when you know that you don't have any values pandas would
consider missing in the array (NaN for float data, iNaT for
datetimes, etc.).

Returns
-------
Expand All @@ -457,7 +461,8 @@ def _factorize_array(values, check_nulls, na_sentinel=-1, size_hint=None):

table = hash_klass(size_hint or len(values))
uniques = vec_klass()
labels = table.get_labels(values, uniques, 0, na_sentinel, check_nulls)
labels = table.get_labels(values, uniques, 0, na_sentinel,
na_value=na_value)

labels = _ensure_platform_int(labels)
uniques = uniques.to_array()
Expand Down Expand Up @@ -508,10 +513,17 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None):
dtype = original.dtype
else:
values, dtype, _ = _ensure_data(values)
check_nulls = not is_integer_dtype(original)
labels, uniques = _factorize_array(values, check_nulls,

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

use pandas.core.dtypes.missing.na_value_for_dtype (or maybe add a kwarg to return the underlying value). just want to try to keep this logic in 1 place.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you know why na_value_for_dtype(PeriodDtype) is nan? It should be NaT, right?

if (is_datetime64_any_dtype(original) or
is_timedelta64_dtype(original)):
na_value = iNaT
else:
na_value = None

labels, uniques = _factorize_array(values,
na_sentinel=na_sentinel,
size_hint=size_hint)
size_hint=size_hint,
na_value=na_value)

if sort and len(uniques) > 0:
from pandas.core.sorting import safe_sort
Expand Down
6 changes: 2 additions & 4 deletions pandas/core/arrays/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
from pandas import compat
from pandas.compat import u, lzip
from pandas._libs import lib, algos as libalgos
from pandas._libs.tslib import iNaT

from pandas.core.dtypes.generic import (
ABCSeries, ABCIndexClass, ABCCategoricalIndex)
Expand Down Expand Up @@ -2163,11 +2162,10 @@ def factorize(self, na_sentinel=-1):
from pandas.core.algorithms import _factorize_array

codes = self.codes.astype('int64')
codes[codes == -1] = iNaT
# We set missing codes, normally -1, to iNaT so that the
# Int64HashTable treats them as missing values.
labels, uniques = _factorize_array(codes, check_nulls=True,
na_sentinel=na_sentinel)
labels, uniques = _factorize_array(codes, na_sentinel=na_sentinel,
na_value=-1)
uniques = self._constructor(self.categories.take(uniques),
categories=self.categories,
ordered=self.ordered)
Expand Down
30 changes: 30 additions & 0 deletions pandas/tests/test_algos.py
Original file line number Diff line number Diff line change
Expand Up @@ -257,6 +257,36 @@ def test_deprecate_order(self):
with tm.assert_produces_warning(False):
algos.factorize(data)

@pytest.mark.parametrize('data', [
np.array([0, 1, 0], dtype='u8'),
np.array([-2**63, 1, -2**63], dtype='i8'),
np.array(['__nan__', 'foo', '__nan__'], dtype='object'),
])
def test_parametrized_factorize_na_value_default(self, data):
# arrays that include the NA default for that type, but isn't used.
l, u = algos.factorize(data)
expected_uniques = data[[0, 1]]
expected_labels = np.array([0, 1, 0], dtype='i8')
tm.assert_numpy_array_equal(l, expected_labels)
tm.assert_numpy_array_equal(u, expected_uniques)

@pytest.mark.parametrize('data, na_value', [
(np.array([0, 1, 0, 2], dtype='u8'), 0),
(np.array([1, 0, 1, 2], dtype='u8'), 1),
(np.array([-2**63, 1, -2**63, 0], dtype='i8'), -2**63),
(np.array([1, -2**63, 1, 0], dtype='i8'), 1),
(np.array(['a', '', 'a', 'b'], dtype=object), 'a'),
(np.array([(), ('a', 1), (), ('a', 2)], dtype=object), ()),
(np.array([('a', 1), (), ('a', 1), ('a', 2)], dtype=object),
('a', 1)),
])
def test_parametrized_factorize_na_value(self, data, na_value):
l, u = algos._factorize_array(data, na_value=na_value)
expected_uniques = data[[1, 3]]
expected_labels = np.array([-1, 0, -1, 1], dtype='i8')
tm.assert_numpy_array_equal(l, expected_labels)
tm.assert_numpy_array_equal(u, expected_uniques)


class TestUnique(object):

Expand Down