Skip to content

Parametrized NA sentinel for factorize #20473

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from 11 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
51 changes: 36 additions & 15 deletions pandas/_libs/hashtable_class_helper.pxi.in
Original file line number Diff line number Diff line change
Expand Up @@ -250,13 +250,13 @@ cdef class HashTable:

{{py:

# name, dtype, null_condition, float_group
dtypes = [('Float64', 'float64', 'val != val', True),
('UInt64', 'uint64', 'False', False),
('Int64', 'int64', 'val == iNaT', False)]
# name, dtype, float_group, default_na_value
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

when is float_group used? seems superfluous?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Seems to be used in unique.

dtypes = [('Float64', 'float64', True, 'nan'),
('UInt64', 'uint64', False, 0),
('Int64', 'int64', False, 'iNaT')]

def get_dispatch(dtypes):
for (name, dtype, null_condition, float_group) in dtypes:
for (name, dtype, float_group, default_na_value) in dtypes:
unique_template = """\
cdef:
Py_ssize_t i, n = len(values)
Expand Down Expand Up @@ -298,13 +298,13 @@ def get_dispatch(dtypes):
return uniques.to_array()
"""

unique_template = unique_template.format(name=name, dtype=dtype, null_condition=null_condition, float_group=float_group)
unique_template = unique_template.format(name=name, dtype=dtype, float_group=float_group)

yield (name, dtype, null_condition, float_group, unique_template)
yield (name, dtype, float_group, default_na_value, unique_template)
}}


{{for name, dtype, null_condition, float_group, unique_template in get_dispatch(dtypes)}}
{{for name, dtype, float_group, default_na_value, unique_template in get_dispatch(dtypes)}}

cdef class {{name}}HashTable(HashTable):

Expand Down Expand Up @@ -408,24 +408,37 @@ cdef class {{name}}HashTable(HashTable):
@cython.boundscheck(False)
def get_labels(self, {{dtype}}_t[:] values, {{name}}Vector uniques,
Py_ssize_t count_prior, Py_ssize_t na_sentinel,
bint check_null=True):
bint check_null=True,
object na_value=None):
cdef:
Py_ssize_t i, n = len(values)
int64_t[:] labels
Py_ssize_t idx, count = count_prior
int ret = 0
{{dtype}}_t val
{{dtype}}_t val, na_value2
khiter_t k
{{name}}VectorData *ud
bint use_na_value

labels = np.empty(n, dtype=np.int64)
ud = uniques.data
use_na_value = na_value is not None

if use_na_value:
# We need this na_value2 because we want to allow users
# to *optionally* specify an NA sentinel *of the correct* type.
# We use None, to make it optional, which requires `object` type
# for the parameter. To please the compiler, we use na_value2,
# which is only used if it's *specified*.
na_value2 = <{{dtype}}_t>na_value
else:
na_value2 = {{default_na_value}}

with nogil:
for i in range(n):
val = values[i]

if check_null and {{null_condition}}:
if check_null and (val != val or val == na_value2):
labels[i] = na_sentinel
continue

Expand Down Expand Up @@ -695,7 +708,8 @@ cdef class StringHashTable(HashTable):
@cython.boundscheck(False)
def get_labels(self, ndarray[object] values, ObjectVector uniques,
Py_ssize_t count_prior, int64_t na_sentinel,
bint check_null=1):
bint check_null=1,
object na_value=None):
cdef:
Py_ssize_t i, n = len(values)
int64_t[:] labels
Expand All @@ -706,18 +720,21 @@ cdef class StringHashTable(HashTable):
char *v
char **vecs
khiter_t k
bint use_na_value

# these by-definition *must* be strings
labels = np.zeros(n, dtype=np.int64)
uindexer = np.empty(n, dtype=np.int64)
use_na_value = na_value is not None

# pre-filter out missing
# and assign pointers
vecs = <char **> malloc(n * sizeof(char *))
for i in range(n):
val = values[i]

if PyUnicode_Check(val) or PyString_Check(val):
if ((PyUnicode_Check(val) or PyString_Check(val)) and
not (use_na_value and val == na_value)):
v = util.get_c_string(val)
vecs[i] = v
else:
Expand Down Expand Up @@ -868,22 +885,26 @@ cdef class PyObjectHashTable(HashTable):

def get_labels(self, ndarray[object] values, ObjectVector uniques,
Py_ssize_t count_prior, int64_t na_sentinel,
bint check_null=True):
bint check_null=True,
object na_value=None):
cdef:
Py_ssize_t i, n = len(values)
int64_t[:] labels
Py_ssize_t idx, count = count_prior
int ret = 0
object val
khiter_t k
bint use_na_value

labels = np.empty(n, dtype=np.int64)
use_na_value = na_value is not None

for i in range(n):
val = values[i]
hash(val)

if check_null and val != val or val is None:
if ((check_null and val != val or val is None) or
(use_na_value and val == na_value)):
labels[i] = na_sentinel
continue

Expand Down
17 changes: 14 additions & 3 deletions pandas/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -435,7 +435,8 @@ def isin(comps, values):
return f(comps, values)


def _factorize_array(values, check_nulls, na_sentinel=-1, size_hint=None):
def _factorize_array(values, check_nulls, na_sentinel=-1, size_hint=None,
na_value=None):
"""Factorize an array-like to labels and uniques.

This doesn't do any coercion of types or unboxing before factorization.
Expand All @@ -445,19 +446,27 @@ def _factorize_array(values, check_nulls, na_sentinel=-1, size_hint=None):
values : ndarray
check_nulls : bool
Whether to check for nulls in the hashtable's 'get_labels' method.
Nulls are always checked when `na_value` is specified.
Copy link
Contributor

@jreback jreback Mar 24, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if you are going to add this kwarg (which is ok), then remove check_nulls which is redundant.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thought I posted a comment about that, but I guess I deleted it.

AFAICT check_nulls is only useful for DatetimeIndex, so that these two are different?

In [2]: pd.factorize([2**-63, 0])
Out[2]: (array([0, 1]), array([1.08420217e-19, 0.00000000e+00]))

In [3]: pd.factorize(pd.DatetimeIndex([None, 0]))
Out[3]:
(array([-1,  0]),
 DatetimeIndex(['1970-01-01'], dtype='datetime64[ns]', freq=None))

Once DatetimeIndex is an ExtensionArray, pd.factorize will dispatch to DatetimeIndex.factorize, which can call _factorize_array(..., na_value=iNaT). I think I can do this now, it just makes the logic a bit ugly. Will give it a shot.

Copy link
Contributor Author

@TomAugspurger TomAugspurger Mar 24, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ahh, may not be able to remove check_nulls entirely. I needed it for factorizing boolean arrays. might still be able to... Anyway, this is what I had in mind:

diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
index bf192cdb2..21ec17054 100644
--- a/pandas/core/algorithms.py
+++ b/pandas/core/algorithms.py
@@ -511,7 +511,7 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None):
     values = _ensure_arraylike(values)
     original = values
 
-    if is_categorical_dtype(values):
+    if is_categorical_dtype(values) or is_datetime64_any_dtype(values):
         values = getattr(values, '_values', values)
         labels, uniques = values.factorize()
         dtype = original.dtype
diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py
index b906ea0f4..bd33828f2 100644
--- a/pandas/core/indexes/datetimelike.py
+++ b/pandas/core/indexes/datetimelike.py
@@ -984,6 +984,13 @@ class DatetimeIndexOpsMixin(object):
 
         return algorithms.isin(self.asi8, values.asi8)
 
+    def factorize(self, sort=False, na_sentinel=-1):
+        from pandas.core.algorithms import _factorize_array
+        l, u = _factorize_array(self, True, na_sentinel=na_sentinel,
+                                na_value=iNaT)
+        u = self._constructor(u)
+        return l, u
+
     def shift(self, n, freq=None):
         """
         Specialized shift which produces a DatetimeIndex

Will have to fix it to check for timedeltaindex too, handle sorting, but not too bad. Worth doing here, or a separate PR?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Got it. Will be able to remove check_nulls entirely without too much effort.

na_sentinel : int, default -1
size_hint : int, optional
Passsed through to the hashtable's 'get_labels' method
na_value : object, optional
A value in `values` to consider missing. Note: only use this
parameter when you know that you don't have any values pandas would
consider missing in the array (NaN for float data, iNaT for
datetimes, etc.).

Returns
-------
labels, uniques : ndarray
"""
(hash_klass, vec_klass), values = _get_data_algo(values, _hashtables)
check_nulls = check_nulls or na_value is not None

table = hash_klass(size_hint or len(values))
uniques = vec_klass()
labels = table.get_labels(values, uniques, 0, na_sentinel, check_nulls)
labels = table.get_labels(values, uniques, 0, na_sentinel, check_nulls,
na_value=na_value)

labels = _ensure_platform_int(labels)
uniques = uniques.to_array()
Expand Down Expand Up @@ -508,7 +517,9 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None):
dtype = original.dtype
else:
values, dtype, _ = _ensure_data(values)
check_nulls = not is_integer_dtype(original)
check_nulls = (not is_integer_dtype(original) and
not is_bool_dtype(original))

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

use pandas.core.dtypes.missing.na_value_for_dtype (or maybe add a kwarg to return the underlying value). just want to try to keep this logic in 1 place.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you know why na_value_for_dtype(PeriodDtype) is nan? It should be NaT, right?

labels, uniques = _factorize_array(values, check_nulls,
na_sentinel=na_sentinel,
size_hint=size_hint)
Expand Down
5 changes: 2 additions & 3 deletions pandas/core/arrays/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
from pandas import compat
from pandas.compat import u, lzip
from pandas._libs import lib, algos as libalgos
from pandas._libs.tslib import iNaT

from pandas.core.dtypes.generic import (
ABCSeries, ABCIndexClass, ABCCategoricalIndex)
Expand Down Expand Up @@ -2163,11 +2162,11 @@ def factorize(self, na_sentinel=-1):
from pandas.core.algorithms import _factorize_array

codes = self.codes.astype('int64')
codes[codes == -1] = iNaT
# We set missing codes, normally -1, to iNaT so that the
# Int64HashTable treats them as missing values.
labels, uniques = _factorize_array(codes, check_nulls=True,
na_sentinel=na_sentinel)
na_sentinel=na_sentinel,
na_value=-1)
uniques = self._constructor(self.categories.take(uniques),
categories=self.categories,
ordered=self.ordered)
Expand Down
31 changes: 31 additions & 0 deletions pandas/tests/test_algos.py
Original file line number Diff line number Diff line change
Expand Up @@ -257,6 +257,37 @@ def test_deprecate_order(self):
with tm.assert_produces_warning(False):
algos.factorize(data)

@pytest.mark.parametrize('data', [
np.array([0, 1, 0], dtype='u8'),
np.array([-2**63, 1, -2**63], dtype='i8'),
np.array(['__nan__', 'foo', '__nan__'], dtype='object'),
])
def test_parametrized_factorize_na_value_default(self, data):
# arrays that include the NA default for that type, but isn't used.
l, u = algos.factorize(data)
expected_uniques = data[[0, 1]]
expected_labels = np.array([0, 1, 0], dtype='i8')
tm.assert_numpy_array_equal(l, expected_labels)
tm.assert_numpy_array_equal(u, expected_uniques)

@pytest.mark.parametrize('data, na_value', [
(np.array([0, 1, 0, 2], dtype='u8'), 0),
(np.array([1, 0, 1, 2], dtype='u8'), 1),
(np.array([-2**63, 1, -2**63, 0], dtype='i8'), -2**63),
(np.array([1, -2**63, 1, 0], dtype='i8'), 1),
(np.array(['a', '', 'a', 'b'], dtype=object), 'a'),
(np.array([(), ('a', 1), (), ('a', 2)], dtype=object), ()),
(np.array([('a', 1), (), ('a', 1), ('a', 2)], dtype=object),
('a', 1)),
])
def test_parametrized_factorize_na_value(self, data, na_value):
l, u = algos._factorize_array(data, check_nulls=True,
na_value=na_value)
expected_uniques = data[[1, 3]]
expected_labels = np.array([-1, 0, -1, 1], dtype='i8')
tm.assert_numpy_array_equal(l, expected_labels)
tm.assert_numpy_array_equal(u, expected_uniques)


class TestUnique(object):

Expand Down