-
-
Notifications
You must be signed in to change notification settings - Fork 18.5k
Parametrized NA sentinel for factorize #20473
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
872c24a
3c18428
703ab8a
ab32e0f
62fa538
28fad50
8580754
cf14ee1
8141131
a23d451
b25f3d4
dfcda85
eaff342
c05c807
e786253
465d458
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -250,13 +250,13 @@ cdef class HashTable: | |
|
||
{{py: | ||
|
||
# name, dtype, null_condition, float_group | ||
dtypes = [('Float64', 'float64', 'val != val', True), | ||
('UInt64', 'uint64', 'False', False), | ||
('Int64', 'int64', 'val == iNaT', False)] | ||
# name, dtype, null_condition, float_group, default_na_value | ||
dtypes = [('Float64', 'float64', 'val != val', True, 'nan'), | ||
('UInt64', 'uint64', 'False', False, 0), | ||
('Int64', 'int64', 'val == iNaT', False, 'iNaT')] | ||
|
||
def get_dispatch(dtypes): | ||
for (name, dtype, null_condition, float_group) in dtypes: | ||
for (name, dtype, null_condition, float_group, default_na_value) in dtypes: | ||
unique_template = """\ | ||
cdef: | ||
Py_ssize_t i, n = len(values) | ||
|
@@ -300,16 +300,19 @@ def get_dispatch(dtypes): | |
|
||
unique_template = unique_template.format(name=name, dtype=dtype, null_condition=null_condition, float_group=float_group) | ||
|
||
yield (name, dtype, null_condition, float_group, unique_template) | ||
yield (name, dtype, null_condition, float_group, default_na_value, unique_template) | ||
}} | ||
|
||
|
||
{{for name, dtype, null_condition, float_group, unique_template in get_dispatch(dtypes)}} | ||
{{for name, dtype, null_condition, float_group, default_na_value, unique_template in get_dispatch(dtypes)}} | ||
|
||
cdef class {{name}}HashTable(HashTable): | ||
|
||
def __cinit__(self, size_hint=1): | ||
def __cinit__(self, size_hint=1, {{dtype}}_t na_value={{default_na_value}}, | ||
bint use_na_value=False): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I wonder, should this just be moved to |
||
self.table = kh_init_{{dtype}}() | ||
self.na_value = na_value | ||
self.use_na_value = use_na_value | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do you even need There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm not sure :) Without
On master the labels I'll try that now to make sure. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hmm OK I see. I'll say this with the caveat that I haven't used factorize before, but does it even make sense to have a NA value for boolean types? Wouldn't the presence of NA force that conversion to float? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think the best use-case to focus on is Categorical. In its codes, missing values are -1. So we'd like to pass an array of int64 to factorize, and have it think that -1 is NA. We avoid NaN for datetime / categorical for that reason: it would force them to float. For the Int64HashTable, we use iNaT (the smallest int) as the missing value sentinel. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ah OK thanks - that clears things up a lot. Shouldn't the default missing value for the type then be -1 instead of 0? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. For Int64 or UInt64? If it's UInt64 it'll have to be 0, since -1 is negative :) I need to better describe cdef f(int64_t x=None):
if x is not None:
... But in a way that makes cython happy. There's probably a way to do that... There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Good point on the -1 with the unsigned int - you'll have to excuse me (it's Friday). So perhaps one thing to consider is instead of declaring x to be Kind of wordy so let me know if that's not clear There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Something like diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in
index af60eb4b1..07bdad241 100644
--- a/pandas/_libs/hashtable_class_helper.pxi.in
+++ b/pandas/_libs/hashtable_class_helper.pxi.in
@@ -409,26 +409,32 @@ cdef class {{name}}HashTable(HashTable):
def get_labels(self, {{dtype}}_t[:] values, {{name}}Vector uniques,
Py_ssize_t count_prior, Py_ssize_t na_sentinel,
bint check_null=True,
- {{dtype}}_t na_value={{default_na_value}},
- bint use_na_value=False):
+ object na_value=None):
cdef:
Py_ssize_t i, n = len(values)
int64_t[:] labels
Py_ssize_t idx, count = count_prior
int ret = 0
- {{dtype}}_t val
+ {{dtype}}_t val, na_value2
khiter_t k
{{name}}VectorData *ud
+ bint use_na_value
labels = np.empty(n, dtype=np.int64)
ud = uniques.data
+ use_na_value = na_value is not None
+
+ if use_na_value:
+ na_value2 = <{{dtype}}_t>na_value
+ else:
+ na_value2 = {{default_na_value}}
with nogil:
for i in range(n):
val = values[i]
if ((check_null and {{null_condition}}) or
- (use_na_value and val == na_value)):
+ (use_na_value and val == na_value2)):
labels[i] = na_sentinel
continue
diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
index bbf60c2ee..e668b4748 100644
--- a/pandas/core/algorithms.py
+++ b/pandas/core/algorithms.py
@@ -456,16 +456,9 @@ def _factorize_array(values, check_nulls, na_sentinel=-1, size_hint=None,
"""
(hash_klass, vec_klass), values = _get_data_algo(values, _hashtables)
- use_na_value = na_value is not None
- kwargs = dict(use_na_value=use_na_value)
-
- if use_na_value:
- kwargs['na_value'] = na_value
-
table = hash_klass(size_hint or len(values))
uniques = vec_klass()
- labels = table.get_labels(values, uniques, 0, na_sentinel, check_nulls,
- **kwargs)
+ labels = table.get_labels(values, uniques, 0, na_sentinel, check_nulls, na_value=na_value)
labels = _ensure_platform_int(labels)
uniques = uniques.to_array()
seems to work. Is that what you had in mind? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Just to be a little more explicit something along the lines of cdef f(object x=None):
cdef:
{{template_type}} x_val
if x is not None:
x_val = <{{template_type}}> x
...
with nogil:
... Should still work because you aren't trying to access any Python objects within the nogil - the compiler should know that x_val is of type There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Replied at about the same time but yes that's what I was thinking. Good to hear it works |
||
if size_hint is not None: | ||
kh_resize_{{dtype}}(self.table, size_hint) | ||
|
||
|
@@ -414,18 +417,22 @@ cdef class {{name}}HashTable(HashTable): | |
int64_t[:] labels | ||
Py_ssize_t idx, count = count_prior | ||
int ret = 0 | ||
{{dtype}}_t val | ||
{{dtype}}_t val, na_value | ||
khiter_t k | ||
{{name}}VectorData *ud | ||
bint use_na_value | ||
|
||
labels = np.empty(n, dtype=np.int64) | ||
ud = uniques.data | ||
na_value = self.na_value | ||
use_na_value = self.use_na_value | ||
|
||
with nogil: | ||
for i in range(n): | ||
val = values[i] | ||
|
||
if check_null and {{null_condition}}: | ||
if ((check_null and {{null_condition}}) or | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Related to above comment, is it not possible to do something like if (check_null and (val != val or val == na_value)): To still cover all of the scenarios? (borrowed the above logic from |
||
(use_na_value and val == na_value)): | ||
labels[i] = na_sentinel | ||
continue | ||
|
||
|
@@ -519,8 +526,11 @@ cdef class StringHashTable(HashTable): | |
# or a sentinel np.nan / None missing value | ||
na_string_sentinel = '__nan__' | ||
|
||
def __init__(self, int size_hint=1): | ||
def __init__(self, int size_hint=1, object na_value=na_string_sentinel, | ||
bint use_na_value=False): | ||
self.table = kh_init_str() | ||
self.na_value = na_value | ||
self.use_na_value = use_na_value | ||
if size_hint is not None: | ||
kh_resize_str(self.table, size_hint) | ||
|
||
|
@@ -706,18 +716,23 @@ cdef class StringHashTable(HashTable): | |
char *v | ||
char **vecs | ||
khiter_t k | ||
bint use_na_value | ||
|
||
# these by-definition *must* be strings | ||
labels = np.zeros(n, dtype=np.int64) | ||
uindexer = np.empty(n, dtype=np.int64) | ||
|
||
na_value = self.na_value | ||
use_na_value = self.use_na_value | ||
|
||
# pre-filter out missing | ||
# and assign pointers | ||
vecs = <char **> malloc(n * sizeof(char *)) | ||
for i in range(n): | ||
val = values[i] | ||
|
||
if PyUnicode_Check(val) or PyString_Check(val): | ||
if ((PyUnicode_Check(val) or PyString_Check(val)) and | ||
not (use_na_value and val == na_value)): | ||
v = util.get_c_string(val) | ||
vecs[i] = v | ||
else: | ||
|
@@ -753,8 +768,11 @@ na_sentinel = object | |
|
||
cdef class PyObjectHashTable(HashTable): | ||
|
||
def __init__(self, size_hint=1): | ||
def __init__(self, size_hint=1, object na_value=na_sentinel, | ||
bint use_na_value=False): | ||
self.table = kh_init_pymap() | ||
self.na_value = na_value | ||
self.use_na_value = use_na_value | ||
kh_resize_pymap(self.table, size_hint) | ||
|
||
def __dealloc__(self): | ||
|
@@ -876,14 +894,18 @@ cdef class PyObjectHashTable(HashTable): | |
int ret = 0 | ||
object val | ||
khiter_t k | ||
bint use_na_value | ||
|
||
labels = np.empty(n, dtype=np.int64) | ||
na_value = self.na_value | ||
use_na_value = self.use_na_value | ||
|
||
for i in range(n): | ||
val = values[i] | ||
hash(val) | ||
|
||
if check_null and val != val or val is None: | ||
if ((check_null and val != val or val is None) or | ||
(use_na_value and val == na_value)): | ||
labels[i] = na_sentinel | ||
continue | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -435,7 +435,8 @@ def isin(comps, values): | |
return f(comps, values) | ||
|
||
|
||
def _factorize_array(values, check_nulls, na_sentinel=-1, size_hint=None): | ||
def _factorize_array(values, check_nulls, na_sentinel=-1, size_hint=None, | ||
na_value=None): | ||
"""Factorize an array-like to labels and uniques. | ||
|
||
This doesn't do any coercion of types or unboxing before factorization. | ||
|
@@ -455,7 +456,13 @@ def _factorize_array(values, check_nulls, na_sentinel=-1, size_hint=None): | |
""" | ||
(hash_klass, vec_klass), values = _get_data_algo(values, _hashtables) | ||
|
||
table = hash_klass(size_hint or len(values)) | ||
use_na_value = na_value is not None | ||
kwargs = dict(use_na_value=use_na_value) | ||
|
||
if use_na_value: | ||
kwargs['na_value'] = na_value | ||
|
||
table = hash_klass(size_hint or len(values), **kwargs) | ||
uniques = vec_klass() | ||
labels = table.get_labels(values, uniques, 0, na_sentinel, check_nulls) | ||
|
||
|
@@ -465,7 +472,8 @@ def _factorize_array(values, check_nulls, na_sentinel=-1, size_hint=None): | |
|
||
|
||
@deprecate_kwarg(old_arg_name='order', new_arg_name=None) | ||
def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None): | ||
def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None, | ||
na_value=None): | ||
""" | ||
Encode input values as an enumerated type or categorical variable | ||
|
||
|
@@ -479,6 +487,8 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None): | |
na_sentinel : int, default -1 | ||
Value to mark "not found" | ||
size_hint : hint to the hashtable sizer | ||
na_value : object, optional | ||
A value in `values` to consider missing. | ||
|
||
Returns | ||
------- | ||
|
@@ -509,9 +519,11 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None): | |
else: | ||
values, dtype, _ = _ensure_data(values) | ||
check_nulls = not is_integer_dtype(original) | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. use There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do you know why |
||
labels, uniques = _factorize_array(values, check_nulls, | ||
na_sentinel=na_sentinel, | ||
size_hint=size_hint) | ||
size_hint=size_hint, | ||
na_value=na_value) | ||
|
||
if sort and len(uniques) > 0: | ||
from pandas.core.sorting import safe_sort | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I realize you didn't set this, but is the
uint64
track for bool hashing? If so, any reason why we don't useuint8
to save space?