-
-
Notifications
You must be signed in to change notification settings - Fork 18.5k
Parametrized NA sentinel for factorize #20473
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 11 commits
872c24a
3c18428
703ab8a
ab32e0f
62fa538
28fad50
8580754
cf14ee1
8141131
a23d451
b25f3d4
dfcda85
eaff342
c05c807
e786253
465d458
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -435,7 +435,8 @@ def isin(comps, values): | |
return f(comps, values) | ||
|
||
|
||
def _factorize_array(values, check_nulls, na_sentinel=-1, size_hint=None): | ||
def _factorize_array(values, check_nulls, na_sentinel=-1, size_hint=None, | ||
na_value=None): | ||
"""Factorize an array-like to labels and uniques. | ||
|
||
This doesn't do any coercion of types or unboxing before factorization. | ||
|
@@ -445,19 +446,27 @@ def _factorize_array(values, check_nulls, na_sentinel=-1, size_hint=None): | |
values : ndarray | ||
check_nulls : bool | ||
Whether to check for nulls in the hashtable's 'get_labels' method. | ||
Nulls are always checked when `na_value` is specified. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. if you are going to add this kwarg (which is ok), then remove There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thought I posted a comment about that, but I guess I deleted it. AFAICT In [2]: pd.factorize([2**-63, 0])
Out[2]: (array([0, 1]), array([1.08420217e-19, 0.00000000e+00]))
In [3]: pd.factorize(pd.DatetimeIndex([None, 0]))
Out[3]:
(array([-1, 0]),
DatetimeIndex(['1970-01-01'], dtype='datetime64[ns]', freq=None)) Once DatetimeIndex is an ExtensionArray, There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ahh, may not be able to remove diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
index bf192cdb2..21ec17054 100644
--- a/pandas/core/algorithms.py
+++ b/pandas/core/algorithms.py
@@ -511,7 +511,7 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None):
values = _ensure_arraylike(values)
original = values
- if is_categorical_dtype(values):
+ if is_categorical_dtype(values) or is_datetime64_any_dtype(values):
values = getattr(values, '_values', values)
labels, uniques = values.factorize()
dtype = original.dtype
diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py
index b906ea0f4..bd33828f2 100644
--- a/pandas/core/indexes/datetimelike.py
+++ b/pandas/core/indexes/datetimelike.py
@@ -984,6 +984,13 @@ class DatetimeIndexOpsMixin(object):
return algorithms.isin(self.asi8, values.asi8)
+ def factorize(self, sort=False, na_sentinel=-1):
+ from pandas.core.algorithms import _factorize_array
+ l, u = _factorize_array(self, True, na_sentinel=na_sentinel,
+ na_value=iNaT)
+ u = self._constructor(u)
+ return l, u
+
def shift(self, n, freq=None):
"""
Specialized shift which produces a DatetimeIndex Will have to fix it to check for timedeltaindex too, handle sorting, but not too bad. Worth doing here, or a separate PR? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Got it. Will be able to remove |
||
na_sentinel : int, default -1 | ||
size_hint : int, optional | ||
Passsed through to the hashtable's 'get_labels' method | ||
na_value : object, optional | ||
A value in `values` to consider missing. Note: only use this | ||
parameter when you know that you don't have any values pandas would | ||
consider missing in the array (NaN for float data, iNaT for | ||
datetimes, etc.). | ||
|
||
Returns | ||
------- | ||
labels, uniques : ndarray | ||
""" | ||
(hash_klass, vec_klass), values = _get_data_algo(values, _hashtables) | ||
check_nulls = check_nulls or na_value is not None | ||
|
||
table = hash_klass(size_hint or len(values)) | ||
uniques = vec_klass() | ||
labels = table.get_labels(values, uniques, 0, na_sentinel, check_nulls) | ||
labels = table.get_labels(values, uniques, 0, na_sentinel, check_nulls, | ||
na_value=na_value) | ||
|
||
labels = _ensure_platform_int(labels) | ||
uniques = uniques.to_array() | ||
|
@@ -508,7 +517,9 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None): | |
dtype = original.dtype | ||
else: | ||
values, dtype, _ = _ensure_data(values) | ||
check_nulls = not is_integer_dtype(original) | ||
check_nulls = (not is_integer_dtype(original) and | ||
not is_bool_dtype(original)) | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. use There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do you know why |
||
labels, uniques = _factorize_array(values, check_nulls, | ||
na_sentinel=na_sentinel, | ||
size_hint=size_hint) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
when is float_group used? seems superfluous?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Seems to be used in
unique
.