pandas-dev · jreback · Mar 27, 2018 · Mar 23, 2018 · Mar 23, 2018 · Mar 23, 2018
diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in
@@ -250,13 +250,13 @@ cdef class HashTable:
 
 {{py:
 
-# name, dtype, null_condition, float_group
-dtypes = [('Float64', 'float64', 'val != val', True),
-          ('UInt64', 'uint64', 'False', False),
-          ('Int64', 'int64', 'val == iNaT', False)]
+# name, dtype, float_group, default_na_value
+dtypes = [('Float64', 'float64', True, 'nan'),
+          ('UInt64', 'uint64', False, 0),
+          ('Int64', 'int64', False, 'iNaT')]
 
 def get_dispatch(dtypes):
-  for (name, dtype, null_condition, float_group) in dtypes:
+  for (name, dtype, float_group, default_na_value) in dtypes:
     unique_template = """\
         cdef:
            Py_ssize_t i, n = len(values)
@@ -298,13 +298,13 @@ def get_dispatch(dtypes):
         return uniques.to_array()
       """
 
-    unique_template = unique_template.format(name=name, dtype=dtype, null_condition=null_condition, float_group=float_group)
+    unique_template = unique_template.format(name=name, dtype=dtype, float_group=float_group)
 
-    yield (name, dtype, null_condition, float_group, unique_template)
+    yield (name, dtype, float_group, default_na_value, unique_template)
 }}
 
 
-{{for name, dtype, null_condition, float_group, unique_template in get_dispatch(dtypes)}}
+{{for name, dtype, float_group, default_na_value, unique_template in get_dispatch(dtypes)}}
 
 cdef class {{name}}HashTable(HashTable):
 
@@ -408,24 +408,36 @@ cdef class {{name}}HashTable(HashTable):
     @cython.boundscheck(False)
     def get_labels(self, {{dtype}}_t[:] values, {{name}}Vector uniques,
                    Py_ssize_t count_prior, Py_ssize_t na_sentinel,
-                   bint check_null=True):
+                   object na_value=None):
         cdef:
             Py_ssize_t i, n = len(values)
             int64_t[:] labels
             Py_ssize_t idx, count = count_prior
             int ret = 0
-            {{dtype}}_t val
+            {{dtype}}_t val, na_value2
             khiter_t k
             {{name}}VectorData *ud
+            bint use_na_value
 
         labels = np.empty(n, dtype=np.int64)
         ud = uniques.data
+        use_na_value = na_value is not None
+
+        if use_na_value:
+            # We need this na_value2 because we want to allow users
+            # to *optionally* specify an NA sentinel *of the correct* type.
+            # We use None, to make it optional, which requires `object` type
+            # for the parameter. To please the compiler, we use na_value2,
+            # which is only used if it's *specified*.
+            na_value2 = <{{dtype}}_t>na_value
+        else:
+            na_value2 = {{default_na_value}}
 
         with nogil:
             for i in range(n):
                 val = values[i]
 
-                if check_null and {{null_condition}}:
+                if val != val or (use_na_value and val == na_value2):
                     labels[i] = na_sentinel
                     continue
 
@@ -695,7 +707,7 @@ cdef class StringHashTable(HashTable):
     @cython.boundscheck(False)
     def get_labels(self, ndarray[object] values, ObjectVector uniques,
                    Py_ssize_t count_prior, int64_t na_sentinel,
-                   bint check_null=1):
+                   object na_value=None):
         cdef:
             Py_ssize_t i, n = len(values)
             int64_t[:] labels
@@ -706,18 +718,21 @@ cdef class StringHashTable(HashTable):
             char *v
             char **vecs
             khiter_t k
+            bint use_na_value
 
         # these by-definition *must* be strings
         labels = np.zeros(n, dtype=np.int64)
         uindexer = np.empty(n, dtype=np.int64)
+        use_na_value = na_value is not None
 
         # pre-filter out missing
         # and assign pointers
         vecs = <char **> malloc(n * sizeof(char *))
         for i in range(n):
             val = values[i]
 
-            if PyUnicode_Check(val) or PyString_Check(val):
+            if ((PyUnicode_Check(val) or PyString_Check(val)) and
+                    not (use_na_value and val == na_value)):
                 v = util.get_c_string(val)
                 vecs[i] = v
             else:
@@ -868,22 +883,25 @@ cdef class PyObjectHashTable(HashTable):
 
     def get_labels(self, ndarray[object] values, ObjectVector uniques,
                    Py_ssize_t count_prior, int64_t na_sentinel,
-                   bint check_null=True):
+                   object na_value=None):
         cdef:
             Py_ssize_t i, n = len(values)
             int64_t[:] labels
             Py_ssize_t idx, count = count_prior
             int ret = 0
             object val
             khiter_t k
+            bint use_na_value
 
         labels = np.empty(n, dtype=np.int64)
+        use_na_value = na_value is not None
 
         for i in range(n):
             val = values[i]
             hash(val)
 
-            if check_null and val != val or val is None:
+            if ((val != val or val is None) or
+                    (use_na_value and val == na_value)):
                 labels[i] = na_sentinel
                 continue
 

diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
@@ -435,19 +435,23 @@ def isin(comps, values):
     return f(comps, values)
 
 
-def _factorize_array(values, check_nulls, na_sentinel=-1, size_hint=None):
+def _factorize_array(values, na_sentinel=-1, size_hint=None,
+                     na_value=None):
     """Factorize an array-like to labels and uniques.
 
     This doesn't do any coercion of types or unboxing before factorization.
 
     Parameters
     ----------
     values : ndarray
-    check_nulls : bool
-        Whether to check for nulls in the hashtable's 'get_labels' method.
     na_sentinel : int, default -1
     size_hint : int, optional
         Passsed through to the hashtable's 'get_labels' method
+    na_value : object, optional
+        A value in `values` to consider missing. Note: only use this
+        parameter when you know that you don't have any values pandas would
+        consider missing in the array (NaN for float data, iNaT for
+        datetimes, etc.).
 
     Returns
     -------
@@ -457,7 +461,8 @@ def _factorize_array(values, check_nulls, na_sentinel=-1, size_hint=None):
 
     table = hash_klass(size_hint or len(values))
     uniques = vec_klass()
-    labels = table.get_labels(values, uniques, 0, na_sentinel, check_nulls)
+    labels = table.get_labels(values, uniques, 0, na_sentinel,
+                              na_value=na_value)
 
     labels = _ensure_platform_int(labels)
     uniques = uniques.to_array()
@@ -508,10 +513,17 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None):
         dtype = original.dtype
     else:
         values, dtype, _ = _ensure_data(values)
-        check_nulls = not is_integer_dtype(original)
-        labels, uniques = _factorize_array(values, check_nulls,
+
+        if (is_datetime64_any_dtype(original) or
+                is_timedelta64_dtype(original)):
+            na_value = iNaT
+        else:
+            na_value = None
+
+        labels, uniques = _factorize_array(values,
                                            na_sentinel=na_sentinel,
-                                           size_hint=size_hint)
+                                           size_hint=size_hint,
+                                           na_value=na_value)
 
     if sort and len(uniques) > 0:
         from pandas.core.sorting import safe_sort

diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py
@@ -7,7 +7,6 @@
 from pandas import compat
 from pandas.compat import u, lzip
 from pandas._libs import lib, algos as libalgos
-from pandas._libs.tslib import iNaT
 
 from pandas.core.dtypes.generic import (
     ABCSeries, ABCIndexClass, ABCCategoricalIndex)
@@ -2163,11 +2162,10 @@ def factorize(self, na_sentinel=-1):
         from pandas.core.algorithms import _factorize_array
 
         codes = self.codes.astype('int64')
-        codes[codes == -1] = iNaT
         # We set missing codes, normally -1, to iNaT so that the
         # Int64HashTable treats them as missing values.
-        labels, uniques = _factorize_array(codes, check_nulls=True,
-                                           na_sentinel=na_sentinel)
+        labels, uniques = _factorize_array(codes, na_sentinel=na_sentinel,
+                                           na_value=-1)
         uniques = self._constructor(self.categories.take(uniques),
                                     categories=self.categories,
                                     ordered=self.ordered)

diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py
@@ -257,6 +257,36 @@ def test_deprecate_order(self):
         with tm.assert_produces_warning(False):
             algos.factorize(data)
 
+    @pytest.mark.parametrize('data', [
+        np.array([0, 1, 0], dtype='u8'),
+        np.array([-2**63, 1, -2**63], dtype='i8'),
+        np.array(['__nan__', 'foo', '__nan__'], dtype='object'),
+    ])
+    def test_parametrized_factorize_na_value_default(self, data):
+        # arrays that include the NA default for that type, but isn't used.
+        l, u = algos.factorize(data)
+        expected_uniques = data[[0, 1]]
+        expected_labels = np.array([0, 1, 0], dtype='i8')
+        tm.assert_numpy_array_equal(l, expected_labels)
+        tm.assert_numpy_array_equal(u, expected_uniques)
+
+    @pytest.mark.parametrize('data, na_value', [
+        (np.array([0, 1, 0, 2], dtype='u8'), 0),
+        (np.array([1, 0, 1, 2], dtype='u8'), 1),
+        (np.array([-2**63, 1, -2**63, 0], dtype='i8'), -2**63),
+        (np.array([1, -2**63, 1, 0], dtype='i8'), 1),
+        (np.array(['a', '', 'a', 'b'], dtype=object), 'a'),
+        (np.array([(), ('a', 1), (), ('a', 2)], dtype=object), ()),
+        (np.array([('a', 1), (), ('a', 1), ('a', 2)], dtype=object),
+         ('a', 1)),
+    ])
+    def test_parametrized_factorize_na_value(self, data, na_value):
+        l, u = algos._factorize_array(data, na_value=na_value)
+        expected_uniques = data[[1, 3]]
+        expected_labels = np.array([-1, 0, -1, 1], dtype='i8')
+        tm.assert_numpy_array_equal(l, expected_labels)
+        tm.assert_numpy_array_equal(u, expected_uniques)
+
 
 class TestUnique(object):