Fixed factorize for MACArray (#13)

TomAugspurger · web-flow · commit b3152b0789a9 · 2018-03-27T12:58:22.000-05:00
* Fixed factorize for MACArray Relies on pandas-dev/pandas#19957 * Build on na_value * Include groupby patch
diff --git a/conda-recipes/pandas/0001-pandas.patch b/conda-recipes/pandas/0001-pandas.patch
@@ -0,0 +1,203 @@
+diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py
+index 601acac20..7c89cab6b 100644
+--- a/pandas/core/groupby.py
++++ b/pandas/core/groupby.py
+@@ -44,7 +44,7 @@ from pandas.core.base import (PandasObject, SelectionMixin, GroupByError,
+                               DataError, SpecificationError)
+ from pandas.core.index import (Index, MultiIndex,
+                                CategoricalIndex, _ensure_index)
+-from pandas.core.arrays import Categorical
++from pandas.core.arrays import ExtensionArray, Categorical
+ from pandas.core.frame import DataFrame
+ from pandas.core.generic import NDFrame, _shared_docs
+ from pandas.core.internals import BlockManager, make_block
+@@ -2968,7 +2968,7 @@ class Grouping(object):
+ 
+             # no level passed
+             elif not isinstance(self.grouper,
+-                                (Series, Index, Categorical, np.ndarray)):
++                                (Series, Index, ExtensionArray, np.ndarray)):
+                 if getattr(self.grouper, 'ndim', 1) != 1:
+                     t = self.name or str(type(self.grouper))
+                     raise ValueError("Grouper for '%s' not 1-dimensional" % t)
+diff --git a/pandas/tests/extension/base/__init__.py b/pandas/tests/extension/base/__init__.py
+index 27c106efd..f8078d279 100644
+--- a/pandas/tests/extension/base/__init__.py
++++ b/pandas/tests/extension/base/__init__.py
+@@ -44,6 +44,7 @@ from .casting import BaseCastingTests  # noqa
+ from .constructors import BaseConstructorsTests  # noqa
+ from .dtype import BaseDtypeTests  # noqa
+ from .getitem import BaseGetitemTests  # noqa
++from .groupby import BaseGroupbyTests  # noqa
+ from .interface import BaseInterfaceTests  # noqa
+ from .methods import BaseMethodsTests  # noqa
+ from .missing import BaseMissingTests  # noqa
+diff --git a/pandas/tests/extension/base/groupby.py b/pandas/tests/extension/base/groupby.py
+new file mode 100644
+index 000000000..a29ef2a50
+--- /dev/null
++++ b/pandas/tests/extension/base/groupby.py
+@@ -0,0 +1,69 @@
++import pytest
++
++import pandas.util.testing as tm
++import pandas as pd
++from .base import BaseExtensionTests
++
++
++class BaseGroupbyTests(BaseExtensionTests):
++    """Groupby-specific tests."""
++
++    def test_grouping_grouper(self, data_for_grouping):
++        df = pd.DataFrame({
++            "A": ["B", "B", None, None, "A", "A", "B", "C"],
++            "B": data_for_grouping
++        })
++        gr1 = df.groupby("A").grouper.groupings[0]
++        gr2 = df.groupby("B").grouper.groupings[0]
++
++        tm.assert_numpy_array_equal(gr1.grouper, df.A.values)
++        tm.assert_extension_array_equal(gr2.grouper, data_for_grouping)
++
++    @pytest.mark.parametrize('as_index', [True, False])
++    def test_groupby_extension_agg(self, as_index, data_for_grouping):
++        df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1, 4],
++                           "B": data_for_grouping})
++        result = df.groupby("B", as_index=as_index).A.mean()
++        _, index = pd.factorize(data_for_grouping, sort=True)
++        # TODO(ExtensionIndex): remove astype
++        index = pd.Index(index.astype(object), name="B")
++        expected = pd.Series([3, 1, 4], index=index, name="A")
++        if as_index:
++            self.assert_series_equal(result, expected)
++        else:
++            expected = expected.reset_index()
++            self.assert_frame_equal(result, expected)
++
++    def test_groupby_extension_no_sort(self, data_for_grouping):
++        df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1, 4],
++                           "B": data_for_grouping})
++        result = df.groupby("B", sort=False).A.mean()
++        _, index = pd.factorize(data_for_grouping, sort=False)
++        # TODO(ExtensionIndex): remove astype
++        index = pd.Index(index.astype(object), name="B")
++        expected = pd.Series([1, 3, 4], index=index, name="A")
++        self.assert_series_equal(result, expected)
++
++    def test_groupby_extension_transform(self, data_for_grouping):
++        valid = data_for_grouping[~data_for_grouping.isna()]
++        df = pd.DataFrame({"A": [1, 1, 3, 3, 1, 4],
++                           "B": valid})
++
++        result = df.groupby("B").A.transform(len)
++        expected = pd.Series([3, 3, 2, 2, 3, 1], name="A")
++
++        self.assert_series_equal(result, expected)
++
++    @pytest.mark.parametrize('op', [
++        lambda x: 1,
++        lambda x: [1] * len(x),
++        lambda x: pd.Series([1] * len(x)),
++        lambda x: x,
++    ], ids=['scalar', 'list', 'series', 'object'])
++    def test_groupby_extension_apply(self, data_for_grouping, op):
++        df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1, 4],
++                           "B": data_for_grouping})
++        df.groupby("B").apply(op)
++        df.groupby("B").A.apply(op)
++        df.groupby("A").apply(op)
++        df.groupby("A").B.apply(op)
+diff --git a/pandas/tests/extension/decimal/test_decimal.py b/pandas/tests/extension/decimal/test_decimal.py
+index 22c1a67a0..d50917056 100644
+--- a/pandas/tests/extension/decimal/test_decimal.py
++++ b/pandas/tests/extension/decimal/test_decimal.py
+@@ -127,6 +127,10 @@ class TestCasting(BaseDecimal, base.BaseCastingTests):
+     pass
+ 
+ 
++class TestGroupby(BaseDecimal, base.BaseGroupbyTests):
++    pass
++
++
+ def test_series_constructor_coerce_data_to_extension_dtype_raises():
+     xpr = ("Cannot cast data to extension dtype 'decimal'. Pass the "
+            "extension array directly.")
+diff --git a/pandas/tests/extension/json/array.py b/pandas/tests/extension/json/array.py
+index 51a68a370..d9ae49d87 100644
+--- a/pandas/tests/extension/json/array.py
++++ b/pandas/tests/extension/json/array.py
+@@ -113,8 +113,8 @@ class JSONArray(ExtensionArray):
+         return cls(data)
+ 
+     def _values_for_factorize(self):
+-        frozen = tuple(tuple(x.items()) for x in self)
+-        return np.array(frozen, dtype=object), ()
++        frozen = self._values_for_argsort()
++        return frozen, ()
+ 
+     def _values_for_argsort(self):
+         # Disable NumPy's shape inference by including an empty tuple...
+diff --git a/pandas/tests/extension/json/test_json.py b/pandas/tests/extension/json/test_json.py
+index 63d97d5e7..5e9639c48 100644
+--- a/pandas/tests/extension/json/test_json.py
++++ b/pandas/tests/extension/json/test_json.py
+@@ -89,11 +89,12 @@ class TestMissing(base.BaseMissingTests):
+         """We treat dictionaries as a mapping in fillna, not a scalar."""
+ 
+ 
+-class TestMethods(base.BaseMethodsTests):
+-    unhashable = pytest.mark.skip(reason="Unhashable")
+-    unstable = pytest.mark.skipif(not PY36,  # 3.6 or higher
+-                                  reason="Dictionary order unstable")
++unhashable = pytest.mark.skip(reason="Unhashable")
++unstable = pytest.mark.skipif(not PY36,  # 3.6 or higher
++                              reason="Dictionary order unstable")
++
+ 
++class TestMethods(base.BaseMethodsTests):
+     @unhashable
+     def test_value_counts(self, all_data, dropna):
+         pass
+@@ -118,6 +119,7 @@ class TestMethods(base.BaseMethodsTests):
+         super(TestMethods, self).test_sort_values(
+             data_for_sorting, ascending)
+ 
++    @unstable
+     @pytest.mark.parametrize('ascending', [True, False])
+     def test_sort_values_missing(self, data_missing_for_sorting, ascending):
+         super(TestMethods, self).test_sort_values_missing(
+@@ -126,3 +128,34 @@ class TestMethods(base.BaseMethodsTests):
+ 
+ class TestCasting(base.BaseCastingTests):
+     pass
++
++
++class TestGroupby(base.BaseGroupbyTests):
++
++    @unhashable
++    def test_groupby_extension_transform(self):
++        """
++        This currently fails in Series.name.setter, since the
++        name must be hashable, but the value is a dictionary.
++        I think this is what we want, i.e. `.name` should be the original
++        values, and not the values for factorization.
++        """
++
++    @unhashable
++    def test_groupby_extension_apply(self):
++        """
++        This fails in Index._do_unique_check with
++
++        >   hash(val)
++        E   TypeError: unhashable type: 'UserDict' with
++
++        I suspect that once we support Index[ExtensionArray],
++        we'll be able to dispatch unique.
++        """
++
++    @unstable
++    @pytest.mark.parametrize('as_index', [True, False])
++    def test_groupby_extension_agg(self, as_index, data_for_grouping):
++        super(TestGroupby, self).test_groupby_extension_agg(
++            as_index, data_for_grouping
++        )
diff --git a/conda-recipes/pandas/meta.yaml b/conda-recipes/pandas/meta.yaml
@@ -7,6 +7,9 @@ build:
 
 source:
     git_url: https://github.com/pandas-dev/pandas
+    git_rev: 766a480
+    patches:
+      - 0001-pandas.patch
 
 requirements:
   build:
diff --git a/cyberpandas/base.py b/cyberpandas/base.py
@@ -2,11 +2,8 @@
 
 import numpy as np
 
-import pandas as pd
 from pandas.core.arrays import ExtensionArray
 
-from ._utils import refactorize
-
 
 class NumPyBackedExtensionArrayMixin(ExtensionArray):
     @property
@@ -18,6 +15,10 @@ def dtype(self):
     def _constructor_from_sequence(cls, scalars):
         return cls(scalars)
 
+    @classmethod
+    def _from_factorized(cls, values, original):
+        return cls(values)
+
     @property
     def shape(self):
         return (len(self.data),)
@@ -68,65 +69,3 @@ def unique(self):
         _, indices = np.unique(self.data, return_index=True)
         data = self.data.take(np.sort(indices))
         return self._from_ndarray(data)
-
-    def factorize(self, na_sentinel=-1):
-        """Factorize an IPArray into integer labels and unique values.
-
-        Calling :meth:`pandas.Series.factorize` or :meth:`pandas.factorize`
-        will dispatch to this method.
-
-        Parameters
-        ----------
-        na_sentinel : int, default -1
-            The value in `labels` to use for indicating missing values in
-            `self`.
-
-        Returns
-        -------
-        labels : ndarray
-            An integer-type ndarray the same length as `self`. Each newly-
-            observed value in `self` will be assigned the next integer.
-            Missing values in self are assigned `na_sentinel`.
-        uniques : IPArray
-            The unique values in `self` in order of appereance, not including
-            the missing value ``IPv4Address('0.0.0.0')``.
-
-        See Also
-        --------
-        pandas.factorize, pandas.Series.factorize
-
-        Examples
-        --------
-        >>> arr = IPArray([2, 2, 0, 1, 2, 2**64 + 1])
-        >>> arr
-        IPArray(['0.0.0.2', '0.0.0.2', '0.0.0.0', '0.0.0.1',
-                 '0.0.0.2', '::1:0:0:0:1'])
-
-        >>> labels, uniques = arr.factorize()
-        >>> labels
-        array([ 0,  0, -1,  1,  0,  2])
-
-        Notice that `uniques` does not include the missing value.
-        >>> uniques
-        IPArray(['0.0.0.2', '0.0.0.1', '::1:0:0:0:1'])
-        """
-        # OK, so here's the plan.
-        # Start with factorizing `self.data`, which has two unfortunate issues
-        # 1. Requires casting to object.
-        # 2. Gets the NA logic wrong, since (0, 0) isn't NA to pandas.
-        # For now, we can't help with 1. Maybe someday.
-        # For 2, we can "fix" things with a little post-factorization cleanup.
-        l, u = pd.factorize(self.data)
-        mask = self.isna()
-        any_na = mask.any()
-
-        if any_na:
-            first_na = mask.argmax()
-            refactorize(l, first_na, na_sentinel=na_sentinel)  # inplace op
-
-        # u is an ndarray of tuples. Go to our record type, then an IPArray
-        u2 = type(self)((u.astype(self.dtype._record_type)))
-        # May have a missing value.
-        if any_na:
-            u2 = u2[~u2.isna()]
-        return l, u2
diff --git a/cyberpandas/ip_array.py b/cyberpandas/ip_array.py
@@ -256,6 +256,9 @@ def equals(self, other):
         # TODO: missing
         return (self.data == other.data).all()
 
+    def _values_for_factorize(self):
+        return self.astype(object), ipaddress.IPv4Address(0)
+
     def isna(self):
         ips = self.data
         return (ips['lo'] == 0) & (ips['hi'] == 0)
diff --git a/cyberpandas/mac_array.py b/cyberpandas/mac_array.py
@@ -1,3 +1,5 @@
+from collections import Iterable
+
 import numpy as np
 import six
 
@@ -60,9 +62,7 @@ def _box_scalar(scalar):
         return scalar
 
     def __setitem__(self, key, value):
-        from .parser import to_ipaddress
-
-        value = to_ipaddress(value).data
+        value = to_macaddress(value)
         self.data[key] = value
 
     def __iter__(self):
@@ -88,6 +88,10 @@ def equals(self, other):
             raise TypeError
         return (self.data == other.data).all()
 
+    def _values_for_factorize(self):
+        # Should hit pandas' UInt64Hashtable
+        return self, 0
+
     def isna(self):
         return (self.data == 0)
 
@@ -126,3 +130,13 @@ def _parse(mac):
     # https://stackoverflow.com/a/36883363/1889400
     mac_int = int(mac.replace(":", "").replace("-", ""), 16)
     return mac_int
+
+
+def to_macaddress(addresses):
+    if (isinstance(addresses, six.string_types) or
+            not isinstance(addresses, Iterable)):
+        addresses = [addresses]
+
+    addresses = [_parse(mac) if isinstance(mac, six.string_types) else mac
+                 for mac in addresses]
+    return np.array(addresses, dtype='u8')
diff --git a/cyberpandas/parser.py b/cyberpandas/parser.py
@@ -57,7 +57,7 @@ def _to_int_pairs(values):
     if isinstance(values, (str, bytes, int)):
         values = ipaddress.ip_address(values)._ip
         return unpack(pack(values))
-    elif isinstance(values, np.ndarray):
+    elif isinstance(values, np.ndarray) and values.dtype != object:
         if values.ndim != 2:
             raise ValueError("'values' should be a 2-D when passing a "
                              "NumPy array.")
diff --git a/tests/mac/test_interface.py b/tests/mac/test_interface.py
@@ -31,12 +31,12 @@ def all_data(request, data, data_missing):
 
 @pytest.fixture
 def data_for_sorting():
-    return MACArray([10, 2 ** 64 + 1, 1])
+    return MACArray([10, 2 ** 64 - 1, 1])
 
 
 @pytest.fixture
 def data_missing_for_sorting():
-    return MACArray([2 ** 64 + 1, 0, 1])
+    return MACArray([2 ** 64 - 1, 0, 1])
 
 
 @pytest.fixture