Skip to content

Commit b3152b0

Browse files
Fixed factorize for MACArray (#13)
* Fixed factorize for MACArray Relies on pandas-dev/pandas#19957 * Build on na_value * Include groupby patch
1 parent 468644b commit b3152b0

File tree

7 files changed

+233
-71
lines changed

7 files changed

+233
-71
lines changed
+203
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,203 @@
1+
diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py
2+
index 601acac20..7c89cab6b 100644
3+
--- a/pandas/core/groupby.py
4+
+++ b/pandas/core/groupby.py
5+
@@ -44,7 +44,7 @@ from pandas.core.base import (PandasObject, SelectionMixin, GroupByError,
6+
DataError, SpecificationError)
7+
from pandas.core.index import (Index, MultiIndex,
8+
CategoricalIndex, _ensure_index)
9+
-from pandas.core.arrays import Categorical
10+
+from pandas.core.arrays import ExtensionArray, Categorical
11+
from pandas.core.frame import DataFrame
12+
from pandas.core.generic import NDFrame, _shared_docs
13+
from pandas.core.internals import BlockManager, make_block
14+
@@ -2968,7 +2968,7 @@ class Grouping(object):
15+
16+
# no level passed
17+
elif not isinstance(self.grouper,
18+
- (Series, Index, Categorical, np.ndarray)):
19+
+ (Series, Index, ExtensionArray, np.ndarray)):
20+
if getattr(self.grouper, 'ndim', 1) != 1:
21+
t = self.name or str(type(self.grouper))
22+
raise ValueError("Grouper for '%s' not 1-dimensional" % t)
23+
diff --git a/pandas/tests/extension/base/__init__.py b/pandas/tests/extension/base/__init__.py
24+
index 27c106efd..f8078d279 100644
25+
--- a/pandas/tests/extension/base/__init__.py
26+
+++ b/pandas/tests/extension/base/__init__.py
27+
@@ -44,6 +44,7 @@ from .casting import BaseCastingTests # noqa
28+
from .constructors import BaseConstructorsTests # noqa
29+
from .dtype import BaseDtypeTests # noqa
30+
from .getitem import BaseGetitemTests # noqa
31+
+from .groupby import BaseGroupbyTests # noqa
32+
from .interface import BaseInterfaceTests # noqa
33+
from .methods import BaseMethodsTests # noqa
34+
from .missing import BaseMissingTests # noqa
35+
diff --git a/pandas/tests/extension/base/groupby.py b/pandas/tests/extension/base/groupby.py
36+
new file mode 100644
37+
index 000000000..a29ef2a50
38+
--- /dev/null
39+
+++ b/pandas/tests/extension/base/groupby.py
40+
@@ -0,0 +1,69 @@
41+
+import pytest
42+
+
43+
+import pandas.util.testing as tm
44+
+import pandas as pd
45+
+from .base import BaseExtensionTests
46+
+
47+
+
48+
+class BaseGroupbyTests(BaseExtensionTests):
49+
+ """Groupby-specific tests."""
50+
+
51+
+ def test_grouping_grouper(self, data_for_grouping):
52+
+ df = pd.DataFrame({
53+
+ "A": ["B", "B", None, None, "A", "A", "B", "C"],
54+
+ "B": data_for_grouping
55+
+ })
56+
+ gr1 = df.groupby("A").grouper.groupings[0]
57+
+ gr2 = df.groupby("B").grouper.groupings[0]
58+
+
59+
+ tm.assert_numpy_array_equal(gr1.grouper, df.A.values)
60+
+ tm.assert_extension_array_equal(gr2.grouper, data_for_grouping)
61+
+
62+
+ @pytest.mark.parametrize('as_index', [True, False])
63+
+ def test_groupby_extension_agg(self, as_index, data_for_grouping):
64+
+ df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1, 4],
65+
+ "B": data_for_grouping})
66+
+ result = df.groupby("B", as_index=as_index).A.mean()
67+
+ _, index = pd.factorize(data_for_grouping, sort=True)
68+
+ # TODO(ExtensionIndex): remove astype
69+
+ index = pd.Index(index.astype(object), name="B")
70+
+ expected = pd.Series([3, 1, 4], index=index, name="A")
71+
+ if as_index:
72+
+ self.assert_series_equal(result, expected)
73+
+ else:
74+
+ expected = expected.reset_index()
75+
+ self.assert_frame_equal(result, expected)
76+
+
77+
+ def test_groupby_extension_no_sort(self, data_for_grouping):
78+
+ df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1, 4],
79+
+ "B": data_for_grouping})
80+
+ result = df.groupby("B", sort=False).A.mean()
81+
+ _, index = pd.factorize(data_for_grouping, sort=False)
82+
+ # TODO(ExtensionIndex): remove astype
83+
+ index = pd.Index(index.astype(object), name="B")
84+
+ expected = pd.Series([1, 3, 4], index=index, name="A")
85+
+ self.assert_series_equal(result, expected)
86+
+
87+
+ def test_groupby_extension_transform(self, data_for_grouping):
88+
+ valid = data_for_grouping[~data_for_grouping.isna()]
89+
+ df = pd.DataFrame({"A": [1, 1, 3, 3, 1, 4],
90+
+ "B": valid})
91+
+
92+
+ result = df.groupby("B").A.transform(len)
93+
+ expected = pd.Series([3, 3, 2, 2, 3, 1], name="A")
94+
+
95+
+ self.assert_series_equal(result, expected)
96+
+
97+
+ @pytest.mark.parametrize('op', [
98+
+ lambda x: 1,
99+
+ lambda x: [1] * len(x),
100+
+ lambda x: pd.Series([1] * len(x)),
101+
+ lambda x: x,
102+
+ ], ids=['scalar', 'list', 'series', 'object'])
103+
+ def test_groupby_extension_apply(self, data_for_grouping, op):
104+
+ df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1, 4],
105+
+ "B": data_for_grouping})
106+
+ df.groupby("B").apply(op)
107+
+ df.groupby("B").A.apply(op)
108+
+ df.groupby("A").apply(op)
109+
+ df.groupby("A").B.apply(op)
110+
diff --git a/pandas/tests/extension/decimal/test_decimal.py b/pandas/tests/extension/decimal/test_decimal.py
111+
index 22c1a67a0..d50917056 100644
112+
--- a/pandas/tests/extension/decimal/test_decimal.py
113+
+++ b/pandas/tests/extension/decimal/test_decimal.py
114+
@@ -127,6 +127,10 @@ class TestCasting(BaseDecimal, base.BaseCastingTests):
115+
pass
116+
117+
118+
+class TestGroupby(BaseDecimal, base.BaseGroupbyTests):
119+
+ pass
120+
+
121+
+
122+
def test_series_constructor_coerce_data_to_extension_dtype_raises():
123+
xpr = ("Cannot cast data to extension dtype 'decimal'. Pass the "
124+
"extension array directly.")
125+
diff --git a/pandas/tests/extension/json/array.py b/pandas/tests/extension/json/array.py
126+
index 51a68a370..d9ae49d87 100644
127+
--- a/pandas/tests/extension/json/array.py
128+
+++ b/pandas/tests/extension/json/array.py
129+
@@ -113,8 +113,8 @@ class JSONArray(ExtensionArray):
130+
return cls(data)
131+
132+
def _values_for_factorize(self):
133+
- frozen = tuple(tuple(x.items()) for x in self)
134+
- return np.array(frozen, dtype=object), ()
135+
+ frozen = self._values_for_argsort()
136+
+ return frozen, ()
137+
138+
def _values_for_argsort(self):
139+
# Disable NumPy's shape inference by including an empty tuple...
140+
diff --git a/pandas/tests/extension/json/test_json.py b/pandas/tests/extension/json/test_json.py
141+
index 63d97d5e7..5e9639c48 100644
142+
--- a/pandas/tests/extension/json/test_json.py
143+
+++ b/pandas/tests/extension/json/test_json.py
144+
@@ -89,11 +89,12 @@ class TestMissing(base.BaseMissingTests):
145+
"""We treat dictionaries as a mapping in fillna, not a scalar."""
146+
147+
148+
-class TestMethods(base.BaseMethodsTests):
149+
- unhashable = pytest.mark.skip(reason="Unhashable")
150+
- unstable = pytest.mark.skipif(not PY36, # 3.6 or higher
151+
- reason="Dictionary order unstable")
152+
+unhashable = pytest.mark.skip(reason="Unhashable")
153+
+unstable = pytest.mark.skipif(not PY36, # 3.6 or higher
154+
+ reason="Dictionary order unstable")
155+
+
156+
157+
+class TestMethods(base.BaseMethodsTests):
158+
@unhashable
159+
def test_value_counts(self, all_data, dropna):
160+
pass
161+
@@ -118,6 +119,7 @@ class TestMethods(base.BaseMethodsTests):
162+
super(TestMethods, self).test_sort_values(
163+
data_for_sorting, ascending)
164+
165+
+ @unstable
166+
@pytest.mark.parametrize('ascending', [True, False])
167+
def test_sort_values_missing(self, data_missing_for_sorting, ascending):
168+
super(TestMethods, self).test_sort_values_missing(
169+
@@ -126,3 +128,34 @@ class TestMethods(base.BaseMethodsTests):
170+
171+
class TestCasting(base.BaseCastingTests):
172+
pass
173+
+
174+
+
175+
+class TestGroupby(base.BaseGroupbyTests):
176+
+
177+
+ @unhashable
178+
+ def test_groupby_extension_transform(self):
179+
+ """
180+
+ This currently fails in Series.name.setter, since the
181+
+ name must be hashable, but the value is a dictionary.
182+
+ I think this is what we want, i.e. `.name` should be the original
183+
+ values, and not the values for factorization.
184+
+ """
185+
+
186+
+ @unhashable
187+
+ def test_groupby_extension_apply(self):
188+
+ """
189+
+ This fails in Index._do_unique_check with
190+
+
191+
+ > hash(val)
192+
+ E TypeError: unhashable type: 'UserDict' with
193+
+
194+
+ I suspect that once we support Index[ExtensionArray],
195+
+ we'll be able to dispatch unique.
196+
+ """
197+
+
198+
+ @unstable
199+
+ @pytest.mark.parametrize('as_index', [True, False])
200+
+ def test_groupby_extension_agg(self, as_index, data_for_grouping):
201+
+ super(TestGroupby, self).test_groupby_extension_agg(
202+
+ as_index, data_for_grouping
203+
+ )

conda-recipes/pandas/meta.yaml

+3
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,9 @@ build:
77

88
source:
99
git_url: https://github.com/pandas-dev/pandas
10+
git_rev: 766a480
11+
patches:
12+
- 0001-pandas.patch
1013

1114
requirements:
1215
build:

cyberpandas/base.py

+4-65
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,8 @@
22

33
import numpy as np
44

5-
import pandas as pd
65
from pandas.core.arrays import ExtensionArray
76

8-
from ._utils import refactorize
9-
107

118
class NumPyBackedExtensionArrayMixin(ExtensionArray):
129
@property
@@ -18,6 +15,10 @@ def dtype(self):
1815
def _constructor_from_sequence(cls, scalars):
1916
return cls(scalars)
2017

18+
@classmethod
19+
def _from_factorized(cls, values, original):
20+
return cls(values)
21+
2122
@property
2223
def shape(self):
2324
return (len(self.data),)
@@ -68,65 +69,3 @@ def unique(self):
6869
_, indices = np.unique(self.data, return_index=True)
6970
data = self.data.take(np.sort(indices))
7071
return self._from_ndarray(data)
71-
72-
def factorize(self, na_sentinel=-1):
73-
"""Factorize an IPArray into integer labels and unique values.
74-
75-
Calling :meth:`pandas.Series.factorize` or :meth:`pandas.factorize`
76-
will dispatch to this method.
77-
78-
Parameters
79-
----------
80-
na_sentinel : int, default -1
81-
The value in `labels` to use for indicating missing values in
82-
`self`.
83-
84-
Returns
85-
-------
86-
labels : ndarray
87-
An integer-type ndarray the same length as `self`. Each newly-
88-
observed value in `self` will be assigned the next integer.
89-
Missing values in self are assigned `na_sentinel`.
90-
uniques : IPArray
91-
The unique values in `self` in order of appereance, not including
92-
the missing value ``IPv4Address('0.0.0.0')``.
93-
94-
See Also
95-
--------
96-
pandas.factorize, pandas.Series.factorize
97-
98-
Examples
99-
--------
100-
>>> arr = IPArray([2, 2, 0, 1, 2, 2**64 + 1])
101-
>>> arr
102-
IPArray(['0.0.0.2', '0.0.0.2', '0.0.0.0', '0.0.0.1',
103-
'0.0.0.2', '::1:0:0:0:1'])
104-
105-
>>> labels, uniques = arr.factorize()
106-
>>> labels
107-
array([ 0, 0, -1, 1, 0, 2])
108-
109-
Notice that `uniques` does not include the missing value.
110-
>>> uniques
111-
IPArray(['0.0.0.2', '0.0.0.1', '::1:0:0:0:1'])
112-
"""
113-
# OK, so here's the plan.
114-
# Start with factorizing `self.data`, which has two unfortunate issues
115-
# 1. Requires casting to object.
116-
# 2. Gets the NA logic wrong, since (0, 0) isn't NA to pandas.
117-
# For now, we can't help with 1. Maybe someday.
118-
# For 2, we can "fix" things with a little post-factorization cleanup.
119-
l, u = pd.factorize(self.data)
120-
mask = self.isna()
121-
any_na = mask.any()
122-
123-
if any_na:
124-
first_na = mask.argmax()
125-
refactorize(l, first_na, na_sentinel=na_sentinel) # inplace op
126-
127-
# u is an ndarray of tuples. Go to our record type, then an IPArray
128-
u2 = type(self)((u.astype(self.dtype._record_type)))
129-
# May have a missing value.
130-
if any_na:
131-
u2 = u2[~u2.isna()]
132-
return l, u2

cyberpandas/ip_array.py

+3
Original file line numberDiff line numberDiff line change
@@ -256,6 +256,9 @@ def equals(self, other):
256256
# TODO: missing
257257
return (self.data == other.data).all()
258258

259+
def _values_for_factorize(self):
260+
return self.astype(object), ipaddress.IPv4Address(0)
261+
259262
def isna(self):
260263
ips = self.data
261264
return (ips['lo'] == 0) & (ips['hi'] == 0)

cyberpandas/mac_array.py

+17-3
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
from collections import Iterable
2+
13
import numpy as np
24
import six
35

@@ -60,9 +62,7 @@ def _box_scalar(scalar):
6062
return scalar
6163

6264
def __setitem__(self, key, value):
63-
from .parser import to_ipaddress
64-
65-
value = to_ipaddress(value).data
65+
value = to_macaddress(value)
6666
self.data[key] = value
6767

6868
def __iter__(self):
@@ -88,6 +88,10 @@ def equals(self, other):
8888
raise TypeError
8989
return (self.data == other.data).all()
9090

91+
def _values_for_factorize(self):
92+
# Should hit pandas' UInt64Hashtable
93+
return self, 0
94+
9195
def isna(self):
9296
return (self.data == 0)
9397

@@ -126,3 +130,13 @@ def _parse(mac):
126130
# https://stackoverflow.com/a/36883363/1889400
127131
mac_int = int(mac.replace(":", "").replace("-", ""), 16)
128132
return mac_int
133+
134+
135+
def to_macaddress(addresses):
136+
if (isinstance(addresses, six.string_types) or
137+
not isinstance(addresses, Iterable)):
138+
addresses = [addresses]
139+
140+
addresses = [_parse(mac) if isinstance(mac, six.string_types) else mac
141+
for mac in addresses]
142+
return np.array(addresses, dtype='u8')

cyberpandas/parser.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@ def _to_int_pairs(values):
5757
if isinstance(values, (str, bytes, int)):
5858
values = ipaddress.ip_address(values)._ip
5959
return unpack(pack(values))
60-
elif isinstance(values, np.ndarray):
60+
elif isinstance(values, np.ndarray) and values.dtype != object:
6161
if values.ndim != 2:
6262
raise ValueError("'values' should be a 2-D when passing a "
6363
"NumPy array.")

tests/mac/test_interface.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -31,12 +31,12 @@ def all_data(request, data, data_missing):
3131

3232
@pytest.fixture
3333
def data_for_sorting():
34-
return MACArray([10, 2 ** 64 + 1, 1])
34+
return MACArray([10, 2 ** 64 - 1, 1])
3535

3636

3737
@pytest.fixture
3838
def data_missing_for_sorting():
39-
return MACArray([2 ** 64 + 1, 0, 1])
39+
return MACArray([2 ** 64 - 1, 0, 1])
4040

4141

4242
@pytest.fixture

0 commit comments

Comments
 (0)