Skip to content

Commit 9497b39

Browse files
authored
Merge pull request #6906 from janezd/groupby-keep-variables
GroupBy: Avoid guessing variable types
2 parents c6a79f6 + c9edef4 commit 9497b39

7 files changed

Lines changed: 254 additions & 83 deletions

File tree

Orange/data/aggregate.py

Lines changed: 38 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
from functools import lru_cache
2-
from typing import Callable, Dict, List, Tuple, Union
2+
from typing import Callable, Dict, List, Tuple, Union, Type
33

44
import pandas as pd
55

@@ -39,15 +39,20 @@ def __init__(self, table: Table, by: List[Variable]):
3939
df = table_to_frame(table, include_metas=True)
4040
# observed=True keeps only groups with at leas one instance
4141
self.group_by = df.groupby([a.name for a in by], observed=True)
42+
self.by = tuple(by)
4243

4344
# lru_cache that is caches on the object level
4445
self.compute_aggregation = lru_cache()(self._compute_aggregation)
4546

47+
AggDescType = Union[str,
48+
Callable,
49+
Tuple[str, Union[str, Callable]],
50+
Tuple[str, Union[str, Callable], Union[Type[Variable], bool]]
51+
]
52+
4653
def aggregate(
4754
self,
48-
aggregations: Dict[
49-
Variable, List[Union[str, Callable, Tuple[str, Union[str, Callable]]]]
50-
],
55+
aggregations: Dict[Variable, List[AggDescType]],
5156
callback: Callable = dummy_callback,
5257
) -> Table:
5358
"""
@@ -57,12 +62,16 @@ def aggregate(
5762
----------
5863
aggregations
5964
The dictionary that defines aggregations that need to be computed
60-
for variables. We support two formats:
65+
for variables. We support three formats:
6166
- {variable name: [agg function 1, agg function 2]}
6267
- {variable name: [(agg name 1, agg function 1), (agg name 1, agg function 1)]}
68+
- {variable name: [(agg name 1, agg function 1, output_variable_type1), ...]}
6369
Where agg name is the aggregation name used in the output column name.
6470
Aggregation function can be either function or string that defines
6571
aggregation in Pandas (e.g. mean).
72+
output_variable_type can be a type for a new variable, True to copy
73+
the input variable, or False to create a new variable of the same type
74+
as the input
6675
callback
6776
Callback function to report the progress
6877
@@ -75,29 +84,44 @@ def aggregate(
7584
count = 0
7685

7786
result_agg = []
87+
output_variables = []
7888
for col, aggs in aggregations.items():
7989
for agg in aggs:
80-
res = self._compute_aggregation(col, agg)
90+
res, var = self._compute_aggregation(col, agg)
8191
result_agg.append(res)
92+
output_variables.append(var)
8293
count += 1
8394
callback(count / num_aggs * 0.8)
8495

85-
agg_table = self._aggregations_to_table(result_agg)
96+
agg_table = self._aggregations_to_table(result_agg, output_variables)
8697
callback(1)
8798
return agg_table
8899

89100
def _compute_aggregation(
90-
self, col: Variable, agg: Union[str, Callable, Tuple[str, Union[str, Callable]]]
91-
) -> pd.Series:
101+
self, col: Variable, agg: AggDescType) -> Tuple[pd.Series, Variable]:
92102
# use named aggregation to avoid issues with same column names when reset_index
93103
if isinstance(agg, tuple):
94-
name, agg = agg
104+
name, agg, var_type, *_ = (*agg, None)
95105
else:
96106
name = agg if isinstance(agg, str) else agg.__name__
107+
var_type = None
97108
col_name = f"{col.name} - {name}"
98-
return self.group_by[col.name].agg(**{col_name: agg})
99-
100-
def _aggregations_to_table(self, aggregations: List[pd.Series]) -> Table:
109+
agg_col = self.group_by[col.name].agg(**{col_name: agg})
110+
if var_type is True:
111+
var = col.copy(name=col_name)
112+
elif var_type is False:
113+
var = col.make(name=col_name)
114+
elif var_type is None:
115+
var = None
116+
else:
117+
assert issubclass(var_type, Variable)
118+
var = var_type.make(name=col_name)
119+
return agg_col, var
120+
121+
def _aggregations_to_table(
122+
self,
123+
aggregations: List[pd.Series],
124+
output_variables: List[Union[Variable, None]]) -> Table:
101125
"""Concatenate aggregation series and convert back to Table"""
102126
if aggregations:
103127
df = pd.concat(aggregations, axis=1)
@@ -107,7 +131,7 @@ def _aggregations_to_table(self, aggregations: List[pd.Series]) -> Table:
107131
df = df.drop(columns=df.columns)
108132
gb_attributes = df.index.names
109133
df = df.reset_index() # move group by var that are in index to columns
110-
table = table_from_frame(df)
134+
table = table_from_frame(df, variables=(*self.by, *output_variables))
111135

112136
# group by variables should be last two columns in metas in the output
113137
metas = table.domain.metas

Orange/data/pandas_compat.py

Lines changed: 53 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
"""Pandas DataFrame↔Table conversion helpers"""
22
from functools import partial
3+
from itertools import zip_longest
34

45
import numpy as np
56
from scipy import sparse as sp
@@ -255,7 +256,14 @@ def to_categorical(s, _):
255256
return np.asarray(x)
256257

257258

258-
def vars_from_df(df, role=None, force_nominal=False):
259+
def to_numeric(s, _):
260+
return np.asarray(pd.to_numeric(s))
261+
262+
263+
def vars_from_df(df, role=None, force_nominal=False, variables=None):
264+
if variables is not None:
265+
assert len(variables) == len(df.columns)
266+
259267
if role is None and hasattr(df, 'orange_role'):
260268
role = df.orange_role
261269
df = _reset_index(df)
@@ -264,39 +272,52 @@ def vars_from_df(df, role=None, force_nominal=False):
264272
exprs = [], [], []
265273
vars_ = [], [], []
266274

267-
for column in df.columns:
275+
def _convert_string(s, _):
276+
return np.asarray(
277+
# to object so that fillna can replace with nans if Unknown in nan
278+
# replace nan with object Unknown assure that all values are string
279+
s.astype(object).fillna(StringVariable.Unknown).astype(str),
280+
dtype=object
281+
)
282+
283+
conversions = {
284+
DiscreteVariable: to_categorical,
285+
ContinuousVariable: to_numeric,
286+
TimeVariable: _convert_datetime,
287+
StringVariable: _convert_string
288+
}
289+
290+
for column, var in zip_longest(df.columns, variables or [], fillvalue=None):
268291
s = df[column]
269292
_role = Role.Attribute if role is None else role
270-
if hasattr(df, 'orange_variables') and column in df.orange_variables:
293+
if var is not None:
294+
if not var.is_primitive():
295+
_role = Role.Meta
296+
expr = conversions[type(var)]
297+
elif hasattr(df, 'orange_variables') and column in df.orange_variables:
271298
original_var = df.orange_variables[column]
272299
var = original_var.copy(compute_value=None)
273300
expr = None
274-
elif _is_datetime(s):
275-
var = TimeVariable(str(column))
276-
expr = _convert_datetime
277-
elif _is_discrete(s, force_nominal):
278-
discrete = s.astype("category").cat
279-
var = DiscreteVariable(
280-
str(column), discrete.categories.astype(str).tolist()
281-
)
282-
expr = to_categorical
283-
elif is_numeric_dtype(s):
284-
var = ContinuousVariable(
285-
# set number of decimals to 0 if int else keeps default behaviour
286-
str(column), number_of_decimals=(0 if is_integer_dtype(s) else None)
287-
)
288-
expr = None
289301
else:
290-
if role is not None and role != Role.Meta:
291-
raise ValueError("String variable must be in metas.")
292-
_role = Role.Meta
293-
var = StringVariable(str(column))
294-
expr = lambda s, _: np.asarray(
295-
# to object so that fillna can replace with nans if Unknown in nan
296-
# replace nan with object Unknown assure that all values are string
297-
s.astype(object).fillna(StringVariable.Unknown).astype(str),
298-
dtype=object
299-
)
302+
if _is_datetime(s):
303+
var = TimeVariable(str(column))
304+
elif _is_discrete(s, force_nominal):
305+
discrete = s.astype("category").cat
306+
var = DiscreteVariable(
307+
str(column), discrete.categories.astype(str).tolist()
308+
)
309+
elif is_numeric_dtype(s):
310+
var = ContinuousVariable(
311+
# set number of decimals to 0 if int else keeps default behaviour
312+
str(column), number_of_decimals=(0 if is_integer_dtype(s) else None)
313+
)
314+
else:
315+
if role is not None and role != Role.Meta:
316+
raise ValueError("String variable must be in metas.")
317+
_role = Role.Meta
318+
var = StringVariable(str(column))
319+
expr = conversions[type(var)]
320+
300321

301322
cols[_role].append(column)
302323
exprs[_role].append(expr)
@@ -330,8 +351,10 @@ def vars_from_df(df, role=None, force_nominal=False):
330351
return xym, Domain(*vars_)
331352

332353

333-
def table_from_frame(df, *, force_nominal=False):
334-
XYM, domain = vars_from_df(df, force_nominal=force_nominal)
354+
def table_from_frame(df, *, force_nominal=False, variables=None):
355+
XYM, domain = vars_from_df(df,
356+
force_nominal=force_nominal,
357+
variables=variables)
335358

336359
if hasattr(df, 'orange_weights') and hasattr(df, 'orange_attributes'):
337360
W = [df.orange_weights[i] for i in df.index if i in df.orange_weights]

Orange/data/tests/test_aggregate.py

Lines changed: 58 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import unittest
2+
from unittest.mock import Mock
23

34
import numpy as np
45
import pandas as pd
@@ -132,13 +133,69 @@ def test_aggregation(self):
132133
def test_preserve_table_class(self):
133134
"""
134135
Test whether result table has the same type than the imnput table,
135-
e.g. if input table corpus the resutlitn table must be corpus too.
136+
e.g. if input table corpus the resulting table must be corpus too.
136137
"""
137138
data = AlternativeTable.from_table(self.data.domain, self.data)
138139
gb = data.groupby([data.domain["a"]])
139140
output = gb.aggregate({data.domain["a"]: ["mean"]})
140141
self.assertIsInstance(output, AlternativeTable)
141142

143+
def test_preserve_variables(self):
144+
a, _, _, dvar = self.data.domain.attributes
145+
gb = self.data.groupby([a])
146+
147+
a.attributes = {"foo": "bar"}
148+
dvar.attributes = {"foo": "baz"}
149+
150+
a.copy = Mock(side_effect=a.copy)
151+
a.make = Mock(side_effect=a.make)
152+
153+
def f(*_):
154+
return 0
155+
156+
output = gb.aggregate(
157+
{a: [("copy", f, True),
158+
("make", f, False),
159+
("auto", f, None),
160+
("string", f, StringVariable),
161+
("number", f, ContinuousVariable)],
162+
dvar: [("copy", f, True),
163+
("make", f, False),
164+
("auto", f, None),
165+
("string", f, StringVariable),
166+
("discrete", f, DiscreteVariable)]}
167+
)
168+
self.assertIsInstance(output.domain["a - copy"], ContinuousVariable)
169+
a.copy.assert_called_once()
170+
self.assertEqual(output.domain["a - copy"].attributes, {"foo": "bar"})
171+
172+
self.assertIsInstance(output.domain["a - make"], ContinuousVariable)
173+
a.make.assert_called_once()
174+
self.assertNotEqual(output.domain["a - make"].attributes, {"foo": "bar"})
175+
176+
self.assertIsInstance(output.domain["a - auto"], ContinuousVariable)
177+
self.assertNotEqual(output.domain["a - auto"].attributes, {"foo": "bar"})
178+
179+
self.assertIsInstance(output.domain["a - string"], StringVariable)
180+
181+
self.assertIsInstance(output.domain["a - number"], ContinuousVariable)
182+
self.assertNotEqual(output.domain["a - number"].attributes, {"foo": "bar"})
183+
184+
self.assertIsInstance(output.domain["dvar - copy"], DiscreteVariable)
185+
self.assertEqual(output.domain["dvar - copy"].attributes, {"foo": "baz"})
186+
187+
self.assertIsInstance(output.domain["dvar - make"], DiscreteVariable)
188+
self.assertNotEqual(output.domain["dvar - make"].attributes, {"foo": "baz"})
189+
190+
# f returns 0, so the column looks numeric! Let's test that it is
191+
# converted to numeric.
192+
self.assertIsInstance(output.domain["dvar - auto"], ContinuousVariable)
193+
194+
self.assertIsInstance(output.domain["dvar - string"], StringVariable)
195+
196+
self.assertIsInstance(output.domain["dvar - discrete"], DiscreteVariable)
197+
self.assertNotEqual(output.domain["dvar - discrete"].attributes, {"foo": "baz"})
198+
142199

143200
if __name__ == "__main__":
144201
unittest.main()
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,24 @@ def test_table_from_frame(self):
5656
self.assertEqual(names, ['0', '1', '2'])
5757
self.assertEqual(types, [DiscreteVariable, ContinuousVariable, TimeVariable])
5858

59+
# Specify (some) variables
60+
dvar = DiscreteVariable('x', values=tuple("dacb"))
61+
cvar = ContinuousVariable('y')
62+
table = table_from_frame(df, variables=[dvar, cvar, None])
63+
self.assertIs(table.domain[0], dvar)
64+
self.assertIs(table.domain[1], cvar)
65+
self.assertIsInstance(table.domain[2], TimeVariable)
66+
67+
table = table_from_frame(df,
68+
variables=[None, None, None],
69+
force_nominal=True)
70+
self.assertIsInstance(table.domain[0], DiscreteVariable)
71+
self.assertIsInstance(table.domain[1], ContinuousVariable)
72+
self.assertIsInstance(table.domain[2], TimeVariable)
73+
74+
self.assertRaises(AssertionError,
75+
table_from_frame, df, variables=[None, None])
76+
5977
# Include index
6078
df.index = list('abaa')
6179
table = table_from_frame(df)

0 commit comments

Comments
 (0)