biolab
diff --git a/‎Orange/data/aggregate.py‎
Lines changed: 38 additions & 14 deletions b/‎Orange/data/aggregate.py‎
Lines changed: 38 additions & 14 deletions
diff --git a/‎Orange/data/pandas_compat.py‎
Lines changed: 53 additions & 30 deletions b/‎Orange/data/pandas_compat.py‎
Lines changed: 53 additions & 30 deletions
diff --git a/‎Orange/data/tests/test_aggregate.py‎
Lines changed: 58 additions & 1 deletion b/‎Orange/data/tests/test_aggregate.py‎
Lines changed: 58 additions & 1 deletion
diff --git a/‎Orange/data/tests/test_pandas.py‎ ‎Orange/data/tests/test_pandas_compat.py‎Orange/data/tests/test_pandas.py renamed to Orange/data/tests/test_pandas_compat.py
Lines changed: 18 additions & 0 deletions b/‎Orange/data/tests/test_pandas.py‎ ‎Orange/data/tests/test_pandas_compat.py‎Orange/data/tests/test_pandas.py renamed to Orange/data/tests/test_pandas_compat.py
Lines changed: 18 additions & 0 deletions
@@ -1,5 +1,5 @@
 from functools import lru_cache
-from typing import Callable, Dict, List, Tuple, Union
+from typing import Callable, Dict, List, Tuple, Union, Type
 
 import pandas as pd
 
@@ -39,15 +39,20 @@ def __init__(self, table: Table, by: List[Variable]):
         df = table_to_frame(table, include_metas=True)
         # observed=True keeps only groups with at leas one instance
         self.group_by = df.groupby([a.name for a in by], observed=True)
+        self.by = tuple(by)
 
         # lru_cache that is caches on the object level
         self.compute_aggregation = lru_cache()(self._compute_aggregation)
 
+    AggDescType = Union[str,
+                    Callable,
+                    Tuple[str, Union[str, Callable]],
+                    Tuple[str, Union[str, Callable], Union[Type[Variable], bool]]
+    ]
+
     def aggregate(
         self,
-        aggregations: Dict[
-            Variable, List[Union[str, Callable, Tuple[str, Union[str, Callable]]]]
-        ],
+        aggregations: Dict[Variable, List[AggDescType]],
         callback: Callable = dummy_callback,
     ) -> Table:
         """
@@ -57,12 +62,16 @@ def aggregate(
         ----------
         aggregations
             The dictionary that defines aggregations that need to be computed
-            for variables. We support two formats:
+            for variables. We support three formats:
             - {variable name: [agg function 1, agg function 2]}
             - {variable name: [(agg name 1, agg function 1),  (agg name 1, agg function 1)]}
+            - {variable name: [(agg name 1, agg function 1, output_variable_type1), ...]}
             Where agg name is the aggregation name used in the output column name.
             Aggregation function can be either function or string that defines
             aggregation in Pandas (e.g. mean).
+            output_variable_type can be a type for a new variable, True to copy
+            the input variable, or False to create a new variable of the same type
+            as the input
         callback
             Callback function to report the progress
 
@@ -75,29 +84,44 @@ def aggregate(
         count = 0
 
         result_agg = []
+        output_variables = []
         for col, aggs in aggregations.items():
             for agg in aggs:
-                res = self._compute_aggregation(col, agg)
+                res, var = self._compute_aggregation(col, agg)
                 result_agg.append(res)
+                output_variables.append(var)
                 count += 1
                 callback(count / num_aggs * 0.8)
 
-        agg_table = self._aggregations_to_table(result_agg)
+        agg_table = self._aggregations_to_table(result_agg, output_variables)
         callback(1)
         return agg_table
 
     def _compute_aggregation(
-        self, col: Variable, agg: Union[str, Callable, Tuple[str, Union[str, Callable]]]
-    ) -> pd.Series:
+            self, col: Variable, agg: AggDescType) -> Tuple[pd.Series, Variable]:
         # use named aggregation to avoid issues with same column names when reset_index
         if isinstance(agg, tuple):
-            name, agg = agg
+            name, agg, var_type, *_ = (*agg, None)
         else:
             name = agg if isinstance(agg, str) else agg.__name__
+            var_type = None
         col_name = f"{col.name} - {name}"
-        return self.group_by[col.name].agg(**{col_name: agg})
-
-    def _aggregations_to_table(self, aggregations: List[pd.Series]) -> Table:
+        agg_col = self.group_by[col.name].agg(**{col_name: agg})
+        if var_type is True:
+            var = col.copy(name=col_name)
+        elif var_type is False:
+            var = col.make(name=col_name)
+        elif var_type is None:
+            var = None
+        else:
+            assert issubclass(var_type, Variable)
+            var = var_type.make(name=col_name)
+        return agg_col, var
+
+    def _aggregations_to_table(
+            self,
+            aggregations: List[pd.Series],
+            output_variables: List[Union[Variable, None]]) -> Table:
         """Concatenate aggregation series and convert back to Table"""
         if aggregations:
             df = pd.concat(aggregations, axis=1)
@@ -107,7 +131,7 @@ def _aggregations_to_table(self, aggregations: List[pd.Series]) -> Table:
             df = df.drop(columns=df.columns)
         gb_attributes = df.index.names
         df = df.reset_index()  # move group by var that are in index to columns
-        table = table_from_frame(df)
+        table = table_from_frame(df, variables=(*self.by, *output_variables))
 
         # group by variables should be last two columns in metas in the output
         metas = table.domain.metas
 
@@ -1,5 +1,6 @@
 """Pandas DataFrame↔Table conversion helpers"""
 from functools import partial
+from itertools import zip_longest
 
 import numpy as np
 from scipy import sparse as sp
@@ -255,7 +256,14 @@ def to_categorical(s, _):
     return np.asarray(x)
 
 
-def vars_from_df(df, role=None, force_nominal=False):
+def to_numeric(s, _):
+    return np.asarray(pd.to_numeric(s))
+
+
+def vars_from_df(df, role=None, force_nominal=False, variables=None):
+    if variables is not None:
+        assert len(variables) == len(df.columns)
+
     if role is None and hasattr(df, 'orange_role'):
         role = df.orange_role
     df = _reset_index(df)
@@ -264,39 +272,52 @@ def vars_from_df(df, role=None, force_nominal=False):
     exprs = [], [], []
     vars_ = [], [], []
 
-    for column in df.columns:
+    def _convert_string(s, _):
+        return np.asarray(
+                    # to object so that fillna can replace with nans if Unknown in nan
+                    # replace nan with object Unknown assure that all values are string
+                    s.astype(object).fillna(StringVariable.Unknown).astype(str),
+                    dtype=object
+                )
+
+    conversions = {
+        DiscreteVariable: to_categorical,
+        ContinuousVariable: to_numeric,
+        TimeVariable: _convert_datetime,
+        StringVariable: _convert_string
+    }
+
+    for column, var in zip_longest(df.columns, variables or [], fillvalue=None):
         s = df[column]
         _role = Role.Attribute if role is None else role
-        if hasattr(df, 'orange_variables') and column in df.orange_variables:
+        if var is not None:
+            if not var.is_primitive():
+                _role = Role.Meta
+            expr = conversions[type(var)]
+        elif hasattr(df, 'orange_variables') and column in df.orange_variables:
             original_var = df.orange_variables[column]
             var = original_var.copy(compute_value=None)
             expr = None
-        elif _is_datetime(s):
-            var = TimeVariable(str(column))
-            expr = _convert_datetime
-        elif _is_discrete(s, force_nominal):
-            discrete = s.astype("category").cat
-            var = DiscreteVariable(
-                str(column), discrete.categories.astype(str).tolist()
-            )
-            expr = to_categorical
-        elif is_numeric_dtype(s):
-            var = ContinuousVariable(
-                # set number of decimals to 0 if int else keeps default behaviour
-                str(column), number_of_decimals=(0 if is_integer_dtype(s) else None)
-            )
-            expr = None
         else:
-            if role is not None and role != Role.Meta:
-                raise ValueError("String variable must be in metas.")
-            _role = Role.Meta
-            var = StringVariable(str(column))
-            expr = lambda s, _: np.asarray(
-                # to object so that fillna can replace with nans if Unknown in nan
-                # replace nan with object Unknown assure that all values are string
-                s.astype(object).fillna(StringVariable.Unknown).astype(str),
-                dtype=object
-            )
+            if _is_datetime(s):
+                var = TimeVariable(str(column))
+            elif _is_discrete(s, force_nominal):
+                discrete = s.astype("category").cat
+                var = DiscreteVariable(
+                    str(column), discrete.categories.astype(str).tolist()
+                )
+            elif is_numeric_dtype(s):
+                var = ContinuousVariable(
+                    # set number of decimals to 0 if int else keeps default behaviour
+                    str(column), number_of_decimals=(0 if is_integer_dtype(s) else None)
+                )
+            else:
+                if role is not None and role != Role.Meta:
+                    raise ValueError("String variable must be in metas.")
+                _role = Role.Meta
+                var = StringVariable(str(column))
+            expr = conversions[type(var)]
+
 
         cols[_role].append(column)
         exprs[_role].append(expr)
@@ -330,8 +351,10 @@ def vars_from_df(df, role=None, force_nominal=False):
     return xym, Domain(*vars_)
 
 
-def table_from_frame(df, *, force_nominal=False):
-    XYM, domain = vars_from_df(df, force_nominal=force_nominal)
+def table_from_frame(df, *, force_nominal=False, variables=None):
+    XYM, domain = vars_from_df(df,
+                               force_nominal=force_nominal,
+                               variables=variables)
 
     if hasattr(df, 'orange_weights') and hasattr(df, 'orange_attributes'):
         W = [df.orange_weights[i] for i in df.index if i in df.orange_weights]
 
@@ -1,4 +1,5 @@
 import unittest
+from unittest.mock import Mock
 
 import numpy as np
 import pandas as pd
@@ -132,13 +133,69 @@ def test_aggregation(self):
     def test_preserve_table_class(self):
         """
         Test whether result table has the same type than the imnput table,
-        e.g. if input table corpus the resutlitn table must be corpus too.
+        e.g. if input table corpus the resulting table must be corpus too.
         """
         data = AlternativeTable.from_table(self.data.domain, self.data)
         gb = data.groupby([data.domain["a"]])
         output = gb.aggregate({data.domain["a"]: ["mean"]})
         self.assertIsInstance(output, AlternativeTable)
 
+    def test_preserve_variables(self):
+        a, _, _, dvar = self.data.domain.attributes
+        gb = self.data.groupby([a])
+
+        a.attributes = {"foo": "bar"}
+        dvar.attributes = {"foo": "baz"}
+
+        a.copy = Mock(side_effect=a.copy)
+        a.make = Mock(side_effect=a.make)
+
+        def f(*_):
+            return 0
+
+        output = gb.aggregate(
+            {a: [("copy", f, True),
+                 ("make", f, False),
+                 ("auto", f, None),
+                 ("string", f, StringVariable),
+                 ("number", f, ContinuousVariable)],
+             dvar: [("copy", f, True),
+                    ("make", f, False),
+                    ("auto", f, None),
+                    ("string", f, StringVariable),
+                    ("discrete", f, DiscreteVariable)]}
+        )
+        self.assertIsInstance(output.domain["a - copy"], ContinuousVariable)
+        a.copy.assert_called_once()
+        self.assertEqual(output.domain["a - copy"].attributes, {"foo": "bar"})
+
+        self.assertIsInstance(output.domain["a - make"], ContinuousVariable)
+        a.make.assert_called_once()
+        self.assertNotEqual(output.domain["a - make"].attributes, {"foo": "bar"})
+
+        self.assertIsInstance(output.domain["a - auto"], ContinuousVariable)
+        self.assertNotEqual(output.domain["a - auto"].attributes, {"foo": "bar"})
+
+        self.assertIsInstance(output.domain["a - string"], StringVariable)
+
+        self.assertIsInstance(output.domain["a - number"], ContinuousVariable)
+        self.assertNotEqual(output.domain["a - number"].attributes, {"foo": "bar"})
+
+        self.assertIsInstance(output.domain["dvar - copy"], DiscreteVariable)
+        self.assertEqual(output.domain["dvar - copy"].attributes, {"foo": "baz"})
+
+        self.assertIsInstance(output.domain["dvar - make"], DiscreteVariable)
+        self.assertNotEqual(output.domain["dvar - make"].attributes, {"foo": "baz"})
+
+        # f returns 0, so the column looks numeric! Let's test that it is
+        # converted to numeric.
+        self.assertIsInstance(output.domain["dvar - auto"], ContinuousVariable)
+
+        self.assertIsInstance(output.domain["dvar - string"], StringVariable)
+
+        self.assertIsInstance(output.domain["dvar - discrete"], DiscreteVariable)
+        self.assertNotEqual(output.domain["dvar - discrete"].attributes, {"foo": "baz"})
+
 
 if __name__ == "__main__":
     unittest.main()
@@ -56,6 +56,24 @@ def test_table_from_frame(self):
         self.assertEqual(names, ['0', '1', '2'])
         self.assertEqual(types, [DiscreteVariable, ContinuousVariable, TimeVariable])
 
+        # Specify (some) variables
+        dvar = DiscreteVariable('x', values=tuple("dacb"))
+        cvar = ContinuousVariable('y')
+        table = table_from_frame(df, variables=[dvar, cvar, None])
+        self.assertIs(table.domain[0], dvar)
+        self.assertIs(table.domain[1], cvar)
+        self.assertIsInstance(table.domain[2], TimeVariable)
+
+        table = table_from_frame(df,
+                                 variables=[None, None, None],
+                                 force_nominal=True)
+        self.assertIsInstance(table.domain[0], DiscreteVariable)
+        self.assertIsInstance(table.domain[1], ContinuousVariable)
+        self.assertIsInstance(table.domain[2], TimeVariable)
+
+        self.assertRaises(AssertionError,
+                          table_from_frame, df, variables=[None, None])
+
         # Include index
         df.index = list('abaa')
         table = table_from_frame(df)