diff --git a/autonormalize/dfd.py b/autonormalize/dfd.py index d6a39ba..cf98339 100644 --- a/autonormalize/dfd.py +++ b/autonormalize/dfd.py @@ -1,7 +1,7 @@ from functools import partial from itertools import combinations -import numpy +import pandas as pd from tqdm import tqdm from .classes import DfdDependencies, LHSs, Masks, Node @@ -359,7 +359,6 @@ def approximate_dependencies(lhs_set, rhs, df, accuracy, masks): acc = 0 for index, row in indicator.iterrows(): - mask = None for attr in lhs_set: @@ -368,14 +367,18 @@ def approximate_dependencies(lhs_set, rhs, df, accuracy, masks): if df[attr].dtypes.name == 'datetime64[ns]': m = df[attr] == row[attr] else: - m = df[attr].values == row[attr] + if pd.isna(row[attr]): + m = df[attr].isnull() + else: + m = df[attr].values == row[attr] masks.add_mask(attr, row[attr], m) if mask is None: mask = m else: mask = mask & m options = df[mask] - _, unique_counts = numpy.unique(options[rhs].to_numpy(), return_counts=True) + + unique_counts = options[rhs].value_counts() acc += unique_counts.sum() - unique_counts.max() if acc > limit: return False diff --git a/autonormalize/normalize.py b/autonormalize/normalize.py index 40f1c26..1002ad6 100644 --- a/autonormalize/normalize.py +++ b/autonormalize/normalize.py @@ -76,6 +76,7 @@ def make_indexes(depdf): Arguments: depdf (DepDF) : depDF to make indexes for """ + prim_key = depdf.deps.get_prim_key() if len(prim_key) > 1: @@ -103,8 +104,9 @@ def make_indexes(depdf): for index in indices[name]: add[index] = new_val - - depdf.parent.df.drop(columns=prim_key, inplace=True) + # Don't drop a column if it is needed in another parent relationship + to_drop = [key for key in prim_key if key not in depdf.parent.deps.serialize().keys()] + depdf.parent.df.drop(columns=to_drop, inplace=True) depdf.parent.df.insert(len(depdf.parent.df.columns), '_'.join(prim_key), add) for child in depdf.children: diff --git a/autonormalize/tests/test_dfd.py b/autonormalize/tests/test_dfd.py index 1f1e832..43dc843 100644 --- a/autonormalize/tests/test_dfd.py +++ b/autonormalize/tests/test_dfd.py @@ -1,5 +1,6 @@ import os +import numpy as np import pandas as pd from autonormalize import dfd @@ -73,21 +74,28 @@ def test_compute_partitions(): assert not dfd.compute_partitions(df, 'c', frozenset(['a', 'b']), {}, 0.96, mask) -# def test_approximate_dependencies(): -# mask = dfd.Masks(['a', 'b', 'c']) -# a = [6, 2, 3, 7, 8, 1, 0, 2, 0, 3, 6, 0, 4, 6, 8, 7, 6, 8, 1, 5, 1, 3, 3, 0, 0, 4, 5, 5, 7, 0, 8, 2, 4, 7, 0, 0, 6, 4, 6, 8] -# # b = [int(x%2 == 0) for x in a] -# b = [1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1] -# # c = [(a[i] + b[i])<4 for i in range(40)] -# c = [False, True, True, False, False, True, True, True, True, True, False, True, False, False, False, False, False, False, True, False, True, True, True, True, True, False, False, False, False, True, False, True, False, False, True, True, False, False, False, False] -# df = pd.DataFrame({'a': a, 'b': b, 'c': c}) -# assert dfd.approximate_dependencies([0, 1], 2, df, 1.00, mask, 0.90) -# assert dfd.approximate_dependencies(set([0, 1]), 2, df, .90, mask, 0.90) -# c[0] = True -# df = pd.DataFrame({'a': a, 'b': b, 'c': c}) -# assert dfd.approximate_dependencies([0, 1], 2, df, .97, mask, 0.90) -# assert not dfd.approximate_dependencies(set([0, 1]), 2, df, .98, mask, 0.90) -# c[35] = False -# df = pd.DataFrame({'a': a, 'b': b, 'c': c}) -# assert dfd.approximate_dependencies([0, 1], 2, df, .95, mask, 0.90) -# assert not dfd.approximate_dependencies([0, 1], 2, df, .96, mask, 0.90) +def test_approximate_dependencies(): + mask = dfd.Masks(['a', 'b', 'c']) + a = [6, 2, 3, 7, 8, 1, 0, 2, 0, 3, 6, 0, 4, 6, 8, 7, 6, 8, 1, 5, 1, 3, 3, 0, 0, 4, 5, 5, 7, 0, 8, 2, 4, 7, 0, 0, 6, 4, 6, 8] + b = [1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1] + c = [False, True, True, False, False, True, True, True, True, True, False, True, False, False, False, False, False, False, True, False, True, True, True, True, True, False, False, False, False, True, False, True, False, False, True, True, False, False, False, False] + df = pd.DataFrame({'a': a, 'b': b, 'c': c}) + assert dfd.approximate_dependencies(['a', 'b'], 'c', df, 1.00, mask) + assert dfd.approximate_dependencies(['a', 'b'], 'c', df, .90, mask) + c[0] = True + df = pd.DataFrame({'a': a, 'b': b, 'c': c}) + assert dfd.approximate_dependencies(['a', 'b'], 'c', df, .97, mask) + assert not dfd.approximate_dependencies(['a', 'b'], 'c', df, .98, mask) + c[35] = False + df = pd.DataFrame({'a': a, 'b': b, 'c': c}) + assert dfd.approximate_dependencies(['a', 'b'], 'c', df, .95, mask) + assert not dfd.approximate_dependencies(['a', 'b'], 'c', df, .96, mask) + + +def test_approximate_dependencies_with_nan(): + mask = dfd.Masks(['a', 'b', 'c']) + a = [np.nan, 2, 3, 7, 8, 1, 0, 2, 0, 3, np.nan, 0, 4, np.nan, 8, 7, np.nan, 8, 1, 5, 1, 3, 3, 0, 0, 4, 5, 5, 7, 0, 8, 2, 4, 7, 0, 0, np.nan, 4, np.nan, 8] + b = [1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1] + c = [True, True, True, False, False, True, True, True, True, True, False, True, False, False, False, False, False, False, True, False, True, True, True, True, True, False, False, False, False, True, False, True, False, False, True, True, False, False, False, False] + df = pd.DataFrame({'a': a, 'b': b, 'c': c}) + assert dfd.approximate_dependencies(['a', 'b'], 'c', df, 0.9, mask) diff --git a/autonormalize/tests/test_normalize.py b/autonormalize/tests/test_normalize.py index 37a717b..f5f8d4d 100644 --- a/autonormalize/tests/test_normalize.py +++ b/autonormalize/tests/test_normalize.py @@ -1,3 +1,4 @@ +import numpy as np import pandas as pd from pandas.testing import assert_frame_equal @@ -178,3 +179,32 @@ def test_make_indexes(): # Make sure new column names are sorted assert 'hemisphere_month' in new_dfs[0].columns assert 'hemisphere_month' in new_dfs[1].columns + + +def test_make_indexes_improper_column_drop(): + df_dict = {'Id': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10], + 'MSSubClass': [90, 60, 90, 90, 20, 50, 80, 20, 60, 20, 20], + 'MSZoning': ['RL', 'RL', 'RL', 'RL', 'RL', 'RM', 'RL', 'RL', 'RL', 'RL', 'RL'], + 'LotFrontage': [55.0, np.nan, 42.0, 100.0, np.nan, 98.0, 70.0, 85.0, 65.0, 78.0, 60.0], + 'LotArea': [12640, 8755, 7711, 25000, 14375, 8820, 8163, 14536, 14006, 9360, 7200], + 'Alley': [np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan], + 'LotShape': ['IR1', 'IR1', 'IR1', 'Reg', 'IR1', 'Reg', 'Reg', 'Reg', 'IR1', 'Reg', 'Reg'], + 'LandContour': ['Lvl', 'Lvl', 'Lvl', 'Low', 'Lvl', 'Lvl', 'Lvl', 'Lvl', 'Lvl', 'Lvl', 'Lvl'], + 'Utilities': ['AllPub', 'AllPub', 'AllPub', 'AllPub', 'NoSeWa', 'AllPub', 'AllPub', 'AllPub', 'AllPub', 'AllPub', 'AllPub']} + df = pd.DataFrame(df_dict) + + deps = classes.Dependencies({'Id': [['LotArea']], + 'MSSubClass': [['LotArea'], ['LotFrontage', 'Utilities'], ['Id']], + 'MSZoning': [['LotFrontage'], ['LotArea'], ['MSSubClass'], ['Id']], + 'LotFrontage': [['LotArea'], ['Id']], 'LotArea': [['Id']], + 'Alley': [['LotFrontage'], ['LandContour'], ['Utilities'], ['MSSubClass'], ['Id'], ['MSZoning'], ['LotArea'], ['LotShape']], + 'LotShape': [['LotFrontage'], ['MSSubClass', 'Utilities', 'LandContour'], ['LotArea'], ['Id']], + 'LandContour': [['LotFrontage'], ['MSSubClass', 'LotShape'], ['LotArea'], ['Id']], + 'Utilities': [['MSSubClass', 'LotShape'], ['LotArea'], ['MSSubClass', 'LotFrontage'], ['Id']]}, ['id']) + + depdf = normalize.DepDF(deps, df, deps.get_prim_key()) + normalize.normalize_dataframe(depdf) + normalize.make_indexes(depdf) + new_dfs = depdf.return_dfs() + + assert 'MSSubClass' in new_dfs[0].columns diff --git a/docs/source/release_notes.rst b/docs/source/release_notes.rst index 6f004b9..4b03595 100755 --- a/docs/source/release_notes.rst +++ b/docs/source/release_notes.rst @@ -3,15 +3,17 @@ Release Notes ------------- -.. Future Release - ============== +Future Release +============== * Enhancements * Fixes + * Bug fixes in ``auto_entityset`` (:pr:`21`) * Changes * Documentation Changes * Testing Changes -.. Thanks to the following people for contributing to this release: + Thanks to the following people for contributing to this release: + :user:`thehomebrewnerd` v2.0.1 Apr 25, 2022 ===================