From 55dee920e1ef3e77e67851f8beae8b3018c73aff Mon Sep 17 00:00:00 2001 From: Kiran Patel <7103956+kiran94@users.noreply.github.com> Date: Tue, 6 Apr 2021 22:47:14 +0100 Subject: [PATCH] feat: added custom date format option (#13) * added custom date time field handling * bumped version * implemented vertical custom date formats * added documentation on date_fields * corrected docs --- README.md | 2 + dgraphpandas/__init__.py | 2 +- dgraphpandas/strategies/horizontal.py | 14 ++ dgraphpandas/strategies/vertical.py | 3 +- dgraphpandas/strategies/vertical_helpers.py | 10 +- tests/strategies/test_horizontal.py | 218 ++++++++++++++++++++ tests/strategies/test_vertical.py | 33 +++ tests/strategies/test_vertical_helpers.py | 26 +++ 8 files changed, 305 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 50014dc..2538e38 100644 --- a/README.md +++ b/README.md @@ -336,6 +336,8 @@ These options can be placed on the root of the config or passed as `kwargs` dire - `read_csv_options` - Applied to the [`pd.read_csv`](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_csv.html) call when a file is passed to a transform - For example if the vendor file was tab separated then this could be `{'sep': '\t'}` +- `date_fields` + - Apply datetime options to a field. This option can be useful when the input file has a date column with an unsual format. For each field, this object is passed into [`pd.to_datetime`](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.to_datetime.html). For example if you had a column called `dob` then you could set this object to `{ "dob": {"format": "%Y-%m-%d"} }`. All the [standard](https://docs.python.org/3/library/datetime.html#strftime-and-strptime-format-codes) format codes are supported. ## Samples diff --git a/dgraphpandas/__init__.py b/dgraphpandas/__init__.py index 67b1467..7e06be6 100644 --- a/dgraphpandas/__init__.py +++ b/dgraphpandas/__init__.py @@ -1,2 +1,2 @@ __name__ = 'dgraphpandas' -__version__ = '0.0.8' +__version__ = '0.0.9' diff --git a/dgraphpandas/strategies/horizontal.py b/dgraphpandas/strategies/horizontal.py index 5fce733..b6f53eb 100644 --- a/dgraphpandas/strategies/horizontal.py +++ b/dgraphpandas/strategies/horizontal.py @@ -27,6 +27,7 @@ def horizontal_transform( file_config: Dict[str, Any] = config['files'][config_file_key] type_overrides: Dict[str, str] = get_from_config('type_overrides', file_config, {}, **(kwargs)) subject_fields: Union[List[str], Callable[..., List[str]]] = get_from_config('subject_fields', file_config, **(kwargs)) + date_fields: Dict[str, str] = get_from_config('date_fields', file_config, {}, **(kwargs)) if not subject_fields: raise ValueError('subject_fields') @@ -43,6 +44,19 @@ def horizontal_transform( The frame columns are {frame.columns} ''') + ''' + Date Fields get special treatment as they can be represented in many different ways + from different sources. Therefore if the column has been defined in date_fields + then apply those options to that column. + ''' + for col, date_format in date_fields.items(): + date_format = date_fields[col] + logger.debug(f'Converting {col} to datetime: {date_format}') + frame[col] = pd.to_datetime(frame[col], **(date_format)) + if col not in type_overrides: + logger.debug(f'Ensuring {col} has datetime64 type') + type_overrides[col] = 'datetime64' + ''' Ensure that object values have the correct type according to type_overrides. For example, when pandas reads a csv and detects a numerical value it may decide to diff --git a/dgraphpandas/strategies/vertical.py b/dgraphpandas/strategies/vertical.py index 7f01362..8ffa607 100644 --- a/dgraphpandas/strategies/vertical.py +++ b/dgraphpandas/strategies/vertical.py @@ -56,6 +56,7 @@ def vertical_transform( override_edge_name: Dict[str, Any] = get_from_config('override_edge_name', file_config, {}, **(kwargs)) pre_rename: Dict[str, str] = get_from_config('pre_rename', file_config, {}, **(kwargs)) type_overrides: Dict[str, str] = get_from_config('type_overrides', file_config, {}, **(kwargs)) + date_fields: Dict[str, str] = get_from_config('date_fields', file_config, {}, **(kwargs)) potential_callables = _resolve_potential_callables(frame, { 'subject_fields': subject_fields, @@ -84,7 +85,7 @@ def vertical_transform( intrinsic = _apply_rdf_types(intrinsic, type_overrides) edges['type'] = None - intrinsic = _format_date_fields(intrinsic) + intrinsic = _format_date_fields(intrinsic, date_fields) intrinsic = _remove_illegal_rdf_characters(intrinsic, illegal_characters, 'subject') intrinsic = _remove_illegal_rdf_characters(intrinsic, illegal_characters_intrinsic_object, 'object') edges = _remove_illegal_rdf_characters(edges, illegal_characters, 'subject') diff --git a/dgraphpandas/strategies/vertical_helpers.py b/dgraphpandas/strategies/vertical_helpers.py index 951f53d..d157b76 100644 --- a/dgraphpandas/strategies/vertical_helpers.py +++ b/dgraphpandas/strategies/vertical_helpers.py @@ -134,7 +134,7 @@ def _apply_rdf_types(frame: pd.DataFrame, types: Dict[str, str]): return frame -def _format_date_fields(frame: pd.DataFrame) -> pd.DataFrame: +def _format_date_fields(frame: pd.DataFrame, date_formats: Dict[str, str] = {}) -> pd.DataFrame: ''' Ensure that DateTime fields are formatted in ISO format And any fields are which NaT are filtered out. @@ -142,6 +142,14 @@ def _format_date_fields(frame: pd.DataFrame) -> pd.DataFrame: if frame is None: raise ValueError('frame') + if date_formats: + logger.debug(f'Applying date_formats {date_formats}') + for col, format in date_formats.items(): + logger.debug(f'Applying {format} to {col}') + mask = frame['predicate'] == col + frame.loc[mask, 'object'] = pd.to_datetime(frame.loc[mask, 'object'], **(format)) + frame.loc[mask, 'type'] = '' + logger.debug('Ensuring Date Time fields are in ISO format') intrinsic_with_datetime = frame.loc[frame['type'] == ''] frame = frame.loc[frame['type'] != ''] diff --git a/tests/strategies/test_horizontal.py b/tests/strategies/test_horizontal.py index c3c9f28..0c6b590 100644 --- a/tests/strategies/test_horizontal.py +++ b/tests/strategies/test_horizontal.py @@ -306,3 +306,221 @@ def test_horizontal_melted_file_path_custom_csv_passed(self, mock_pandas: Mock, assert_frame_equal(expected_melted, args[0]) self.assertEqual(config, args[1]) self.assertEqual(config_file_key, args[2]) + + @parameterized.expand([ + ### + ( + 'year_wrong_order', + {'dob': {'format': "%Y-%m-%d"}}, + pd.DataFrame(data={ + 'customer_id': [1, 2], + 'dob': ['03-02-2021', '01-03-1945'], + 'weight': [50, 32] + }) + ), + ### + ( + 'alphanumerical_string', + {'dob': {'format': "%Y-%m-%d"}}, + pd.DataFrame(data={ + 'customer_id': [1, 2], + 'dob': ['not a date', '01-03-1945'], + 'weight': [50, 32] + }) + ), + ### + ( + 'missing_dashes', + {'dob': {'format': "%Y-%m%d"}}, + pd.DataFrame(data={ + 'customer_id': [1, 2], + 'dob': ['2021-03-02', '19450301'], + 'weight': [50, 32] + }) + ), + ### + ( + 'missing_dots', + {'dob': {'format': "%Y.%m.%d"}}, + pd.DataFrame(data={ + 'customer_id': [1, 2], + 'dob': ['2021-03-02', '1945.03&01'], + 'weight': [50, 32] + }) + ), + ### + ( + 'malformed_month_string', + {'dob': {'format': "%d-%b-%Y"}}, + pd.DataFrame(data={ + 'customer_id': [1, 2], + 'dob': ['02-FebFake-2021', '01-Mar-1945'], + 'weight': [50, 32] + }) + ) + ]) + @patch('dgraphpandas.strategies.horizontal.vertical_transform') + def test_horizontal_transform_incorrect_date_format(self, name, date_format, frame, transform_mock: Mock): + ''' + Ensures when the date format provided does not match the value within the frame, + then an error is raised. + ''' + config_file_key = 'customer' + config = { + 'files': { + config_file_key: { + 'subject_fields': ['customer_id'], + 'date_fields': date_format + } + } + } + + with self.assertRaisesRegex(ValueError, "time data (.*) (doesn't|does not) match format(.*)"): + horizontal_transform(frame, config, config_file_key) + transform_mock.assert_not_called() + + @parameterized.expand([ + ### + ( + 'uncoverted_month_day', + {'dob': {'format': "%Y"}}, + pd.DataFrame(data={ + 'customer_id': [1, 2], + 'dob': ['2021-03-02', '1945-03-01'], + 'weight': [50, 32] + }) + ), + ### + ( + 'uncoverted_month_year', + {'dob': {'format': "%m-%d"}}, + pd.DataFrame(data={ + 'customer_id': [1, 2], + 'dob': ['03-02-2021', '03-01-2021'], + 'weight': [50, 32] + }) + ) + ]) + @patch('dgraphpandas.strategies.horizontal.vertical_transform') + def test_horizontal_transform_unconverted_date_parts(self, name, date_format, frame, transform_mock: Mock): + ''' + Ensures when the date partially matches and there are some converted + parts, an error is raised + ''' + config_file_key = 'customer' + config = { + 'files': { + config_file_key: { + 'subject_fields': ['customer_id'], + 'date_fields': date_format + } + } + } + + with self.assertRaisesRegex(ValueError, "unconverted data remains: (.*)"): + horizontal_transform(frame, config, config_file_key) + transform_mock.assert_not_called() + + @parameterized.expand([ + ### + ( + 'dash_format', + {'dob': {'format': "%Y-%m-%d"}}, + pd.DataFrame(data={ + 'customer_id': [1, 2], + 'dob': ['2021-03-02', '1945-03-01'], + 'weight': [50, 32] + }), + pd.DataFrame(data={ + 'customer_id': [1, 2, 1, 2], + 'predicate': ['dob', 'dob', 'weight', 'weight'], + 'object':[pd.to_datetime('2021-03-02 00:00:00'), pd.to_datetime('1945-03-01 00:00:00'), 50, 32] + }) + ), + ### + ( + 'dot_format', + {'dob': {'format': "%Y.%m.%d"}}, + pd.DataFrame(data={ + 'customer_id': [1, 2], + 'dob': ['1999.05.09', '1789.02.12'], + 'weight': [50, 32] + }), + pd.DataFrame(data={ + 'customer_id': [1, 2, 1, 2], + 'predicate': ['dob', 'dob', 'weight', 'weight'], + 'object': [pd.to_datetime('1999-05-09 00:00:00'), pd.to_datetime('1789-02-12 00:00:00'), 50, 32] + }) + ), + ### + ( + 'multiple_date_fields', + {'updated_at': {'format': '%Y.%m.%d'}, 'dob': {'format': "%Y.%m.%d"}}, + pd.DataFrame(data={ + 'customer_id': [1, 2], + 'dob': ['1999.05.09', '1789.02.12'], + 'updated_at': ['2021.03.02', '2021.03.04'], + 'weight': [50, 32] + }), + pd.DataFrame(data={ + 'customer_id': [1, 2, 1, 2, 1, 2], + 'predicate': ['dob', 'dob', 'updated_at', 'updated_at', 'weight', 'weight'], + 'object': [ + pd.to_datetime('1999-05-09 00:00:00'), + pd.to_datetime('1789-02-12 00:00:00'), + pd.to_datetime('2021-03-02 00:00:00'), + pd.to_datetime('2021-03-04 00:00:00'), + 50, + 32] + }) + ), + ### + ( + 'multiple_date_fields_different_formats', + {'updated_at': {'format': '%Y$%m$%d'}, 'dob': {'format': "%Y.%m.%d"}}, + pd.DataFrame(data={ + 'customer_id': [1, 2], + 'dob': ['1999.05.09', '1789.02.12'], + 'updated_at': ['2021$03$02', '2021$03$04'], + 'weight': [50, 32] + }), + pd.DataFrame(data={ + 'customer_id': [1, 2, 1, 2, 1, 2], + 'predicate': ['dob', 'dob', 'updated_at', 'updated_at', 'weight', 'weight'], + 'object': [ + pd.to_datetime('1999-05-09 00:00:00'), + pd.to_datetime('1789-02-12 00:00:00'), + pd.to_datetime('2021-03-02 00:00:00'), + pd.to_datetime('2021-03-04 00:00:00'), + 50, + 32] + }) + ) + ]) + @patch('dgraphpandas.strategies.horizontal.vertical_transform') + def test_horizontal_transform_correct_date_format(self, name, date_format, frame, expected_melted, transform_mock: Mock): + ''' + Ensures when the date_format provided is in the correct format, + no error is raised + ''' + config_file_key = 'customer' + config = { + 'files': { + config_file_key: { + 'subject_fields': ['customer_id'], + 'date_fields': date_format + } + } + } + + horizontal_transform(frame, config, config_file_key) + + transform_mock.assert_called_once() + args, kwargs = transform_mock.call_args_list[0] + + passed_frame, passed_config, passed_config_key = args + + assert_frame_equal(passed_frame, expected_melted) + self.assertEqual(passed_config, config) + self.assertEqual(passed_config_key, config_file_key) + self.assertEqual(kwargs, {}) diff --git a/tests/strategies/test_vertical.py b/tests/strategies/test_vertical.py index 5f0584f..715da39 100644 --- a/tests/strategies/test_vertical.py +++ b/tests/strategies/test_vertical.py @@ -669,6 +669,39 @@ def test_vertical_transform_csv_file(self, mock_pandas: Mock): {} ), ### + ( + 'with datetime_formats', + 'customer', + { + 'files': { + 'customer': { + 'subject_fields': ['customer_id'], + 'edge_fields': ['location_id'], + 'date_fields': {'dob': {'format': '%Y %b %d'}} + }, + }, + 'add_dgraph_type_records': False, + }, + pd.DataFrame(data={ + 'customer_id': [1, 2, 3, 1, 2], + 'predicate': ['dob', 'weight', 'orders', 'location_id', 'location_id'], + 'object': ['2021 Mar 13', 90, '1', 'loc45', 'loc64'] + }), + pd.DataFrame(data={ + 'subject': ['customer_2', 'customer_3', 'customer_1'], + 'predicate': ['weight', 'orders', 'dob'], + 'object': [90, '1', '2021-03-13T00:00:00'], + 'type': ['']*2 + [''] + }), + pd.DataFrame(data={ + 'subject': ['customer_1', 'customer_2'], + 'predicate': ['location', 'location'], + 'object': ['location_loc45', 'location_loc64'], + 'type': [None]*2 + }), + {} + ), + ### ( 'illegal_characters', 'customer', diff --git a/tests/strategies/test_vertical_helpers.py b/tests/strategies/test_vertical_helpers.py index 8a41863..c3a5b11 100644 --- a/tests/strategies/test_vertical_helpers.py +++ b/tests/strategies/test_vertical_helpers.py @@ -500,6 +500,32 @@ def test_format_date_fields_date_fields_exist_but_not_datetime(self): with self.assertRaises(AttributeError): _format_date_fields(frame) + def test_format_date_fields_formats_provided(self): + ''' + Ensures when a date field is provided, the object is converted + into ISO format + ''' + frame = pd.DataFrame(data={ + 'subject': ['customer_1', 'customer_1', 'customer_1'], + 'predicate': ['hair_colour', 'dob', 'weight'], + 'object': ['black', '2021 Jan 21', '50'], + 'type': ['', '', ''] + }) + + date_fields = { + 'dob': {"format": "%Y %b %d"} + } + + result = _format_date_fields(frame, date_fields) + expected_frame = pd.DataFrame(data={ + 'subject': ['customer_1', 'customer_1', 'customer_1'], + 'predicate': ['hair_colour', 'weight', 'dob'], + 'object': ['black', '50', '2021-01-21T00:00:00'], + 'type': ['', '', ''] + }) + + assert_frame_equal(result.reset_index(drop=True), expected_frame.reset_index(drop=True)) + def test_compile_illegal_characters_regex_nonecharacters(self): ''' Ensure when none characters are passed, then none