feat: added custom date format option (#13)

kiran94 · web-flow · commit 55dee920e1ef · 2021-04-06T22:47:14.000+01:00
* added custom date time field handling

* bumped version

* implemented vertical custom date formats

* added documentation on date_fields

* corrected docs
diff --git a/README.md b/README.md
@@ -336,6 +336,8 @@ These options can be placed on the root of the config or passed as `kwargs` dire
 - `read_csv_options`
   - Applied to the [`pd.read_csv`](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_csv.html) call when a file is passed to a transform
   - For example if the vendor file was tab separated then this could be `{'sep': '\t'}`
+- `date_fields`
+  - Apply datetime options to a field. This option can be useful when the input file has a date column with an unsual format. For each field, this object is passed into [`pd.to_datetime`](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.to_datetime.html). For example if you had a column called `dob` then you could set this object to `{ "dob": {"format": "%Y-%m-%d"} }`. All the [standard](https://docs.python.org/3/library/datetime.html#strftime-and-strptime-format-codes) format codes are supported.
 
 ## Samples
 
diff --git a/dgraphpandas/__init__.py b/dgraphpandas/__init__.py
@@ -1,2 +1,2 @@
 __name__ = 'dgraphpandas'
-__version__ = '0.0.8'
+__version__ = '0.0.9'
diff --git a/dgraphpandas/strategies/horizontal.py b/dgraphpandas/strategies/horizontal.py
@@ -27,6 +27,7 @@ def horizontal_transform(
     file_config: Dict[str, Any] = config['files'][config_file_key]
     type_overrides: Dict[str, str] = get_from_config('type_overrides', file_config, {}, **(kwargs))
     subject_fields: Union[List[str], Callable[..., List[str]]] = get_from_config('subject_fields', file_config, **(kwargs))
+    date_fields: Dict[str, str] = get_from_config('date_fields', file_config, {}, **(kwargs))
 
     if not subject_fields:
         raise ValueError('subject_fields')
@@ -43,6 +44,19 @@ def horizontal_transform(
             The frame columns are {frame.columns}
         ''')
 
+    '''
+    Date Fields get special treatment as they can be represented in many different ways
+    from different sources. Therefore if the column has been defined in date_fields
+    then apply those options to that column.
+    '''
+    for col, date_format in date_fields.items():
+        date_format = date_fields[col]
+        logger.debug(f'Converting {col} to datetime: {date_format}')
+        frame[col] = pd.to_datetime(frame[col], **(date_format))
+        if col not in type_overrides:
+            logger.debug(f'Ensuring {col} has datetime64 type')
+            type_overrides[col] = 'datetime64'
+
     '''
     Ensure that object values have the correct type according to type_overrides.
     For example, when pandas reads a csv and detects a numerical value it may decide to
diff --git a/dgraphpandas/strategies/vertical.py b/dgraphpandas/strategies/vertical.py
@@ -56,6 +56,7 @@ def vertical_transform(
     override_edge_name: Dict[str, Any] = get_from_config('override_edge_name', file_config, {}, **(kwargs))
     pre_rename: Dict[str, str] = get_from_config('pre_rename', file_config, {}, **(kwargs))
     type_overrides: Dict[str, str] = get_from_config('type_overrides', file_config, {}, **(kwargs))
+    date_fields: Dict[str, str] = get_from_config('date_fields', file_config, {}, **(kwargs))
 
     potential_callables = _resolve_potential_callables(frame, {
         'subject_fields': subject_fields,
@@ -84,7 +85,7 @@ def vertical_transform(
     intrinsic = _apply_rdf_types(intrinsic, type_overrides)
     edges['type'] = None
 
-    intrinsic = _format_date_fields(intrinsic)
+    intrinsic = _format_date_fields(intrinsic, date_fields)
     intrinsic = _remove_illegal_rdf_characters(intrinsic, illegal_characters, 'subject')
     intrinsic = _remove_illegal_rdf_characters(intrinsic, illegal_characters_intrinsic_object, 'object')
     edges = _remove_illegal_rdf_characters(edges, illegal_characters, 'subject')
diff --git a/dgraphpandas/strategies/vertical_helpers.py b/dgraphpandas/strategies/vertical_helpers.py
@@ -134,14 +134,22 @@ def _apply_rdf_types(frame: pd.DataFrame, types: Dict[str, str]):
     return frame
 
 
-def _format_date_fields(frame: pd.DataFrame) -> pd.DataFrame:
+def _format_date_fields(frame: pd.DataFrame, date_formats: Dict[str, str] = {}) -> pd.DataFrame:
     '''
     Ensure that DateTime fields are formatted in ISO format
     And any fields are which NaT are filtered out.
     '''
     if frame is None:
         raise ValueError('frame')
 
+    if date_formats:
+        logger.debug(f'Applying date_formats {date_formats}')
+        for col, format in date_formats.items():
+            logger.debug(f'Applying {format} to {col}')
+            mask = frame['predicate'] == col
+            frame.loc[mask, 'object'] = pd.to_datetime(frame.loc[mask, 'object'], **(format))
+            frame.loc[mask, 'type'] = '<xs:dateTime>'
+
     logger.debug('Ensuring Date Time fields are in ISO format')
     intrinsic_with_datetime = frame.loc[frame['type'] == '<xs:dateTime>']
     frame = frame.loc[frame['type'] != '<xs:dateTime>']
diff --git a/tests/strategies/test_horizontal.py b/tests/strategies/test_horizontal.py
@@ -306,3 +306,221 @@ def test_horizontal_melted_file_path_custom_csv_passed(self, mock_pandas: Mock,
         assert_frame_equal(expected_melted, args[0])
         self.assertEqual(config, args[1])
         self.assertEqual(config_file_key, args[2])
+
+    @parameterized.expand([
+        ###
+        (
+            'year_wrong_order',
+            {'dob': {'format': "%Y-%m-%d"}},
+            pd.DataFrame(data={
+                'customer_id': [1, 2],
+                'dob': ['03-02-2021', '01-03-1945'],
+                'weight': [50, 32]
+            })
+        ),
+        ###
+        (
+            'alphanumerical_string',
+            {'dob': {'format': "%Y-%m-%d"}},
+            pd.DataFrame(data={
+                'customer_id': [1, 2],
+                'dob': ['not a date', '01-03-1945'],
+                'weight': [50, 32]
+            })
+        ),
+        ###
+        (
+            'missing_dashes',
+            {'dob': {'format': "%Y-%m%d"}},
+            pd.DataFrame(data={
+                'customer_id': [1, 2],
+                'dob': ['2021-03-02', '19450301'],
+                'weight': [50, 32]
+            })
+        ),
+        ###
+        (
+            'missing_dots',
+            {'dob': {'format': "%Y.%m.%d"}},
+            pd.DataFrame(data={
+                'customer_id': [1, 2],
+                'dob': ['2021-03-02', '1945.03&01'],
+                'weight': [50, 32]
+            })
+        ),
+        ###
+        (
+            'malformed_month_string',
+            {'dob': {'format': "%d-%b-%Y"}},
+            pd.DataFrame(data={
+                'customer_id': [1, 2],
+                'dob': ['02-FebFake-2021', '01-Mar-1945'],
+                'weight': [50, 32]
+            })
+        )
+    ])
+    @patch('dgraphpandas.strategies.horizontal.vertical_transform')
+    def test_horizontal_transform_incorrect_date_format(self, name, date_format, frame, transform_mock: Mock):
+        '''
+        Ensures when the date format provided does not match the value within the frame,
+        then an error is raised.
+        '''
+        config_file_key = 'customer'
+        config = {
+            'files': {
+                config_file_key: {
+                    'subject_fields': ['customer_id'],
+                    'date_fields': date_format
+                }
+            }
+        }
+
+        with self.assertRaisesRegex(ValueError, "time data (.*) (doesn't|does not) match format(.*)"):
+            horizontal_transform(frame, config, config_file_key)
+        transform_mock.assert_not_called()
+
+    @parameterized.expand([
+        ###
+        (
+            'uncoverted_month_day',
+            {'dob': {'format': "%Y"}},
+            pd.DataFrame(data={
+                'customer_id': [1, 2],
+                'dob': ['2021-03-02', '1945-03-01'],
+                'weight': [50, 32]
+            })
+        ),
+        ###
+        (
+            'uncoverted_month_year',
+            {'dob': {'format': "%m-%d"}},
+            pd.DataFrame(data={
+                'customer_id': [1, 2],
+                'dob': ['03-02-2021', '03-01-2021'],
+                'weight': [50, 32]
+            })
+        )
+    ])
+    @patch('dgraphpandas.strategies.horizontal.vertical_transform')
+    def test_horizontal_transform_unconverted_date_parts(self, name, date_format, frame, transform_mock: Mock):
+        '''
+        Ensures when the date partially matches and there are some converted
+        parts, an error is raised
+        '''
+        config_file_key = 'customer'
+        config = {
+            'files': {
+                config_file_key: {
+                    'subject_fields': ['customer_id'],
+                    'date_fields': date_format
+                }
+            }
+        }
+
+        with self.assertRaisesRegex(ValueError, "unconverted data remains: (.*)"):
+            horizontal_transform(frame, config, config_file_key)
+        transform_mock.assert_not_called()
+
+    @parameterized.expand([
+        ###
+        (
+            'dash_format',
+            {'dob': {'format': "%Y-%m-%d"}},
+            pd.DataFrame(data={
+                'customer_id': [1, 2],
+                'dob': ['2021-03-02', '1945-03-01'],
+                'weight': [50, 32]
+            }),
+            pd.DataFrame(data={
+                'customer_id': [1, 2, 1, 2],
+                'predicate': ['dob', 'dob', 'weight', 'weight'],
+                'object':[pd.to_datetime('2021-03-02 00:00:00'), pd.to_datetime('1945-03-01 00:00:00'), 50, 32]
+            })
+        ),
+        ###
+        (
+            'dot_format',
+            {'dob': {'format': "%Y.%m.%d"}},
+            pd.DataFrame(data={
+                'customer_id': [1, 2],
+                'dob': ['1999.05.09', '1789.02.12'],
+                'weight': [50, 32]
+            }),
+            pd.DataFrame(data={
+                'customer_id': [1, 2, 1, 2],
+                'predicate': ['dob', 'dob', 'weight', 'weight'],
+                'object': [pd.to_datetime('1999-05-09 00:00:00'), pd.to_datetime('1789-02-12 00:00:00'), 50, 32]
+            })
+        ),
+        ###
+        (
+            'multiple_date_fields',
+            {'updated_at': {'format': '%Y.%m.%d'}, 'dob': {'format': "%Y.%m.%d"}},
+            pd.DataFrame(data={
+                'customer_id': [1, 2],
+                'dob': ['1999.05.09', '1789.02.12'],
+                'updated_at': ['2021.03.02', '2021.03.04'],
+                'weight': [50, 32]
+            }),
+            pd.DataFrame(data={
+                'customer_id': [1, 2, 1, 2, 1, 2],
+                'predicate': ['dob', 'dob', 'updated_at', 'updated_at', 'weight', 'weight'],
+                'object': [
+                    pd.to_datetime('1999-05-09 00:00:00'),
+                    pd.to_datetime('1789-02-12 00:00:00'),
+                    pd.to_datetime('2021-03-02 00:00:00'),
+                    pd.to_datetime('2021-03-04 00:00:00'),
+                    50,
+                    32]
+            })
+        ),
+        ###
+        (
+            'multiple_date_fields_different_formats',
+            {'updated_at': {'format': '%Y$%m$%d'}, 'dob': {'format': "%Y.%m.%d"}},
+            pd.DataFrame(data={
+                'customer_id': [1, 2],
+                'dob': ['1999.05.09', '1789.02.12'],
+                'updated_at': ['2021$03$02', '2021$03$04'],
+                'weight': [50, 32]
+            }),
+            pd.DataFrame(data={
+                'customer_id': [1, 2, 1, 2, 1, 2],
+                'predicate': ['dob', 'dob', 'updated_at', 'updated_at', 'weight', 'weight'],
+                'object': [
+                    pd.to_datetime('1999-05-09 00:00:00'),
+                    pd.to_datetime('1789-02-12 00:00:00'),
+                    pd.to_datetime('2021-03-02 00:00:00'),
+                    pd.to_datetime('2021-03-04 00:00:00'),
+                    50,
+                    32]
+            })
+        )
+    ])
+    @patch('dgraphpandas.strategies.horizontal.vertical_transform')
+    def test_horizontal_transform_correct_date_format(self, name, date_format, frame, expected_melted, transform_mock: Mock):
+        '''
+        Ensures when the date_format provided is in the correct format,
+        no error is raised
+        '''
+        config_file_key = 'customer'
+        config = {
+            'files': {
+                config_file_key: {
+                    'subject_fields': ['customer_id'],
+                    'date_fields': date_format
+                }
+            }
+        }
+
+        horizontal_transform(frame, config, config_file_key)
+
+        transform_mock.assert_called_once()
+        args, kwargs = transform_mock.call_args_list[0]
+
+        passed_frame, passed_config, passed_config_key = args
+
+        assert_frame_equal(passed_frame, expected_melted)
+        self.assertEqual(passed_config, config)
+        self.assertEqual(passed_config_key, config_file_key)
+        self.assertEqual(kwargs, {})
diff --git a/tests/strategies/test_vertical.py b/tests/strategies/test_vertical.py
@@ -669,6 +669,39 @@ def test_vertical_transform_csv_file(self, mock_pandas: Mock):
             {}
         ),
         ###
+        (
+            'with datetime_formats',
+            'customer',
+            {
+                'files': {
+                    'customer': {
+                        'subject_fields': ['customer_id'],
+                        'edge_fields': ['location_id'],
+                        'date_fields': {'dob': {'format': '%Y %b %d'}}
+                    },
+                },
+                'add_dgraph_type_records': False,
+            },
+            pd.DataFrame(data={
+                'customer_id': [1, 2, 3, 1, 2],
+                'predicate': ['dob', 'weight', 'orders', 'location_id', 'location_id'],
+                'object': ['2021 Mar 13', 90, '1', 'loc45', 'loc64']
+            }),
+            pd.DataFrame(data={
+                'subject': ['customer_2', 'customer_3', 'customer_1'],
+                'predicate': ['weight', 'orders', 'dob'],
+                'object': [90, '1', '2021-03-13T00:00:00'],
+                'type': ['<xs:string>']*2 + ['<xs:dateTime>']
+            }),
+            pd.DataFrame(data={
+                'subject': ['customer_1', 'customer_2'],
+                'predicate': ['location', 'location'],
+                'object': ['location_loc45', 'location_loc64'],
+                'type': [None]*2
+            }),
+            {}
+        ),
+        ###
         (
             'illegal_characters',
             'customer',
diff --git a/tests/strategies/test_vertical_helpers.py b/tests/strategies/test_vertical_helpers.py
@@ -500,6 +500,32 @@ def test_format_date_fields_date_fields_exist_but_not_datetime(self):
         with self.assertRaises(AttributeError):
             _format_date_fields(frame)
 
+    def test_format_date_fields_formats_provided(self):
+        '''
+        Ensures when a date field is provided, the object is converted
+        into ISO format
+        '''
+        frame = pd.DataFrame(data={
+            'subject': ['customer_1', 'customer_1', 'customer_1'],
+            'predicate':  ['hair_colour', 'dob', 'weight'],
+            'object':  ['black', '2021 Jan 21', '50'],
+            'type':  ['<xs:string>', '<xs:dateTime>', '<xs:int>']
+        })
+
+        date_fields = {
+            'dob': {"format": "%Y %b %d"}
+        }
+
+        result = _format_date_fields(frame, date_fields)
+        expected_frame = pd.DataFrame(data={
+            'subject': ['customer_1', 'customer_1', 'customer_1'],
+            'predicate': ['hair_colour', 'weight', 'dob'],
+            'object': ['black', '50', '2021-01-21T00:00:00'],
+            'type': ['<xs:string>', '<xs:int>', '<xs:dateTime>']
+        })
+
+        assert_frame_equal(result.reset_index(drop=True), expected_frame.reset_index(drop=True))
+
     def test_compile_illegal_characters_regex_nonecharacters(self):
         '''
         Ensure when none characters are passed, then none

Original file line number	Diff line number	Diff line change
`@@ -1,2 +1,2 @@`
`1`	`1`	`__name__ = 'dgraphpandas'`
`2`		`-__version__ = '0.0.8'`
	`2`	`+__version__ = '0.0.9'`