Skip to content

Commit

Permalink
feat: added custom date format option (#13)
Browse files Browse the repository at this point in the history
* added custom date time field handling

* bumped version

* implemented vertical custom date formats

* added documentation on date_fields

* corrected docs
  • Loading branch information
kiran94 authored Apr 6, 2021
1 parent ca4ff49 commit 55dee92
Show file tree
Hide file tree
Showing 8 changed files with 305 additions and 3 deletions.
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -336,6 +336,8 @@ These options can be placed on the root of the config or passed as `kwargs` dire
- `read_csv_options`
- Applied to the [`pd.read_csv`](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_csv.html) call when a file is passed to a transform
- For example if the vendor file was tab separated then this could be `{'sep': '\t'}`
- `date_fields`
- Apply datetime options to a field. This option can be useful when the input file has a date column with an unsual format. For each field, this object is passed into [`pd.to_datetime`](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.to_datetime.html). For example if you had a column called `dob` then you could set this object to `{ "dob": {"format": "%Y-%m-%d"} }`. All the [standard](https://docs.python.org/3/library/datetime.html#strftime-and-strptime-format-codes) format codes are supported.

## Samples

Expand Down
2 changes: 1 addition & 1 deletion dgraphpandas/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
__name__ = 'dgraphpandas'
__version__ = '0.0.8'
__version__ = '0.0.9'
14 changes: 14 additions & 0 deletions dgraphpandas/strategies/horizontal.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ def horizontal_transform(
file_config: Dict[str, Any] = config['files'][config_file_key]
type_overrides: Dict[str, str] = get_from_config('type_overrides', file_config, {}, **(kwargs))
subject_fields: Union[List[str], Callable[..., List[str]]] = get_from_config('subject_fields', file_config, **(kwargs))
date_fields: Dict[str, str] = get_from_config('date_fields', file_config, {}, **(kwargs))

if not subject_fields:
raise ValueError('subject_fields')
Expand All @@ -43,6 +44,19 @@ def horizontal_transform(
The frame columns are {frame.columns}
''')

'''
Date Fields get special treatment as they can be represented in many different ways
from different sources. Therefore if the column has been defined in date_fields
then apply those options to that column.
'''
for col, date_format in date_fields.items():
date_format = date_fields[col]
logger.debug(f'Converting {col} to datetime: {date_format}')
frame[col] = pd.to_datetime(frame[col], **(date_format))
if col not in type_overrides:
logger.debug(f'Ensuring {col} has datetime64 type')
type_overrides[col] = 'datetime64'

'''
Ensure that object values have the correct type according to type_overrides.
For example, when pandas reads a csv and detects a numerical value it may decide to
Expand Down
3 changes: 2 additions & 1 deletion dgraphpandas/strategies/vertical.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ def vertical_transform(
override_edge_name: Dict[str, Any] = get_from_config('override_edge_name', file_config, {}, **(kwargs))
pre_rename: Dict[str, str] = get_from_config('pre_rename', file_config, {}, **(kwargs))
type_overrides: Dict[str, str] = get_from_config('type_overrides', file_config, {}, **(kwargs))
date_fields: Dict[str, str] = get_from_config('date_fields', file_config, {}, **(kwargs))

potential_callables = _resolve_potential_callables(frame, {
'subject_fields': subject_fields,
Expand Down Expand Up @@ -84,7 +85,7 @@ def vertical_transform(
intrinsic = _apply_rdf_types(intrinsic, type_overrides)
edges['type'] = None

intrinsic = _format_date_fields(intrinsic)
intrinsic = _format_date_fields(intrinsic, date_fields)
intrinsic = _remove_illegal_rdf_characters(intrinsic, illegal_characters, 'subject')
intrinsic = _remove_illegal_rdf_characters(intrinsic, illegal_characters_intrinsic_object, 'object')
edges = _remove_illegal_rdf_characters(edges, illegal_characters, 'subject')
Expand Down
10 changes: 9 additions & 1 deletion dgraphpandas/strategies/vertical_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,14 +134,22 @@ def _apply_rdf_types(frame: pd.DataFrame, types: Dict[str, str]):
return frame


def _format_date_fields(frame: pd.DataFrame) -> pd.DataFrame:
def _format_date_fields(frame: pd.DataFrame, date_formats: Dict[str, str] = {}) -> pd.DataFrame:
'''
Ensure that DateTime fields are formatted in ISO format
And any fields are which NaT are filtered out.
'''
if frame is None:
raise ValueError('frame')

if date_formats:
logger.debug(f'Applying date_formats {date_formats}')
for col, format in date_formats.items():
logger.debug(f'Applying {format} to {col}')
mask = frame['predicate'] == col
frame.loc[mask, 'object'] = pd.to_datetime(frame.loc[mask, 'object'], **(format))
frame.loc[mask, 'type'] = '<xs:dateTime>'

logger.debug('Ensuring Date Time fields are in ISO format')
intrinsic_with_datetime = frame.loc[frame['type'] == '<xs:dateTime>']
frame = frame.loc[frame['type'] != '<xs:dateTime>']
Expand Down
218 changes: 218 additions & 0 deletions tests/strategies/test_horizontal.py
Original file line number Diff line number Diff line change
Expand Up @@ -306,3 +306,221 @@ def test_horizontal_melted_file_path_custom_csv_passed(self, mock_pandas: Mock,
assert_frame_equal(expected_melted, args[0])
self.assertEqual(config, args[1])
self.assertEqual(config_file_key, args[2])

@parameterized.expand([
###
(
'year_wrong_order',
{'dob': {'format': "%Y-%m-%d"}},
pd.DataFrame(data={
'customer_id': [1, 2],
'dob': ['03-02-2021', '01-03-1945'],
'weight': [50, 32]
})
),
###
(
'alphanumerical_string',
{'dob': {'format': "%Y-%m-%d"}},
pd.DataFrame(data={
'customer_id': [1, 2],
'dob': ['not a date', '01-03-1945'],
'weight': [50, 32]
})
),
###
(
'missing_dashes',
{'dob': {'format': "%Y-%m%d"}},
pd.DataFrame(data={
'customer_id': [1, 2],
'dob': ['2021-03-02', '19450301'],
'weight': [50, 32]
})
),
###
(
'missing_dots',
{'dob': {'format': "%Y.%m.%d"}},
pd.DataFrame(data={
'customer_id': [1, 2],
'dob': ['2021-03-02', '1945.03&01'],
'weight': [50, 32]
})
),
###
(
'malformed_month_string',
{'dob': {'format': "%d-%b-%Y"}},
pd.DataFrame(data={
'customer_id': [1, 2],
'dob': ['02-FebFake-2021', '01-Mar-1945'],
'weight': [50, 32]
})
)
])
@patch('dgraphpandas.strategies.horizontal.vertical_transform')
def test_horizontal_transform_incorrect_date_format(self, name, date_format, frame, transform_mock: Mock):
'''
Ensures when the date format provided does not match the value within the frame,
then an error is raised.
'''
config_file_key = 'customer'
config = {
'files': {
config_file_key: {
'subject_fields': ['customer_id'],
'date_fields': date_format
}
}
}

with self.assertRaisesRegex(ValueError, "time data (.*) (doesn't|does not) match format(.*)"):
horizontal_transform(frame, config, config_file_key)
transform_mock.assert_not_called()

@parameterized.expand([
###
(
'uncoverted_month_day',
{'dob': {'format': "%Y"}},
pd.DataFrame(data={
'customer_id': [1, 2],
'dob': ['2021-03-02', '1945-03-01'],
'weight': [50, 32]
})
),
###
(
'uncoverted_month_year',
{'dob': {'format': "%m-%d"}},
pd.DataFrame(data={
'customer_id': [1, 2],
'dob': ['03-02-2021', '03-01-2021'],
'weight': [50, 32]
})
)
])
@patch('dgraphpandas.strategies.horizontal.vertical_transform')
def test_horizontal_transform_unconverted_date_parts(self, name, date_format, frame, transform_mock: Mock):
'''
Ensures when the date partially matches and there are some converted
parts, an error is raised
'''
config_file_key = 'customer'
config = {
'files': {
config_file_key: {
'subject_fields': ['customer_id'],
'date_fields': date_format
}
}
}

with self.assertRaisesRegex(ValueError, "unconverted data remains: (.*)"):
horizontal_transform(frame, config, config_file_key)
transform_mock.assert_not_called()

@parameterized.expand([
###
(
'dash_format',
{'dob': {'format': "%Y-%m-%d"}},
pd.DataFrame(data={
'customer_id': [1, 2],
'dob': ['2021-03-02', '1945-03-01'],
'weight': [50, 32]
}),
pd.DataFrame(data={
'customer_id': [1, 2, 1, 2],
'predicate': ['dob', 'dob', 'weight', 'weight'],
'object':[pd.to_datetime('2021-03-02 00:00:00'), pd.to_datetime('1945-03-01 00:00:00'), 50, 32]
})
),
###
(
'dot_format',
{'dob': {'format': "%Y.%m.%d"}},
pd.DataFrame(data={
'customer_id': [1, 2],
'dob': ['1999.05.09', '1789.02.12'],
'weight': [50, 32]
}),
pd.DataFrame(data={
'customer_id': [1, 2, 1, 2],
'predicate': ['dob', 'dob', 'weight', 'weight'],
'object': [pd.to_datetime('1999-05-09 00:00:00'), pd.to_datetime('1789-02-12 00:00:00'), 50, 32]
})
),
###
(
'multiple_date_fields',
{'updated_at': {'format': '%Y.%m.%d'}, 'dob': {'format': "%Y.%m.%d"}},
pd.DataFrame(data={
'customer_id': [1, 2],
'dob': ['1999.05.09', '1789.02.12'],
'updated_at': ['2021.03.02', '2021.03.04'],
'weight': [50, 32]
}),
pd.DataFrame(data={
'customer_id': [1, 2, 1, 2, 1, 2],
'predicate': ['dob', 'dob', 'updated_at', 'updated_at', 'weight', 'weight'],
'object': [
pd.to_datetime('1999-05-09 00:00:00'),
pd.to_datetime('1789-02-12 00:00:00'),
pd.to_datetime('2021-03-02 00:00:00'),
pd.to_datetime('2021-03-04 00:00:00'),
50,
32]
})
),
###
(
'multiple_date_fields_different_formats',
{'updated_at': {'format': '%Y$%m$%d'}, 'dob': {'format': "%Y.%m.%d"}},
pd.DataFrame(data={
'customer_id': [1, 2],
'dob': ['1999.05.09', '1789.02.12'],
'updated_at': ['2021$03$02', '2021$03$04'],
'weight': [50, 32]
}),
pd.DataFrame(data={
'customer_id': [1, 2, 1, 2, 1, 2],
'predicate': ['dob', 'dob', 'updated_at', 'updated_at', 'weight', 'weight'],
'object': [
pd.to_datetime('1999-05-09 00:00:00'),
pd.to_datetime('1789-02-12 00:00:00'),
pd.to_datetime('2021-03-02 00:00:00'),
pd.to_datetime('2021-03-04 00:00:00'),
50,
32]
})
)
])
@patch('dgraphpandas.strategies.horizontal.vertical_transform')
def test_horizontal_transform_correct_date_format(self, name, date_format, frame, expected_melted, transform_mock: Mock):
'''
Ensures when the date_format provided is in the correct format,
no error is raised
'''
config_file_key = 'customer'
config = {
'files': {
config_file_key: {
'subject_fields': ['customer_id'],
'date_fields': date_format
}
}
}

horizontal_transform(frame, config, config_file_key)

transform_mock.assert_called_once()
args, kwargs = transform_mock.call_args_list[0]

passed_frame, passed_config, passed_config_key = args

assert_frame_equal(passed_frame, expected_melted)
self.assertEqual(passed_config, config)
self.assertEqual(passed_config_key, config_file_key)
self.assertEqual(kwargs, {})
33 changes: 33 additions & 0 deletions tests/strategies/test_vertical.py
Original file line number Diff line number Diff line change
Expand Up @@ -669,6 +669,39 @@ def test_vertical_transform_csv_file(self, mock_pandas: Mock):
{}
),
###
(
'with datetime_formats',
'customer',
{
'files': {
'customer': {
'subject_fields': ['customer_id'],
'edge_fields': ['location_id'],
'date_fields': {'dob': {'format': '%Y %b %d'}}
},
},
'add_dgraph_type_records': False,
},
pd.DataFrame(data={
'customer_id': [1, 2, 3, 1, 2],
'predicate': ['dob', 'weight', 'orders', 'location_id', 'location_id'],
'object': ['2021 Mar 13', 90, '1', 'loc45', 'loc64']
}),
pd.DataFrame(data={
'subject': ['customer_2', 'customer_3', 'customer_1'],
'predicate': ['weight', 'orders', 'dob'],
'object': [90, '1', '2021-03-13T00:00:00'],
'type': ['<xs:string>']*2 + ['<xs:dateTime>']
}),
pd.DataFrame(data={
'subject': ['customer_1', 'customer_2'],
'predicate': ['location', 'location'],
'object': ['location_loc45', 'location_loc64'],
'type': [None]*2
}),
{}
),
###
(
'illegal_characters',
'customer',
Expand Down
26 changes: 26 additions & 0 deletions tests/strategies/test_vertical_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -500,6 +500,32 @@ def test_format_date_fields_date_fields_exist_but_not_datetime(self):
with self.assertRaises(AttributeError):
_format_date_fields(frame)

def test_format_date_fields_formats_provided(self):
'''
Ensures when a date field is provided, the object is converted
into ISO format
'''
frame = pd.DataFrame(data={
'subject': ['customer_1', 'customer_1', 'customer_1'],
'predicate': ['hair_colour', 'dob', 'weight'],
'object': ['black', '2021 Jan 21', '50'],
'type': ['<xs:string>', '<xs:dateTime>', '<xs:int>']
})

date_fields = {
'dob': {"format": "%Y %b %d"}
}

result = _format_date_fields(frame, date_fields)
expected_frame = pd.DataFrame(data={
'subject': ['customer_1', 'customer_1', 'customer_1'],
'predicate': ['hair_colour', 'weight', 'dob'],
'object': ['black', '50', '2021-01-21T00:00:00'],
'type': ['<xs:string>', '<xs:int>', '<xs:dateTime>']
})

assert_frame_equal(result.reset_index(drop=True), expected_frame.reset_index(drop=True))

def test_compile_illegal_characters_regex_nonecharacters(self):
'''
Ensure when none characters are passed, then none
Expand Down

0 comments on commit 55dee92

Please sign in to comment.