Skip to content

Commit 55dee92

Browse files
authored
feat: added custom date format option (#13)
* added custom date time field handling * bumped version * implemented vertical custom date formats * added documentation on date_fields * corrected docs
1 parent ca4ff49 commit 55dee92

File tree

8 files changed

+305
-3
lines changed

8 files changed

+305
-3
lines changed

README.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -336,6 +336,8 @@ These options can be placed on the root of the config or passed as `kwargs` dire
336336
- `read_csv_options`
337337
- Applied to the [`pd.read_csv`](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_csv.html) call when a file is passed to a transform
338338
- For example if the vendor file was tab separated then this could be `{'sep': '\t'}`
339+
- `date_fields`
340+
- Apply datetime options to a field. This option can be useful when the input file has a date column with an unsual format. For each field, this object is passed into [`pd.to_datetime`](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.to_datetime.html). For example if you had a column called `dob` then you could set this object to `{ "dob": {"format": "%Y-%m-%d"} }`. All the [standard](https://docs.python.org/3/library/datetime.html#strftime-and-strptime-format-codes) format codes are supported.
339341

340342
## Samples
341343

dgraphpandas/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,2 @@
11
__name__ = 'dgraphpandas'
2-
__version__ = '0.0.8'
2+
__version__ = '0.0.9'

dgraphpandas/strategies/horizontal.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ def horizontal_transform(
2727
file_config: Dict[str, Any] = config['files'][config_file_key]
2828
type_overrides: Dict[str, str] = get_from_config('type_overrides', file_config, {}, **(kwargs))
2929
subject_fields: Union[List[str], Callable[..., List[str]]] = get_from_config('subject_fields', file_config, **(kwargs))
30+
date_fields: Dict[str, str] = get_from_config('date_fields', file_config, {}, **(kwargs))
3031

3132
if not subject_fields:
3233
raise ValueError('subject_fields')
@@ -43,6 +44,19 @@ def horizontal_transform(
4344
The frame columns are {frame.columns}
4445
''')
4546

47+
'''
48+
Date Fields get special treatment as they can be represented in many different ways
49+
from different sources. Therefore if the column has been defined in date_fields
50+
then apply those options to that column.
51+
'''
52+
for col, date_format in date_fields.items():
53+
date_format = date_fields[col]
54+
logger.debug(f'Converting {col} to datetime: {date_format}')
55+
frame[col] = pd.to_datetime(frame[col], **(date_format))
56+
if col not in type_overrides:
57+
logger.debug(f'Ensuring {col} has datetime64 type')
58+
type_overrides[col] = 'datetime64'
59+
4660
'''
4761
Ensure that object values have the correct type according to type_overrides.
4862
For example, when pandas reads a csv and detects a numerical value it may decide to

dgraphpandas/strategies/vertical.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,7 @@ def vertical_transform(
5656
override_edge_name: Dict[str, Any] = get_from_config('override_edge_name', file_config, {}, **(kwargs))
5757
pre_rename: Dict[str, str] = get_from_config('pre_rename', file_config, {}, **(kwargs))
5858
type_overrides: Dict[str, str] = get_from_config('type_overrides', file_config, {}, **(kwargs))
59+
date_fields: Dict[str, str] = get_from_config('date_fields', file_config, {}, **(kwargs))
5960

6061
potential_callables = _resolve_potential_callables(frame, {
6162
'subject_fields': subject_fields,
@@ -84,7 +85,7 @@ def vertical_transform(
8485
intrinsic = _apply_rdf_types(intrinsic, type_overrides)
8586
edges['type'] = None
8687

87-
intrinsic = _format_date_fields(intrinsic)
88+
intrinsic = _format_date_fields(intrinsic, date_fields)
8889
intrinsic = _remove_illegal_rdf_characters(intrinsic, illegal_characters, 'subject')
8990
intrinsic = _remove_illegal_rdf_characters(intrinsic, illegal_characters_intrinsic_object, 'object')
9091
edges = _remove_illegal_rdf_characters(edges, illegal_characters, 'subject')

dgraphpandas/strategies/vertical_helpers.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -134,14 +134,22 @@ def _apply_rdf_types(frame: pd.DataFrame, types: Dict[str, str]):
134134
return frame
135135

136136

137-
def _format_date_fields(frame: pd.DataFrame) -> pd.DataFrame:
137+
def _format_date_fields(frame: pd.DataFrame, date_formats: Dict[str, str] = {}) -> pd.DataFrame:
138138
'''
139139
Ensure that DateTime fields are formatted in ISO format
140140
And any fields are which NaT are filtered out.
141141
'''
142142
if frame is None:
143143
raise ValueError('frame')
144144

145+
if date_formats:
146+
logger.debug(f'Applying date_formats {date_formats}')
147+
for col, format in date_formats.items():
148+
logger.debug(f'Applying {format} to {col}')
149+
mask = frame['predicate'] == col
150+
frame.loc[mask, 'object'] = pd.to_datetime(frame.loc[mask, 'object'], **(format))
151+
frame.loc[mask, 'type'] = '<xs:dateTime>'
152+
145153
logger.debug('Ensuring Date Time fields are in ISO format')
146154
intrinsic_with_datetime = frame.loc[frame['type'] == '<xs:dateTime>']
147155
frame = frame.loc[frame['type'] != '<xs:dateTime>']

tests/strategies/test_horizontal.py

Lines changed: 218 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -306,3 +306,221 @@ def test_horizontal_melted_file_path_custom_csv_passed(self, mock_pandas: Mock,
306306
assert_frame_equal(expected_melted, args[0])
307307
self.assertEqual(config, args[1])
308308
self.assertEqual(config_file_key, args[2])
309+
310+
@parameterized.expand([
311+
###
312+
(
313+
'year_wrong_order',
314+
{'dob': {'format': "%Y-%m-%d"}},
315+
pd.DataFrame(data={
316+
'customer_id': [1, 2],
317+
'dob': ['03-02-2021', '01-03-1945'],
318+
'weight': [50, 32]
319+
})
320+
),
321+
###
322+
(
323+
'alphanumerical_string',
324+
{'dob': {'format': "%Y-%m-%d"}},
325+
pd.DataFrame(data={
326+
'customer_id': [1, 2],
327+
'dob': ['not a date', '01-03-1945'],
328+
'weight': [50, 32]
329+
})
330+
),
331+
###
332+
(
333+
'missing_dashes',
334+
{'dob': {'format': "%Y-%m%d"}},
335+
pd.DataFrame(data={
336+
'customer_id': [1, 2],
337+
'dob': ['2021-03-02', '19450301'],
338+
'weight': [50, 32]
339+
})
340+
),
341+
###
342+
(
343+
'missing_dots',
344+
{'dob': {'format': "%Y.%m.%d"}},
345+
pd.DataFrame(data={
346+
'customer_id': [1, 2],
347+
'dob': ['2021-03-02', '1945.03&01'],
348+
'weight': [50, 32]
349+
})
350+
),
351+
###
352+
(
353+
'malformed_month_string',
354+
{'dob': {'format': "%d-%b-%Y"}},
355+
pd.DataFrame(data={
356+
'customer_id': [1, 2],
357+
'dob': ['02-FebFake-2021', '01-Mar-1945'],
358+
'weight': [50, 32]
359+
})
360+
)
361+
])
362+
@patch('dgraphpandas.strategies.horizontal.vertical_transform')
363+
def test_horizontal_transform_incorrect_date_format(self, name, date_format, frame, transform_mock: Mock):
364+
'''
365+
Ensures when the date format provided does not match the value within the frame,
366+
then an error is raised.
367+
'''
368+
config_file_key = 'customer'
369+
config = {
370+
'files': {
371+
config_file_key: {
372+
'subject_fields': ['customer_id'],
373+
'date_fields': date_format
374+
}
375+
}
376+
}
377+
378+
with self.assertRaisesRegex(ValueError, "time data (.*) (doesn't|does not) match format(.*)"):
379+
horizontal_transform(frame, config, config_file_key)
380+
transform_mock.assert_not_called()
381+
382+
@parameterized.expand([
383+
###
384+
(
385+
'uncoverted_month_day',
386+
{'dob': {'format': "%Y"}},
387+
pd.DataFrame(data={
388+
'customer_id': [1, 2],
389+
'dob': ['2021-03-02', '1945-03-01'],
390+
'weight': [50, 32]
391+
})
392+
),
393+
###
394+
(
395+
'uncoverted_month_year',
396+
{'dob': {'format': "%m-%d"}},
397+
pd.DataFrame(data={
398+
'customer_id': [1, 2],
399+
'dob': ['03-02-2021', '03-01-2021'],
400+
'weight': [50, 32]
401+
})
402+
)
403+
])
404+
@patch('dgraphpandas.strategies.horizontal.vertical_transform')
405+
def test_horizontal_transform_unconverted_date_parts(self, name, date_format, frame, transform_mock: Mock):
406+
'''
407+
Ensures when the date partially matches and there are some converted
408+
parts, an error is raised
409+
'''
410+
config_file_key = 'customer'
411+
config = {
412+
'files': {
413+
config_file_key: {
414+
'subject_fields': ['customer_id'],
415+
'date_fields': date_format
416+
}
417+
}
418+
}
419+
420+
with self.assertRaisesRegex(ValueError, "unconverted data remains: (.*)"):
421+
horizontal_transform(frame, config, config_file_key)
422+
transform_mock.assert_not_called()
423+
424+
@parameterized.expand([
425+
###
426+
(
427+
'dash_format',
428+
{'dob': {'format': "%Y-%m-%d"}},
429+
pd.DataFrame(data={
430+
'customer_id': [1, 2],
431+
'dob': ['2021-03-02', '1945-03-01'],
432+
'weight': [50, 32]
433+
}),
434+
pd.DataFrame(data={
435+
'customer_id': [1, 2, 1, 2],
436+
'predicate': ['dob', 'dob', 'weight', 'weight'],
437+
'object':[pd.to_datetime('2021-03-02 00:00:00'), pd.to_datetime('1945-03-01 00:00:00'), 50, 32]
438+
})
439+
),
440+
###
441+
(
442+
'dot_format',
443+
{'dob': {'format': "%Y.%m.%d"}},
444+
pd.DataFrame(data={
445+
'customer_id': [1, 2],
446+
'dob': ['1999.05.09', '1789.02.12'],
447+
'weight': [50, 32]
448+
}),
449+
pd.DataFrame(data={
450+
'customer_id': [1, 2, 1, 2],
451+
'predicate': ['dob', 'dob', 'weight', 'weight'],
452+
'object': [pd.to_datetime('1999-05-09 00:00:00'), pd.to_datetime('1789-02-12 00:00:00'), 50, 32]
453+
})
454+
),
455+
###
456+
(
457+
'multiple_date_fields',
458+
{'updated_at': {'format': '%Y.%m.%d'}, 'dob': {'format': "%Y.%m.%d"}},
459+
pd.DataFrame(data={
460+
'customer_id': [1, 2],
461+
'dob': ['1999.05.09', '1789.02.12'],
462+
'updated_at': ['2021.03.02', '2021.03.04'],
463+
'weight': [50, 32]
464+
}),
465+
pd.DataFrame(data={
466+
'customer_id': [1, 2, 1, 2, 1, 2],
467+
'predicate': ['dob', 'dob', 'updated_at', 'updated_at', 'weight', 'weight'],
468+
'object': [
469+
pd.to_datetime('1999-05-09 00:00:00'),
470+
pd.to_datetime('1789-02-12 00:00:00'),
471+
pd.to_datetime('2021-03-02 00:00:00'),
472+
pd.to_datetime('2021-03-04 00:00:00'),
473+
50,
474+
32]
475+
})
476+
),
477+
###
478+
(
479+
'multiple_date_fields_different_formats',
480+
{'updated_at': {'format': '%Y$%m$%d'}, 'dob': {'format': "%Y.%m.%d"}},
481+
pd.DataFrame(data={
482+
'customer_id': [1, 2],
483+
'dob': ['1999.05.09', '1789.02.12'],
484+
'updated_at': ['2021$03$02', '2021$03$04'],
485+
'weight': [50, 32]
486+
}),
487+
pd.DataFrame(data={
488+
'customer_id': [1, 2, 1, 2, 1, 2],
489+
'predicate': ['dob', 'dob', 'updated_at', 'updated_at', 'weight', 'weight'],
490+
'object': [
491+
pd.to_datetime('1999-05-09 00:00:00'),
492+
pd.to_datetime('1789-02-12 00:00:00'),
493+
pd.to_datetime('2021-03-02 00:00:00'),
494+
pd.to_datetime('2021-03-04 00:00:00'),
495+
50,
496+
32]
497+
})
498+
)
499+
])
500+
@patch('dgraphpandas.strategies.horizontal.vertical_transform')
501+
def test_horizontal_transform_correct_date_format(self, name, date_format, frame, expected_melted, transform_mock: Mock):
502+
'''
503+
Ensures when the date_format provided is in the correct format,
504+
no error is raised
505+
'''
506+
config_file_key = 'customer'
507+
config = {
508+
'files': {
509+
config_file_key: {
510+
'subject_fields': ['customer_id'],
511+
'date_fields': date_format
512+
}
513+
}
514+
}
515+
516+
horizontal_transform(frame, config, config_file_key)
517+
518+
transform_mock.assert_called_once()
519+
args, kwargs = transform_mock.call_args_list[0]
520+
521+
passed_frame, passed_config, passed_config_key = args
522+
523+
assert_frame_equal(passed_frame, expected_melted)
524+
self.assertEqual(passed_config, config)
525+
self.assertEqual(passed_config_key, config_file_key)
526+
self.assertEqual(kwargs, {})

tests/strategies/test_vertical.py

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -669,6 +669,39 @@ def test_vertical_transform_csv_file(self, mock_pandas: Mock):
669669
{}
670670
),
671671
###
672+
(
673+
'with datetime_formats',
674+
'customer',
675+
{
676+
'files': {
677+
'customer': {
678+
'subject_fields': ['customer_id'],
679+
'edge_fields': ['location_id'],
680+
'date_fields': {'dob': {'format': '%Y %b %d'}}
681+
},
682+
},
683+
'add_dgraph_type_records': False,
684+
},
685+
pd.DataFrame(data={
686+
'customer_id': [1, 2, 3, 1, 2],
687+
'predicate': ['dob', 'weight', 'orders', 'location_id', 'location_id'],
688+
'object': ['2021 Mar 13', 90, '1', 'loc45', 'loc64']
689+
}),
690+
pd.DataFrame(data={
691+
'subject': ['customer_2', 'customer_3', 'customer_1'],
692+
'predicate': ['weight', 'orders', 'dob'],
693+
'object': [90, '1', '2021-03-13T00:00:00'],
694+
'type': ['<xs:string>']*2 + ['<xs:dateTime>']
695+
}),
696+
pd.DataFrame(data={
697+
'subject': ['customer_1', 'customer_2'],
698+
'predicate': ['location', 'location'],
699+
'object': ['location_loc45', 'location_loc64'],
700+
'type': [None]*2
701+
}),
702+
{}
703+
),
704+
###
672705
(
673706
'illegal_characters',
674707
'customer',

tests/strategies/test_vertical_helpers.py

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -500,6 +500,32 @@ def test_format_date_fields_date_fields_exist_but_not_datetime(self):
500500
with self.assertRaises(AttributeError):
501501
_format_date_fields(frame)
502502

503+
def test_format_date_fields_formats_provided(self):
504+
'''
505+
Ensures when a date field is provided, the object is converted
506+
into ISO format
507+
'''
508+
frame = pd.DataFrame(data={
509+
'subject': ['customer_1', 'customer_1', 'customer_1'],
510+
'predicate': ['hair_colour', 'dob', 'weight'],
511+
'object': ['black', '2021 Jan 21', '50'],
512+
'type': ['<xs:string>', '<xs:dateTime>', '<xs:int>']
513+
})
514+
515+
date_fields = {
516+
'dob': {"format": "%Y %b %d"}
517+
}
518+
519+
result = _format_date_fields(frame, date_fields)
520+
expected_frame = pd.DataFrame(data={
521+
'subject': ['customer_1', 'customer_1', 'customer_1'],
522+
'predicate': ['hair_colour', 'weight', 'dob'],
523+
'object': ['black', '50', '2021-01-21T00:00:00'],
524+
'type': ['<xs:string>', '<xs:int>', '<xs:dateTime>']
525+
})
526+
527+
assert_frame_equal(result.reset_index(drop=True), expected_frame.reset_index(drop=True))
528+
503529
def test_compile_illegal_characters_regex_nonecharacters(self):
504530
'''
505531
Ensure when none characters are passed, then none

0 commit comments

Comments
 (0)