Skip to content

Commit 6b5e2bf

Browse files
raisadzFBruzzesi
andauthored
Enh: adding automated inferencing of format %Y-%m-%dT%H:%M in pyarrow (#1292)
* add pyarrow time parsing with %H:%M format * add test for format %H:%M * add time format mapping * add start and end characters matching Co-authored-by: Francesco Bruzzesi <[email protected]> --------- Co-authored-by: Francesco Bruzzesi <[email protected]>
1 parent d5feb6f commit 6b5e2bf

File tree

2 files changed

+56
-11
lines changed

2 files changed

+56
-11
lines changed

narwhals/_arrow/utils.py

+12-3
Original file line numberDiff line numberDiff line change
@@ -340,7 +340,9 @@ def convert_str_slice_to_int_slice(
340340
# Regex for date, time, separator and timezone components
341341
DATE_RE = r"(?P<date>\d{1,4}[-/.]\d{1,2}[-/.]\d{1,4})"
342342
SEP_RE = r"(?P<sep>\s|T)"
343-
TIME_RE = r"(?P<time>\d{2}:\d{2}:\d{2})" # \s*(?P<period>[AP]M)?)?
343+
TIME_RE = r"(?P<time>\d{2}:\d{2}(?::\d{2})?)" # \s*(?P<period>[AP]M)?)?
344+
HMS_RE = r"^(?P<hms>\d{2}:\d{2}:\d{2})$"
345+
HM_RE = r"^(?P<hm>\d{2}:\d{2})$"
344346
TZ_RE = r"(?P<tz>Z|[+-]\d{2}:?\d{2})" # Matches 'Z', '+02:00', '+0200', '+02', etc.
345347
FULL_RE = rf"{DATE_RE}{SEP_RE}?{TIME_RE}?{TZ_RE}?$"
346348

@@ -354,6 +356,10 @@ def convert_str_slice_to_int_slice(
354356
(DMY_RE, "%d-%m-%Y"),
355357
(MDY_RE, "%m-%d-%Y"),
356358
)
359+
TIME_FORMATS = (
360+
(HMS_RE, "%H:%M:%S"),
361+
(HM_RE, "%H:%M"),
362+
)
357363

358364

359365
def parse_datetime_format(arr: pa.StringArray) -> str:
@@ -418,5 +424,8 @@ def _parse_date_format(arr: pa.Array) -> str:
418424
def _parse_time_format(arr: pa.Array) -> str:
419425
import pyarrow.compute as pc # ignore-banned-import
420426

421-
matches = pc.extract_regex(arr, pattern=TIME_RE)
422-
return "%H:%M:%S" if pc.all(matches.is_valid()).as_py() else ""
427+
for time_rgx, time_fmt in TIME_FORMATS:
428+
matches = pc.extract_regex(arr, pattern=time_rgx)
429+
if pc.all(matches.is_valid()).as_py():
430+
return time_fmt
431+
return ""

tests/expr_and_series/str/to_datetime_test.py

+44-8
Original file line numberDiff line numberDiff line change
@@ -47,11 +47,29 @@ def test_to_datetime_series(constructor_eager: ConstructorEager) -> None:
4747
assert str(result) == expected
4848

4949

50-
def test_to_datetime_infer_fmt(constructor: Constructor) -> None:
50+
@pytest.mark.parametrize(
51+
("data", "expected", "expected_cudf"),
52+
[
53+
(
54+
{"a": ["2020-01-01T12:34:56"]},
55+
"2020-01-01 12:34:56",
56+
"2020-01-01T12:34:56.000000000",
57+
),
58+
(
59+
{"a": ["2020-01-01T12:34"]},
60+
"2020-01-01 12:34:00",
61+
"2020-01-01T12:34:00.000000000",
62+
),
63+
],
64+
)
65+
def test_to_datetime_infer_fmt(
66+
constructor: Constructor,
67+
data: dict[str, list[str]],
68+
expected: str,
69+
expected_cudf: str,
70+
) -> None:
5171
if "cudf" in str(constructor): # pragma: no cover
52-
expected = "2020-01-01T12:34:56.000000000"
53-
else:
54-
expected = "2020-01-01 12:34:56"
72+
expected = expected_cudf
5573

5674
result = (
5775
nw.from_native(constructor(data))
@@ -63,11 +81,29 @@ def test_to_datetime_infer_fmt(constructor: Constructor) -> None:
6381
assert str(result) == expected
6482

6583

66-
def test_to_datetime_series_infer_fmt(constructor_eager: ConstructorEager) -> None:
84+
@pytest.mark.parametrize(
85+
("data", "expected", "expected_cudf"),
86+
[
87+
(
88+
{"a": ["2020-01-01T12:34:56"]},
89+
"2020-01-01 12:34:56",
90+
"2020-01-01T12:34:56.000000000",
91+
),
92+
(
93+
{"a": ["2020-01-01T12:34"]},
94+
"2020-01-01 12:34:00",
95+
"2020-01-01T12:34:00.000000000",
96+
),
97+
],
98+
)
99+
def test_to_datetime_series_infer_fmt(
100+
constructor_eager: ConstructorEager,
101+
data: dict[str, list[str]],
102+
expected: str,
103+
expected_cudf: str,
104+
) -> None:
67105
if "cudf" in str(constructor_eager): # pragma: no cover
68-
expected = "2020-01-01T12:34:56.000000000"
69-
else:
70-
expected = "2020-01-01 12:34:56"
106+
expected = expected_cudf
71107

72108
result = (
73109
nw.from_native(constructor_eager(data), eager_only=True)["a"].str.to_datetime()

0 commit comments

Comments
 (0)