From 26f65e7ed97902872ec87ce45312988f4795a2a8 Mon Sep 17 00:00:00 2001 From: Alex Meyer Date: Wed, 4 Dec 2024 22:26:18 -0800 Subject: [PATCH 1/4] Handle military clock time (0800) in time standardizer. --- .../unit/transforms/test_standardizer.py | 34 +++++++++++++++++ .../sycamore/transforms/standardizer.py | 37 ++++++++++++++++++- 2 files changed, 69 insertions(+), 2 deletions(-) diff --git a/lib/sycamore/sycamore/tests/unit/transforms/test_standardizer.py b/lib/sycamore/sycamore/tests/unit/transforms/test_standardizer.py index 32da2332d..c246f11f9 100644 --- a/lib/sycamore/sycamore/tests/unit/transforms/test_standardizer.py +++ b/lib/sycamore/sycamore/tests/unit/transforms/test_standardizer.py @@ -5,6 +5,7 @@ DateTimeStandardizer, ignore_errors, ) +import pytest import unittest from datetime import date, datetime @@ -266,3 +267,36 @@ def test_ignore_errors_key_missing(self): key_path = ["nonExistentKey"] expected_output = {"event": {"coolKey": ""}} self.assertEqual(ignore_errors(doc, standardizer, key_path), expected_output) + + +@pytest.mark.parametrize( + "raw, want", + [ + ("March 17, 2023, 14.25 Local", "2023-03-17 14:25:00"), + ("March 17, 2023, 14.25", "2023-03-17 14:25:00"), + ("March 17, 2023 14:25:00", "2023-03-17 14:25:00"), + ("17 March 2023 14:25", "2023-03-17 14:25:00"), + ("2023-07-15 10.30.00", "2023-07-15 10:30:00"), + ("15/07/2023 10.30.00", "2023-07-15 10:30:00"), + ("2023-07-15 10.30.00 Local", "2023-07-15 10:30:00"), + ("2023-07-15 10.30.00PDT", "2023-07-15 10:30:00-07:00"), + ("2024/6/01 23:59:59 PDT", "2024-06-01 23:59:59-07:00"), + ("2024/12/04 15:25:39 PST", "2024-12-04 15:25:39-08:00"), + ("03/02/1995 0815 CST", "1995-03-02 08:15:00-06:00"), + ("03/02/1995 081500 CST", "1995-03-02 08:15:00-06:00"), + ("3/2/95 0815", "1995-03-02 08:15:00"), + ("1995-03-02 0815 CST", "1995-03-02 08:15:00-06:00"), + ("1995-03-02 081500 CST", "1995-03-02 08:15:00-06:00"), + ("4/30/1970 10:15:00 JST", "1970-04-30 10:15:00+09:00"), + ("1/2/2034 12:13:14 GMT", "2034-01-02 12:13:14+00:00"), + ("2034-01-02T12:13:14+00:00", "2034-01-02 12:13:14+00:00"), + ("April 1, 1999 1259", "fail"), + ], +) +def test_date_fixer(raw, want): + try: + dt = DateTimeStandardizer.fixer(raw) + s = dt.isoformat(sep=" ", timespec="seconds") + except ValueError: + s = "fail" + assert s == want diff --git a/lib/sycamore/sycamore/transforms/standardizer.py b/lib/sycamore/sycamore/transforms/standardizer.py index 780f02376..32c9af975 100644 --- a/lib/sycamore/sycamore/transforms/standardizer.py +++ b/lib/sycamore/sycamore/transforms/standardizer.py @@ -185,6 +185,9 @@ class DateTimeStandardizer(Standardizer): """ DEFAULT_FORMAT = "%B %d, %Y %H:%M:%S%Z" + clock_re = re.compile(r"\d:[0-5]\d") + year_re = re.compile(r"([12]\d\d\d-)|(/[12]\d\d\d)|(\d/[0-3]?\d/\d)") + digitpair_re = re.compile(r"([0-2]\d)([0-5]\d)(\d\d)?") @staticmethod def fixer(raw_dateTime: str) -> datetime: @@ -205,10 +208,11 @@ def fixer(raw_dateTime: str) -> datetime: """ assert raw_dateTime is not None, "raw_dateTime is None" try: - raw_dateTime = raw_dateTime.strip() + raw_dateTime = DateTimeStandardizer.preprocess(raw_dateTime) raw_dateTime = raw_dateTime.replace("Local", "") raw_dateTime = raw_dateTime.replace("local", "") raw_dateTime = raw_dateTime.replace(".", ":") + logging.error(f"FIXME {raw_dateTime}") parsed = dateparser.parse(raw_dateTime) if not parsed: raise ValueError(f"Invalid date format: {raw_dateTime}") @@ -222,6 +226,35 @@ def fixer(raw_dateTime: str) -> datetime: # Handle any other exceptions raise RuntimeError(f"Unexpected error occurred while processing: {raw_dateTime}") from e + @staticmethod + def preprocess(raw: str) -> str: + # Fix up military clock time with just digits (0800) + raw = raw.strip() + tokens = raw.split() + saw_clock = 0 + saw_year = 0 + saw_digits = 0 + for token in tokens: + if DateTimeStandardizer.clock_re.search(token): + saw_clock += 1 + elif DateTimeStandardizer.year_re.search(token): + saw_year += 1 + elif DateTimeStandardizer.digitpair_re.fullmatch(token): + saw_digits += 1 + # If unsure there's exactly one military clock time, bail out. + # Note that numbers like 2024 could be times or years. + if (saw_clock > 0) or (saw_year == 0) or (saw_digits != 1): + return raw + pieces: list[str] = [] + for token in tokens: + if match := DateTimeStandardizer.digitpair_re.fullmatch(token): + clock = ":".join([x for x in match.groups() if x]) + before = token[: match.start(0)] + after = token[match.end(0) :] + token = before + clock + after + pieces.append(token) + return " ".join(pieces) + @staticmethod def standardize( doc: Document, @@ -305,7 +338,7 @@ def ignore_errors(doc: Document, standardizer: Standardizer, key_path: list[str] try: doc = standardizer.standardize(doc, key_path=key_path) except KeyError: - logger.warn(f"Key {key_path} not found in document: {doc}") + logger.warning(f"Key {key_path} not found in document: {doc}") except Exception as e: logger.error(e) return doc From e563ba620bf951426660cab2d6cd9be5ee5a3f60 Mon Sep 17 00:00:00 2001 From: Alex Meyer Date: Wed, 4 Dec 2024 22:35:55 -0800 Subject: [PATCH 2/4] Add some fail cases. --- .../sycamore/tests/unit/transforms/test_standardizer.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/lib/sycamore/sycamore/tests/unit/transforms/test_standardizer.py b/lib/sycamore/sycamore/tests/unit/transforms/test_standardizer.py index c246f11f9..dc503e64c 100644 --- a/lib/sycamore/sycamore/tests/unit/transforms/test_standardizer.py +++ b/lib/sycamore/sycamore/tests/unit/transforms/test_standardizer.py @@ -290,6 +290,9 @@ def test_ignore_errors_key_missing(self): ("4/30/1970 10:15:00 JST", "1970-04-30 10:15:00+09:00"), ("1/2/2034 12:13:14 GMT", "2034-01-02 12:13:14+00:00"), ("2034-01-02T12:13:14+00:00", "2034-01-02 12:13:14+00:00"), + ("", "fail"), + ("wrongdate", "fail"), + ("2023123-07-15 10.30.00 Local", "fail"), ("April 1, 1999 1259", "fail"), ], ) From 62f2f67a24856e426a5db01f9896c572fb980588 Mon Sep 17 00:00:00 2001 From: Alex Meyer Date: Wed, 4 Dec 2024 22:51:40 -0800 Subject: [PATCH 3/4] Add AM/PM cases. --- .../sycamore/tests/unit/transforms/test_standardizer.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/lib/sycamore/sycamore/tests/unit/transforms/test_standardizer.py b/lib/sycamore/sycamore/tests/unit/transforms/test_standardizer.py index dc503e64c..615d700dc 100644 --- a/lib/sycamore/sycamore/tests/unit/transforms/test_standardizer.py +++ b/lib/sycamore/sycamore/tests/unit/transforms/test_standardizer.py @@ -275,6 +275,8 @@ def test_ignore_errors_key_missing(self): ("March 17, 2023, 14.25 Local", "2023-03-17 14:25:00"), ("March 17, 2023, 14.25", "2023-03-17 14:25:00"), ("March 17, 2023 14:25:00", "2023-03-17 14:25:00"), + ("March 17, 2023 2:25PM", "2023-03-17 14:25:00"), + ("March 17, 2023 2:25AM", "2023-03-17 02:25:00"), ("17 March 2023 14:25", "2023-03-17 14:25:00"), ("2023-07-15 10.30.00", "2023-07-15 10:30:00"), ("15/07/2023 10.30.00", "2023-07-15 10:30:00"), From 9e77a01b0498e264c91ae466b7d1e796ef41b101 Mon Sep 17 00:00:00 2001 From: Alex Meyer Date: Thu, 5 Dec 2024 11:12:34 -0800 Subject: [PATCH 4/4] Improvements based on PR comments. --- lib/sycamore/sycamore/transforms/standardizer.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/lib/sycamore/sycamore/transforms/standardizer.py b/lib/sycamore/sycamore/transforms/standardizer.py index 32c9af975..736c36ac7 100644 --- a/lib/sycamore/sycamore/transforms/standardizer.py +++ b/lib/sycamore/sycamore/transforms/standardizer.py @@ -185,6 +185,11 @@ class DateTimeStandardizer(Standardizer): """ DEFAULT_FORMAT = "%B %d, %Y %H:%M:%S%Z" + + # Regexes for military time stuff below. Example matching strings: + # clock: 8:00 12:30 23:59:59 + # year: 1970-04-30 1999-12 12/5/2024 12/2000 4/30/70 + # digitpair: 0800 235959 clock_re = re.compile(r"\d:[0-5]\d") year_re = re.compile(r"([12]\d\d\d-)|(/[12]\d\d\d)|(\d/[0-3]?\d/\d)") digitpair_re = re.compile(r"([0-2]\d)([0-5]\d)(\d\d)?") @@ -208,11 +213,10 @@ def fixer(raw_dateTime: str) -> datetime: """ assert raw_dateTime is not None, "raw_dateTime is None" try: - raw_dateTime = DateTimeStandardizer.preprocess(raw_dateTime) + raw_dateTime = DateTimeStandardizer.fix_military(raw_dateTime) raw_dateTime = raw_dateTime.replace("Local", "") raw_dateTime = raw_dateTime.replace("local", "") raw_dateTime = raw_dateTime.replace(".", ":") - logging.error(f"FIXME {raw_dateTime}") parsed = dateparser.parse(raw_dateTime) if not parsed: raise ValueError(f"Invalid date format: {raw_dateTime}") @@ -227,7 +231,7 @@ def fixer(raw_dateTime: str) -> datetime: raise RuntimeError(f"Unexpected error occurred while processing: {raw_dateTime}") from e @staticmethod - def preprocess(raw: str) -> str: + def fix_military(raw: str) -> str: # Fix up military clock time with just digits (0800) raw = raw.strip() tokens = raw.split()