Skip to content

Commit

Permalink
Handle military clock time (0800) in time standardizer.
Browse files Browse the repository at this point in the history
  • Loading branch information
alexaryn committed Dec 5, 2024
1 parent 1f05347 commit 26f65e7
Show file tree
Hide file tree
Showing 2 changed files with 69 additions and 2 deletions.
34 changes: 34 additions & 0 deletions lib/sycamore/sycamore/tests/unit/transforms/test_standardizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
DateTimeStandardizer,
ignore_errors,
)
import pytest
import unittest
from datetime import date, datetime

Expand Down Expand Up @@ -266,3 +267,36 @@ def test_ignore_errors_key_missing(self):
key_path = ["nonExistentKey"]
expected_output = {"event": {"coolKey": ""}}
self.assertEqual(ignore_errors(doc, standardizer, key_path), expected_output)


@pytest.mark.parametrize(
"raw, want",
[
("March 17, 2023, 14.25 Local", "2023-03-17 14:25:00"),
("March 17, 2023, 14.25", "2023-03-17 14:25:00"),
("March 17, 2023 14:25:00", "2023-03-17 14:25:00"),
("17 March 2023 14:25", "2023-03-17 14:25:00"),
("2023-07-15 10.30.00", "2023-07-15 10:30:00"),
("15/07/2023 10.30.00", "2023-07-15 10:30:00"),
("2023-07-15 10.30.00 Local", "2023-07-15 10:30:00"),
("2023-07-15 10.30.00PDT", "2023-07-15 10:30:00-07:00"),
("2024/6/01 23:59:59 PDT", "2024-06-01 23:59:59-07:00"),
("2024/12/04 15:25:39 PST", "2024-12-04 15:25:39-08:00"),
("03/02/1995 0815 CST", "1995-03-02 08:15:00-06:00"),
("03/02/1995 081500 CST", "1995-03-02 08:15:00-06:00"),
("3/2/95 0815", "1995-03-02 08:15:00"),
("1995-03-02 0815 CST", "1995-03-02 08:15:00-06:00"),
("1995-03-02 081500 CST", "1995-03-02 08:15:00-06:00"),
("4/30/1970 10:15:00 JST", "1970-04-30 10:15:00+09:00"),
("1/2/2034 12:13:14 GMT", "2034-01-02 12:13:14+00:00"),
("2034-01-02T12:13:14+00:00", "2034-01-02 12:13:14+00:00"),
("April 1, 1999 1259", "fail"),
],
)
def test_date_fixer(raw, want):
try:
dt = DateTimeStandardizer.fixer(raw)
s = dt.isoformat(sep=" ", timespec="seconds")
except ValueError:
s = "fail"
assert s == want
37 changes: 35 additions & 2 deletions lib/sycamore/sycamore/transforms/standardizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,6 +185,9 @@ class DateTimeStandardizer(Standardizer):
"""

DEFAULT_FORMAT = "%B %d, %Y %H:%M:%S%Z"
clock_re = re.compile(r"\d:[0-5]\d")
year_re = re.compile(r"([12]\d\d\d-)|(/[12]\d\d\d)|(\d/[0-3]?\d/\d)")
digitpair_re = re.compile(r"([0-2]\d)([0-5]\d)(\d\d)?")

@staticmethod
def fixer(raw_dateTime: str) -> datetime:
Expand All @@ -205,10 +208,11 @@ def fixer(raw_dateTime: str) -> datetime:
"""
assert raw_dateTime is not None, "raw_dateTime is None"
try:
raw_dateTime = raw_dateTime.strip()
raw_dateTime = DateTimeStandardizer.preprocess(raw_dateTime)
raw_dateTime = raw_dateTime.replace("Local", "")
raw_dateTime = raw_dateTime.replace("local", "")
raw_dateTime = raw_dateTime.replace(".", ":")
logging.error(f"FIXME {raw_dateTime}")
parsed = dateparser.parse(raw_dateTime)
if not parsed:
raise ValueError(f"Invalid date format: {raw_dateTime}")
Expand All @@ -222,6 +226,35 @@ def fixer(raw_dateTime: str) -> datetime:
# Handle any other exceptions
raise RuntimeError(f"Unexpected error occurred while processing: {raw_dateTime}") from e

@staticmethod
def preprocess(raw: str) -> str:
# Fix up military clock time with just digits (0800)
raw = raw.strip()
tokens = raw.split()
saw_clock = 0
saw_year = 0
saw_digits = 0
for token in tokens:
if DateTimeStandardizer.clock_re.search(token):
saw_clock += 1
elif DateTimeStandardizer.year_re.search(token):
saw_year += 1
elif DateTimeStandardizer.digitpair_re.fullmatch(token):
saw_digits += 1
# If unsure there's exactly one military clock time, bail out.
# Note that numbers like 2024 could be times or years.
if (saw_clock > 0) or (saw_year == 0) or (saw_digits != 1):
return raw
pieces: list[str] = []
for token in tokens:
if match := DateTimeStandardizer.digitpair_re.fullmatch(token):
clock = ":".join([x for x in match.groups() if x])
before = token[: match.start(0)]
after = token[match.end(0) :]
token = before + clock + after
pieces.append(token)
return " ".join(pieces)

@staticmethod
def standardize(
doc: Document,
Expand Down Expand Up @@ -305,7 +338,7 @@ def ignore_errors(doc: Document, standardizer: Standardizer, key_path: list[str]
try:
doc = standardizer.standardize(doc, key_path=key_path)
except KeyError:
logger.warn(f"Key {key_path} not found in document: {doc}")
logger.warning(f"Key {key_path} not found in document: {doc}")
except Exception as e:
logger.error(e)
return doc

0 comments on commit 26f65e7

Please sign in to comment.