Skip to content

Commit

Permalink
Fixed problematic date parsing (#888)
Browse files Browse the repository at this point in the history
We had some trouble with datetime strings coming back from server not parsing correctly. I added a bit more flexible parsing.
  • Loading branch information
whitead authored Feb 25, 2025
1 parent 0727ca8 commit 093cbb8
Show file tree
Hide file tree
Showing 3 changed files with 34 additions and 2 deletions.
5 changes: 3 additions & 2 deletions paperqa/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
encode_id,
format_bibtex,
get_citenames,
maybe_get_date,
)
from paperqa.version import __version__ as pqa_version

Expand Down Expand Up @@ -585,8 +586,8 @@ def populate_bibtex_key_citation( # noqa: PLR0912
"pages": data.get("pages"),
"month": (
None
if not data.get("publication_date")
else data["publication_date"].strftime("%b")
if not (maybe_date := maybe_get_date(data.get("publication_date")))
else maybe_date.strftime("%b")
),
"doi": data.get("doi"),
"url": data.get("doi_url"),
Expand Down
22 changes: 22 additions & 0 deletions paperqa/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -557,3 +557,25 @@ def citation_to_docname(citation: str) -> str:
if match is not None:
year = match.group(1)
return f"{author}{year}"


def maybe_get_date(date: str | datetime | None) -> datetime | None:
if not date:
return None
if isinstance(date, str):
# Try common date formats in sequence
formats = [
"%Y-%m-%dT%H:%M:%S%z", # ISO with timezone: 2023-01-31T14:30:00+0000
"%Y-%m-%d %H:%M:%S", # ISO with time: 2023-01-31 14:30:00
"%B %d, %Y", # Full month day, year: January 31, 2023
"%b %d, %Y", # Month day, year: Jan 31, 2023
"%Y-%m-%d", # ISO format: 2023-01-31
]

for fmt in formats:
try:
return datetime.strptime(date, fmt)
except ValueError:
continue
return None
return date
9 changes: 9 additions & 0 deletions tests/test_paperqa.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@
from paperqa.utils import (
extract_score,
get_citenames,
maybe_get_date,
maybe_is_html,
maybe_is_text,
name_in_text,
Expand Down Expand Up @@ -1644,3 +1645,11 @@ def test_fallback_non_json(self, input_text: str) -> None:
)
def test_llm_parse_json_with_escaped_characters(self, input_text, expected_output):
assert llm_parse_json(input_text) == expected_output


def test_maybe_get_date():
assert maybe_get_date("2023-01-01") == datetime(2023, 1, 1)
assert maybe_get_date("2023-01-31 14:30:00") == datetime(2023, 1, 31, 14, 30)
assert maybe_get_date(datetime(2023, 1, 1)) == datetime(2023, 1, 1)
assert maybe_get_date("foo") is None
assert maybe_get_date("") is None

0 comments on commit 093cbb8

Please sign in to comment.