Skip to content

Commit

Permalink
handle unexpected dates
Browse files Browse the repository at this point in the history
  • Loading branch information
Xavier Medrano authored and Xavier Medrano committed Mar 28, 2024
1 parent f050d27 commit 0db4f31
Showing 1 changed file with 22 additions and 1 deletion.
23 changes: 22 additions & 1 deletion lm20/pipelines.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import mimetypes
import os
import cgi
import re

import dateutil.parser

Expand Down Expand Up @@ -132,7 +133,27 @@ def process_item(self, item, spider):

time_str = adapter["termDate"]
if time_str:
adapter["termDate"] = dateutil.parser.parse(time_str).date()
try:
adapter["termDate"] = dateutil.parser.parse(time_str).date()
except dateutil.parser._parser.ParserError:
try:
if re.search('[a-zA-Z]', time_str):
# Contains letters, but can't be parsed. ex: "on-going"
adapter["termDate"] = time_str.lower()
elif time_str.isnumeric():
# Unformatted date. ex: MMDDYYYY
formatted_str = f"{time_str[:2]}/{time_str[2:4]}/{time_str[4:]}"
adapter["termDate"] = dateutil.parser.parse(formatted_str).date()
else:
# Incorrectly formatted date. ex: MMDD/YY
formatted_str = f"{time_str[:2]}/{time_str[2:]}"
if formatted_str.count('/') == 2:
adapter["termDate"] = dateutil.parser.parse(formatted_str).date()
else:
raise dateutil.parser._parser.ParserError
except dateutil.parser._parser.ParserError:
print("Could not parse date from string:", time_str)
pass

return item

Expand Down

0 comments on commit 0db4f31

Please sign in to comment.