Skip to content

Commit

Permalink
feat: add caching for timezone offsets, significantly speeds up import
Browse files Browse the repository at this point in the history
this is different from pr scrapinghub#1181. that pr only makes import faster but
still incurs cost on the first usage. this one leverages an optional
cache.

closes scrapinghub#533
  • Loading branch information
tobymao committed Jan 30, 2025
1 parent 47acb88 commit a5d6b87
Show file tree
Hide file tree
Showing 2 changed files with 39 additions and 3 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -52,3 +52,4 @@ docs/_build

# Other
raw_data
*.pkl
41 changes: 38 additions & 3 deletions dateparser/timezone_parser.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from datetime import datetime, timedelta, timezone, tzinfo

import os
import regex as re

from .timezones import timezone_info_list
Expand Down Expand Up @@ -85,7 +86,41 @@ def get_local_tz_offset():


_search_regex_parts = []
_tz_offsets = list(build_tz_offsets(_search_regex_parts))
_search_regex = re.compile("|".join(_search_regex_parts))
_search_regex_ignorecase = re.compile("|".join(_search_regex_parts), re.IGNORECASE)
local_tz_offset = get_local_tz_offset()

_tz_offsets = None
_search_regex = None
_search_regex_ignorecase = None


def _load_offsets():
global _tz_offsets, _search_regex, _search_regex_ignorecase
_tz_offsets = list(build_tz_offsets(_search_regex_parts))
_search_regex = re.compile("|".join(_search_regex_parts))
_search_regex_ignorecase = re.compile("|".join(_search_regex_parts), re.IGNORECASE)


if "DATEPARSER_TZ_CACHE" in os.environ:
import pickle
from pathlib import Path
from dateparser import __version__

path = Path(os.environ.get("DATEPARSER_TZ_CACHE_PATH", ".dateparser_tz_cache.pkl"))
path.parents[0].mkdir(parents=True, exist_ok=True)
reload = True

try:
with open(path, mode="rb") as file:
version, _tz_offsets, _search_regex, _search_regex_ignorecase = pickle.load(file)

if version == __version__:
reload = False
except Exception:
pass

if reload:
with open(path, mode="wb") as file:
_load_offsets()
pickle.dump([__version__, _tz_offsets, _search_regex, _search_regex_ignorecase], file)
else:
_load_offsets()

0 comments on commit a5d6b87

Please sign in to comment.