Skip to content

Commit

Permalink
feat: add caching for timezone offsets, significantly speeds up import
Browse files Browse the repository at this point in the history
this is different from pr scrapinghub#1181. it builds a cache at install time which
can be distributed.

closes scrapinghub#533
  • Loading branch information
tobymao committed Feb 5, 2025
1 parent 47acb88 commit 08ce4e2
Show file tree
Hide file tree
Showing 5 changed files with 76 additions and 30 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -52,3 +52,4 @@ docs/_build

# Other
raw_data
*.pkl
1 change: 1 addition & 0 deletions MANIFEST.in
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ include CONTRIBUTING.rst
include HISTORY.rst
include LICENSE
include README.rst
include dateparser_data/dateparser_tz_cache.pkl
include dateparser_data/settings.py
include requirements.txt

Expand Down
42 changes: 12 additions & 30 deletions dateparser/timezone_parser.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
import pickle
from datetime import datetime, timedelta, timezone, tzinfo
from pathlib import Path

import regex as re

from .timezones import timezone_info_list


class StaticTzInfo(tzinfo):
def __init__(self, name, offset):
Expand Down Expand Up @@ -54,38 +54,20 @@ def convert_to_local_tz(datetime_obj, datetime_tz_offset):
return datetime_obj - datetime_tz_offset + local_tz_offset


def build_tz_offsets(search_regex_parts):
def get_offset(tz_obj, regex, repl="", replw=""):
return (
tz_obj[0],
{
"regex": re.compile(
re.sub(repl, replw, regex % tz_obj[0]), re.IGNORECASE
),
"offset": timedelta(seconds=tz_obj[1]),
},
)

for tz_info in timezone_info_list:
for regex in tz_info["regex_patterns"]:
for tz_obj in tz_info["timezones"]:
search_regex_parts.append(tz_obj[0])
yield get_offset(tz_obj, regex)

# alternate patterns
for replace, replacewith in tz_info.get("replace", []):
search_regex_parts.append(re.sub(replace, replacewith, tz_obj[0]))
yield get_offset(tz_obj, regex, repl=replace, replw=replacewith)


def get_local_tz_offset():
offset = datetime.now() - datetime.now(tz=timezone.utc).replace(tzinfo=None)
offset = timedelta(days=offset.days, seconds=round(offset.seconds, -1))
return offset


_search_regex_parts = []
_tz_offsets = list(build_tz_offsets(_search_regex_parts))
_search_regex = re.compile("|".join(_search_regex_parts))
_search_regex_ignorecase = re.compile("|".join(_search_regex_parts), re.IGNORECASE)
local_tz_offset = get_local_tz_offset()

with open(
Path(__file__).parent.parent.joinpath("dateparser_data", "dateparser_tz_cache.pkl"),
mode="rb",
) as file:
(
_tz_offsets,
_search_regex,
_search_regex_ignorecase,
) = pickle.load(file)
46 changes: 46 additions & 0 deletions dateparser/timezones.py → dateparser_scripts/timezones.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,11 @@
# As well as http://en.wikipedia.org/wiki/List_of_time_zone_abbreviations
# As well as https://github.com/scrapinghub/dateparser/pull/4
# As well as http://en.wikipedia.org/wiki/List_of_UTC_time_offsets
import pickle
import re
import sys
from datetime import timedelta
from pathlib import Path

timezone_info_list = [
{
Expand Down Expand Up @@ -467,3 +472,44 @@
],
},
]


def build_tz_offsets(search_regex_parts):
def get_offset(tz_obj, regex, repl="", replw=""):
return (
tz_obj[0],
{
"regex": re.compile(
re.sub(repl, replw, regex % tz_obj[0]), re.IGNORECASE
),
"offset": timedelta(seconds=tz_obj[1]),
},
)

for tz_info in timezone_info_list:
for regex in tz_info["regex_patterns"]:
for tz_obj in tz_info["timezones"]:
search_regex_parts.append(tz_obj[0])
yield get_offset(tz_obj, regex)

# alternate patterns
for replace, replacewith in tz_info.get("replace", []):
search_regex_parts.append(re.sub(replace, replacewith, tz_obj[0]))
yield get_offset(tz_obj, regex, repl=replace, replw=replacewith)



def main():
search_regex_parts = []
tz_offets = list(build_tz_offsets(search_regex_parts))
search_regex = re.compile("|".join(search_regex_parts))
search_regex_ignorecase = re.compile("|".join(search_regex_parts), re.IGNORECASE)

with open(Path("dateparser_data", "dateparser_tz_cache.pkl"), mode="wb") as file:
pickle.dump(
(tz_offets, search_regex, search_regex_ignorecase),
file,
)

if __name__ == "__main__":
main()
16 changes: 16 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,26 @@
import re
import subprocess

from setuptools import find_packages, setup
from setuptools.command import develop, install

__version__ = re.search(
r"__version__.*\s*=\s*[\"]([^\"]+)[\"]", open("dateparser/__init__.py").read()
).group(1)


class PostDevelop(develop.develop):
def run(self):
subprocess.call("python 3 dateparser_scripts/timezones.py", shell=True)
develop.develop.run(self)


class PostInstall(install.install):
def run(self):
subprocess.call("python3 dateparser_scripts/timezones.py", shell=True)
install.install.run(self)


introduction = re.sub(
r":members:.+|..\sautomodule::.+|:class:|:func:|:ref:",
"",
Expand Down Expand Up @@ -45,6 +60,7 @@
"fasttext": ["fasttext"],
"langdetect": ["langdetect"],
},
cmdclass={"develop": PostDevelop, "install": PostInstall},
license="BSD",
zip_safe=False,
keywords="dateparser",
Expand Down

0 comments on commit 08ce4e2

Please sign in to comment.