From f423124eaafb9c5844feaa1b1895e7e000065b5a Mon Sep 17 00:00:00 2001 From: tobymao Date: Thu, 30 Jan 2025 10:35:14 -0800 Subject: [PATCH] feat: add caching for timezone offsets, significantly speeds up import this is different from pr #1181. it builds a cache at install time which can be distributed. closes #533 --- .gitignore | 1 + MANIFEST.in | 1 + dateparser/timezone_parser.py | 42 +++++------------ .../timezones.py | 46 +++++++++++++++++++ setup.py | 17 +++++++ tests/test_timezone_parser.py | 1 + 6 files changed, 78 insertions(+), 30 deletions(-) rename {dateparser => dateparser_scripts}/timezones.py (90%) diff --git a/.gitignore b/.gitignore index 669c9f718..7f82e3344 100644 --- a/.gitignore +++ b/.gitignore @@ -52,3 +52,4 @@ docs/_build # Other raw_data +*.pkl diff --git a/MANIFEST.in b/MANIFEST.in index ce4030d7c..13f227180 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -3,6 +3,7 @@ include CONTRIBUTING.rst include HISTORY.rst include LICENSE include README.rst +include dateparser_data/dateparser_tz_cache.pkl include dateparser_data/settings.py include requirements.txt diff --git a/dateparser/timezone_parser.py b/dateparser/timezone_parser.py index 0f879ddcc..139aa630a 100644 --- a/dateparser/timezone_parser.py +++ b/dateparser/timezone_parser.py @@ -1,9 +1,9 @@ +import pickle from datetime import datetime, timedelta, timezone, tzinfo +from pathlib import Path import regex as re -from .timezones import timezone_info_list - class StaticTzInfo(tzinfo): def __init__(self, name, offset): @@ -54,38 +54,20 @@ def convert_to_local_tz(datetime_obj, datetime_tz_offset): return datetime_obj - datetime_tz_offset + local_tz_offset -def build_tz_offsets(search_regex_parts): - def get_offset(tz_obj, regex, repl="", replw=""): - return ( - tz_obj[0], - { - "regex": re.compile( - re.sub(repl, replw, regex % tz_obj[0]), re.IGNORECASE - ), - "offset": timedelta(seconds=tz_obj[1]), - }, - ) - - for tz_info in timezone_info_list: - for regex in tz_info["regex_patterns"]: - for tz_obj in tz_info["timezones"]: - search_regex_parts.append(tz_obj[0]) - yield get_offset(tz_obj, regex) - - # alternate patterns - for replace, replacewith in tz_info.get("replace", []): - search_regex_parts.append(re.sub(replace, replacewith, tz_obj[0])) - yield get_offset(tz_obj, regex, repl=replace, replw=replacewith) - - def get_local_tz_offset(): offset = datetime.now() - datetime.now(tz=timezone.utc).replace(tzinfo=None) offset = timedelta(days=offset.days, seconds=round(offset.seconds, -1)) return offset -_search_regex_parts = [] -_tz_offsets = list(build_tz_offsets(_search_regex_parts)) -_search_regex = re.compile("|".join(_search_regex_parts)) -_search_regex_ignorecase = re.compile("|".join(_search_regex_parts), re.IGNORECASE) local_tz_offset = get_local_tz_offset() + +with open( + Path(__file__).parent.parent.joinpath("dateparser_data", "dateparser_tz_cache.pkl"), + mode="rb", +) as file: + ( + _tz_offsets, + _search_regex, + _search_regex_ignorecase, + ) = pickle.load(file) diff --git a/dateparser/timezones.py b/dateparser_scripts/timezones.py similarity index 90% rename from dateparser/timezones.py rename to dateparser_scripts/timezones.py index 9ac35dafd..8bde6dfd3 100644 --- a/dateparser/timezones.py +++ b/dateparser_scripts/timezones.py @@ -2,6 +2,11 @@ # As well as http://en.wikipedia.org/wiki/List_of_time_zone_abbreviations # As well as https://github.com/scrapinghub/dateparser/pull/4 # As well as http://en.wikipedia.org/wiki/List_of_UTC_time_offsets +import pickle +import re +import sys +from datetime import timedelta +from pathlib import Path timezone_info_list = [ { @@ -467,3 +472,44 @@ ], }, ] + + +def build_tz_offsets(search_regex_parts): + def get_offset(tz_obj, regex, repl="", replw=""): + return ( + tz_obj[0], + { + "regex": re.compile( + re.sub(repl, replw, regex % tz_obj[0]), re.IGNORECASE + ), + "offset": timedelta(seconds=tz_obj[1]), + }, + ) + + for tz_info in timezone_info_list: + for regex in tz_info["regex_patterns"]: + for tz_obj in tz_info["timezones"]: + search_regex_parts.append(tz_obj[0]) + yield get_offset(tz_obj, regex) + + # alternate patterns + for replace, replacewith in tz_info.get("replace", []): + search_regex_parts.append(re.sub(replace, replacewith, tz_obj[0])) + yield get_offset(tz_obj, regex, repl=replace, replw=replacewith) + + + +def main(): + search_regex_parts = [] + tz_offets = list(build_tz_offsets(search_regex_parts)) + search_regex = re.compile("|".join(search_regex_parts)) + search_regex_ignorecase = re.compile("|".join(search_regex_parts), re.IGNORECASE) + + with open(Path("dateparser_data", "dateparser_tz_cache.pkl"), mode="wb") as file: + pickle.dump( + (tz_offets, search_regex, search_regex_ignorecase), + file, + ) + +if __name__ == "__main__": + main() diff --git a/setup.py b/setup.py index fce81acc4..965d68d90 100644 --- a/setup.py +++ b/setup.py @@ -1,11 +1,27 @@ import re +import subprocess from setuptools import find_packages, setup +from setuptools.command import develop, install __version__ = re.search( r"__version__.*\s*=\s*[\"]([^\"]+)[\"]", open("dateparser/__init__.py").read() ).group(1) + +class PostDevelop(develop.develop): + def run(self): + print("******** develop") + develop.develop.run(self) + subprocess.call("dateparser_scripts/timezones.py", shell=True) + + +class PostInstall(install.install): + def run(self): + subprocess.call("python3 dateparser_scripts/timezones.py", shell=True) + install.install.run(self) + + introduction = re.sub( r":members:.+|..\sautomodule::.+|:class:|:func:|:ref:", "", @@ -45,6 +61,7 @@ "fasttext": ["fasttext"], "langdetect": ["langdetect"], }, + cmdclass={"develop": PostDevelop, "install": PostInstall}, license="BSD", zip_safe=False, keywords="dateparser", diff --git a/tests/test_timezone_parser.py b/tests/test_timezone_parser.py index b8d2d0d46..b5dccc97e 100644 --- a/tests/test_timezone_parser.py +++ b/tests/test_timezone_parser.py @@ -1,4 +1,5 @@ import datetime as dt +import pickle from datetime import datetime, timedelta from unittest import SkipTest from unittest.mock import Mock, patch