feat: add caching for timezone offsets, significantly speeds up import

tobymao · tobymao · commit 08ce4e2cc944 · 2025-02-05T10:55:50.000-08:00
this is different from pr scrapinghub#1181. it builds a cache at install time which can be distributed. closes scrapinghub#533
diff --git a/.gitignore b/.gitignore
@@ -52,3 +52,4 @@ docs/_build
 
 # Other
 raw_data
+*.pkl
diff --git a/MANIFEST.in b/MANIFEST.in
@@ -3,6 +3,7 @@ include CONTRIBUTING.rst
 include HISTORY.rst
 include LICENSE
 include README.rst
+include dateparser_data/dateparser_tz_cache.pkl
 include dateparser_data/settings.py
 include requirements.txt
 
diff --git a/dateparser/timezone_parser.py b/dateparser/timezone_parser.py
@@ -1,9 +1,9 @@
+import pickle
 from datetime import datetime, timedelta, timezone, tzinfo
+from pathlib import Path
 
 import regex as re
 
-from .timezones import timezone_info_list
-
 
 class StaticTzInfo(tzinfo):
     def __init__(self, name, offset):
@@ -54,38 +54,20 @@ def convert_to_local_tz(datetime_obj, datetime_tz_offset):
     return datetime_obj - datetime_tz_offset + local_tz_offset
 
 
-def build_tz_offsets(search_regex_parts):
-    def get_offset(tz_obj, regex, repl="", replw=""):
-        return (
-            tz_obj[0],
-            {
-                "regex": re.compile(
-                    re.sub(repl, replw, regex % tz_obj[0]), re.IGNORECASE
-                ),
-                "offset": timedelta(seconds=tz_obj[1]),
-            },
-        )
-
-    for tz_info in timezone_info_list:
-        for regex in tz_info["regex_patterns"]:
-            for tz_obj in tz_info["timezones"]:
-                search_regex_parts.append(tz_obj[0])
-                yield get_offset(tz_obj, regex)
-
-                # alternate patterns
-                for replace, replacewith in tz_info.get("replace", []):
-                    search_regex_parts.append(re.sub(replace, replacewith, tz_obj[0]))
-                    yield get_offset(tz_obj, regex, repl=replace, replw=replacewith)
-
-
 def get_local_tz_offset():
     offset = datetime.now() - datetime.now(tz=timezone.utc).replace(tzinfo=None)
     offset = timedelta(days=offset.days, seconds=round(offset.seconds, -1))
     return offset
 
 
-_search_regex_parts = []
-_tz_offsets = list(build_tz_offsets(_search_regex_parts))
-_search_regex = re.compile("|".join(_search_regex_parts))
-_search_regex_ignorecase = re.compile("|".join(_search_regex_parts), re.IGNORECASE)
 local_tz_offset = get_local_tz_offset()
+
+with open(
+    Path(__file__).parent.parent.joinpath("dateparser_data", "dateparser_tz_cache.pkl"),
+    mode="rb",
+) as file:
+    (
+        _tz_offsets,
+        _search_regex,
+        _search_regex_ignorecase,
+    ) = pickle.load(file)
diff --git a/dateparser_scripts/timezones.py b/dateparser_scripts/timezones.py
@@ -2,6 +2,11 @@
 # As well as http://en.wikipedia.org/wiki/List_of_time_zone_abbreviations
 # As well as https://github.com/scrapinghub/dateparser/pull/4
 # As well as http://en.wikipedia.org/wiki/List_of_UTC_time_offsets
+import pickle
+import re
+import sys
+from datetime import timedelta
+from pathlib import Path
 
 timezone_info_list = [
     {
@@ -467,3 +472,44 @@
         ],
     },
 ]
+
+
+def build_tz_offsets(search_regex_parts):
+    def get_offset(tz_obj, regex, repl="", replw=""):
+        return (
+            tz_obj[0],
+            {
+                "regex": re.compile(
+                    re.sub(repl, replw, regex % tz_obj[0]), re.IGNORECASE
+                ),
+                "offset": timedelta(seconds=tz_obj[1]),
+            },
+        )
+
+    for tz_info in timezone_info_list:
+        for regex in tz_info["regex_patterns"]:
+            for tz_obj in tz_info["timezones"]:
+                search_regex_parts.append(tz_obj[0])
+                yield get_offset(tz_obj, regex)
+
+                # alternate patterns
+                for replace, replacewith in tz_info.get("replace", []):
+                    search_regex_parts.append(re.sub(replace, replacewith, tz_obj[0]))
+                    yield get_offset(tz_obj, regex, repl=replace, replw=replacewith)
+
+
+
+def main():
+    search_regex_parts = []
+    tz_offets = list(build_tz_offsets(search_regex_parts))
+    search_regex = re.compile("|".join(search_regex_parts))
+    search_regex_ignorecase = re.compile("|".join(search_regex_parts), re.IGNORECASE)
+
+    with open(Path("dateparser_data", "dateparser_tz_cache.pkl"), mode="wb") as file:
+        pickle.dump(
+            (tz_offets, search_regex, search_regex_ignorecase),
+            file,
+        )
+
+if __name__ == "__main__":
+    main()
diff --git a/setup.py b/setup.py
@@ -1,11 +1,26 @@
 import re
+import subprocess
 
 from setuptools import find_packages, setup
+from setuptools.command import develop, install
 
 __version__ = re.search(
     r"__version__.*\s*=\s*[\"]([^\"]+)[\"]", open("dateparser/__init__.py").read()
 ).group(1)
 
+
+class PostDevelop(develop.develop):
+    def run(self):
+        subprocess.call("python 3 dateparser_scripts/timezones.py", shell=True)
+        develop.develop.run(self)
+
+
+class PostInstall(install.install):
+    def run(self):
+        subprocess.call("python3 dateparser_scripts/timezones.py", shell=True)
+        install.install.run(self)
+
+
 introduction = re.sub(
     r":members:.+|..\sautomodule::.+|:class:|:func:|:ref:",
     "",
@@ -45,6 +60,7 @@
         "fasttext": ["fasttext"],
         "langdetect": ["langdetect"],
     },
+    cmdclass={"develop": PostDevelop, "install": PostInstall},
     license="BSD",
     zip_safe=False,
     keywords="dateparser",

Original file line number	Diff line number	Diff line change
`@@ -52,3 +52,4 @@ docs/_build`
`52`	`52`
`53`	`53`	`# Other`
`54`	`54`	`raw_data`
	`55`	`+*.pkl`