Skip to content

Commit f423124

Browse files
committed
feat: add caching for timezone offsets, significantly speeds up import
this is different from pr scrapinghub#1181. it builds a cache at install time which can be distributed. closes scrapinghub#533
1 parent 47acb88 commit f423124

File tree

6 files changed

+78
-30
lines changed

6 files changed

+78
-30
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,3 +52,4 @@ docs/_build
5252

5353
# Other
5454
raw_data
55+
*.pkl

MANIFEST.in

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ include CONTRIBUTING.rst
33
include HISTORY.rst
44
include LICENSE
55
include README.rst
6+
include dateparser_data/dateparser_tz_cache.pkl
67
include dateparser_data/settings.py
78
include requirements.txt
89

dateparser/timezone_parser.py

Lines changed: 12 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
1+
import pickle
12
from datetime import datetime, timedelta, timezone, tzinfo
3+
from pathlib import Path
24

35
import regex as re
46

5-
from .timezones import timezone_info_list
6-
77

88
class StaticTzInfo(tzinfo):
99
def __init__(self, name, offset):
@@ -54,38 +54,20 @@ def convert_to_local_tz(datetime_obj, datetime_tz_offset):
5454
return datetime_obj - datetime_tz_offset + local_tz_offset
5555

5656

57-
def build_tz_offsets(search_regex_parts):
58-
def get_offset(tz_obj, regex, repl="", replw=""):
59-
return (
60-
tz_obj[0],
61-
{
62-
"regex": re.compile(
63-
re.sub(repl, replw, regex % tz_obj[0]), re.IGNORECASE
64-
),
65-
"offset": timedelta(seconds=tz_obj[1]),
66-
},
67-
)
68-
69-
for tz_info in timezone_info_list:
70-
for regex in tz_info["regex_patterns"]:
71-
for tz_obj in tz_info["timezones"]:
72-
search_regex_parts.append(tz_obj[0])
73-
yield get_offset(tz_obj, regex)
74-
75-
# alternate patterns
76-
for replace, replacewith in tz_info.get("replace", []):
77-
search_regex_parts.append(re.sub(replace, replacewith, tz_obj[0]))
78-
yield get_offset(tz_obj, regex, repl=replace, replw=replacewith)
79-
80-
8157
def get_local_tz_offset():
8258
offset = datetime.now() - datetime.now(tz=timezone.utc).replace(tzinfo=None)
8359
offset = timedelta(days=offset.days, seconds=round(offset.seconds, -1))
8460
return offset
8561

8662

87-
_search_regex_parts = []
88-
_tz_offsets = list(build_tz_offsets(_search_regex_parts))
89-
_search_regex = re.compile("|".join(_search_regex_parts))
90-
_search_regex_ignorecase = re.compile("|".join(_search_regex_parts), re.IGNORECASE)
9163
local_tz_offset = get_local_tz_offset()
64+
65+
with open(
66+
Path(__file__).parent.parent.joinpath("dateparser_data", "dateparser_tz_cache.pkl"),
67+
mode="rb",
68+
) as file:
69+
(
70+
_tz_offsets,
71+
_search_regex,
72+
_search_regex_ignorecase,
73+
) = pickle.load(file)

dateparser/timezones.py renamed to dateparser_scripts/timezones.py

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,11 @@
22
# As well as http://en.wikipedia.org/wiki/List_of_time_zone_abbreviations
33
# As well as https://github.com/scrapinghub/dateparser/pull/4
44
# As well as http://en.wikipedia.org/wiki/List_of_UTC_time_offsets
5+
import pickle
6+
import re
7+
import sys
8+
from datetime import timedelta
9+
from pathlib import Path
510

611
timezone_info_list = [
712
{
@@ -467,3 +472,44 @@
467472
],
468473
},
469474
]
475+
476+
477+
def build_tz_offsets(search_regex_parts):
478+
def get_offset(tz_obj, regex, repl="", replw=""):
479+
return (
480+
tz_obj[0],
481+
{
482+
"regex": re.compile(
483+
re.sub(repl, replw, regex % tz_obj[0]), re.IGNORECASE
484+
),
485+
"offset": timedelta(seconds=tz_obj[1]),
486+
},
487+
)
488+
489+
for tz_info in timezone_info_list:
490+
for regex in tz_info["regex_patterns"]:
491+
for tz_obj in tz_info["timezones"]:
492+
search_regex_parts.append(tz_obj[0])
493+
yield get_offset(tz_obj, regex)
494+
495+
# alternate patterns
496+
for replace, replacewith in tz_info.get("replace", []):
497+
search_regex_parts.append(re.sub(replace, replacewith, tz_obj[0]))
498+
yield get_offset(tz_obj, regex, repl=replace, replw=replacewith)
499+
500+
501+
502+
def main():
503+
search_regex_parts = []
504+
tz_offets = list(build_tz_offsets(search_regex_parts))
505+
search_regex = re.compile("|".join(search_regex_parts))
506+
search_regex_ignorecase = re.compile("|".join(search_regex_parts), re.IGNORECASE)
507+
508+
with open(Path("dateparser_data", "dateparser_tz_cache.pkl"), mode="wb") as file:
509+
pickle.dump(
510+
(tz_offets, search_regex, search_regex_ignorecase),
511+
file,
512+
)
513+
514+
if __name__ == "__main__":
515+
main()

setup.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,27 @@
11
import re
2+
import subprocess
23

34
from setuptools import find_packages, setup
5+
from setuptools.command import develop, install
46

57
__version__ = re.search(
68
r"__version__.*\s*=\s*[\"]([^\"]+)[\"]", open("dateparser/__init__.py").read()
79
).group(1)
810

11+
12+
class PostDevelop(develop.develop):
13+
def run(self):
14+
print("******** develop")
15+
develop.develop.run(self)
16+
subprocess.call("dateparser_scripts/timezones.py", shell=True)
17+
18+
19+
class PostInstall(install.install):
20+
def run(self):
21+
subprocess.call("python3 dateparser_scripts/timezones.py", shell=True)
22+
install.install.run(self)
23+
24+
925
introduction = re.sub(
1026
r":members:.+|..\sautomodule::.+|:class:|:func:|:ref:",
1127
"",
@@ -45,6 +61,7 @@
4561
"fasttext": ["fasttext"],
4662
"langdetect": ["langdetect"],
4763
},
64+
cmdclass={"develop": PostDevelop, "install": PostInstall},
4865
license="BSD",
4966
zip_safe=False,
5067
keywords="dateparser",

tests/test_timezone_parser.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import datetime as dt
2+
import pickle
23
from datetime import datetime, timedelta
34
from unittest import SkipTest
45
from unittest.mock import Mock, patch

0 commit comments

Comments
 (0)