Skip to content

Commit e31b55f

Browse files
authored
Merge pull request #457 from python-jsonschema/cache-refs
Cache remote refs when downloading, refactor cachedownloader
2 parents 761a2b2 + 037c2b2 commit e31b55f

11 files changed

+574
-231
lines changed

docs/faq.rst

+43
Original file line numberDiff line numberDiff line change
@@ -115,3 +115,46 @@ To resolve, quote the boolean:
115115
116116
steps:
117117
- bash: echo "{{ parameters.myBoolean}}"
118+
119+
Caching
120+
-------
121+
122+
What data gets cached?
123+
~~~~~~~~~~~~~~~~~~~~~~
124+
125+
``check-jsonschema`` will cache all downloaded schemas by default.
126+
The schemas are stored in the ``downloads/`` directory in your cache dir, and any
127+
downloaded refs are stored in the ``refs/`` directory.
128+
129+
Where is the cache dir?
130+
~~~~~~~~~~~~~~~~~~~~~~~
131+
132+
``check-jsonschema`` detects an appropriate cache directory based on your
133+
platform and environment variables.
134+
135+
On Windows, the cache dir is ``%LOCALAPPDATA%/check_jsonschema/`` and falls back
136+
to ``%APPDATA%/check_jsonschema/`` if ``LOCALAPPDATA`` is unset.
137+
138+
On macOS, the cache dir is ``~/Library/Caches/check_jsonschema/``.
139+
140+
On Linux, the cache dir is ``$XDG_CACHE_HOME/check_jsonschema/`` and falls back
141+
to ``~/.cache/check_jsonschema/`` if ``XDG_CACHE_HOME`` is unset.
142+
143+
How does check-jsonschema decide what is a cache hit vs miss?
144+
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
145+
146+
``check-jsonschema`` checks for cache hits by comparing local file modification
147+
times to the ``Last-Modified`` header present in the headers on an HTTP GET
148+
request. If the local last modified time is older than the header, the rest of
149+
the request will be streamed and written to replace the file.
150+
151+
How do I clear the cache?
152+
~~~~~~~~~~~~~~~~~~~~~~~~~
153+
154+
There is no special command for clearing the cache. Simply find the cache
155+
directory based on the information above and remove it or any of its contents.
156+
157+
Can I disable caching?
158+
~~~~~~~~~~~~~~~~~~~~~~
159+
160+
Yes! Just use the ``--no-cache`` CLI option.

src/check_jsonschema/cachedownloader.py

+156-109
Original file line numberDiff line numberDiff line change
@@ -11,139 +11,186 @@
1111

1212
import requests
1313

14+
_LASTMOD_FMT = "%a, %d %b %Y %H:%M:%S %Z"
15+
16+
17+
def _base_cache_dir() -> str | None:
18+
sysname = platform.system()
19+
20+
# on windows, try to get the appdata env var
21+
# this *could* result in cache_dir=None, which is fine, just skip caching in
22+
# that case
23+
if sysname == "Windows":
24+
cache_dir = os.getenv("LOCALAPPDATA", os.getenv("APPDATA"))
25+
# macOS -> app support dir
26+
elif sysname == "Darwin":
27+
cache_dir = os.path.expanduser("~/Library/Caches")
28+
# default for unknown platforms, namely linux behavior
29+
# use XDG env var and default to ~/.cache/
30+
else:
31+
cache_dir = os.getenv("XDG_CACHE_HOME", os.path.expanduser("~/.cache"))
32+
33+
return cache_dir
34+
35+
36+
def _resolve_cache_dir(dirname: str = "downloads") -> str | None:
37+
cache_dir = _base_cache_dir()
38+
if cache_dir:
39+
cache_dir = os.path.join(cache_dir, "check_jsonschema", dirname)
40+
return cache_dir
41+
42+
43+
def _lastmod_from_response(response: requests.Response) -> float:
44+
try:
45+
return time.mktime(
46+
time.strptime(response.headers["last-modified"], _LASTMOD_FMT)
47+
)
48+
# OverflowError: time outside of platform-specific bounds
49+
# ValueError: malformed/unparseable
50+
# LookupError: no such header
51+
except (OverflowError, ValueError, LookupError):
52+
return 0.0
53+
54+
55+
def _get_request(
56+
file_url: str, *, response_ok: t.Callable[[requests.Response], bool]
57+
) -> requests.Response:
58+
num_retries = 2
59+
r: requests.Response | None = None
60+
for _attempt in range(num_retries + 1):
61+
try:
62+
r = requests.get(file_url, stream=True)
63+
except requests.RequestException as e:
64+
if _attempt == num_retries:
65+
raise FailedDownloadError("encountered error during download") from e
66+
continue
67+
if r.ok and response_ok(r):
68+
return r
69+
assert r is not None
70+
raise FailedDownloadError(
71+
f"got response with status={r.status_code}, retries exhausted"
72+
)
73+
74+
75+
def _atomic_write(dest: str, content: bytes) -> None:
76+
# download to a temp file and then move to the dest
77+
# this makes the download safe if run in parallel (parallel runs
78+
# won't create a new empty file for writing and cause failures)
79+
fp = tempfile.NamedTemporaryFile(mode="wb", delete=False)
80+
fp.write(content)
81+
fp.close()
82+
shutil.copy(fp.name, dest)
83+
os.remove(fp.name)
84+
85+
86+
def _cache_hit(cachefile: str, response: requests.Response) -> bool:
87+
# no file? miss
88+
if not os.path.exists(cachefile):
89+
return False
90+
91+
# compare mtime on any cached file against the remote last-modified time
92+
# it is considered a hit if the local file is at least as new as the remote file
93+
local_mtime = os.path.getmtime(cachefile)
94+
remote_mtime = _lastmod_from_response(response)
95+
return local_mtime >= remote_mtime
96+
1497

1598
class FailedDownloadError(Exception):
1699
pass
17100

18101

19102
class CacheDownloader:
20-
_LASTMOD_FMT = "%a, %d %b %Y %H:%M:%S %Z"
21-
22-
# changed in v0.5.0
23-
# original cache dir was "jsonschema_validate"
24-
# this will let us do any other caching we might need in the future in the same
25-
# cache dir (adjacent to "downloads")
26-
_CACHEDIR_NAME = os.path.join("check_jsonschema", "downloads")
103+
def __init__(self, cache_dir: str | None = None, disable_cache: bool = False):
104+
if cache_dir is None:
105+
self._cache_dir = _resolve_cache_dir()
106+
else:
107+
self._cache_dir = _resolve_cache_dir(cache_dir)
108+
self._disable_cache = disable_cache
27109

28-
def __init__(
110+
def _download(
29111
self,
30112
file_url: str,
31-
filename: str | None = None,
32-
cache_dir: str | None = None,
33-
disable_cache: bool = False,
34-
validation_callback: t.Callable[[bytes], t.Any] | None = None,
35-
):
36-
self._file_url = file_url
37-
self._filename = filename or file_url.split("/")[-1]
38-
self._cache_dir = cache_dir or self._compute_default_cache_dir()
39-
self._disable_cache = disable_cache
40-
self._validation_callback = validation_callback
41-
42-
def _compute_default_cache_dir(self) -> str | None:
43-
sysname = platform.system()
44-
45-
# on windows, try to get the appdata env var
46-
# this *could* result in cache_dir=None, which is fine, just skip caching in
47-
# that case
48-
if sysname == "Windows":
49-
cache_dir = os.getenv("LOCALAPPDATA", os.getenv("APPDATA"))
50-
# macOS -> app support dir
51-
elif sysname == "Darwin":
52-
cache_dir = os.path.expanduser("~/Library/Caches")
53-
# default for unknown platforms, namely linux behavior
54-
# use XDG env var and default to ~/.cache/
55-
else:
56-
cache_dir = os.getenv("XDG_CACHE_HOME", os.path.expanduser("~/.cache"))
57-
58-
if cache_dir:
59-
cache_dir = os.path.join(cache_dir, self._CACHEDIR_NAME)
60-
61-
return cache_dir
62-
63-
def _get_request(
64-
self, *, response_ok: t.Callable[[requests.Response], bool]
65-
) -> requests.Response:
66-
try:
67-
r: requests.Response | None = None
68-
for _attempt in range(3):
69-
r = requests.get(self._file_url, stream=True)
70-
if r.ok and response_ok(r):
71-
return r
72-
assert r is not None
73-
raise FailedDownloadError(
74-
f"got response with status={r.status_code}, retries exhausted"
75-
)
76-
except requests.RequestException as e:
77-
raise FailedDownloadError("encountered error during download") from e
78-
79-
def _lastmod_from_response(self, response: requests.Response) -> float:
80-
try:
81-
return time.mktime(
82-
time.strptime(response.headers["last-modified"], self._LASTMOD_FMT)
83-
)
84-
# OverflowError: time outside of platform-specific bounds
85-
# ValueError: malformed/unparseable
86-
# LookupError: no such header
87-
except (OverflowError, ValueError, LookupError):
88-
return 0.0
89-
90-
def _cache_hit(self, cachefile: str, response: requests.Response) -> bool:
91-
# no file? miss
92-
if not os.path.exists(cachefile):
93-
return False
94-
95-
# compare mtime on any cached file against the remote last-modified time
96-
# it is considered a hit if the local file is at least as new as the remote file
97-
local_mtime = os.path.getmtime(cachefile)
98-
remote_mtime = self._lastmod_from_response(response)
99-
return local_mtime >= remote_mtime
100-
101-
def _write(self, dest: str, response: requests.Response) -> None:
102-
# download to a temp file and then move to the dest
103-
# this makes the download safe if run in parallel (parallel runs
104-
# won't create a new empty file for writing and cause failures)
105-
fp = tempfile.NamedTemporaryFile(mode="wb", delete=False)
106-
fp.write(response.content)
107-
fp.close()
108-
shutil.copy(fp.name, dest)
109-
os.remove(fp.name)
110-
111-
def _validate(self, response: requests.Response) -> bool:
112-
if not self._validation_callback:
113-
return True
114-
115-
try:
116-
self._validation_callback(response.content)
117-
return True
118-
except ValueError:
119-
return False
120-
121-
def _download(self) -> str:
122-
assert self._cache_dir
113+
filename: str,
114+
response_ok: t.Callable[[requests.Response], bool],
115+
) -> str:
116+
assert self._cache_dir is not None
123117
os.makedirs(self._cache_dir, exist_ok=True)
124-
dest = os.path.join(self._cache_dir, self._filename)
118+
dest = os.path.join(self._cache_dir, filename)
125119

126120
def check_response_for_download(r: requests.Response) -> bool:
127121
# if the response indicates a cache hit, treat it as valid
128122
# this ensures that we short-circuit any further evaluation immediately on
129123
# a hit
130-
if self._cache_hit(dest, r):
124+
if _cache_hit(dest, r):
131125
return True
132126
# we now know it's not a hit, so validate the content (forces download)
133-
return self._validate(r)
127+
return response_ok(r)
134128

135-
response = self._get_request(response_ok=check_response_for_download)
129+
response = _get_request(file_url, response_ok=check_response_for_download)
136130
# check to see if we have a file which matches the connection
137131
# only download if we do not (cache miss, vs hit)
138-
if not self._cache_hit(dest, response):
139-
self._write(dest, response)
132+
if not _cache_hit(dest, response):
133+
_atomic_write(dest, response.content)
140134

141135
return dest
142136

143137
@contextlib.contextmanager
144-
def open(self) -> t.Iterator[t.IO[bytes]]:
138+
def open(
139+
self,
140+
file_url: str,
141+
filename: str,
142+
validate_response: t.Callable[[requests.Response], bool],
143+
) -> t.Iterator[t.IO[bytes]]:
145144
if (not self._cache_dir) or self._disable_cache:
146-
yield io.BytesIO(self._get_request(response_ok=self._validate).content)
145+
yield io.BytesIO(
146+
_get_request(file_url, response_ok=validate_response).content
147+
)
147148
else:
148-
with open(self._download(), "rb") as fp:
149+
with open(
150+
self._download(file_url, filename, response_ok=validate_response), "rb"
151+
) as fp:
149152
yield fp
153+
154+
def bind(
155+
self,
156+
file_url: str,
157+
filename: str | None = None,
158+
validation_callback: t.Callable[[bytes], t.Any] | None = None,
159+
) -> BoundCacheDownloader:
160+
return BoundCacheDownloader(
161+
file_url, filename, self, validation_callback=validation_callback
162+
)
163+
164+
165+
class BoundCacheDownloader:
166+
def __init__(
167+
self,
168+
file_url: str,
169+
filename: str | None,
170+
downloader: CacheDownloader,
171+
*,
172+
validation_callback: t.Callable[[bytes], t.Any] | None = None,
173+
):
174+
self._file_url = file_url
175+
self._filename = filename or file_url.split("/")[-1]
176+
self._downloader = downloader
177+
self._validation_callback = validation_callback
178+
179+
@contextlib.contextmanager
180+
def open(self) -> t.Iterator[t.IO[bytes]]:
181+
with self._downloader.open(
182+
self._file_url,
183+
self._filename,
184+
validate_response=self._validate_response,
185+
) as fp:
186+
yield fp
187+
188+
def _validate_response(self, response: requests.Response) -> bool:
189+
if not self._validation_callback:
190+
return True
191+
192+
try:
193+
self._validation_callback(response.content)
194+
return True
195+
except ValueError:
196+
return False

src/check_jsonschema/cli/main_command.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -300,8 +300,8 @@ def build_schema_loader(args: ParseResult) -> SchemaLoaderBase:
300300
assert args.schema_path is not None
301301
return SchemaLoader(
302302
args.schema_path,
303-
args.cache_filename,
304-
args.disable_cache,
303+
cache_filename=args.cache_filename,
304+
disable_cache=args.disable_cache,
305305
base_uri=args.base_uri,
306306
validator_class=args.validator_class,
307307
)

src/check_jsonschema/schema_loader/main.py

+6-4
Original file line numberDiff line numberDiff line change
@@ -57,14 +57,16 @@ def get_validator(
5757

5858
class SchemaLoader(SchemaLoaderBase):
5959
validator_class: type[jsonschema.protocols.Validator] | None = None
60+
disable_cache: bool = True
6061

6162
def __init__(
6263
self,
6364
schemafile: str,
65+
*,
6466
cache_filename: str | None = None,
65-
disable_cache: bool = False,
6667
base_uri: str | None = None,
6768
validator_class: type[jsonschema.protocols.Validator] | None = None,
69+
disable_cache: bool = True,
6870
) -> None:
6971
# record input parameters (these are not to be modified)
7072
self.schemafile = schemafile
@@ -140,7 +142,7 @@ def get_validator(
140142
# reference resolution
141143
# with support for YAML, TOML, and other formats from the parsers
142144
reference_registry = make_reference_registry(
143-
self._parsers, retrieval_uri, schema
145+
self._parsers, retrieval_uri, schema, self.disable_cache
144146
)
145147

146148
if self.validator_class is None:
@@ -171,7 +173,7 @@ def get_validator(
171173

172174

173175
class BuiltinSchemaLoader(SchemaLoader):
174-
def __init__(self, schema_name: str, base_uri: str | None = None) -> None:
176+
def __init__(self, schema_name: str, *, base_uri: str | None = None) -> None:
175177
self.schema_name = schema_name
176178
self.base_uri = base_uri
177179
self._parsers = ParserSet()
@@ -187,7 +189,7 @@ def get_schema(self) -> dict[str, t.Any]:
187189

188190

189191
class MetaSchemaLoader(SchemaLoaderBase):
190-
def __init__(self, base_uri: str | None = None) -> None:
192+
def __init__(self, *, base_uri: str | None = None) -> None:
191193
if base_uri is not None:
192194
raise NotImplementedError(
193195
"'--base-uri' was used with '--metaschema'. "

0 commit comments

Comments
 (0)