Add tests for ref resolution on-disk caching

sirosen · sirosen · commit 18b7bac9ae3f · 2024-06-28T19:45:35.000-05:00
diff --git a/tests/acceptance/test_remote_ref_resolution.py b/tests/acceptance/test_remote_ref_resolution.py
@@ -1,3 +1,4 @@
+import hashlib
 import json
 
 import pytest
@@ -35,6 +36,10 @@
 }
 
 
+def _md5(s):
+    return hashlib.md5(s.encode()).hexdigest()
+
+
 @pytest.mark.parametrize("check_passes", (True, False))
 @pytest.mark.parametrize("casename", ("case1", "case2"))
 def test_remote_ref_resolution_simple_case(run_line, check_passes, casename, tmp_path):
@@ -63,6 +68,88 @@ def test_remote_ref_resolution_simple_case(run_line, check_passes, casename, tmp
         assert result.exit_code == 1, output
 
 
+@pytest.mark.parametrize("casename", ("case1", "case2"))
+@pytest.mark.parametrize("disable_cache", (True, False))
+def test_remote_ref_resolution_cache_control(
+    run_line, tmp_path, cache_dir, casename, disable_cache
+):
+    main_schema_loc = "https://example.com/main.json"
+    responses.add("GET", main_schema_loc, json=CASES[casename]["main_schema"])
+
+    ref_locs = []
+    for name, subschema in CASES[casename]["other_schemas"].items():
+        other_schema_loc = f"https://example.com/{name}.json"
+        responses.add("GET", other_schema_loc, json=subschema)
+        ref_locs.append(other_schema_loc)
+
+    instance_path = tmp_path / "instance.json"
+    instance_path.write_text(json.dumps(CASES[casename]["passing_document"]))
+
+    # run the command
+    result = run_line(
+        ["check-jsonschema", "--schemafile", main_schema_loc, str(instance_path)]
+        + (["--no-cache"] if disable_cache else [])
+    )
+    output = f"\nstdout:\n{result.stdout}\n\nstderr:\n{result.stderr}"
+    assert result.exit_code == 0, output
+
+    cache_locs = []
+    for ref_loc in ref_locs:
+        cache_locs.append(cache_dir / "check_jsonschema" / "refs" / _md5(ref_loc))
+    assert cache_locs  # sanity check
+    if disable_cache:
+        for loc in cache_locs:
+            assert not loc.exists()
+    else:
+        for loc in cache_locs:
+            assert loc.exists()
+
+
+@pytest.mark.parametrize("casename", ("case1", "case2"))
+@pytest.mark.parametrize("check_passes", (True, False))
+def test_remote_ref_resolution_loads_from_cache(
+    run_line, tmp_path, cache_dir, casename, check_passes
+):
+    main_schema_loc = "https://example.com/main.json"
+    responses.add("GET", main_schema_loc, json=CASES[casename]["main_schema"])
+
+    # ensure the ref cache dir exists
+    ref_cache_dir = cache_loc = cache_dir / "check_jsonschema" / "refs"
+    ref_cache_dir.mkdir(parents=True)
+
+    ref_locs = []
+    cache_locs = []
+    for name, subschema in CASES[casename]["other_schemas"].items():
+        other_schema_loc = f"https://example.com/{name}.json"
+        # intentionally populate the HTTP location with "bad data"
+        responses.add("GET", other_schema_loc, json="{}")
+        ref_locs.append(other_schema_loc)
+
+        # but populate the cache with "good data"
+        cache_loc = ref_cache_dir / _md5(other_schema_loc)
+        cache_locs.append(cache_loc)
+        cache_loc.write_text(json.dumps(subschema))
+
+    instance_path = tmp_path / "instance.json"
+    instance_path.write_text(
+        json.dumps(
+            CASES[casename]["passing_document"]
+            if check_passes
+            else CASES[casename]["failing_document"]
+        )
+    )
+
+    # run the command
+    result = run_line(
+        ["check-jsonschema", "--schemafile", main_schema_loc, str(instance_path)]
+    )
+    output = f"\nstdout:\n{result.stdout}\n\nstderr:\n{result.stderr}"
+    if check_passes:
+        assert result.exit_code == 0, output
+    else:
+        assert result.exit_code == 1, output
+
+
 # this test ensures that `$id` is preferred for the base URI over
 # the retrieval URI
 @pytest.mark.parametrize("check_passes", (True, False))