From c4814c6c873d5279c7d01e78c9051aaec4622547 Mon Sep 17 00:00:00 2001 From: Stephen Rosen Date: Fri, 18 Oct 2024 10:58:54 -0500 Subject: [PATCH 01/12] Rename `--format-regex` and update docs Docs cover the expected usage, which is not implemented in this initial change. The rename and alias is all that's covered here. --- CHANGELOG.rst | 3 +++ docs/usage.rst | 16 ++++++++---- src/check_jsonschema/checker.py | 2 +- src/check_jsonschema/cli/main_command.py | 27 ++++++++++++++------- src/check_jsonschema/cli/parse_result.py | 23 +++++++++++++++--- src/check_jsonschema/formats/__init__.py | 31 +----------------------- src/check_jsonschema/regex_variants.py | 31 ++++++++++++++++++++++++ 7 files changed, 85 insertions(+), 48 deletions(-) create mode 100644 src/check_jsonschema/regex_variants.py diff --git a/CHANGELOG.rst b/CHANGELOG.rst index ddd3a5414..2c6c5f18a 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -9,6 +9,9 @@ Unreleased ---------- .. vendor-insert-here +- Rename ``--format-regex`` to ``--regex-variant`` and convert + ``--format-regex`` to a deprecated alias. + It will be removed in a future release. - Update vendored schemas (2024-12-22) - Drop support for Python 3.8 diff --git a/docs/usage.rst b/docs/usage.rst index 56a5bb12a..206975a2b 100644 --- a/docs/usage.rst +++ b/docs/usage.rst @@ -183,11 +183,11 @@ Example usage: # disables all three of time, date-time, and iri --disable-formats time,date-time --disable-formats iri -``--format-regex`` +``--regex-variant`` ~~~~~~~~~~~~~~~~~~ -Set a mode for handling of the ``"regex"`` value for ``"format"``. The modes are as -follows: +Set a mode for handling of the ``"regex"`` value for ``"format"`` and the mode +for ``"pattern"`` interpretation. The modes are as follows: .. list-table:: Regex Options :widths: 15 30 @@ -196,9 +196,15 @@ follows: * - mode - description * - default - - Require the regex to be valid in ECMAScript regex syntax. + - Use ECMAScript regex syntax. * - python - - Require the regex to be valid in Python regex syntax. + - Use Python regex syntax. + +.. note:: + + This only controls the regex mode used for ``format`` and ``pattern``. + ``patternProperties`` is not currently controlled, and always uses the + Python engine. Other Options -------------- diff --git a/src/check_jsonschema/checker.py b/src/check_jsonschema/checker.py index 63d42d4e6..13591fb8d 100644 --- a/src/check_jsonschema/checker.py +++ b/src/check_jsonschema/checker.py @@ -28,7 +28,7 @@ def __init__( instance_loader: InstanceLoader, reporter: Reporter, *, - format_opts: FormatOptions | None = None, + format_opts: FormatOptions, traceback_mode: str = "short", fill_defaults: bool = False, ) -> None: diff --git a/src/check_jsonschema/cli/main_command.py b/src/check_jsonschema/cli/main_command.py index 8e15e0620..a84e29584 100644 --- a/src/check_jsonschema/cli/main_command.py +++ b/src/check_jsonschema/cli/main_command.py @@ -68,10 +68,10 @@ def pretty_helptext_list(values: list[str] | tuple[str, ...]) -> str: date, date-time, email, ipv4, ipv6, regex, uuid \b -For the "regex" format, there are multiple modes which can be specified with -'--format-regex': - default | check that the string is a valid ECMAScript regex - python | check that the string is a valid python regex +For handling of regexes, there are multiple modes which can be specified with +'--regex-variant': + default | use ECMAScript regex syntax (via regress) + python | use python regex syntax \b The '--builtin-schema' flag supports the following schema names: @@ -138,11 +138,18 @@ def pretty_helptext_list(values: list[str] | tuple[str, ...]) -> str: ) @click.option( "--format-regex", + hidden=True, + help="Legacy name for `--regex-variant`.", + default=None, + type=click.Choice([x.value for x in RegexVariantName], case_sensitive=False), +) +@click.option( + "--regex-variant", help=( - "Set the mode of format validation for regexes. " - "If `--disable-formats regex` is used, this option has no effect." + "Name of which regex dialect should be used for format checking " + "and 'pattern' matching." ), - default=RegexVariantName.default.value, + default=None, type=click.Choice([x.value for x in RegexVariantName], case_sensitive=False), ) @click.option( @@ -230,7 +237,8 @@ def main( no_cache: bool, cache_filename: str | None, disable_formats: tuple[list[str], ...], - format_regex: t.Literal["python", "default"], + format_regex: t.Literal["python", "default"] | None, + regex_variant: t.Literal["python", "default"] | None, default_filetype: t.Literal["json", "yaml", "toml", "json5"], traceback_mode: t.Literal["full", "short"], data_transform: t.Literal["azure-pipelines", "gitlab-ci"] | None, @@ -243,6 +251,8 @@ def main( ) -> None: args = ParseResult() + args.set_regex_variant(regex_variant, legacy_opt=format_regex) + args.set_schema(schemafile, builtin_schema, check_metaschema) args.set_validator(validator_class) @@ -257,7 +267,6 @@ def main( else: args.disable_formats = normalized_disable_formats - args.format_regex = RegexVariantName(format_regex) args.disable_cache = no_cache args.default_filetype = default_filetype args.fill_defaults = fill_defaults diff --git a/src/check_jsonschema/cli/parse_result.py b/src/check_jsonschema/cli/parse_result.py index a317378f9..dd27fb2fd 100644 --- a/src/check_jsonschema/cli/parse_result.py +++ b/src/check_jsonschema/cli/parse_result.py @@ -1,14 +1,21 @@ from __future__ import annotations import enum +import sys import typing as t import click import jsonschema -from ..formats import FormatOptions, RegexVariantName +from ..formats import FormatOptions +from ..regex_variants import RegexVariantName from ..transforms import Transform +if sys.version_info >= (3, 8): + from typing import Literal +else: + from typing_extensions import Literal + class SchemaLoadingMode(enum.Enum): filepath = "filepath" @@ -36,12 +43,22 @@ def __init__(self) -> None: # regex format options self.disable_all_formats: bool = False self.disable_formats: tuple[str, ...] = () - self.format_regex: RegexVariantName = RegexVariantName.default + self.regex_variant: RegexVariantName = RegexVariantName.default # error and output controls self.verbosity: int = 1 self.traceback_mode: str = "short" self.output_format: str = "text" + def set_regex_variant( + self, + variant_opt: Literal["python", "default"] | None, + *, + legacy_opt: Literal["python", "default"] | None = None, + ) -> None: + variant_name: Literal["python", "default"] | None = variant_opt or legacy_opt + if variant_name: + self.regex_variant = RegexVariantName(variant_name) + def set_schema( self, schemafile: str | None, builtin_schema: str | None, check_metaschema: bool ) -> None: @@ -83,6 +100,6 @@ def set_validator( def format_opts(self) -> FormatOptions: return FormatOptions( enabled=not self.disable_all_formats, - regex_variant=self.format_regex, + regex_variant=self.regex_variant, disabled_formats=self.disable_formats, ) diff --git a/src/check_jsonschema/formats/__init__.py b/src/check_jsonschema/formats/__init__.py index 8202d9a00..a5a04990f 100644 --- a/src/check_jsonschema/formats/__init__.py +++ b/src/check_jsonschema/formats/__init__.py @@ -1,14 +1,11 @@ from __future__ import annotations import copy -import enum -import re -import typing as t import jsonschema import jsonschema.validators -import regress +from ..regex_variants import RegexImplementation, RegexVariantName from .implementations import validate_rfc3339, validate_time # all known format strings except for a selection from draft3 which have either @@ -39,32 +36,6 @@ ) -class RegexVariantName(enum.Enum): - default = "default" - python = "python" - - -class RegexImplementation: - def __init__(self, variant: RegexVariantName) -> None: - self.variant = variant - - def check_format(self, instance: t.Any) -> bool: - if not isinstance(instance, str): - return True - - try: - if self.variant == RegexVariantName.default: - regress.Regex(instance) - else: - re.compile(instance) - # something is wrong with RegressError getting into the published types - # needs investigation... for now, ignore the error - except (regress.RegressError, re.error): # type: ignore[attr-defined] - return False - - return True - - class FormatOptions: def __init__( self, diff --git a/src/check_jsonschema/regex_variants.py b/src/check_jsonschema/regex_variants.py new file mode 100644 index 000000000..6e41d91ef --- /dev/null +++ b/src/check_jsonschema/regex_variants.py @@ -0,0 +1,31 @@ +import enum +import re +import typing as t + +import regress + + +class RegexVariantName(enum.Enum): + default = "default" + python = "python" + + +class RegexImplementation: + def __init__(self, variant: RegexVariantName) -> None: + self.variant = variant + + def check_format(self, instance: t.Any) -> bool: + if not isinstance(instance, str): + return True + + try: + if self.variant == RegexVariantName.default: + regress.Regex(instance) + else: + re.compile(instance) + # something is wrong with RegressError getting into the published types + # needs investigation... for now, ignore the error + except (regress.RegressError, re.error): # type: ignore[attr-defined] + return False + + return True From 802c5e427c7d679fa550d5ac02c5dad727deff25 Mon Sep 17 00:00:00 2001 From: Stephen Rosen Date: Sun, 20 Oct 2024 13:31:18 -0500 Subject: [PATCH 02/12] Test that `--regex-variant` controls regex format --- tests/acceptance/test_format_regex_opts.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/tests/acceptance/test_format_regex_opts.py b/tests/acceptance/test_format_regex_opts.py index 1f0486170..deb4e0fe7 100644 --- a/tests/acceptance/test_format_regex_opts.py +++ b/tests/acceptance/test_format_regex_opts.py @@ -1,6 +1,6 @@ # test on a JavaScript regex which is not a valid python regex -# `--format-regex=default` should accept it -# `--format-regex=python` should reject it +# `--regex-variant=default` should accept it +# `--regex-variant=python` should reject it # # check these options against documents with invalid and valid python regexes to confirm # that they are behaving as expected @@ -43,6 +43,10 @@ ("--disable-formats", "regex"), ("--format-regex", "default"), ("--format-regex", "python"), + ("--regex-variant", "python"), + ("--regex-variant", "default"), + ("--regex-variant", "default", "--format-regex", "python"), + ("--regex-variant", "python", "--format-regex", "default"), ] ) def regexopts(request): @@ -108,7 +112,10 @@ def test_regex_format_js_specific(run_line, tmp_path, regexopts): doc = tmp_path / "doc.json" doc.write_text(json.dumps(JS_REGEX_DOCUMENT)) - expect_ok = regexopts != ("--format-regex", "python") + expect_ok = regexopts[:2] not in ( + ("--format-regex", "python"), + ("--regex-variant", "python"), + ) res = run_line( [ From 819d834f490fc6ac5235ec8dc400d8ddfa8589ec Mon Sep 17 00:00:00 2001 From: Stephen Rosen Date: Sat, 2 Nov 2024 11:10:59 -0500 Subject: [PATCH 03/12] Instrument `regress`-backed `pattern` evaluation When an alternate RegexImplementation is selected, it will be passed down to the code which builds a Validator. The Validator is extended with the keyword validator provided by the RegexImplementation. Because this uses the `extend()` interface, a test which subclassed a validator broke -- this is documented in `jsonschema` as unsupported usage, so the test simply had to be updated to use supported interfaces. --- src/check_jsonschema/checker.py | 7 +++- src/check_jsonschema/cli/main_command.py | 4 +- src/check_jsonschema/cli/parse_result.py | 4 +- src/check_jsonschema/formats/__init__.py | 9 ++--- src/check_jsonschema/regex_variants.py | 30 +++++++++++++++ src/check_jsonschema/schema_loader/main.py | 20 +++++++++- .../acceptance/test_custom_validator_class.py | 38 +++++++++++-------- 7 files changed, 86 insertions(+), 26 deletions(-) diff --git a/src/check_jsonschema/checker.py b/src/check_jsonschema/checker.py index 13591fb8d..6d5153efc 100644 --- a/src/check_jsonschema/checker.py +++ b/src/check_jsonschema/checker.py @@ -11,6 +11,7 @@ from .formats import FormatOptions from .instance_loader import InstanceLoader from .parsers import ParseError +from .regex_variants import RegexImplementation from .reporter import Reporter from .result import CheckResult from .schema_loader import SchemaLoaderBase, SchemaParseError, UnsupportedUrlScheme @@ -29,6 +30,7 @@ def __init__( reporter: Reporter, *, format_opts: FormatOptions, + regex_impl: RegexImplementation, traceback_mode: str = "short", fill_defaults: bool = False, ) -> None: @@ -36,7 +38,8 @@ def __init__( self._instance_loader = instance_loader self._reporter = reporter - self._format_opts = format_opts if format_opts is not None else FormatOptions() + self._format_opts = format_opts + self._regex_impl = regex_impl self._traceback_mode = traceback_mode self._fill_defaults = fill_defaults @@ -51,7 +54,7 @@ def get_validator( ) -> jsonschema.protocols.Validator: try: return self._schema_loader.get_validator( - path, doc, self._format_opts, self._fill_defaults + path, doc, self._format_opts, self._regex_impl, self._fill_defaults ) except SchemaParseError as e: self._fail("Error: schemafile could not be parsed as JSON", e) diff --git a/src/check_jsonschema/cli/main_command.py b/src/check_jsonschema/cli/main_command.py index a84e29584..1b5db9bc7 100644 --- a/src/check_jsonschema/cli/main_command.py +++ b/src/check_jsonschema/cli/main_command.py @@ -9,9 +9,10 @@ from ..catalog import CUSTOM_SCHEMA_NAMES, SCHEMA_CATALOG from ..checker import SchemaChecker -from ..formats import KNOWN_FORMATS, RegexVariantName +from ..formats import KNOWN_FORMATS from ..instance_loader import InstanceLoader from ..parsers import SUPPORTED_FILE_FORMATS +from ..regex_variants import RegexImplementation, RegexVariantName from ..reporter import REPORTER_BY_NAME, Reporter from ..schema_loader import ( BuiltinSchemaLoader, @@ -327,6 +328,7 @@ def build_checker(args: ParseResult) -> SchemaChecker: instance_loader, reporter, format_opts=args.format_opts, + regex_impl=RegexImplementation(args.regex_variant), traceback_mode=args.traceback_mode, fill_defaults=args.fill_defaults, ) diff --git a/src/check_jsonschema/cli/parse_result.py b/src/check_jsonschema/cli/parse_result.py index dd27fb2fd..bf76806c3 100644 --- a/src/check_jsonschema/cli/parse_result.py +++ b/src/check_jsonschema/cli/parse_result.py @@ -8,7 +8,7 @@ import jsonschema from ..formats import FormatOptions -from ..regex_variants import RegexVariantName +from ..regex_variants import RegexImplementation, RegexVariantName from ..transforms import Transform if sys.version_info >= (3, 8): @@ -99,7 +99,7 @@ def set_validator( @property def format_opts(self) -> FormatOptions: return FormatOptions( + regex_impl=RegexImplementation(self.regex_variant), enabled=not self.disable_all_formats, - regex_variant=self.regex_variant, disabled_formats=self.disable_formats, ) diff --git a/src/check_jsonschema/formats/__init__.py b/src/check_jsonschema/formats/__init__.py index a5a04990f..971daac74 100644 --- a/src/check_jsonschema/formats/__init__.py +++ b/src/check_jsonschema/formats/__init__.py @@ -5,7 +5,7 @@ import jsonschema import jsonschema.validators -from ..regex_variants import RegexImplementation, RegexVariantName +from ..regex_variants import RegexImplementation from .implementations import validate_rfc3339, validate_time # all known format strings except for a selection from draft3 which have either @@ -40,12 +40,12 @@ class FormatOptions: def __init__( self, *, + regex_impl: RegexImplementation, enabled: bool = True, - regex_variant: RegexVariantName = RegexVariantName.default, disabled_formats: tuple[str, ...] = (), ) -> None: self.enabled = enabled - self.regex_variant = regex_variant + self.regex_impl = regex_impl self.disabled_formats = disabled_formats @@ -72,8 +72,7 @@ def make_format_checker( # replace the regex check del checker.checkers["regex"] - regex_impl = RegexImplementation(opts.regex_variant) - checker.checks("regex")(regex_impl.check_format) + checker.checks("regex")(opts.regex_impl.check_format) checker.checks("date-time")(validate_rfc3339) checker.checks("time")(validate_time) diff --git a/src/check_jsonschema/regex_variants.py b/src/check_jsonschema/regex_variants.py index 6e41d91ef..aa2f8e75a 100644 --- a/src/check_jsonschema/regex_variants.py +++ b/src/check_jsonschema/regex_variants.py @@ -2,6 +2,7 @@ import re import typing as t +import jsonschema import regress @@ -29,3 +30,32 @@ def check_format(self, instance: t.Any) -> bool: return False return True + + def pattern_keyword( + self, validator: t.Any, pattern: str, instance: str, schema: t.Any + ) -> t.Iterator[jsonschema.ValidationError]: + if not validator.is_type(instance, "string"): + return + + if self.variant == RegexVariantName.default: + try: + regress_pattern = regress.Regex(pattern) + except regress.RegressError: # type: ignore[attr-defined] + yield jsonschema.ValidationError( + f"pattern {pattern!r} failed to compile" + ) + if not regress_pattern.find(instance): + yield jsonschema.ValidationError( + f"{instance!r} does not match {pattern!r}" + ) + else: + try: + re_pattern = re.compile(pattern) + except re.error: + yield jsonschema.ValidationError( + f"pattern {pattern!r} failed to compile" + ) + if not re_pattern.search(instance): + yield jsonschema.ValidationError( + f"{instance!r} does not match {pattern!r}" + ) diff --git a/src/check_jsonschema/schema_loader/main.py b/src/check_jsonschema/schema_loader/main.py index 099107455..40a29f210 100644 --- a/src/check_jsonschema/schema_loader/main.py +++ b/src/check_jsonschema/schema_loader/main.py @@ -11,6 +11,7 @@ from ..builtin_schemas import get_builtin_schema from ..formats import FormatOptions, make_format_checker from ..parsers import ParserSet +from ..regex_variants import RegexImplementation from ..utils import is_url_ish from .errors import UnsupportedUrlScheme from .readers import HttpSchemaReader, LocalSchemaReader, StdinSchemaReader @@ -45,12 +46,23 @@ def set_defaults_then_validate( ) +def _extend_with_pattern_implementation( + validator_class: type[jsonschema.protocols.Validator], + regex_impl: RegexImplementation, +) -> type[jsonschema.Validator]: + return jsonschema.validators.extend( + validator_class, + {"pattern": regex_impl.pattern_keyword}, + ) + + class SchemaLoaderBase: def get_validator( self, path: pathlib.Path | str, instance_doc: dict[str, t.Any], format_opts: FormatOptions, + regex_impl: RegexImplementation, fill_defaults: bool, ) -> jsonschema.protocols.Validator: raise NotImplementedError @@ -124,14 +136,16 @@ def get_validator( path: pathlib.Path | str, instance_doc: dict[str, t.Any], format_opts: FormatOptions, + regex_impl: RegexImplementation, fill_defaults: bool, ) -> jsonschema.protocols.Validator: - return self._get_validator(format_opts, fill_defaults) + return self._get_validator(format_opts, regex_impl, fill_defaults) @functools.lru_cache def _get_validator( self, format_opts: FormatOptions, + regex_impl: RegexImplementation, fill_defaults: bool, ) -> jsonschema.protocols.Validator: retrieval_uri = self.get_schema_retrieval_uri() @@ -168,6 +182,9 @@ def _get_validator( if fill_defaults: validator_cls = _extend_with_default(validator_cls) + # set the regex variant for 'pattern' keywords + validator_cls = _extend_with_pattern_implementation(validator_cls, regex_impl) + # now that we know it's safe to try to create the validator instance, do it validator = validator_cls( schema, @@ -206,6 +223,7 @@ def get_validator( path: pathlib.Path | str, instance_doc: dict[str, t.Any], format_opts: FormatOptions, + regex_impl: RegexImplementation, fill_defaults: bool, ) -> jsonschema.protocols.Validator: schema_validator = jsonschema.validators.validator_for(instance_doc) diff --git a/tests/acceptance/test_custom_validator_class.py b/tests/acceptance/test_custom_validator_class.py index 170f4524b..9504b963b 100644 --- a/tests/acceptance/test_custom_validator_class.py +++ b/tests/acceptance/test_custom_validator_class.py @@ -66,24 +66,32 @@ def _foo_module(mock_module): """\ import jsonschema -class MyValidator: - def __init__(self, schema, *args, **kwargs): - self.schema = schema - self.real_validator = jsonschema.validators.Draft7Validator( - schema, *args, **kwargs - ) - - def iter_errors(self, data, *args, **kwargs): - yield from self.real_validator.iter_errors(data, *args, **kwargs) - for event in data["events"]: - if "Occult" in event["title"]: + +def check_occult_properties(validator, properties, instance, schema): + if not validator.is_type(instance, "object"): + return + + for property, subschema in properties.items(): + if property in instance: + if property == "title" and "Occult" in instance["title"]: yield jsonschema.exceptions.ValidationError( "Error! Occult event detected! Run!", - validator=None, + validator=validator, validator_value=None, - instance=event, - schema=self.schema, + instance=instance, + schema=schema, ) + yield from validator.descend( + instance[property], + subschema, + path=property, + schema_path=property, + ) + +MyValidator = jsonschema.validators.extend( + jsonschema.validators.Draft7Validator, + {"properties": check_occult_properties}, +) """, ) @@ -115,7 +123,7 @@ def test_custom_validator_class_can_detect_custom_conditions(run_line, tmp_path) str(doc), ], ) - assert result.exit_code == 1 # fail + assert result.exit_code == 1, result.stdout # fail assert "Occult event detected" in result.stdout, result.stdout From 20327258a39fab08c331ac95852f327c14194ae0 Mon Sep 17 00:00:00 2001 From: Stephen Rosen Date: Sat, 2 Nov 2024 11:32:36 -0500 Subject: [PATCH 04/12] Optimize regex implementations & fix tool configs Change RegexImplementation to dispatch between variants at init time, so that each call to the relevant methods is faster and does not re-check the variant. Minor tool config fixes: - flake8 ignore vs extend-ignore (whoops) - tox.ini depends needed to run in parallel --- .flake8 | 2 +- src/check_jsonschema/regex_variants.py | 84 +++++++++++++++++--------- tox.ini | 4 ++ 3 files changed, 61 insertions(+), 29 deletions(-) diff --git a/.flake8 b/.flake8 index c56dba3ab..fc708eaf8 100644 --- a/.flake8 +++ b/.flake8 @@ -2,4 +2,4 @@ exclude = .git,.tox,__pycache__,dist,.venv*,docs,build max-line-length = 90 # black related: W503/W504 conflict, black causes E203 -ignore = W503,W504,E203,B019 +extend-ignore = W503,W504,E203,B019 diff --git a/src/check_jsonschema/regex_variants.py b/src/check_jsonschema/regex_variants.py index aa2f8e75a..95bae1188 100644 --- a/src/check_jsonschema/regex_variants.py +++ b/src/check_jsonschema/regex_variants.py @@ -11,24 +11,68 @@ class RegexVariantName(enum.Enum): python = "python" +class _ConcreteImplementation(t.Protocol): + def check_format(self, instance: t.Any) -> bool: ... + + def pattern_keyword( + self, validator: t.Any, pattern: str, instance: str, schema: t.Any + ) -> t.Iterator[jsonschema.ValidationError]: ... + + class RegexImplementation: + """ + A high-level interface for getting at the different possible + implementations of regex behaviors. + """ + + _real_implementation: _ConcreteImplementation + def __init__(self, variant: RegexVariantName) -> None: self.variant = variant + if self.variant == RegexVariantName.default: + self._real_implementation = _RegressImplementation() + else: + self._real_implementation = _PythonImplementation() + + self.check_format = self._real_implementation.check_format + self.pattern_keyword = self._real_implementation.pattern_keyword + + +class _RegressImplementation: def check_format(self, instance: t.Any) -> bool: if not isinstance(instance, str): return True - try: - if self.variant == RegexVariantName.default: - regress.Regex(instance) - else: - re.compile(instance) + regress.Regex(instance) # something is wrong with RegressError getting into the published types # needs investigation... for now, ignore the error - except (regress.RegressError, re.error): # type: ignore[attr-defined] + except regress.RegressError: # type: ignore[attr-defined] return False + return True + def pattern_keyword( + self, validator: t.Any, pattern: str, instance: str, schema: t.Any + ) -> t.Iterator[jsonschema.ValidationError]: + if not validator.is_type(instance, "string"): + return + + try: + regress_pattern = regress.Regex(pattern) + except regress.RegressError: # type: ignore[attr-defined] + yield jsonschema.ValidationError(f"pattern {pattern!r} failed to compile") + if not regress_pattern.find(instance): + yield jsonschema.ValidationError(f"{instance!r} does not match {pattern!r}") + + +class _PythonImplementation: + def check_format(self, instance: t.Any) -> bool: + if not isinstance(instance, str): + return True + try: + re.compile(instance) + except re.error: + return False return True def pattern_keyword( @@ -37,25 +81,9 @@ def pattern_keyword( if not validator.is_type(instance, "string"): return - if self.variant == RegexVariantName.default: - try: - regress_pattern = regress.Regex(pattern) - except regress.RegressError: # type: ignore[attr-defined] - yield jsonschema.ValidationError( - f"pattern {pattern!r} failed to compile" - ) - if not regress_pattern.find(instance): - yield jsonschema.ValidationError( - f"{instance!r} does not match {pattern!r}" - ) - else: - try: - re_pattern = re.compile(pattern) - except re.error: - yield jsonschema.ValidationError( - f"pattern {pattern!r} failed to compile" - ) - if not re_pattern.search(instance): - yield jsonschema.ValidationError( - f"{instance!r} does not match {pattern!r}" - ) + try: + re_pattern = re.compile(pattern) + except re.error: + yield jsonschema.ValidationError(f"pattern {pattern!r} failed to compile") + if not re_pattern.search(instance): + yield jsonschema.ValidationError(f"{instance!r} does not match {pattern!r}") diff --git a/tox.ini b/tox.ini index c3a64592a..6edec916f 100644 --- a/tox.ini +++ b/tox.ini @@ -30,12 +30,14 @@ deps = format: jsonschema[format] commands = coverage run -m pytest {posargs:--junitxml={envdir}/pytest.xml} +depends = cov_clean [testenv:cov_clean] description = "erase coverage data to prepare for a new run" deps = coverage skip_install = true commands = coverage erase +depends = [testenv:cov] description = "combine and report coverage data" @@ -43,6 +45,7 @@ deps = coverage skip_install = true commands_pre = - coverage combine commands = coverage report --skip-covered +depends = py{,38,39,310,311,312}{,-mindeps,-format,-json5,-pyjson5,-disable_orjson} [testenv:mypy] description = "check type annotations with mypy" @@ -51,6 +54,7 @@ deps = mypy types-requests click commands = mypy src/ {posargs} +depends = [testenv:pyright] description = "check type annotations with pyright" From 509fc7bcb657ce29db5b7e426707698df9c3bac2 Mon Sep 17 00:00:00 2001 From: Stephen Rosen Date: Sat, 2 Nov 2024 12:06:25 -0500 Subject: [PATCH 05/12] Set format_checker when checking schemas This allows the `check_schema` call to succeed when it requires that the regex implementation is regress. Patterns which can compile as regress patterns are thereby supported. Add initial acceptance tests for regress 'pattern' support, which revealed this issue. --- src/check_jsonschema/schema_loader/main.py | 2 +- tests/acceptance/test_example_files.py | 44 ++++++++++++++++--- .../negative/unicode_pattern/instance.json | 4 ++ .../negative/unicode_pattern/schema.json | 20 +++++++++ .../positive/unicode_pattern/instance.json | 4 ++ .../positive/unicode_pattern/schema.json | 20 +++++++++ 6 files changed, 87 insertions(+), 7 deletions(-) create mode 100644 tests/example-files/explicit-schema/negative/unicode_pattern/instance.json create mode 100644 tests/example-files/explicit-schema/negative/unicode_pattern/schema.json create mode 100644 tests/example-files/explicit-schema/positive/unicode_pattern/instance.json create mode 100644 tests/example-files/explicit-schema/positive/unicode_pattern/schema.json diff --git a/src/check_jsonschema/schema_loader/main.py b/src/check_jsonschema/schema_loader/main.py index 40a29f210..6e817088c 100644 --- a/src/check_jsonschema/schema_loader/main.py +++ b/src/check_jsonschema/schema_loader/main.py @@ -167,7 +167,7 @@ def _get_validator( if self.validator_class is None: # get the correct validator class and check the schema under its metaschema validator_cls = jsonschema.validators.validator_for(schema) - validator_cls.check_schema(schema) + validator_cls.check_schema(schema, format_checker=format_checker) else: # for a user-provided validator class, don't check_schema # on the grounds that it might *not* be valid but the user wants to use diff --git a/tests/acceptance/test_example_files.py b/tests/acceptance/test_example_files.py index 057f07a5a..10411741f 100644 --- a/tests/acceptance/test_example_files.py +++ b/tests/acceptance/test_example_files.py @@ -63,7 +63,7 @@ def test_hook_positive_examples(case_name, run_line): hook_id = POSITIVE_HOOK_CASES[case_name] ret = run_line(HOOK_CONFIG[hook_id] + [rcase.path] + rcase.add_args) - assert ret.exit_code == 0, _format_cli_result(rcase, ret) + assert ret.exit_code == 0, _format_cli_result(ret, rcase) @pytest.mark.parametrize("case_name", NEGATIVE_HOOK_CASES.keys()) @@ -72,7 +72,7 @@ def test_hook_negative_examples(case_name, run_line): hook_id = NEGATIVE_HOOK_CASES[case_name] ret = run_line(HOOK_CONFIG[hook_id] + [rcase.path] + rcase.add_args) - assert ret.exit_code == 1, _format_cli_result(rcase, ret) + assert ret.exit_code == 1, _format_cli_result(ret, rcase) @pytest.mark.parametrize("case_name", _get_explicit_cases("positive")) @@ -102,7 +102,37 @@ def test_explicit_positive_examples(case_name, run_line): str(instance), ] ) - assert ret.exit_code == 0 + assert ret.exit_code == 0, _format_cli_result(ret) + + +@pytest.mark.parametrize("case_name", _get_explicit_cases("negative")) +def test_explicit_negative_examples(case_name, run_line): + _check_file_format_skip(case_name) + casedir = EXAMPLE_EXPLICIT_FILES / "negative" / case_name + + instance = casedir / "instance.json" + if not instance.exists(): + instance = casedir / "instance.yaml" + if not instance.exists(): + instance = casedir / "instance.toml" + if not instance.exists(): + raise Exception("could not find an instance file for test case") + + schema = casedir / "schema.json" + if not schema.exists(): + schema = casedir / "schema.yaml" + if not schema.exists(): + raise Exception("could not find a schema file for test case") + + ret = run_line( + [ + "check-jsonschema", + "--schemafile", + str(schema), + str(instance), + ] + ) + assert ret.exit_code == 1, _format_cli_result(ret) def _check_file_format_skip(case_name): @@ -166,10 +196,12 @@ def _package_is_installed(pkg: str) -> bool: return True -def _format_cli_result(rcase: ResolvedCase, result) -> str: +def _format_cli_result(result, rcase: ResolvedCase | None = None) -> str: + prefix = "" + if rcase is not None: + prefix = f"config.add_args={rcase.add_args}\n" return ( - "\n" - f"config.add_args={rcase.add_args}\n" + f"\n{prefix}" f"{result.exit_code=}\n" f"result.stdout={result.output}\n" f"{result.stderr=}" diff --git a/tests/example-files/explicit-schema/negative/unicode_pattern/instance.json b/tests/example-files/explicit-schema/negative/unicode_pattern/instance.json new file mode 100644 index 000000000..0bce573bf --- /dev/null +++ b/tests/example-files/explicit-schema/negative/unicode_pattern/instance.json @@ -0,0 +1,4 @@ +{ + "key": "foo 1", + "value": "bar 2" +} diff --git a/tests/example-files/explicit-schema/negative/unicode_pattern/schema.json b/tests/example-files/explicit-schema/negative/unicode_pattern/schema.json new file mode 100644 index 000000000..3511f41b2 --- /dev/null +++ b/tests/example-files/explicit-schema/negative/unicode_pattern/schema.json @@ -0,0 +1,20 @@ +{ + "additionalProperties": false, + "properties": { + "key": { + "description": "some key", + "maxLength": 128, + "minLength": 1, + "pattern": "^\\p{L}\\p{Z}\\p{N}$", + "type": "string" + }, + "value": { + "description": "some value", + "maxLength": 256, + "minLength": 0, + "pattern": "^\\p{L}\\p{Z}\\p{N}$", + "type": "string" + } + }, + "type": "object" +} diff --git a/tests/example-files/explicit-schema/positive/unicode_pattern/instance.json b/tests/example-files/explicit-schema/positive/unicode_pattern/instance.json new file mode 100644 index 000000000..6766d3091 --- /dev/null +++ b/tests/example-files/explicit-schema/positive/unicode_pattern/instance.json @@ -0,0 +1,4 @@ +{ + "key": "a 1", + "value": "b 2" +} diff --git a/tests/example-files/explicit-schema/positive/unicode_pattern/schema.json b/tests/example-files/explicit-schema/positive/unicode_pattern/schema.json new file mode 100644 index 000000000..3511f41b2 --- /dev/null +++ b/tests/example-files/explicit-schema/positive/unicode_pattern/schema.json @@ -0,0 +1,20 @@ +{ + "additionalProperties": false, + "properties": { + "key": { + "description": "some key", + "maxLength": 128, + "minLength": 1, + "pattern": "^\\p{L}\\p{Z}\\p{N}$", + "type": "string" + }, + "value": { + "description": "some value", + "maxLength": 256, + "minLength": 0, + "pattern": "^\\p{L}\\p{Z}\\p{N}$", + "type": "string" + } + }, + "type": "object" +} From 5cf5ae14fd09aca3011ec3b2c8a24bb1a9476468 Mon Sep 17 00:00:00 2001 From: Stephen Rosen Date: Mon, 23 Dec 2024 13:56:03 -0600 Subject: [PATCH 06/12] Add a nonunicode regex mode This is a variant of the regress engine usage in which the `u` flag is not set. It is needed for usages which rely on unicode escapes *not* being interpreted. In particular, the azure pipelines schema won't pass metaschema validation (let alone apply correctly) if `pattern` interpretation uses unicode-mode regexes. --- .pre-commit-hooks.yaml | 2 +- docs/usage.rst | 2 ++ pyproject.toml | 2 +- src/check_jsonschema/catalog.py | 7 ++++- src/check_jsonschema/cli/main_command.py | 9 +++--- src/check_jsonschema/cli/parse_result.py | 8 +++-- src/check_jsonschema/regex_variants.py | 37 ++++++++++++++++++++---- 7 files changed, 51 insertions(+), 16 deletions(-) diff --git a/.pre-commit-hooks.yaml b/.pre-commit-hooks.yaml index 42abd1d61..6f0e0e09c 100644 --- a/.pre-commit-hooks.yaml +++ b/.pre-commit-hooks.yaml @@ -24,7 +24,7 @@ - id: check-azure-pipelines name: Validate Azure Pipelines description: 'Validate Azure Pipelines config against the schema provided by Microsoft' - entry: check-jsonschema --builtin-schema vendor.azure-pipelines --data-transform azure-pipelines + entry: check-jsonschema --builtin-schema vendor.azure-pipelines --data-transform azure-pipelines --regex-variant nonunicode language: python files: ^(\.)?azure-pipelines\.(yml|yaml)$ types: [yaml] diff --git a/docs/usage.rst b/docs/usage.rst index 206975a2b..e9216986e 100644 --- a/docs/usage.rst +++ b/docs/usage.rst @@ -197,6 +197,8 @@ for ``"pattern"`` interpretation. The modes are as follows: - description * - default - Use ECMAScript regex syntax. + * - nonunicode + - Use ECMAScript regex syntax, but without unicode escapes enabled. * - python - Use Python regex syntax. diff --git a/pyproject.toml b/pyproject.toml index 575c0d524..a14d23d47 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -20,7 +20,7 @@ dependencies = [ 'tomli>=2.0;python_version<"3.11"', "ruamel.yaml==0.18.6", "jsonschema>=4.18.0,<5.0", - "regress>=0.4.0", + "regress>=2024.11.1", "requests<3.0", "click>=8,<9", ] diff --git a/src/check_jsonschema/catalog.py b/src/check_jsonschema/catalog.py index 0ac915c23..cebbd78a5 100644 --- a/src/check_jsonschema/catalog.py +++ b/src/check_jsonschema/catalog.py @@ -31,7 +31,12 @@ def _githubusercontent_url(owner: str, repo: str, ref: str, path: str) -> str: "Validate Azure Pipelines config against the schema provided " "by Microsoft" ), - "add_args": ["--data-transform", "azure-pipelines"], + "add_args": [ + "--data-transform", + "azure-pipelines", + "--regex-variant", + "nonunicode", + ], "files": r"^(\.)?azure-pipelines\.(yml|yaml)$", "types": "yaml", }, diff --git a/src/check_jsonschema/cli/main_command.py b/src/check_jsonschema/cli/main_command.py index 1b5db9bc7..9e93ff1ff 100644 --- a/src/check_jsonschema/cli/main_command.py +++ b/src/check_jsonschema/cli/main_command.py @@ -71,8 +71,9 @@ def pretty_helptext_list(values: list[str] | tuple[str, ...]) -> str: \b For handling of regexes, there are multiple modes which can be specified with '--regex-variant': - default | use ECMAScript regex syntax (via regress) - python | use python regex syntax + default | use ECMAScript regex syntax (via regress) + nonunicode | use ECMAScript regex syntax, but in non-unicode mode (via regress) + python | use python regex syntax \b The '--builtin-schema' flag supports the following schema names: @@ -238,8 +239,8 @@ def main( no_cache: bool, cache_filename: str | None, disable_formats: tuple[list[str], ...], - format_regex: t.Literal["python", "default"] | None, - regex_variant: t.Literal["python", "default"] | None, + format_regex: t.Literal["python", "nonunicode", "default"] | None, + regex_variant: t.Literal["python", "nonunicode", "default"] | None, default_filetype: t.Literal["json", "yaml", "toml", "json5"], traceback_mode: t.Literal["full", "short"], data_transform: t.Literal["azure-pipelines", "gitlab-ci"] | None, diff --git a/src/check_jsonschema/cli/parse_result.py b/src/check_jsonschema/cli/parse_result.py index bf76806c3..a32c93932 100644 --- a/src/check_jsonschema/cli/parse_result.py +++ b/src/check_jsonschema/cli/parse_result.py @@ -51,11 +51,13 @@ def __init__(self) -> None: def set_regex_variant( self, - variant_opt: Literal["python", "default"] | None, + variant_opt: Literal["python", "nonunicode", "default"] | None, *, - legacy_opt: Literal["python", "default"] | None = None, + legacy_opt: Literal["python", "nonunicode", "default"] | None = None, ) -> None: - variant_name: Literal["python", "default"] | None = variant_opt or legacy_opt + variant_name: Literal["python", "nonunicode", "default"] | None = ( + variant_opt or legacy_opt + ) if variant_name: self.regex_variant = RegexVariantName(variant_name) diff --git a/src/check_jsonschema/regex_variants.py b/src/check_jsonschema/regex_variants.py index 95bae1188..276230d10 100644 --- a/src/check_jsonschema/regex_variants.py +++ b/src/check_jsonschema/regex_variants.py @@ -8,6 +8,7 @@ class RegexVariantName(enum.Enum): default = "default" + nonunicode = "nonunicode" python = "python" @@ -31,7 +32,9 @@ def __init__(self, variant: RegexVariantName) -> None: self.variant = variant if self.variant == RegexVariantName.default: - self._real_implementation = _RegressImplementation() + self._real_implementation = _UnicodeRegressImplementation() + elif self.variant == RegexVariantName.nonunicode: + self._real_implementation = _NonunicodeRegressImplementation() else: self._real_implementation = _PythonImplementation() @@ -39,15 +42,37 @@ def __init__(self, variant: RegexVariantName) -> None: self.pattern_keyword = self._real_implementation.pattern_keyword -class _RegressImplementation: +class _UnicodeRegressImplementation: + def check_format(self, instance: t.Any) -> bool: + if not isinstance(instance, str): + return True + try: + regress.Regex(instance, flags="u") + except regress.RegressError: + return False + return True + + def pattern_keyword( + self, validator: t.Any, pattern: str, instance: str, schema: t.Any + ) -> t.Iterator[jsonschema.ValidationError]: + if not validator.is_type(instance, "string"): + return + + try: + regress_pattern = regress.Regex(pattern, flags="u") + except regress.RegressError: + yield jsonschema.ValidationError(f"pattern {pattern!r} failed to compile") + if not regress_pattern.find(instance): + yield jsonschema.ValidationError(f"{instance!r} does not match {pattern!r}") + + +class _NonunicodeRegressImplementation: def check_format(self, instance: t.Any) -> bool: if not isinstance(instance, str): return True try: regress.Regex(instance) - # something is wrong with RegressError getting into the published types - # needs investigation... for now, ignore the error - except regress.RegressError: # type: ignore[attr-defined] + except regress.RegressError: return False return True @@ -59,7 +84,7 @@ def pattern_keyword( try: regress_pattern = regress.Regex(pattern) - except regress.RegressError: # type: ignore[attr-defined] + except regress.RegressError: yield jsonschema.ValidationError(f"pattern {pattern!r} failed to compile") if not regress_pattern.find(instance): yield jsonschema.ValidationError(f"{instance!r} does not match {pattern!r}") From 6d99fd8c017f13b1840e3f7a9c0a74ce79b23f4f Mon Sep 17 00:00:00 2001 From: Stephen Rosen Date: Mon, 23 Dec 2024 16:39:32 -0600 Subject: [PATCH 07/12] Support patternProperties in regex variants --- docs/usage.rst | 9 +-- src/check_jsonschema/regex_variants.py | 73 ++++++++++++++++++++++ src/check_jsonschema/schema_loader/main.py | 5 +- 3 files changed, 79 insertions(+), 8 deletions(-) diff --git a/docs/usage.rst b/docs/usage.rst index e9216986e..7de004e75 100644 --- a/docs/usage.rst +++ b/docs/usage.rst @@ -187,7 +187,8 @@ Example usage: ~~~~~~~~~~~~~~~~~~ Set a mode for handling of the ``"regex"`` value for ``"format"`` and the mode -for ``"pattern"`` interpretation. The modes are as follows: +for ``"pattern"`` and ``"patternProperties"`` interpretation. +The modes are as follows: .. list-table:: Regex Options :widths: 15 30 @@ -202,12 +203,6 @@ for ``"pattern"`` interpretation. The modes are as follows: * - python - Use Python regex syntax. -.. note:: - - This only controls the regex mode used for ``format`` and ``pattern``. - ``patternProperties`` is not currently controlled, and always uses the - Python engine. - Other Options -------------- diff --git a/src/check_jsonschema/regex_variants.py b/src/check_jsonschema/regex_variants.py index 276230d10..6e4f70fdf 100644 --- a/src/check_jsonschema/regex_variants.py +++ b/src/check_jsonschema/regex_variants.py @@ -19,6 +19,14 @@ def pattern_keyword( self, validator: t.Any, pattern: str, instance: str, schema: t.Any ) -> t.Iterator[jsonschema.ValidationError]: ... + def patternProperties_keyword( + self, + validator: t.Any, + patternProperties: dict[str, t.Any], + instance: dict[str, t.Any], + schema: t.Any, + ) -> t.Iterator[jsonschema.ValidationError]: ... + class RegexImplementation: """ @@ -40,6 +48,9 @@ def __init__(self, variant: RegexVariantName) -> None: self.check_format = self._real_implementation.check_format self.pattern_keyword = self._real_implementation.pattern_keyword + self.patternProperties_keyword = ( + self._real_implementation.patternProperties_keyword + ) class _UnicodeRegressImplementation: @@ -65,6 +76,27 @@ def pattern_keyword( if not regress_pattern.find(instance): yield jsonschema.ValidationError(f"{instance!r} does not match {pattern!r}") + def patternProperties_keyword( + self, + validator: t.Any, + patternProperties: dict[str, t.Any], + instance: dict[str, t.Any], + schema: t.Any, + ) -> t.Iterator[jsonschema.ValidationError]: + if not validator.is_type(instance, "object"): + return + + for pattern, subschema in patternProperties.items(): + regress_pattern = regress.Regex(pattern, flags="u") + for k, v in instance.items(): + if regress_pattern.find(k): + yield from validator.descend( + v, + subschema, + path=k, + schema_path=pattern, + ) + class _NonunicodeRegressImplementation: def check_format(self, instance: t.Any) -> bool: @@ -89,6 +121,27 @@ def pattern_keyword( if not regress_pattern.find(instance): yield jsonschema.ValidationError(f"{instance!r} does not match {pattern!r}") + def patternProperties_keyword( + self, + validator: t.Any, + patternProperties: dict[str, t.Any], + instance: dict[str, t.Any], + schema: t.Any, + ) -> t.Iterator[jsonschema.ValidationError]: + if not validator.is_type(instance, "object"): + return + + for pattern, subschema in patternProperties.items(): + regress_pattern = regress.Regex(pattern) + for k, v in instance.items(): + if regress_pattern.find(k): + yield from validator.descend( + v, + subschema, + path=k, + schema_path=pattern, + ) + class _PythonImplementation: def check_format(self, instance: t.Any) -> bool: @@ -112,3 +165,23 @@ def pattern_keyword( yield jsonschema.ValidationError(f"pattern {pattern!r} failed to compile") if not re_pattern.search(instance): yield jsonschema.ValidationError(f"{instance!r} does not match {pattern!r}") + + def patternProperties_keyword( + self, + validator: t.Any, + patternProperties: dict[str, t.Any], + instance: dict[str, t.Any], + schema: t.Any, + ) -> t.Iterator[jsonschema.ValidationError]: + if not validator.is_type(instance, "object"): + return + + for pattern, subschema in patternProperties.items(): + for k, v in instance.items(): + if re.search(pattern, k): + yield from validator.descend( + v, + subschema, + path=k, + schema_path=pattern, + ) diff --git a/src/check_jsonschema/schema_loader/main.py b/src/check_jsonschema/schema_loader/main.py index 6e817088c..f48de1cab 100644 --- a/src/check_jsonschema/schema_loader/main.py +++ b/src/check_jsonschema/schema_loader/main.py @@ -52,7 +52,10 @@ def _extend_with_pattern_implementation( ) -> type[jsonschema.Validator]: return jsonschema.validators.extend( validator_class, - {"pattern": regex_impl.pattern_keyword}, + { + "pattern": regex_impl.pattern_keyword, + "patternProperties": regex_impl.patternProperties_keyword, + }, ) From a1c3eb977187b4287beadc68e6aa0d2f976e3695 Mon Sep 17 00:00:00 2001 From: Stephen Rosen Date: Wed, 1 Jan 2025 12:46:39 -0600 Subject: [PATCH 08/12] Update changelog for new regex mode --- CHANGELOG.rst | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 2c6c5f18a..dc23d3d6f 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -9,12 +9,21 @@ Unreleased ---------- .. vendor-insert-here +- Update vendored schemas (2024-12-22) +- Drop support for Python 3.8 - Rename ``--format-regex`` to ``--regex-variant`` and convert ``--format-regex`` to a deprecated alias. It will be removed in a future release. - -- Update vendored schemas (2024-12-22) -- Drop support for Python 3.8 +- Regular expression interpretation in ``"pattern"``, ``"patternProperties"``, and + ``"format": "regex"`` usages now uses unicode-mode JS regular expressions by + default. (:issue:`353`) + + - Use ``--regex-variant nonunicode`` to get non-unicode JS regular + expressions, the default behavior from previous versions. + - Custom validators may be impacted by the new regular expression + features. Validators are now always modified with the ``jsonschema`` + library's ``extend()`` API to control the ``pattern`` and + ``patternProperties`` keywords. 0.30.0 ------ From 1435f8a0ac8d7a491460cf9d502bc6173212dd5c Mon Sep 17 00:00:00 2001 From: Stephen Rosen Date: Wed, 1 Jan 2025 13:37:47 -0600 Subject: [PATCH 09/12] Refine concrete regex implementations Primarily, make the regress forms inherit and share most of their code. --- src/check_jsonschema/regex_variants.py | 83 +++++++------------------- 1 file changed, 22 insertions(+), 61 deletions(-) diff --git a/src/check_jsonschema/regex_variants.py b/src/check_jsonschema/regex_variants.py index 6e4f70fdf..bebbc9a82 100644 --- a/src/check_jsonschema/regex_variants.py +++ b/src/check_jsonschema/regex_variants.py @@ -12,69 +12,35 @@ class RegexVariantName(enum.Enum): python = "python" -class _ConcreteImplementation(t.Protocol): - def check_format(self, instance: t.Any) -> bool: ... - - def pattern_keyword( - self, validator: t.Any, pattern: str, instance: str, schema: t.Any - ) -> t.Iterator[jsonschema.ValidationError]: ... - - def patternProperties_keyword( - self, - validator: t.Any, - patternProperties: dict[str, t.Any], - instance: dict[str, t.Any], - schema: t.Any, - ) -> t.Iterator[jsonschema.ValidationError]: ... - - class RegexImplementation: """ A high-level interface for getting at the different possible implementations of regex behaviors. """ - _real_implementation: _ConcreteImplementation + _concrete: "_ConcreteImplementation" def __init__(self, variant: RegexVariantName) -> None: self.variant = variant if self.variant == RegexVariantName.default: - self._real_implementation = _UnicodeRegressImplementation() + self._concrete = _RegressImplementation() elif self.variant == RegexVariantName.nonunicode: - self._real_implementation = _NonunicodeRegressImplementation() + self._concrete = _NonunicodeRegressImplementation() else: - self._real_implementation = _PythonImplementation() + self._concrete = _PythonImplementation() - self.check_format = self._real_implementation.check_format - self.pattern_keyword = self._real_implementation.pattern_keyword - self.patternProperties_keyword = ( - self._real_implementation.patternProperties_keyword - ) + self.check_format = self._concrete.check_format + self.pattern_keyword = self._concrete.pattern_keyword + self.patternProperties_keyword = self._concrete.patternProperties_keyword -class _UnicodeRegressImplementation: - def check_format(self, instance: t.Any) -> bool: - if not isinstance(instance, str): - return True - try: - regress.Regex(instance, flags="u") - except regress.RegressError: - return False - return True +class _ConcreteImplementation(t.Protocol): + def check_format(self, instance: t.Any) -> bool: ... def pattern_keyword( self, validator: t.Any, pattern: str, instance: str, schema: t.Any - ) -> t.Iterator[jsonschema.ValidationError]: - if not validator.is_type(instance, "string"): - return - - try: - regress_pattern = regress.Regex(pattern, flags="u") - except regress.RegressError: - yield jsonschema.ValidationError(f"pattern {pattern!r} failed to compile") - if not regress_pattern.find(instance): - yield jsonschema.ValidationError(f"{instance!r} does not match {pattern!r}") + ) -> t.Iterator[jsonschema.ValidationError]: ... def patternProperties_keyword( self, @@ -82,28 +48,18 @@ def patternProperties_keyword( patternProperties: dict[str, t.Any], instance: dict[str, t.Any], schema: t.Any, - ) -> t.Iterator[jsonschema.ValidationError]: - if not validator.is_type(instance, "object"): - return + ) -> t.Iterator[jsonschema.ValidationError]: ... - for pattern, subschema in patternProperties.items(): - regress_pattern = regress.Regex(pattern, flags="u") - for k, v in instance.items(): - if regress_pattern.find(k): - yield from validator.descend( - v, - subschema, - path=k, - schema_path=pattern, - ) +class _RegressImplementation: + def _compile_pattern(self, pattern: str) -> regress.Regex: + return regress.Regex(pattern, flags="u") -class _NonunicodeRegressImplementation: def check_format(self, instance: t.Any) -> bool: if not isinstance(instance, str): return True try: - regress.Regex(instance) + self._compile_pattern(instance) except regress.RegressError: return False return True @@ -115,7 +71,7 @@ def pattern_keyword( return try: - regress_pattern = regress.Regex(pattern) + regress_pattern = self._compile_pattern(pattern) except regress.RegressError: yield jsonschema.ValidationError(f"pattern {pattern!r} failed to compile") if not regress_pattern.find(instance): @@ -132,7 +88,7 @@ def patternProperties_keyword( return for pattern, subschema in patternProperties.items(): - regress_pattern = regress.Regex(pattern) + regress_pattern = self._compile_pattern(pattern) for k, v in instance.items(): if regress_pattern.find(k): yield from validator.descend( @@ -143,6 +99,11 @@ def patternProperties_keyword( ) +class _NonunicodeRegressImplementation(_RegressImplementation): + def _compile_pattern(self, pattern: str) -> regress.Regex: + return regress.Regex(pattern) + + class _PythonImplementation: def check_format(self, instance: t.Any) -> bool: if not isinstance(instance, str): From e9aa64e85e2a47abfc522e6bdd39096204bc2db5 Mon Sep 17 00:00:00 2001 From: Stephen Rosen Date: Wed, 1 Jan 2025 13:53:17 -0600 Subject: [PATCH 10/12] Minor linting fix --- src/check_jsonschema/cli/parse_result.py | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/src/check_jsonschema/cli/parse_result.py b/src/check_jsonschema/cli/parse_result.py index a32c93932..bfd9065b1 100644 --- a/src/check_jsonschema/cli/parse_result.py +++ b/src/check_jsonschema/cli/parse_result.py @@ -1,7 +1,6 @@ from __future__ import annotations import enum -import sys import typing as t import click @@ -11,11 +10,6 @@ from ..regex_variants import RegexImplementation, RegexVariantName from ..transforms import Transform -if sys.version_info >= (3, 8): - from typing import Literal -else: - from typing_extensions import Literal - class SchemaLoadingMode(enum.Enum): filepath = "filepath" @@ -51,11 +45,11 @@ def __init__(self) -> None: def set_regex_variant( self, - variant_opt: Literal["python", "nonunicode", "default"] | None, + variant_opt: t.Literal["python", "nonunicode", "default"] | None, *, - legacy_opt: Literal["python", "nonunicode", "default"] | None = None, + legacy_opt: t.Literal["python", "nonunicode", "default"] | None = None, ) -> None: - variant_name: Literal["python", "nonunicode", "default"] | None = ( + variant_name: t.Literal["python", "nonunicode", "default"] | None = ( variant_opt or legacy_opt ) if variant_name: From cc12d98f7d413970c05134aac9d5357f426920fa Mon Sep 17 00:00:00 2001 From: Stephen Rosen Date: Tue, 7 Jan 2025 20:53:21 -0600 Subject: [PATCH 11/12] Use regex variant when checking schema validity Rather than relying directly on `validator_cls.check_schema`, reimplement this functionality in-tree here to allow for customizations to the validator class before it is run on the user's schema. A new test uses an invalid regex under the regress unicode engine which is valid in the python engine to ensure that consistent checking is applied. Manual testing revealed that the `_fail()` message production for SchemaError was showing error information twice, which is fixed without a new test to guarantee the new behavior. --- src/check_jsonschema/checker.py | 2 +- src/check_jsonschema/regex_variants.py | 10 ++---- src/check_jsonschema/schema_loader/main.py | 33 ++++++++++++++++++- tests/acceptance/test_invalid_schema_files.py | 12 +++++++ 4 files changed, 47 insertions(+), 10 deletions(-) diff --git a/src/check_jsonschema/checker.py b/src/check_jsonschema/checker.py index 6d5153efc..c6cd852eb 100644 --- a/src/check_jsonschema/checker.py +++ b/src/check_jsonschema/checker.py @@ -59,7 +59,7 @@ def get_validator( except SchemaParseError as e: self._fail("Error: schemafile could not be parsed as JSON", e) except jsonschema.SchemaError as e: - self._fail(f"Error: schemafile was not valid: {e}\n", e) + self._fail("Error: schemafile was not valid\n", e) except UnsupportedUrlScheme as e: self._fail(f"Error: {e}\n", e) except Exception as e: diff --git a/src/check_jsonschema/regex_variants.py b/src/check_jsonschema/regex_variants.py index bebbc9a82..b76527867 100644 --- a/src/check_jsonschema/regex_variants.py +++ b/src/check_jsonschema/regex_variants.py @@ -70,10 +70,7 @@ def pattern_keyword( if not validator.is_type(instance, "string"): return - try: - regress_pattern = self._compile_pattern(pattern) - except regress.RegressError: - yield jsonschema.ValidationError(f"pattern {pattern!r} failed to compile") + regress_pattern = self._compile_pattern(pattern) if not regress_pattern.find(instance): yield jsonschema.ValidationError(f"{instance!r} does not match {pattern!r}") @@ -120,10 +117,7 @@ def pattern_keyword( if not validator.is_type(instance, "string"): return - try: - re_pattern = re.compile(pattern) - except re.error: - yield jsonschema.ValidationError(f"pattern {pattern!r} failed to compile") + re_pattern = re.compile(pattern) if not re_pattern.search(instance): yield jsonschema.ValidationError(f"{instance!r} does not match {pattern!r}") diff --git a/src/check_jsonschema/schema_loader/main.py b/src/check_jsonschema/schema_loader/main.py index f48de1cab..19f9cdcef 100644 --- a/src/check_jsonschema/schema_loader/main.py +++ b/src/check_jsonschema/schema_loader/main.py @@ -170,7 +170,12 @@ def _get_validator( if self.validator_class is None: # get the correct validator class and check the schema under its metaschema validator_cls = jsonschema.validators.validator_for(schema) - validator_cls.check_schema(schema, format_checker=format_checker) + _check_schema( + validator_cls, + schema, + format_checker=format_checker, + regex_impl=regex_impl, + ) else: # for a user-provided validator class, don't check_schema # on the grounds that it might *not* be valid but the user wants to use @@ -197,6 +202,32 @@ def _get_validator( return t.cast(jsonschema.protocols.Validator, validator) +def _check_schema( + validator_cls: type[jsonschema.protocols.Validator], + schema: dict[str, t.Any], + *, + format_checker: jsonschema.FormatChecker | None, + regex_impl: RegexImplementation, +) -> None: + """A variant definition of Validator.check_schema which uses the regex + implementation and format checker specified.""" + schema_validator_cls = jsonschema.validators.validator_for( + validator_cls.META_SCHEMA, default=validator_cls + ) + schema_validator_cls = _extend_with_pattern_implementation( + schema_validator_cls, regex_impl + ) + + if format_checker is None: + format_checker = schema_validator_cls.FORMAT_CHECKER + + schema_validator = schema_validator_cls( + validator_cls.META_SCHEMA, format_checker=format_checker + ) + for error in schema_validator.iter_errors(schema): + raise jsonschema.exceptions.SchemaError.create_from(error) + + class BuiltinSchemaLoader(SchemaLoader): def __init__(self, schema_name: str, *, base_uri: str | None = None) -> None: self.schema_name = schema_name diff --git a/tests/acceptance/test_invalid_schema_files.py b/tests/acceptance/test_invalid_schema_files.py index c4cf62c72..6f6b6b73f 100644 --- a/tests/acceptance/test_invalid_schema_files.py +++ b/tests/acceptance/test_invalid_schema_files.py @@ -29,3 +29,15 @@ def test_checker_invalid_schemafile_scheme(run_line, tmp_path): res = run_line(["check-jsonschema", "--schemafile", f"ftp://{foo}", str(bar)]) assert res.exit_code == 1 assert "only supports http, https" in res.stderr + + +def test_checker_invalid_schemafile_due_to_bad_regex(run_line, tmp_path): + foo = tmp_path / "foo.json" + bar = tmp_path / "bar.json" + # too many backslash escapes -- not a valid Unicode-mode regex + foo.write_text(r'{"properties": {"foo": {"pattern": "\\\\p{N}"}}}') + bar.write_text("{}") + + res = run_line(["check-jsonschema", "--schemafile", str(foo), str(bar)]) + assert res.exit_code == 1 + assert "schemafile was not valid" in res.stderr From 4414601317bd7bfa69e07bef638bb267bdbb1ccf Mon Sep 17 00:00:00 2001 From: Stephen Rosen Date: Tue, 7 Jan 2025 22:13:51 -0600 Subject: [PATCH 12/12] Always enable metaschema format checking In the implementation prior to this change, passing `--disable-formats` would impact not only the "actual" schema validator, but also the validator built to evaluate the schema against its metaschema. As a result, `--disable-formats *` and similar would enable schemas to run which previously should have been caught as invalid. Furthermore, the customized format checker which had extensions for date and time evaluation added was used, and any other customizations to format checking would implicitly be shared with the metaschema check. To resolve, refactor format checker building to allow it to be used more directly for the metaschema check, and add test cases to confirm that a bad regex in a `pattern` is always rejected, even when `--disable-formats regex` or similar is used. --- src/check_jsonschema/formats/__init__.py | 24 ++++++++++---- src/check_jsonschema/schema_loader/main.py | 33 +++++++++++-------- tests/acceptance/test_invalid_schema_files.py | 16 +++++++-- 3 files changed, 51 insertions(+), 22 deletions(-) diff --git a/src/check_jsonschema/formats/__init__.py b/src/check_jsonschema/formats/__init__.py index 971daac74..2308c4313 100644 --- a/src/check_jsonschema/formats/__init__.py +++ b/src/check_jsonschema/formats/__init__.py @@ -66,13 +66,10 @@ def make_format_checker( if not opts.enabled: return None - # copy the base checker - base_checker = get_base_format_checker(schema_dialect) - checker = copy.deepcopy(base_checker) + # customize around regex checking first + checker = format_checker_for_regex_impl(opts.regex_impl) - # replace the regex check - del checker.checkers["regex"] - checker.checks("regex")(opts.regex_impl.check_format) + # add other custom format checks checker.checks("date-time")(validate_rfc3339) checker.checks("time")(validate_time) @@ -83,3 +80,18 @@ def make_format_checker( del checker.checkers[checkname] return checker + + +def format_checker_for_regex_impl( + regex_impl: RegexImplementation, schema_dialect: str | None = None +) -> jsonschema.FormatChecker: + # convert to a schema-derived format checker, and copy it + # for safe modification + base_checker = get_base_format_checker(schema_dialect) + checker = copy.deepcopy(base_checker) + + # replace the regex check + del checker.checkers["regex"] + checker.checks("regex")(regex_impl.check_format) + + return checker diff --git a/src/check_jsonschema/schema_loader/main.py b/src/check_jsonschema/schema_loader/main.py index 19f9cdcef..e056389a9 100644 --- a/src/check_jsonschema/schema_loader/main.py +++ b/src/check_jsonschema/schema_loader/main.py @@ -9,7 +9,7 @@ import jsonschema from ..builtin_schemas import get_builtin_schema -from ..formats import FormatOptions, make_format_checker +from ..formats import FormatOptions, format_checker_for_regex_impl, make_format_checker from ..parsers import ParserSet from ..regex_variants import RegexImplementation from ..utils import is_url_ish @@ -153,10 +153,7 @@ def _get_validator( ) -> jsonschema.protocols.Validator: retrieval_uri = self.get_schema_retrieval_uri() schema = self.get_schema() - - schema_dialect = schema.get("$schema") - if schema_dialect is not None and not isinstance(schema_dialect, str): - schema_dialect = None + schema_dialect = _dialect_of_schema(schema) # format checker (which may be None) format_checker = make_format_checker(format_opts, schema_dialect) @@ -170,12 +167,8 @@ def _get_validator( if self.validator_class is None: # get the correct validator class and check the schema under its metaschema validator_cls = jsonschema.validators.validator_for(schema) - _check_schema( - validator_cls, - schema, - format_checker=format_checker, - regex_impl=regex_impl, - ) + + _check_schema(validator_cls, schema, regex_impl=regex_impl) else: # for a user-provided validator class, don't check_schema # on the grounds that it might *not* be valid but the user wants to use @@ -206,11 +199,11 @@ def _check_schema( validator_cls: type[jsonschema.protocols.Validator], schema: dict[str, t.Any], *, - format_checker: jsonschema.FormatChecker | None, regex_impl: RegexImplementation, ) -> None: """A variant definition of Validator.check_schema which uses the regex implementation and format checker specified.""" + # construct the metaschema validator class (with customized regex impl) schema_validator_cls = jsonschema.validators.validator_for( validator_cls.META_SCHEMA, default=validator_cls ) @@ -218,9 +211,11 @@ def _check_schema( schema_validator_cls, regex_impl ) - if format_checker is None: - format_checker = schema_validator_cls.FORMAT_CHECKER + # construct a specialized format checker (again, customized regex impl) + metaschema_dialect = _dialect_of_schema(validator_cls.META_SCHEMA) + format_checker = format_checker_for_regex_impl(regex_impl, metaschema_dialect) + # now, construct and apply the actual validator schema_validator = schema_validator_cls( validator_cls.META_SCHEMA, format_checker=format_checker ) @@ -228,6 +223,16 @@ def _check_schema( raise jsonschema.exceptions.SchemaError.create_from(error) +def _dialect_of_schema(schema: dict[str, t.Any] | bool) -> str | None: + if not isinstance(schema, dict): + return None + + schema_dialect = schema.get("$schema") + if schema_dialect is not None and not isinstance(schema_dialect, str): + schema_dialect = None + return schema_dialect + + class BuiltinSchemaLoader(SchemaLoader): def __init__(self, schema_name: str, *, base_uri: str | None = None) -> None: self.schema_name = schema_name diff --git a/tests/acceptance/test_invalid_schema_files.py b/tests/acceptance/test_invalid_schema_files.py index 6f6b6b73f..71efda024 100644 --- a/tests/acceptance/test_invalid_schema_files.py +++ b/tests/acceptance/test_invalid_schema_files.py @@ -1,3 +1,6 @@ +import pytest + + def test_checker_non_json_schemafile(run_line, tmp_path): foo = tmp_path / "foo.json" bar = tmp_path / "bar.json" @@ -31,13 +34,22 @@ def test_checker_invalid_schemafile_scheme(run_line, tmp_path): assert "only supports http, https" in res.stderr -def test_checker_invalid_schemafile_due_to_bad_regex(run_line, tmp_path): +@pytest.mark.parametrize( + "add_args", + [ + pytest.param([], id="noargs"), + # ensure that this works even when regex checking is disabled + pytest.param(["--disable-formats", "*"], id="all-formats-disabled"), + pytest.param(["--disable-formats", "regex"], id="regex-format-disabled"), + ], +) +def test_checker_invalid_schemafile_due_to_bad_regex(run_line, tmp_path, add_args): foo = tmp_path / "foo.json" bar = tmp_path / "bar.json" # too many backslash escapes -- not a valid Unicode-mode regex foo.write_text(r'{"properties": {"foo": {"pattern": "\\\\p{N}"}}}') bar.write_text("{}") - res = run_line(["check-jsonschema", "--schemafile", str(foo), str(bar)]) + res = run_line(["check-jsonschema", "--schemafile", str(foo), str(bar), *add_args]) assert res.exit_code == 1 assert "schemafile was not valid" in res.stderr