Skip to content

Use regress to implement JS regex usage for pattern and patternProperties + use unicode mode regexes by default #511

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 12 commits into from
Jan 8, 2025
Merged
2 changes: 1 addition & 1 deletion .flake8
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,4 @@
exclude = .git,.tox,__pycache__,dist,.venv*,docs,build
max-line-length = 90
# black related: W503/W504 conflict, black causes E203
ignore = W503,W504,E203,B019
extend-ignore = W503,W504,E203,B019
2 changes: 1 addition & 1 deletion .pre-commit-hooks.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
- id: check-azure-pipelines
name: Validate Azure Pipelines
description: 'Validate Azure Pipelines config against the schema provided by Microsoft'
entry: check-jsonschema --builtin-schema vendor.azure-pipelines --data-transform azure-pipelines
entry: check-jsonschema --builtin-schema vendor.azure-pipelines --data-transform azure-pipelines --regex-variant nonunicode
language: python
files: ^(\.)?azure-pipelines\.(yml|yaml)$
types: [yaml]
Expand Down
14 changes: 13 additions & 1 deletion CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,21 @@ Unreleased
----------

.. vendor-insert-here

- Update vendored schemas (2024-12-22)
- Drop support for Python 3.8
- Rename ``--format-regex`` to ``--regex-variant`` and convert
``--format-regex`` to a deprecated alias.
It will be removed in a future release.
- Regular expression interpretation in ``"pattern"``, ``"patternProperties"``, and
``"format": "regex"`` usages now uses unicode-mode JS regular expressions by
default. (:issue:`353`)

- Use ``--regex-variant nonunicode`` to get non-unicode JS regular
expressions, the default behavior from previous versions.
- Custom validators may be impacted by the new regular expression
features. Validators are now always modified with the ``jsonschema``
library's ``extend()`` API to control the ``pattern`` and
``patternProperties`` keywords.

0.30.0
------
Expand Down
13 changes: 8 additions & 5 deletions docs/usage.rst
Original file line number Diff line number Diff line change
Expand Up @@ -183,11 +183,12 @@ Example usage:
# disables all three of time, date-time, and iri
--disable-formats time,date-time --disable-formats iri

``--format-regex``
``--regex-variant``
~~~~~~~~~~~~~~~~~~

Set a mode for handling of the ``"regex"`` value for ``"format"``. The modes are as
follows:
Set a mode for handling of the ``"regex"`` value for ``"format"`` and the mode
for ``"pattern"`` and ``"patternProperties"`` interpretation.
The modes are as follows:

.. list-table:: Regex Options
:widths: 15 30
Expand All @@ -196,9 +197,11 @@ follows:
* - mode
- description
* - default
- Require the regex to be valid in ECMAScript regex syntax.
- Use ECMAScript regex syntax.
* - nonunicode
- Use ECMAScript regex syntax, but without unicode escapes enabled.
* - python
- Require the regex to be valid in Python regex syntax.
- Use Python regex syntax.

Other Options
--------------
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ dependencies = [
'tomli>=2.0;python_version<"3.11"',
"ruamel.yaml==0.18.6",
"jsonschema>=4.18.0,<5.0",
"regress>=0.4.0",
"regress>=2024.11.1",
"requests<3.0",
"click>=8,<9",
]
Expand Down
7 changes: 6 additions & 1 deletion src/check_jsonschema/catalog.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,12 @@ def _githubusercontent_url(owner: str, repo: str, ref: str, path: str) -> str:
"Validate Azure Pipelines config against the schema provided "
"by Microsoft"
),
"add_args": ["--data-transform", "azure-pipelines"],
"add_args": [
"--data-transform",
"azure-pipelines",
"--regex-variant",
"nonunicode",
],
"files": r"^(\.)?azure-pipelines\.(yml|yaml)$",
"types": "yaml",
},
Expand Down
11 changes: 7 additions & 4 deletions src/check_jsonschema/checker.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from .formats import FormatOptions
from .instance_loader import InstanceLoader
from .parsers import ParseError
from .regex_variants import RegexImplementation
from .reporter import Reporter
from .result import CheckResult
from .schema_loader import SchemaLoaderBase, SchemaParseError, UnsupportedUrlScheme
Expand All @@ -28,15 +29,17 @@ def __init__(
instance_loader: InstanceLoader,
reporter: Reporter,
*,
format_opts: FormatOptions | None = None,
format_opts: FormatOptions,
regex_impl: RegexImplementation,
traceback_mode: str = "short",
fill_defaults: bool = False,
) -> None:
self._schema_loader = schema_loader
self._instance_loader = instance_loader
self._reporter = reporter

self._format_opts = format_opts if format_opts is not None else FormatOptions()
self._format_opts = format_opts
self._regex_impl = regex_impl
self._traceback_mode = traceback_mode
self._fill_defaults = fill_defaults

Expand All @@ -51,12 +54,12 @@ def get_validator(
) -> jsonschema.protocols.Validator:
try:
return self._schema_loader.get_validator(
path, doc, self._format_opts, self._fill_defaults
path, doc, self._format_opts, self._regex_impl, self._fill_defaults
)
except SchemaParseError as e:
self._fail("Error: schemafile could not be parsed as JSON", e)
except jsonschema.SchemaError as e:
self._fail(f"Error: schemafile was not valid: {e}\n", e)
self._fail("Error: schemafile was not valid\n", e)
except UnsupportedUrlScheme as e:
self._fail(f"Error: {e}\n", e)
except Exception as e:
Expand Down
32 changes: 22 additions & 10 deletions src/check_jsonschema/cli/main_command.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,10 @@

from ..catalog import CUSTOM_SCHEMA_NAMES, SCHEMA_CATALOG
from ..checker import SchemaChecker
from ..formats import KNOWN_FORMATS, RegexVariantName
from ..formats import KNOWN_FORMATS
from ..instance_loader import InstanceLoader
from ..parsers import SUPPORTED_FILE_FORMATS
from ..regex_variants import RegexImplementation, RegexVariantName
from ..reporter import REPORTER_BY_NAME, Reporter
from ..schema_loader import (
BuiltinSchemaLoader,
Expand Down Expand Up @@ -68,10 +69,11 @@ def pretty_helptext_list(values: list[str] | tuple[str, ...]) -> str:
date, date-time, email, ipv4, ipv6, regex, uuid

\b
For the "regex" format, there are multiple modes which can be specified with
'--format-regex':
default | check that the string is a valid ECMAScript regex
python | check that the string is a valid python regex
For handling of regexes, there are multiple modes which can be specified with
'--regex-variant':
default | use ECMAScript regex syntax (via regress)
nonunicode | use ECMAScript regex syntax, but in non-unicode mode (via regress)
python | use python regex syntax

\b
The '--builtin-schema' flag supports the following schema names:
Expand Down Expand Up @@ -138,11 +140,18 @@ def pretty_helptext_list(values: list[str] | tuple[str, ...]) -> str:
)
@click.option(
"--format-regex",
hidden=True,
help="Legacy name for `--regex-variant`.",
default=None,
type=click.Choice([x.value for x in RegexVariantName], case_sensitive=False),
)
@click.option(
"--regex-variant",
help=(
"Set the mode of format validation for regexes. "
"If `--disable-formats regex` is used, this option has no effect."
"Name of which regex dialect should be used for format checking "
"and 'pattern' matching."
),
default=RegexVariantName.default.value,
default=None,
type=click.Choice([x.value for x in RegexVariantName], case_sensitive=False),
)
@click.option(
Expand Down Expand Up @@ -230,7 +239,8 @@ def main(
no_cache: bool,
cache_filename: str | None,
disable_formats: tuple[list[str], ...],
format_regex: t.Literal["python", "default"],
format_regex: t.Literal["python", "nonunicode", "default"] | None,
regex_variant: t.Literal["python", "nonunicode", "default"] | None,
default_filetype: t.Literal["json", "yaml", "toml", "json5"],
traceback_mode: t.Literal["full", "short"],
data_transform: t.Literal["azure-pipelines", "gitlab-ci"] | None,
Expand All @@ -243,6 +253,8 @@ def main(
) -> None:
args = ParseResult()

args.set_regex_variant(regex_variant, legacy_opt=format_regex)

args.set_schema(schemafile, builtin_schema, check_metaschema)
args.set_validator(validator_class)

Expand All @@ -257,7 +269,6 @@ def main(
else:
args.disable_formats = normalized_disable_formats

args.format_regex = RegexVariantName(format_regex)
args.disable_cache = no_cache
args.default_filetype = default_filetype
args.fill_defaults = fill_defaults
Expand Down Expand Up @@ -318,6 +329,7 @@ def build_checker(args: ParseResult) -> SchemaChecker:
instance_loader,
reporter,
format_opts=args.format_opts,
regex_impl=RegexImplementation(args.regex_variant),
traceback_mode=args.traceback_mode,
fill_defaults=args.fill_defaults,
)
Expand Down
19 changes: 16 additions & 3 deletions src/check_jsonschema/cli/parse_result.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,8 @@
import click
import jsonschema

from ..formats import FormatOptions, RegexVariantName
from ..formats import FormatOptions
from ..regex_variants import RegexImplementation, RegexVariantName
from ..transforms import Transform


Expand Down Expand Up @@ -36,12 +37,24 @@ def __init__(self) -> None:
# regex format options
self.disable_all_formats: bool = False
self.disable_formats: tuple[str, ...] = ()
self.format_regex: RegexVariantName = RegexVariantName.default
self.regex_variant: RegexVariantName = RegexVariantName.default
# error and output controls
self.verbosity: int = 1
self.traceback_mode: str = "short"
self.output_format: str = "text"

def set_regex_variant(
self,
variant_opt: t.Literal["python", "nonunicode", "default"] | None,
*,
legacy_opt: t.Literal["python", "nonunicode", "default"] | None = None,
) -> None:
variant_name: t.Literal["python", "nonunicode", "default"] | None = (
variant_opt or legacy_opt
)
if variant_name:
self.regex_variant = RegexVariantName(variant_name)

def set_schema(
self, schemafile: str | None, builtin_schema: str | None, check_metaschema: bool
) -> None:
Expand Down Expand Up @@ -82,7 +95,7 @@ def set_validator(
@property
def format_opts(self) -> FormatOptions:
return FormatOptions(
regex_impl=RegexImplementation(self.regex_variant),
enabled=not self.disable_all_formats,
regex_variant=self.format_regex,
disabled_formats=self.disable_formats,
)
60 changes: 21 additions & 39 deletions src/check_jsonschema/formats/__init__.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,11 @@
from __future__ import annotations

import copy
import enum
import re
import typing as t

import jsonschema
import jsonschema.validators
import regress

from ..regex_variants import RegexImplementation
from .implementations import validate_rfc3339, validate_time

# all known format strings except for a selection from draft3 which have either
Expand Down Expand Up @@ -39,42 +36,16 @@
)


class RegexVariantName(enum.Enum):
default = "default"
python = "python"


class RegexImplementation:
def __init__(self, variant: RegexVariantName) -> None:
self.variant = variant

def check_format(self, instance: t.Any) -> bool:
if not isinstance(instance, str):
return True

try:
if self.variant == RegexVariantName.default:
regress.Regex(instance)
else:
re.compile(instance)
# something is wrong with RegressError getting into the published types
# needs investigation... for now, ignore the error
except (regress.RegressError, re.error): # type: ignore[attr-defined]
return False

return True


class FormatOptions:
def __init__(
self,
*,
regex_impl: RegexImplementation,
enabled: bool = True,
regex_variant: RegexVariantName = RegexVariantName.default,
disabled_formats: tuple[str, ...] = (),
) -> None:
self.enabled = enabled
self.regex_variant = regex_variant
self.regex_impl = regex_impl
self.disabled_formats = disabled_formats


Expand All @@ -95,14 +66,10 @@ def make_format_checker(
if not opts.enabled:
return None

# copy the base checker
base_checker = get_base_format_checker(schema_dialect)
checker = copy.deepcopy(base_checker)
# customize around regex checking first
checker = format_checker_for_regex_impl(opts.regex_impl)

# replace the regex check
del checker.checkers["regex"]
regex_impl = RegexImplementation(opts.regex_variant)
checker.checks("regex")(regex_impl.check_format)
# add other custom format checks
checker.checks("date-time")(validate_rfc3339)
checker.checks("time")(validate_time)

Expand All @@ -113,3 +80,18 @@ def make_format_checker(
del checker.checkers[checkname]

return checker


def format_checker_for_regex_impl(
regex_impl: RegexImplementation, schema_dialect: str | None = None
) -> jsonschema.FormatChecker:
# convert to a schema-derived format checker, and copy it
# for safe modification
base_checker = get_base_format_checker(schema_dialect)
checker = copy.deepcopy(base_checker)

# replace the regex check
del checker.checkers["regex"]
checker.checks("regex")(regex_impl.check_format)

return checker
Loading
Loading