Skip to content

Commit 8da1fef

Browse files
authored
Merge pull request #511 from python-jsonschema/use-regress-for-patterns
Use `regress` to implement JS regex usage for `pattern` and `patternProperties` + use unicode mode regexes by default
2 parents 28714ff + 4414601 commit 8da1fef

21 files changed

+449
-97
lines changed

.flake8

+1-1
Original file line numberDiff line numberDiff line change
@@ -2,4 +2,4 @@
22
exclude = .git,.tox,__pycache__,dist,.venv*,docs,build
33
max-line-length = 90
44
# black related: W503/W504 conflict, black causes E203
5-
ignore = W503,W504,E203,B019
5+
extend-ignore = W503,W504,E203,B019

.pre-commit-hooks.yaml

+1-1
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
- id: check-azure-pipelines
2525
name: Validate Azure Pipelines
2626
description: 'Validate Azure Pipelines config against the schema provided by Microsoft'
27-
entry: check-jsonschema --builtin-schema vendor.azure-pipelines --data-transform azure-pipelines
27+
entry: check-jsonschema --builtin-schema vendor.azure-pipelines --data-transform azure-pipelines --regex-variant nonunicode
2828
language: python
2929
files: ^(\.)?azure-pipelines\.(yml|yaml)$
3030
types: [yaml]

CHANGELOG.rst

+13-1
Original file line numberDiff line numberDiff line change
@@ -9,9 +9,21 @@ Unreleased
99
----------
1010

1111
.. vendor-insert-here
12-
1312
- Update vendored schemas (2024-12-22)
1413
- Drop support for Python 3.8
14+
- Rename ``--format-regex`` to ``--regex-variant`` and convert
15+
``--format-regex`` to a deprecated alias.
16+
It will be removed in a future release.
17+
- Regular expression interpretation in ``"pattern"``, ``"patternProperties"``, and
18+
``"format": "regex"`` usages now uses unicode-mode JS regular expressions by
19+
default. (:issue:`353`)
20+
21+
- Use ``--regex-variant nonunicode`` to get non-unicode JS regular
22+
expressions, the default behavior from previous versions.
23+
- Custom validators may be impacted by the new regular expression
24+
features. Validators are now always modified with the ``jsonschema``
25+
library's ``extend()`` API to control the ``pattern`` and
26+
``patternProperties`` keywords.
1527

1628
0.30.0
1729
------

docs/usage.rst

+8-5
Original file line numberDiff line numberDiff line change
@@ -183,11 +183,12 @@ Example usage:
183183
# disables all three of time, date-time, and iri
184184
--disable-formats time,date-time --disable-formats iri
185185
186-
``--format-regex``
186+
``--regex-variant``
187187
~~~~~~~~~~~~~~~~~~
188188

189-
Set a mode for handling of the ``"regex"`` value for ``"format"``. The modes are as
190-
follows:
189+
Set a mode for handling of the ``"regex"`` value for ``"format"`` and the mode
190+
for ``"pattern"`` and ``"patternProperties"`` interpretation.
191+
The modes are as follows:
191192

192193
.. list-table:: Regex Options
193194
:widths: 15 30
@@ -196,9 +197,11 @@ follows:
196197
* - mode
197198
- description
198199
* - default
199-
- Require the regex to be valid in ECMAScript regex syntax.
200+
- Use ECMAScript regex syntax.
201+
* - nonunicode
202+
- Use ECMAScript regex syntax, but without unicode escapes enabled.
200203
* - python
201-
- Require the regex to be valid in Python regex syntax.
204+
- Use Python regex syntax.
202205

203206
Other Options
204207
--------------

pyproject.toml

+1-1
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ dependencies = [
2020
'tomli>=2.0;python_version<"3.11"',
2121
"ruamel.yaml==0.18.7",
2222
"jsonschema>=4.18.0,<5.0",
23-
"regress>=0.4.0",
23+
"regress>=2024.11.1",
2424
"requests<3.0",
2525
"click>=8,<9",
2626
]

src/check_jsonschema/catalog.py

+6-1
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,12 @@ def _githubusercontent_url(owner: str, repo: str, ref: str, path: str) -> str:
3131
"Validate Azure Pipelines config against the schema provided "
3232
"by Microsoft"
3333
),
34-
"add_args": ["--data-transform", "azure-pipelines"],
34+
"add_args": [
35+
"--data-transform",
36+
"azure-pipelines",
37+
"--regex-variant",
38+
"nonunicode",
39+
],
3540
"files": r"^(\.)?azure-pipelines\.(yml|yaml)$",
3641
"types": "yaml",
3742
},

src/check_jsonschema/checker.py

+7-4
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
from .formats import FormatOptions
1212
from .instance_loader import InstanceLoader
1313
from .parsers import ParseError
14+
from .regex_variants import RegexImplementation
1415
from .reporter import Reporter
1516
from .result import CheckResult
1617
from .schema_loader import SchemaLoaderBase, SchemaParseError, UnsupportedUrlScheme
@@ -28,15 +29,17 @@ def __init__(
2829
instance_loader: InstanceLoader,
2930
reporter: Reporter,
3031
*,
31-
format_opts: FormatOptions | None = None,
32+
format_opts: FormatOptions,
33+
regex_impl: RegexImplementation,
3234
traceback_mode: str = "short",
3335
fill_defaults: bool = False,
3436
) -> None:
3537
self._schema_loader = schema_loader
3638
self._instance_loader = instance_loader
3739
self._reporter = reporter
3840

39-
self._format_opts = format_opts if format_opts is not None else FormatOptions()
41+
self._format_opts = format_opts
42+
self._regex_impl = regex_impl
4043
self._traceback_mode = traceback_mode
4144
self._fill_defaults = fill_defaults
4245

@@ -51,12 +54,12 @@ def get_validator(
5154
) -> jsonschema.protocols.Validator:
5255
try:
5356
return self._schema_loader.get_validator(
54-
path, doc, self._format_opts, self._fill_defaults
57+
path, doc, self._format_opts, self._regex_impl, self._fill_defaults
5558
)
5659
except SchemaParseError as e:
5760
self._fail("Error: schemafile could not be parsed as JSON", e)
5861
except jsonschema.SchemaError as e:
59-
self._fail(f"Error: schemafile was not valid: {e}\n", e)
62+
self._fail("Error: schemafile was not valid\n", e)
6063
except UnsupportedUrlScheme as e:
6164
self._fail(f"Error: {e}\n", e)
6265
except Exception as e:

src/check_jsonschema/cli/main_command.py

+22-10
Original file line numberDiff line numberDiff line change
@@ -9,9 +9,10 @@
99

1010
from ..catalog import CUSTOM_SCHEMA_NAMES, SCHEMA_CATALOG
1111
from ..checker import SchemaChecker
12-
from ..formats import KNOWN_FORMATS, RegexVariantName
12+
from ..formats import KNOWN_FORMATS
1313
from ..instance_loader import InstanceLoader
1414
from ..parsers import SUPPORTED_FILE_FORMATS
15+
from ..regex_variants import RegexImplementation, RegexVariantName
1516
from ..reporter import REPORTER_BY_NAME, Reporter
1617
from ..schema_loader import (
1718
BuiltinSchemaLoader,
@@ -68,10 +69,11 @@ def pretty_helptext_list(values: list[str] | tuple[str, ...]) -> str:
6869
date, date-time, email, ipv4, ipv6, regex, uuid
6970
7071
\b
71-
For the "regex" format, there are multiple modes which can be specified with
72-
'--format-regex':
73-
default | check that the string is a valid ECMAScript regex
74-
python | check that the string is a valid python regex
72+
For handling of regexes, there are multiple modes which can be specified with
73+
'--regex-variant':
74+
default | use ECMAScript regex syntax (via regress)
75+
nonunicode | use ECMAScript regex syntax, but in non-unicode mode (via regress)
76+
python | use python regex syntax
7577
7678
\b
7779
The '--builtin-schema' flag supports the following schema names:
@@ -138,11 +140,18 @@ def pretty_helptext_list(values: list[str] | tuple[str, ...]) -> str:
138140
)
139141
@click.option(
140142
"--format-regex",
143+
hidden=True,
144+
help="Legacy name for `--regex-variant`.",
145+
default=None,
146+
type=click.Choice([x.value for x in RegexVariantName], case_sensitive=False),
147+
)
148+
@click.option(
149+
"--regex-variant",
141150
help=(
142-
"Set the mode of format validation for regexes. "
143-
"If `--disable-formats regex` is used, this option has no effect."
151+
"Name of which regex dialect should be used for format checking "
152+
"and 'pattern' matching."
144153
),
145-
default=RegexVariantName.default.value,
154+
default=None,
146155
type=click.Choice([x.value for x in RegexVariantName], case_sensitive=False),
147156
)
148157
@click.option(
@@ -230,7 +239,8 @@ def main(
230239
no_cache: bool,
231240
cache_filename: str | None,
232241
disable_formats: tuple[list[str], ...],
233-
format_regex: t.Literal["python", "default"],
242+
format_regex: t.Literal["python", "nonunicode", "default"] | None,
243+
regex_variant: t.Literal["python", "nonunicode", "default"] | None,
234244
default_filetype: t.Literal["json", "yaml", "toml", "json5"],
235245
traceback_mode: t.Literal["full", "short"],
236246
data_transform: t.Literal["azure-pipelines", "gitlab-ci"] | None,
@@ -243,6 +253,8 @@ def main(
243253
) -> None:
244254
args = ParseResult()
245255

256+
args.set_regex_variant(regex_variant, legacy_opt=format_regex)
257+
246258
args.set_schema(schemafile, builtin_schema, check_metaschema)
247259
args.set_validator(validator_class)
248260

@@ -257,7 +269,6 @@ def main(
257269
else:
258270
args.disable_formats = normalized_disable_formats
259271

260-
args.format_regex = RegexVariantName(format_regex)
261272
args.disable_cache = no_cache
262273
args.default_filetype = default_filetype
263274
args.fill_defaults = fill_defaults
@@ -318,6 +329,7 @@ def build_checker(args: ParseResult) -> SchemaChecker:
318329
instance_loader,
319330
reporter,
320331
format_opts=args.format_opts,
332+
regex_impl=RegexImplementation(args.regex_variant),
321333
traceback_mode=args.traceback_mode,
322334
fill_defaults=args.fill_defaults,
323335
)

src/check_jsonschema/cli/parse_result.py

+16-3
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,8 @@
66
import click
77
import jsonschema
88

9-
from ..formats import FormatOptions, RegexVariantName
9+
from ..formats import FormatOptions
10+
from ..regex_variants import RegexImplementation, RegexVariantName
1011
from ..transforms import Transform
1112

1213

@@ -36,12 +37,24 @@ def __init__(self) -> None:
3637
# regex format options
3738
self.disable_all_formats: bool = False
3839
self.disable_formats: tuple[str, ...] = ()
39-
self.format_regex: RegexVariantName = RegexVariantName.default
40+
self.regex_variant: RegexVariantName = RegexVariantName.default
4041
# error and output controls
4142
self.verbosity: int = 1
4243
self.traceback_mode: str = "short"
4344
self.output_format: str = "text"
4445

46+
def set_regex_variant(
47+
self,
48+
variant_opt: t.Literal["python", "nonunicode", "default"] | None,
49+
*,
50+
legacy_opt: t.Literal["python", "nonunicode", "default"] | None = None,
51+
) -> None:
52+
variant_name: t.Literal["python", "nonunicode", "default"] | None = (
53+
variant_opt or legacy_opt
54+
)
55+
if variant_name:
56+
self.regex_variant = RegexVariantName(variant_name)
57+
4558
def set_schema(
4659
self, schemafile: str | None, builtin_schema: str | None, check_metaschema: bool
4760
) -> None:
@@ -82,7 +95,7 @@ def set_validator(
8295
@property
8396
def format_opts(self) -> FormatOptions:
8497
return FormatOptions(
98+
regex_impl=RegexImplementation(self.regex_variant),
8599
enabled=not self.disable_all_formats,
86-
regex_variant=self.format_regex,
87100
disabled_formats=self.disable_formats,
88101
)
+21-39
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,11 @@
11
from __future__ import annotations
22

33
import copy
4-
import enum
5-
import re
6-
import typing as t
74

85
import jsonschema
96
import jsonschema.validators
10-
import regress
117

8+
from ..regex_variants import RegexImplementation
129
from .implementations import validate_rfc3339, validate_time
1310

1411
# all known format strings except for a selection from draft3 which have either
@@ -39,42 +36,16 @@
3936
)
4037

4138

42-
class RegexVariantName(enum.Enum):
43-
default = "default"
44-
python = "python"
45-
46-
47-
class RegexImplementation:
48-
def __init__(self, variant: RegexVariantName) -> None:
49-
self.variant = variant
50-
51-
def check_format(self, instance: t.Any) -> bool:
52-
if not isinstance(instance, str):
53-
return True
54-
55-
try:
56-
if self.variant == RegexVariantName.default:
57-
regress.Regex(instance)
58-
else:
59-
re.compile(instance)
60-
# something is wrong with RegressError getting into the published types
61-
# needs investigation... for now, ignore the error
62-
except (regress.RegressError, re.error): # type: ignore[attr-defined]
63-
return False
64-
65-
return True
66-
67-
6839
class FormatOptions:
6940
def __init__(
7041
self,
7142
*,
43+
regex_impl: RegexImplementation,
7244
enabled: bool = True,
73-
regex_variant: RegexVariantName = RegexVariantName.default,
7445
disabled_formats: tuple[str, ...] = (),
7546
) -> None:
7647
self.enabled = enabled
77-
self.regex_variant = regex_variant
48+
self.regex_impl = regex_impl
7849
self.disabled_formats = disabled_formats
7950

8051

@@ -95,14 +66,10 @@ def make_format_checker(
9566
if not opts.enabled:
9667
return None
9768

98-
# copy the base checker
99-
base_checker = get_base_format_checker(schema_dialect)
100-
checker = copy.deepcopy(base_checker)
69+
# customize around regex checking first
70+
checker = format_checker_for_regex_impl(opts.regex_impl)
10171

102-
# replace the regex check
103-
del checker.checkers["regex"]
104-
regex_impl = RegexImplementation(opts.regex_variant)
105-
checker.checks("regex")(regex_impl.check_format)
72+
# add other custom format checks
10673
checker.checks("date-time")(validate_rfc3339)
10774
checker.checks("time")(validate_time)
10875

@@ -113,3 +80,18 @@ def make_format_checker(
11380
del checker.checkers[checkname]
11481

11582
return checker
83+
84+
85+
def format_checker_for_regex_impl(
86+
regex_impl: RegexImplementation, schema_dialect: str | None = None
87+
) -> jsonschema.FormatChecker:
88+
# convert to a schema-derived format checker, and copy it
89+
# for safe modification
90+
base_checker = get_base_format_checker(schema_dialect)
91+
checker = copy.deepcopy(base_checker)
92+
93+
# replace the regex check
94+
del checker.checkers["regex"]
95+
checker.checks("regex")(regex_impl.check_format)
96+
97+
return checker

0 commit comments

Comments
 (0)