Skip to content

Commit 7e09eaf

Browse files
committed
Test for compliance in strict and lax mode
1 parent e98453d commit 7e09eaf

File tree

8 files changed

+299
-84
lines changed

8 files changed

+299
-84
lines changed

.github/workflows/tests-no-regex.yaml

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
name: test-no-regex
2+
on: [push, pull_request]
3+
4+
jobs:
5+
lint:
6+
runs-on: ubuntu-latest
7+
steps:
8+
- uses: actions/checkout@v3
9+
with:
10+
submodules: true
11+
- name: Set up Python 3.11
12+
uses: actions/setup-python@v4
13+
with:
14+
python-version: "3.11"
15+
- name: Install dependencies
16+
run: |
17+
python -m pip install --upgrade pip
18+
python -m pip install --upgrade hatch
19+
- run: hatch -e no-regex run test

jsonpath/function_extensions/match.py

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,13 @@
22

33
try:
44
import regex as re
5+
6+
REGEX_AVAILABLE = True
57
except ImportError:
68
import re # type: ignore
79

10+
REGEX_AVAILABLE = False
11+
812
from jsonpath.function_extensions import ExpressionType
913
from jsonpath.function_extensions import FilterFunction
1014

@@ -19,9 +23,15 @@ class Match(FilterFunction):
1923

2024
def __call__(self, string: str, pattern: str) -> bool:
2125
"""Return `True` if _string_ matches _pattern_, or `False` otherwise."""
26+
# XXX: re.fullmatch caches compiled patterns internally, but `map_re` is not
27+
# cached.
28+
if REGEX_AVAILABLE:
29+
try:
30+
pattern = map_re(pattern)
31+
except TypeError:
32+
return False
33+
2234
try:
23-
# XXX: re.fullmatch caches compiled patterns internally, but `map_re` is not
24-
# cached.
25-
return bool(re.fullmatch(map_re(pattern), string))
35+
return bool(re.fullmatch(pattern, string))
2636
except (TypeError, re.error):
2737
return False

jsonpath/function_extensions/search.py

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,13 @@
22

33
try:
44
import regex as re
5+
6+
REGEX_AVAILABLE = True
57
except ImportError:
68
import re # type: ignore
79

10+
REGEX_AVAILABLE = False
11+
812
from jsonpath.function_extensions import ExpressionType
913
from jsonpath.function_extensions import FilterFunction
1014

@@ -19,9 +23,15 @@ class Search(FilterFunction):
1923

2024
def __call__(self, string: str, pattern: str) -> bool:
2125
"""Return `True` if _string_ contains _pattern_, or `False` otherwise."""
26+
# XXX: re.search caches compiled patterns internally, but `map_re` is not
27+
# cached.
28+
if REGEX_AVAILABLE:
29+
try:
30+
pattern = map_re(pattern)
31+
except TypeError:
32+
return False
33+
2234
try:
23-
# XXX: re.search caches compiled patterns internally, but `map_re` is not
24-
# cached.
25-
return bool(re.search(map_re(pattern), string))
35+
return bool(re.search(pattern, string))
2636
except (TypeError, re.error):
2737
return False

jsonpath/lex.py

Lines changed: 10 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -140,8 +140,11 @@ def compile_rules(self) -> Pattern[str]:
140140
(TOKEN_RE_PATTERN, self.re_pattern),
141141
(TOKEN_DOT_KEY_PROPERTY, self.dot_key_pattern),
142142
(TOKEN_DOT_PROPERTY, self.dot_property_pattern),
143-
(TOKEN_FLOAT, r"-?\d+\.\d*(?:[eE][+-]?\d+)?"),
144-
(TOKEN_INT, r"-?\d+(?P<G_EXP>[eE][+\-]?\d+)?\b"),
143+
(
144+
TOKEN_FLOAT,
145+
r"(:?-?[0-9]+\.[0-9]+(?:[eE][+-]?[0-9]+)?)|(-?[0-9]+[eE]-[0-9]+)",
146+
),
147+
(TOKEN_INT, r"-?[0-9]+(?:[eE]\+?[0-9]+)?"),
145148
(TOKEN_DDOT, r"\.\."),
146149
(TOKEN_DOT, r"\."),
147150
(TOKEN_AND, self.logical_and_pattern),
@@ -202,8 +205,11 @@ def compile_strict_rules(self) -> Pattern[str]:
202205
(TOKEN_SINGLE_QUOTE_STRING, self.single_quote_pattern),
203206
(TOKEN_DOT_KEY_PROPERTY, self.dot_key_pattern),
204207
(TOKEN_DOT_PROPERTY, self.dot_property_pattern),
205-
(TOKEN_FLOAT, r"-?\d+\.\d*(?:[eE][+-]?\d+)?"),
206-
(TOKEN_INT, r"-?\d+(?P<G_EXP>[eE][+\-]?\d+)?\b"),
208+
(
209+
TOKEN_FLOAT,
210+
r"(:?-?[0-9]+\.[0-9]+(?:[eE][+-]?[0-9]+)?)|(-?[0-9]+[eE]-[0-9]+)",
211+
),
212+
(TOKEN_INT, r"-?[0-9]+(?:[eE]\+?[0-9]+)?"),
207213
(TOKEN_DDOT, r"\.\."),
208214
(TOKEN_DOT, r"\."),
209215
(TOKEN_AND, r"&&"),
@@ -288,19 +294,6 @@ def tokenize(self, path: str) -> Iterator[Token]: # noqa PLR0912
288294
value=match.group("G_SQUOTE"),
289295
index=match.start("G_SQUOTE"),
290296
)
291-
elif kind == TOKEN_INT:
292-
if match.group("G_EXP") and match.group("G_EXP")[1] == "-":
293-
yield _token(
294-
kind=TOKEN_FLOAT,
295-
value=match.group(),
296-
index=match.start(),
297-
)
298-
else:
299-
yield _token(
300-
kind=TOKEN_INT,
301-
value=match.group(),
302-
index=match.start(),
303-
)
304297
elif kind == TOKEN_RE_PATTERN:
305298
yield _token(
306299
kind=TOKEN_RE_PATTERN,

jsonpath/parse.py

Lines changed: 23 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,7 @@
104104
from .token import TOKEN_WHITESPACE
105105
from .token import TOKEN_WILD
106106
from .token import Token
107+
from .unescape import unescape_string
107108

108109
if TYPE_CHECKING:
109110
from .env import JSONPathEnvironment
@@ -623,11 +624,23 @@ def parse_string_literal(self, stream: TokenStream) -> BaseExpression:
623624
return StringLiteral(value=self._decode_string_literal(stream.next()))
624625

625626
def parse_integer_literal(self, stream: TokenStream) -> BaseExpression:
627+
token = stream.next()
628+
value = token.value
629+
630+
if self.env.strict and value.startswith("0") and len(value) > 1:
631+
raise JSONPathSyntaxError("invalid integer literal", token=token)
632+
626633
# Convert to float first to handle scientific notation.
627-
return IntegerLiteral(value=int(float(stream.next().value)))
634+
return IntegerLiteral(value=int(float(value)))
628635

629636
def parse_float_literal(self, stream: TokenStream) -> BaseExpression:
630-
return FloatLiteral(value=float(stream.next().value))
637+
token = stream.next()
638+
value = token.value
639+
640+
if value.startswith("0") and len(value.split(".")[0]) > 1:
641+
raise JSONPathSyntaxError("invalid float literal", token=token)
642+
643+
return FloatLiteral(value=float(value))
631644

632645
def parse_prefix_expression(self, stream: TokenStream) -> BaseExpression:
633646
token = stream.next()
@@ -839,11 +852,19 @@ def parse_filter_expression(
839852
return left
840853

841854
def _decode_string_literal(self, token: Token) -> str:
855+
if self.env.strict:
856+
return unescape_string(
857+
token.value,
858+
token,
859+
"'" if token.kind == TOKEN_SINGLE_QUOTE_STRING else '"',
860+
)
861+
842862
if self.env.unicode_escape:
843863
if token.kind == TOKEN_SINGLE_QUOTE_STRING:
844864
value = token.value.replace('"', '\\"').replace("\\'", "'")
845865
else:
846866
value = token.value
867+
847868
try:
848869
rv = json.loads(f'"{value}"')
849870
assert isinstance(rv, str)

jsonpath/unescape.py

Lines changed: 134 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,134 @@
1+
r"""Replace `\uXXXX` escape sequences with Unicode code points."""
2+
3+
from typing import List
4+
from typing import Tuple
5+
6+
from .exceptions import JSONPathSyntaxError
7+
from .token import Token
8+
9+
10+
def unescape_string(value: str, token: Token, quote: str) -> str:
11+
"""Return `value` with escape sequences replaced with Unicode code points."""
12+
unescaped: List[str] = []
13+
index = 0
14+
15+
while index < len(value):
16+
ch = value[index]
17+
if ch == "\\":
18+
index += 1
19+
_ch, index = _decode_escape_sequence(value, index, token, quote)
20+
unescaped.append(_ch)
21+
else:
22+
_string_from_codepoint(ord(ch), token)
23+
unescaped.append(ch)
24+
index += 1
25+
return "".join(unescaped)
26+
27+
28+
def _decode_escape_sequence( # noqa: PLR0911
29+
value: str, index: int, token: Token, quote: str
30+
) -> Tuple[str, int]:
31+
try:
32+
ch = value[index]
33+
except IndexError as err:
34+
raise JSONPathSyntaxError("incomplete escape sequence", token=token) from err
35+
36+
if ch == quote:
37+
return quote, index
38+
if ch == "\\":
39+
return "\\", index
40+
if ch == "/":
41+
return "/", index
42+
if ch == "b":
43+
return "\x08", index
44+
if ch == "f":
45+
return "\x0c", index
46+
if ch == "n":
47+
return "\n", index
48+
if ch == "r":
49+
return "\r", index
50+
if ch == "t":
51+
return "\t", index
52+
if ch == "u":
53+
codepoint, index = _decode_hex_char(value, index, token)
54+
return _string_from_codepoint(codepoint, token), index
55+
56+
raise JSONPathSyntaxError(
57+
f"unknown escape sequence at index {token.index + index - 1}",
58+
token=token,
59+
)
60+
61+
62+
def _decode_hex_char(value: str, index: int, token: Token) -> Tuple[int, int]:
63+
length = len(value)
64+
65+
if index + 4 >= length:
66+
raise JSONPathSyntaxError(
67+
f"incomplete escape sequence at index {token.index + index - 1}",
68+
token=token,
69+
)
70+
71+
index += 1 # move past 'u'
72+
codepoint = _parse_hex_digits(value[index : index + 4], token)
73+
74+
if _is_low_surrogate(codepoint):
75+
raise JSONPathSyntaxError(
76+
f"unexpected low surrogate at index {token.index + index - 1}",
77+
token=token,
78+
)
79+
80+
if _is_high_surrogate(codepoint):
81+
# expect a surrogate pair
82+
if not (
83+
index + 9 < length and value[index + 4] == "\\" and value[index + 5] == "u"
84+
):
85+
raise JSONPathSyntaxError(
86+
f"incomplete escape sequence at index {token.index + index - 2}",
87+
token=token,
88+
)
89+
90+
low_surrogate = _parse_hex_digits(value[index + 6 : index + 10], token)
91+
92+
if not _is_low_surrogate(low_surrogate):
93+
raise JSONPathSyntaxError(
94+
f"unexpected codepoint at index {token.index + index + 4}",
95+
token=token,
96+
)
97+
98+
codepoint = 0x10000 + (((codepoint & 0x03FF) << 10) | (low_surrogate & 0x03FF))
99+
100+
return (codepoint, index + 9)
101+
102+
return (codepoint, index + 3)
103+
104+
105+
def _parse_hex_digits(digits: str, token: Token) -> int:
106+
codepoint = 0
107+
for digit in digits.encode():
108+
codepoint <<= 4
109+
if digit >= 48 and digit <= 57:
110+
codepoint |= digit - 48
111+
elif digit >= 65 and digit <= 70:
112+
codepoint |= digit - 65 + 10
113+
elif digit >= 97 and digit <= 102:
114+
codepoint |= digit - 97 + 10
115+
else:
116+
raise JSONPathSyntaxError(
117+
"invalid \\uXXXX escape sequence",
118+
token=token,
119+
)
120+
return codepoint
121+
122+
123+
def _string_from_codepoint(codepoint: int, token: Token) -> str:
124+
if codepoint <= 0x1F:
125+
raise JSONPathSyntaxError("invalid character", token=token)
126+
return chr(codepoint)
127+
128+
129+
def _is_high_surrogate(codepoint: int) -> bool:
130+
return codepoint >= 0xD800 and codepoint <= 0xDBFF
131+
132+
133+
def _is_low_surrogate(codepoint: int) -> bool:
134+
return codepoint >= 0xDC00 and codepoint <= 0xDFFF

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -192,4 +192,5 @@ convention = "google"
192192
"jsonpath/__init__.py" = ["D104"]
193193
"jsonpath/selectors.py" = ["D102"]
194194
"jsonpath/filter.py" = ["D102", "PLW1641"]
195+
"jsonpath/unescape.py" = ["PLR2004"]
195196
"tests/*" = ["D100", "D101", "D104", "D103"]

0 commit comments

Comments
 (0)