diff --git a/CHANGELOG.rst b/CHANGELOG.rst index ec3299036..520e81524 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -9,6 +9,7 @@ Unreleased ---------- .. vendor-insert-here +- Add ``--format-email`` option to allow full validation of email/idn-email formats 0.29.0 ------ diff --git a/src/check_jsonschema/formats/__init__.py b/src/check_jsonschema/formats/__init__.py index 8202d9a00..f9c6fa90a 100644 --- a/src/check_jsonschema/formats/__init__.py +++ b/src/check_jsonschema/formats/__init__.py @@ -9,7 +9,12 @@ import jsonschema.validators import regress -from .implementations import validate_rfc3339, validate_time +from .implementations import ( + validate_rfc3339, + validate_rfc5321, + validate_rfc6531, + validate_time, +) # all known format strings except for a selection from draft3 which have either # been renamed or removed: @@ -39,6 +44,21 @@ ) +class EmailImplementation: + def __init__(self) -> None: + pass + + def check_format_email(self, instance: t.Any) -> bool: + if not isinstance(instance, str): + return True + return validate_rfc5321(instance) + + def check_format_idn_email(self, instance: t.Any) -> bool: + if not isinstance(instance, str): + return True + return validate_rfc6531(instance) + + class RegexVariantName(enum.Enum): default = "default" python = "python" @@ -101,7 +121,10 @@ def make_format_checker( # replace the regex check del checker.checkers["regex"] + email_impl = EmailImplementation() regex_impl = RegexImplementation(opts.regex_variant) + checker.checks("email")(email_impl.check_format_email) + checker.checks("idn-email")(email_impl.check_format_idn_email) checker.checks("regex")(regex_impl.check_format) checker.checks("date-time")(validate_rfc3339) checker.checks("time")(validate_time) diff --git a/src/check_jsonschema/formats/implementations/__init__.py b/src/check_jsonschema/formats/implementations/__init__.py index 38ac89fe5..5cabca042 100644 --- a/src/check_jsonschema/formats/implementations/__init__.py +++ b/src/check_jsonschema/formats/implementations/__init__.py @@ -1,4 +1,6 @@ from .iso8601_time import validate as validate_time from .rfc3339 import validate as validate_rfc3339 +from .rfc5321 import validate as validate_rfc5321 +from .rfc6531 import validate as validate_rfc6531 -__all__ = ("validate_rfc3339", "validate_time") +__all__ = ("validate_rfc3339", "validate_rfc5321", "validate_rfc6531", "validate_time") diff --git a/src/check_jsonschema/formats/implementations/rfc5321.py b/src/check_jsonschema/formats/implementations/rfc5321.py new file mode 100644 index 000000000..42691c807 --- /dev/null +++ b/src/check_jsonschema/formats/implementations/rfc5321.py @@ -0,0 +1,63 @@ +import re + +# ([!#-'*+/-9=?A-Z^-~-]+(\.[!#-'*+/-9=?A-Z^-~-]+)*|"([]!#-[^-~ \t]|(\\[\t -~]))+") +# @ +# ([!#-'*+/-9=?A-Z^-~-]+(\.[!#-'*+/-9=?A-Z^-~-]+)*|\[[\t -Z^-~]*]) +# +# [a-zA-Z0-9!#$%&'*+/=?^_`{|}~-] == Alphanumeric characters and most special characters except [ (),.:;<>@\[\]\t] +# [a-zA-Z0-9 !#$%&'()*+,./:;<=>?@\[\]^_`{|}~\t-] == All printable characters except for " and \ +# [\t -~] == All printable characters +# [a-zA-Z0-9 !"#$%&'()*+,./:;<=>?@^_`{|}~\t-] == All printable characters except for the following characters []\ +RFC5321_REGEX = re.compile( + r""" + ^ + (?P + [a-zA-Z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-zA-Z0-9!#$%&'*+/=?^_`{|}~-]+)* + | + "(?:[a-zA-Z0-9 !#$%&'()*+,./:;<=>?@\[\]^_`{|}~\t-]|\\[\t -~])+" + ) + @ + (?P + [a-zA-Z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-zA-Z0-9!#$%&'*+/=?^_`{|}~-]+)* + | + \[[a-zA-Z0-9 !"#$%&'()*+,./:;<=>?@^_`{|}~\t-]*\] + ) + $ + """, + re.VERBOSE | re.ASCII, +) + + +def validate(email_str: object) -> bool: + """Validate a string as a RFC5321 email address.""" + if not isinstance(email_str, str): + return False + match = RFC5321_REGEX.match(email_str) + if not match: + return False + local, domain = match.group("local", "domain") + # Local part of email address is limited to 64 octets + if len(local) > 64: + return False + # Domain names are limited to 253 octets + if len(domain) > 253: + return False + for domain_part in domain.split("."): + # DNS Labels are limited to 63 octets + if len(domain_part) > 63: + return False + return True + + +if __name__ == "__main__": + import timeit + + N = 100_000 + tests = (("basic", "user@example.com"),) + + print("benchmarking") + for name, val in tests: + all_times = timeit.repeat( + f"validate({val!r})", globals=globals(), repeat=3, number=N + ) + print(f"{name} (valid={validate(val)}): {int(min(all_times) / N * 10**9)}ns") diff --git a/src/check_jsonschema/formats/implementations/rfc6531.py b/src/check_jsonschema/formats/implementations/rfc6531.py new file mode 100644 index 000000000..ecd5bc4e1 --- /dev/null +++ b/src/check_jsonschema/formats/implementations/rfc6531.py @@ -0,0 +1,61 @@ +import re + +RFC6531_REGEX = re.compile( + r""" + ^ + # local part + ( + ([0-9a-z!#$%&'*+-\/=?^_`\{|\}~\u0080-\U0010FFFF]+(\.[0-9a-z!#$%&'*+-\/=?^_`\{|\}~\u0080-\U0010FFFF]+)*) + | + # quoted string + "([\x20-\x21\x23-\x5B\x5D-\x7E\u0080-\U0010FFFF]|\\[\x20-\x7E])*" + ) + @ + # Domain/address + ( + # Address literal + (\[( + # IPv4 + (\d{1,3}(\.\d{1,3}){3}) + | + # IPv6 + (IPv6:[0-9a-f]{1,4}(:[0-9a-f]{1,4}){7}) + | + (IPv6:([0-9a-f]{1,4}(:[0-9a-f]{1,4}){0,5})?::([0-9a-f]{1,4}(:[0-9a-f]{1,4}){0,5})?) + | + (IPv6:[0-9a-f]{1,4}(:[0-9a-f]{1,4}){5}:\d{1,3}(\.\d{1,3}){3}) + | + (IPv6:([0-9a-f]{1,4}(:[0-9a-f]{1,4}){0,3})?::([0-9a-f]{1,4}(:[0-9a-f]{1,4}){0,3}:)?\d{1,3}(\.\d{1,3}){3}) + | + # General address + ([a-z0-9-]*[a-z0-9]:[\x21-\x5A\x5E-\x7E]+) + )\]) + | + # Domain + ((?!.{256,})(([0-9a-z\u0080-\U0010FFFF]([0-9a-z-\u0080-\U0010FFFF]*[0-9a-z\u0080-\U0010FFFF])?))(\.([0-9a-z\u0080-\U0010FFFF]([0-9a-z-\u0080-\U0010FFFF]*[0-9a-z\u0080-\U0010FFFF])?))*) + ) + $ + """, + re.VERBOSE | re.UNICODE, +) + + +def validate(email_str: object) -> bool: + """Validate a string as a RFC6531 email address.""" + if not isinstance(email_str, str): + return False + return RFC6531_REGEX.match(email_str) + + +if __name__ == "__main__": + import timeit + + N = 100_000 + tests = (("basic", "user@example.com"),) + + print("benchmarking") + for name, val in tests: + all_times = timeit.repeat( + f"validate({val!r})", globals=globals(), repeat=3, number=N + ) + print(f"{name} (valid={validate(val)}): {int(min(all_times) / N * 10**9)}ns") diff --git a/tests/unit/formats/test_rfc5321.py b/tests/unit/formats/test_rfc5321.py new file mode 100644 index 000000000..a9e838330 --- /dev/null +++ b/tests/unit/formats/test_rfc5321.py @@ -0,0 +1,57 @@ +import pytest + +from check_jsonschema.formats.implementations.rfc5321 import validate + + +@pytest.mark.parametrize( + "emailstr", + ( + r"simple@example.com", + r"very.common@example.com", + r"FirstName.LastName@EasierReading.org", + r"x@example.com", + r"long.email-address-with-hyphens@and.subdomains.example.com", + r"user.name+tag+sorting@example.com", + r"name/surname@example.com", + r"admin@example", + r"example@s.example", + r'" "@example.org', + r'"john..doe"@example.org', + r"mailhost!username@example.org", + r'"very.(),:;<>[]\".VERY.\"very@\\ \"very\".unusual"@strange.example.com', + r"user%example.com@example.org", + r"user-@example.org", + r"postmaster@[123.123.123.123]", + r"postmaster@[IPv6:2001:0db8:85a3:0000:0000:8a2e:0370:7334]", + r"_test@[IPv6:2001:0db8:85a3:0000:0000:8a2e:0370:7334]", + ), +) +def test_simple_positive_cases(emailstr): + assert validate(emailstr) + + +@pytest.mark.parametrize( + "emailstr", + ( + r"I❤️CHOCOLATE@example.com", + r"用户@例子.广告", + r"ಬೆಂಬಲ@ಡೇಟಾಮೇಲ್.ಭಾರತ", + r"अजय@डाटा.भारत", + r"квіточка@пошта.укр", + r"χρήστης@παράδειγμα.ελ", + r"Dörte@Sörensen.example.com", + r"коля@пример.рф", + r"abc.example.com", + r"a@b@c@example.com", + r'a"b(c)d,e:f;gi[j\k]l@example.com', + r'just"not"right@example.com', + r'this is"not\allowed@example.com', + r"this\ still\"not\\allowed@example.com", + r"1234567890123456789012345678901234567890123456789012345678901234+x@example.com", + r"i.like.underscores@but_they_are_not_allowed_in_this_part", + r"trythis@123456789012345678901234567890123456789012345678901234567890123456.com", + r"another@12345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234.com", + ), +) +def test_simple_negative_case(emailstr): + assert not validate(emailstr) diff --git a/tests/unit/formats/test_rfc6531.py b/tests/unit/formats/test_rfc6531.py new file mode 100644 index 000000000..68d3e80ab --- /dev/null +++ b/tests/unit/formats/test_rfc6531.py @@ -0,0 +1,59 @@ +import pytest + +from check_jsonschema.formats.implementations.rfc6531 import validate + + +@pytest.mark.parametrize( + "emailstr", + ( + r"simple@example.com", + r"very.common@example.com", + r"FirstName.LastName@EasierReading.org", + r"x@example.com", + r"long.email-address-with-hyphens@and.subdomains.example.com", + r"user.name+tag+sorting@example.com", + r"name/surname@example.com", + r"admin@example", + r"example@s.example", + r'" "@example.org', + r'"john..doe"@example.org', + r"mailhost!username@example.org", + ( + r'"very.(),:;<>[]\".VERY.\"very@\\ \"very\".unusual"@strange.example.com' + r"user%example.com@example.org" + ), + r"user-@example.org", + r"postmaster@[123.123.123.123]", + r"postmaster@[IPv6:2001:0db8:85a3:0000:0000:8a2e:0370:7334]", + r"_test@[IPv6:2001:0db8:85a3:0000:0000:8a2e:0370:7334]", + r"I❤️CHOCOLATE@example.com", + r"用户@例子.广告", + r"ಬೆಂಬಲ@ಡೇಟಾಮೇಲ್.ಭಾರತ", + r"अजय@डाटा.भारत", + r"квіточка@пошта.укр", + r"χρήστης@παράδειγμα.ελ", + r"Dörte@Sörensen.example.com", + r"коля@пример.рф", + ), +) +def test_simple_positive_cases(emailstr): + assert validate(emailstr) + + +@pytest.mark.parametrize( + "emailstr", + ( + r"abc.example.com", + r"a@b@c@example.com", + r'a"b(c)d,e:f;gi[j\k]l@example.com', + r'just"not"right@example.com', + r'this is"not\allowed@example.com', + r"this\ still\"not\\allowed@example.com", + r"1234567890123456789012345678901234567890123456789012345678901234+x@example.com", + r"i.like.underscores@but_they_are_not_allowed_in_this_part", + r"trythis@123456789012345678901234567890123456789012345678901234567890123456.com", + r"another@12345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234.com", + ), +) +def test_simple_negative_case(emailstr): + assert not validate(emailstr)