Skip to content

Add --format-email to perform full validation on "email" and "idn-email" formats #460

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 15 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ Unreleased
----------

.. vendor-insert-here
- Add ``--format-email`` option to allow full validation of email/idn-email formats

0.29.0
------
Expand Down
25 changes: 24 additions & 1 deletion src/check_jsonschema/formats/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,12 @@
import jsonschema.validators
import regress

from .implementations import validate_rfc3339, validate_time
from .implementations import (
validate_rfc3339,
validate_rfc5321,
validate_rfc6531,
validate_time,
)

# all known format strings except for a selection from draft3 which have either
# been renamed or removed:
Expand Down Expand Up @@ -39,6 +44,21 @@
)


class EmailImplementation:
def __init__(self) -> None:
pass

def check_format_email(self, instance: t.Any) -> bool:
if not isinstance(instance, str):
return True
return validate_rfc5321(instance)

def check_format_idn_email(self, instance: t.Any) -> bool:
if not isinstance(instance, str):
return True
return validate_rfc6531(instance)


class RegexVariantName(enum.Enum):
default = "default"
python = "python"
Expand Down Expand Up @@ -101,7 +121,10 @@ def make_format_checker(

# replace the regex check
del checker.checkers["regex"]
email_impl = EmailImplementation()
regex_impl = RegexImplementation(opts.regex_variant)
checker.checks("email")(email_impl.check_format_email)
checker.checks("idn-email")(email_impl.check_format_idn_email)
checker.checks("regex")(regex_impl.check_format)
checker.checks("date-time")(validate_rfc3339)
checker.checks("time")(validate_time)
Expand Down
4 changes: 3 additions & 1 deletion src/check_jsonschema/formats/implementations/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
from .iso8601_time import validate as validate_time
from .rfc3339 import validate as validate_rfc3339
from .rfc5321 import validate as validate_rfc5321
from .rfc6531 import validate as validate_rfc6531

__all__ = ("validate_rfc3339", "validate_time")
__all__ = ("validate_rfc3339", "validate_rfc5321", "validate_rfc6531", "validate_time")
63 changes: 63 additions & 0 deletions src/check_jsonschema/formats/implementations/rfc5321.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
import re

# ([!#-'*+/-9=?A-Z^-~-]+(\.[!#-'*+/-9=?A-Z^-~-]+)*|"([]!#-[^-~ \t]|(\\[\t -~]))+")
# @
# ([!#-'*+/-9=?A-Z^-~-]+(\.[!#-'*+/-9=?A-Z^-~-]+)*|\[[\t -Z^-~]*])
#
# [a-zA-Z0-9!#$%&'*+/=?^_`{|}~-] == Alphanumeric characters and most special characters except [ (),.:;<>@\[\]\t]
# [a-zA-Z0-9 !#$%&'()*+,./:;<=>?@\[\]^_`{|}~\t-] == All printable characters except for " and \
# [\t -~] == All printable characters
# [a-zA-Z0-9 !"#$%&'()*+,./:;<=>?@^_`{|}~\t-] == All printable characters except for the following characters []\
RFC5321_REGEX = re.compile(
r"""
^
(?P<local>
[a-zA-Z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-zA-Z0-9!#$%&'*+/=?^_`{|}~-]+)*
|
"(?:[a-zA-Z0-9 !#$%&'()*+,./:;<=>?@\[\]^_`{|}~\t-]|\\[\t -~])+"
)
@
(?P<domain>
[a-zA-Z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-zA-Z0-9!#$%&'*+/=?^_`{|}~-]+)*
|
\[[a-zA-Z0-9 !"#$%&'()*+,./:;<=>?@^_`{|}~\t-]*\]
)
$
""",
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm finding this regex a bit difficult to read. In particular, I'm seeing some unconventional range expressions in the character classes, like /-9, -Z, and ^-~.
These are valid, but they aren't the way character classes are typically written. Perhaps other people have an expert and intuitive knowledge of what chr(ord(" ")+1) will be, but I definitely don't.

Can these be rewritten such that the suite of characters matched is more obvious to a reader? For example, rather than [/-9], I would much rather see [/0-9]. The fact that the lowercase letters are captured with ^-~ caught me particularly off-guard.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That's fair. I'm not the original author of these regexes either (links in PR description). I'll see what I can do about cleaning up those character classes though.

Copy link
Author

@trzejos trzejos Jul 26, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@sirosen

I had some time this week to revisit this, and reworked the simpler RFC5321 validation. I also added in some length checks as well and tried validating it against the examples in this wikipedia page: https://en.wikipedia.org/wiki/Email_address#Examples

It validated/invalidated as expected except for a couple cases:

  • I❤️[email protected] was incorrectly found invalid, likely because of UTF-8
    • This is actually the correct behavior. UTF-8 email addresses should only be allowed in the idn-email format
  • i.like.underscores@but_they_are_not_allowed_in_this_part was incorrectly found invalid, we allow underscores in the domain part of the regex.

Let me know if the regex is easier to understand and if you think we should need to handle non-ascii strings like that utf-8 one.

I have not revisited the idn-email validator yet

re.VERBOSE | re.ASCII,
)


def validate(email_str: object) -> bool:
"""Validate a string as a RFC5321 email address."""
if not isinstance(email_str, str):
return False
match = RFC5321_REGEX.match(email_str)
if not match:
return False
local, domain = match.group("local", "domain")
# Local part of email address is limited to 64 octets
if len(local) > 64:
return False
# Domain names are limited to 253 octets
if len(domain) > 253:
return False
for domain_part in domain.split("."):
# DNS Labels are limited to 63 octets
if len(domain_part) > 63:
return False
return True


if __name__ == "__main__":
import timeit

N = 100_000
tests = (("basic", "[email protected]"),)

print("benchmarking")
for name, val in tests:
all_times = timeit.repeat(
f"validate({val!r})", globals=globals(), repeat=3, number=N
)
print(f"{name} (valid={validate(val)}): {int(min(all_times) / N * 10**9)}ns")
61 changes: 61 additions & 0 deletions src/check_jsonschema/formats/implementations/rfc6531.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
import re

RFC6531_REGEX = re.compile(
r"""
^
# local part
(
([0-9a-z!#$%&'*+-\/=?^_`\{|\}~\u0080-\U0010FFFF]+(\.[0-9a-z!#$%&'*+-\/=?^_`\{|\}~\u0080-\U0010FFFF]+)*)
|
# quoted string
"([\x20-\x21\x23-\x5B\x5D-\x7E\u0080-\U0010FFFF]|\\[\x20-\x7E])*"
)
@
# Domain/address
(
# Address literal
(\[(
# IPv4
(\d{1,3}(\.\d{1,3}){3})
|
# IPv6
(IPv6:[0-9a-f]{1,4}(:[0-9a-f]{1,4}){7})
|
(IPv6:([0-9a-f]{1,4}(:[0-9a-f]{1,4}){0,5})?::([0-9a-f]{1,4}(:[0-9a-f]{1,4}){0,5})?)
|
(IPv6:[0-9a-f]{1,4}(:[0-9a-f]{1,4}){5}:\d{1,3}(\.\d{1,3}){3})
|
(IPv6:([0-9a-f]{1,4}(:[0-9a-f]{1,4}){0,3})?::([0-9a-f]{1,4}(:[0-9a-f]{1,4}){0,3}:)?\d{1,3}(\.\d{1,3}){3})
|
# General address
([a-z0-9-]*[a-z0-9]:[\x21-\x5A\x5E-\x7E]+)
)\])
|
# Domain
((?!.{256,})(([0-9a-z\u0080-\U0010FFFF]([0-9a-z-\u0080-\U0010FFFF]*[0-9a-z\u0080-\U0010FFFF])?))(\.([0-9a-z\u0080-\U0010FFFF]([0-9a-z-\u0080-\U0010FFFF]*[0-9a-z\u0080-\U0010FFFF])?))*)
)
$
""",
re.VERBOSE | re.UNICODE,
)


def validate(email_str: object) -> bool:
"""Validate a string as a RFC6531 email address."""
if not isinstance(email_str, str):
return False
return RFC6531_REGEX.match(email_str)


if __name__ == "__main__":
import timeit

N = 100_000
tests = (("basic", "[email protected]"),)

print("benchmarking")
for name, val in tests:
all_times = timeit.repeat(
f"validate({val!r})", globals=globals(), repeat=3, number=N
)
print(f"{name} (valid={validate(val)}): {int(min(all_times) / N * 10**9)}ns")
57 changes: 57 additions & 0 deletions tests/unit/formats/test_rfc5321.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
import pytest

from check_jsonschema.formats.implementations.rfc5321 import validate


@pytest.mark.parametrize(
"emailstr",
(
r"[email protected]",
r"[email protected]",
r"[email protected]",
r"[email protected]",
r"[email protected]",
r"[email protected]",
r"name/[email protected]",
r"admin@example",
r"[email protected]",
r'" "@example.org',
r'"john..doe"@example.org',
r"[email protected]",
r'"very.(),:;<>[]\".VERY.\"very@\\ \"very\".unusual"@strange.example.com',
r"user%[email protected]",
r"[email protected]",
r"postmaster@[123.123.123.123]",
r"postmaster@[IPv6:2001:0db8:85a3:0000:0000:8a2e:0370:7334]",
r"_test@[IPv6:2001:0db8:85a3:0000:0000:8a2e:0370:7334]",
),
)
def test_simple_positive_cases(emailstr):
assert validate(emailstr)


@pytest.mark.parametrize(
"emailstr",
(
r"I❤️[email protected]",
r"用户@例子.广告",
r"ಬೆಂಬಲ@ಡೇಟಾಮೇಲ್.ಭಾರತ",
r"अजय@डाटा.भारत",
r"квіточка@пошта.укр",
r"χρήστης@παράδειγμα.ελ",
r"Dörte@Sörensen.example.com",
r"коля@пример.рф",
r"abc.example.com",
r"a@b@[email protected]",
r'a"b(c)d,e:f;g<h>i[j\k][email protected]',
r'just"not"[email protected]',
r'this is"not\[email protected]',
r"this\ still\"not\\[email protected]",
r"1234567890123456789012345678901234567890123456789012345678901234+x@example.com",
r"i.like.underscores@but_they_are_not_allowed_in_this_part",
r"trythis@123456789012345678901234567890123456789012345678901234567890123456.com",
r"another@12345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234.com",
),
)
def test_simple_negative_case(emailstr):
assert not validate(emailstr)
59 changes: 59 additions & 0 deletions tests/unit/formats/test_rfc6531.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
import pytest

from check_jsonschema.formats.implementations.rfc6531 import validate


@pytest.mark.parametrize(
"emailstr",
(
r"[email protected]",
r"[email protected]",
r"[email protected]",
r"[email protected]",
r"[email protected]",
r"[email protected]",
r"name/[email protected]",
r"admin@example",
r"[email protected]",
r'" "@example.org',
r'"john..doe"@example.org',
r"[email protected]",
(
r'"very.(),:;<>[]\".VERY.\"very@\\ \"very\".unusual"@strange.example.com'
r"user%[email protected]"
),
r"[email protected]",
r"postmaster@[123.123.123.123]",
r"postmaster@[IPv6:2001:0db8:85a3:0000:0000:8a2e:0370:7334]",
r"_test@[IPv6:2001:0db8:85a3:0000:0000:8a2e:0370:7334]",
r"I❤️[email protected]",
r"用户@例子.广告",
r"ಬೆಂಬಲ@ಡೇಟಾಮೇಲ್.ಭಾರತ",
r"अजय@डाटा.भारत",
r"квіточка@пошта.укр",
r"χρήστης@παράδειγμα.ελ",
r"Dörte@Sörensen.example.com",
r"коля@пример.рф",
),
)
def test_simple_positive_cases(emailstr):
assert validate(emailstr)


@pytest.mark.parametrize(
"emailstr",
(
r"abc.example.com",
r"a@b@[email protected]",
r'a"b(c)d,e:f;g<h>i[j\k][email protected]',
r'just"not"[email protected]',
r'this is"not\[email protected]',
r"this\ still\"not\\[email protected]",
r"1234567890123456789012345678901234567890123456789012345678901234+x@example.com",
r"i.like.underscores@but_they_are_not_allowed_in_this_part",
r"trythis@123456789012345678901234567890123456789012345678901234567890123456.com",
r"another@12345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234.com",
),
)
def test_simple_negative_case(emailstr):
assert not validate(emailstr)
Loading