Skip to content

Commit 98800ba

Browse files
committed
Add explicit checks for internationalized domain name characters invalid under UTS-46 to improve the error message
1 parent 936aead commit 98800ba

File tree

2 files changed

+40
-3
lines changed

2 files changed

+40
-3
lines changed

email_validator/syntax.py

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -460,6 +460,36 @@ def check_dot_atom(label: str, start_descr: str, end_descr: str, is_hostname: bo
460460
raise EmailSyntaxError("An email address cannot have a period and a hyphen next to each other.")
461461

462462

463+
def uts46_valid_char(char: str) -> bool:
464+
# By exhaustively searching for characters rejected by
465+
# for c in (chr(i) for i in range(0x110000)):
466+
# idna.uts46_remap(c, std3_rules=False, transitional=False)
467+
# I found the following rules are pretty close.
468+
c = ord(char)
469+
if 0x80 <= c <= 0x9f:
470+
# 8-bit ASCII range.
471+
return False
472+
elif ((0x2010 <= c <= 0x2060 and not (0x2024 <= c <= 0x2026) and not (0x2028 <= c <= 0x202E))
473+
or c in (0x00AD, 0x2064, 0xFF0E)
474+
or 0x200B <= c <= 0x200D
475+
or 0x1BCA0 <= c <= 0x1BCA3):
476+
# Characters that are permitted but fall into one of the
477+
# tests below.
478+
return True
479+
elif unicodedata.category(chr(c)) in ("Cf", "Cn", "Co", "Cs", "Zs", "Zl", "Zp"):
480+
# There are a bunch of Zs characters including regular space
481+
# that are allowed by UTS46 but are not allowed in domain
482+
# names anyway.
483+
#
484+
# There are some Cn (unassigned) characters that the idna
485+
# package doesn't reject but we can, I think.
486+
return False
487+
elif "002E" in unicodedata.decomposition(chr(c)).split(" "):
488+
# Characters that decompose into a sequence with a dot.
489+
return False
490+
return True
491+
492+
463493
class DomainNameValidationResult(TypedDict):
464494
ascii_domain: str
465495
domain: str
@@ -484,6 +514,15 @@ def validate_email_domain_name(domain: str, test_environment: bool = False, glob
484514
# they may not be valid, safe, or sensible Unicode strings.
485515
check_unsafe_chars(domain)
486516

517+
# Reject characters that would be rejected by UTS-46 normalization next but
518+
# with an error message under our control.
519+
bad_chars = {
520+
safe_character_display(c) for c in domain
521+
if not uts46_valid_char(c)
522+
}
523+
if bad_chars:
524+
raise EmailSyntaxError("The part after the @-sign contains invalid characters: " + ", ".join(sorted(bad_chars)) + ".")
525+
487526
# Perform UTS-46 normalization, which includes casefolding, NFC normalization,
488527
# and converting all label separators (the period/full stop, fullwidth full stop,
489528
# ideographic full stop, and halfwidth ideographic full stop) to regular dots.

tests/test_syntax.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -402,9 +402,7 @@ def test_domain_literal() -> None:
402402
('[email protected]', 'An email address cannot start with a period.'),
403403
('[email protected]', 'An email address cannot have two periods in a row.'),
404404
('[email protected]', 'An email address cannot have a period immediately before the @-sign.'),
405-
('me@⒈wouldbeinvalid.com',
406-
"The part after the @-sign contains invalid characters (Codepoint U+2488 not allowed "
407-
"at position 1 in '⒈wouldbeinvalid.com')."),
405+
('me@⒈wouldbeinvalid.com', "The part after the @-sign contains invalid characters: '⒈'."),
408406
('me@\u037e.com', "The part after the @-sign contains invalid characters after Unicode normalization: ';'."),
409407
('me@\u1fef.com', "The part after the @-sign contains invalid characters after Unicode normalization: '`'."),
410408
('@example.com', 'There must be something before the @-sign.'),

0 commit comments

Comments
 (0)