@@ -460,6 +460,36 @@ def check_dot_atom(label: str, start_descr: str, end_descr: str, is_hostname: bo
460
460
raise EmailSyntaxError ("An email address cannot have a period and a hyphen next to each other." )
461
461
462
462
463
+ def uts46_valid_char (char : str ) -> bool :
464
+ # By exhaustively searching for characters rejected by
465
+ # for c in (chr(i) for i in range(0x110000)):
466
+ # idna.uts46_remap(c, std3_rules=False, transitional=False)
467
+ # I found the following rules are pretty close.
468
+ c = ord (char )
469
+ if 0x80 <= c <= 0x9f :
470
+ # 8-bit ASCII range.
471
+ return False
472
+ elif ((0x2010 <= c <= 0x2060 and not (0x2024 <= c <= 0x2026 ) and not (0x2028 <= c <= 0x202E ))
473
+ or c in (0x00AD , 0x2064 , 0xFF0E )
474
+ or 0x200B <= c <= 0x200D
475
+ or 0x1BCA0 <= c <= 0x1BCA3 ):
476
+ # Characters that are permitted but fall into one of the
477
+ # tests below.
478
+ return True
479
+ elif unicodedata .category (chr (c )) in ("Cf" , "Cn" , "Co" , "Cs" , "Zs" , "Zl" , "Zp" ):
480
+ # There are a bunch of Zs characters including regular space
481
+ # that are allowed by UTS46 but are not allowed in domain
482
+ # names anyway.
483
+ #
484
+ # There are some Cn (unassigned) characters that the idna
485
+ # package doesn't reject but we can, I think.
486
+ return False
487
+ elif "002E" in unicodedata .decomposition (chr (c )).split (" " ):
488
+ # Characters that decompose into a sequence with a dot.
489
+ return False
490
+ return True
491
+
492
+
463
493
class DomainNameValidationResult (TypedDict ):
464
494
ascii_domain : str
465
495
domain : str
@@ -484,6 +514,15 @@ def validate_email_domain_name(domain: str, test_environment: bool = False, glob
484
514
# they may not be valid, safe, or sensible Unicode strings.
485
515
check_unsafe_chars (domain )
486
516
517
+ # Reject characters that would be rejected by UTS-46 normalization next but
518
+ # with an error message under our control.
519
+ bad_chars = {
520
+ safe_character_display (c ) for c in domain
521
+ if not uts46_valid_char (c )
522
+ }
523
+ if bad_chars :
524
+ raise EmailSyntaxError ("The part after the @-sign contains invalid characters: " + ", " .join (sorted (bad_chars )) + "." )
525
+
487
526
# Perform UTS-46 normalization, which includes casefolding, NFC normalization,
488
527
# and converting all label separators (the period/full stop, fullwidth full stop,
489
528
# ideographic full stop, and halfwidth ideographic full stop) to regular dots.
0 commit comments