1
1
import re
2
+
2
3
from .base import RegexBasedDetector
3
4
4
- class EmailAddressDetector (RegexBasedDetector ):
5
- """Email Address Detector.
6
5
7
- This class is designed to efficiently and accurately detect email addresses within given text. It primarily
8
- validates the general format of email addresses, and does not adhere strictly to email format standards such as RFC 5322.
6
+ class EmailAddressDetector (RegexBasedDetector ):
7
+ """
8
+ A detector for identifying email addresses within text. It uses regular expressions to
9
+ focus on general email structures, not strictly adhering to standards like RFC 5322.
10
+ Designed for efficient and broad detection, it also has some limitations.
9
11
10
- Key Features:
11
- - Ignores common, non-security-threatening email addresses to enhance precision.
12
+ Features:
13
+ - Detects a wide range of email formats efficiently.
14
+ - Ignores common, non-critical emails to minimize false positives.
12
15
13
16
Limitations:
14
- - Despite robust detection mechanisms, the class is not infallible and may not cover all edge cases .
15
- - It does not support some examples from RFC 6530 , e.g., email addresses with Greek alphabets .
17
+ - May miss edge cases or unconventional email formats .
18
+ - Not compliant with advanced formats , e.g., RFC 6530 non-Latin emails .
16
19
17
- References:
20
+ Regular Expression:
21
+ Utilizes a regex pattern focusing on typical email components: local part, domain, TLD.
22
+ Excludes predefined whitelist emails to reduce false positives.
23
+
24
+ References:
18
25
- https://en.wikipedia.org/wiki/Email_address
19
26
- https://stackoverflow.com/a/14321045
20
27
"""
21
28
secret_type = 'Email Address'
22
29
23
-
24
30
# Excluses whitelist email addresses from detection to reduce false positives.
31
+
25
32
26
33
base_pattern = r"""
27
34
[\w+-]+ # Local part before the @ symbol
@@ -32,21 +39,23 @@ class EmailAddressDetector(RegexBasedDetector):
32
39
(?:\.[a-zA-Z]{2,4}) # TLD part
33
40
"""
34
41
# Pattern Breakdown:
35
- # 1. [\w+-]+: Matches one or more of a-z, A-Z, _, +, -
42
+ # 1. [\w+-]+: Matches one or more of a-z, A-Z, _, +, -
36
43
# Represents the local part of the email address before the @ symbol.
37
44
# 2. (?:\.[\w+-]+)*: Matches zero or more of a-z, A-Z, _, +, -, but must start with a . (dot)
38
45
# Allows for dot-separated words in the local part of the email address.
39
46
# 3. @: Matches the @ symbol.
40
- # 4. [\w+-]+: Matches one or more of a-z, A-Z, _, +, -
47
+ # 4. [\w+-]+: Matches one or more of a-z, A-Z, _, +, -
41
48
# Represents the domain part of the email address after the @ symbol.
42
49
# 5. (?:\.[\w+-]+)*: Matches zero or more of a-z, A-Z, _, +, -, but must start with a . (dot)
43
50
# Allows for dot-separated words in the domain part of the email address.
44
51
# 6. (?:\.[a-zA-Z]{2,4}): Matches 2 to 4 instances of a-z, A-Z, starting with a . (dot)
45
52
# Represents the TLD (top-level domain) part of the email address.
46
53
47
- deny_pattern = r"(?!" + "|" .join (re .escape (email ) for email in whitelist ) + r"$)" + base_pattern
54
+ deny_pattern = r'(?!' \
55
+ + '|' .join (re .escape (email ) for email in whitelist ) \
56
+ + r'$)' + base_pattern
48
57
# Combines the base pattern with a negative lookahead to exclude whitelist email addresses.
49
58
50
59
denylist = [
51
- re .compile (r"\b" + deny_pattern + r"\b" , flags = re .VERBOSE )
60
+ re .compile (r'\b' + deny_pattern + r'\b' , flags = re .VERBOSE ),
52
61
]
0 commit comments