-
Notifications
You must be signed in to change notification settings - Fork 80
/
Copy pathhomoglyphs.py
81 lines (73 loc) · 2.36 KB
/
homoglyphs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
# From https://github.com/mindcrypt/uriDeep
source = open("uriDeep/data/deepDiccConfusables.txt")
confusables = {line[0]: list(line.rstrip()[1:]) for line in source}
# Remove some of them (too much FP)
confusables["t"].remove("i")
confusables["o"].remove("0")
confusables["0"].remove("o")
confusables["1"].remove("l")
confusables["l"].remove("1")
confusables["u"].append("ü")
confusables["o"].append("ö")
confusables["u"].append("û")
confusables["o"].append("ô")
# AFNIC specific
confusables["a"] += ["à", "á", "â", "ã", "ä", "å", "æ"]
confusables["c"] += ["ç"]
confusables["e"] += ["è", "é", "ê", "ë"]
confusables["i"] += ["ì", "í", "î", "ï"]
confusables["n"] += ["ñ"]
confusables["o"] += ["ò", "ó", "ô", "õ", "ö"]
confusables["u"] += ["ù", "ú", "û", "ü"]
confusables["y"] += ["ý", "ÿ"]
rev_confusables = {}
for key, values in confusables.items():
for value in values:
rev_confusables[value] = key
rev_confusables[key] = key
# From Alexa top 1M
alexa_1m = open("top-1m.csv")
alexa = []
for line in alexa_1m:
line = line.strip().split(",", 1)[1]
alexa.append(line)
# domains_uniq.txt obtained from CN / SAN extraction of Certificate Transparency List
#
# Sources (corresponding to the 2020/03/22):
# - Google 'Argon2020', ~ 10%
# - DigiCert Log Server, full
# - DigiCert Log Server 2, full
# - Cloudflare 'Nimbus2020' Log, full
# - Cloudflare 'Nimbus2021' Log, full
# - Cloudflare 'Nimbus2022' Log, full
# - Cloudflare 'Nimbus2023' Log, full
# - Let's Encrypt 'Oak2020' log, full
# - Let's Encrypt 'Oak2021' log, full
# - Let's Encrypt 'Oak2022' log, full
for i, domain in enumerate(open("domains_uniq.txt")):
# Verbose output
if (i % 10000000 == 0):
print("Current: %d" % i)
# Dummy filtering of IDN domains
if not "xn--" in domain:
continue
try:
decoded = domain.rstrip().encode("ascii").decode("idna")
except UnicodeError:
print("[WARNING] Skip %s" % domain.rstrip())
unconfuse = ""
found = True
for c in decoded:
if c in [".", "-"]:
unconfuse += c
continue
orig = rev_confusables.get(c, None)
if orig is None:
found = False
break
unconfuse += orig
if not found:
continue
if unconfuse not in alexa:
continue
print("%s (%s - https://%s)" % (unconfuse, decoded, domain.rstrip()))