forked from standardebooks/tools
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmodernize-spelling
executable file
·203 lines (172 loc) · 11.6 KB
/
modernize-spelling
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
#!/usr/bin/env python3
import argparse
import os
import fnmatch
import regex
import se
DICTIONARY_FILE_PATH = os.path.join(os.path.dirname(os.path.realpath(__file__)), "data", "words")
def main():
parser = argparse.ArgumentParser(description="Modernize spelling of some archaic words, and replace words that may be archaically compounded with a dash to a more modern spelling. For example, replace \"ash-tray\" with \"ashtray\".")
parser.add_argument("-v", "--verbose", action="store_true", help="increase output verbosity")
parser.add_argument("-n", "--no-hyphens", dest="modernize_hyphenation", action="store_false", help="don't modernize hyphenation")
parser.add_argument("targets", metavar="TARGET", nargs="+", help="an XHTML file, or a directory containing XHTML files")
args = parser.parse_args()
try:
dictionary = set(line.strip().lower() for line in open(DICTIONARY_FILE_PATH))
except Exception:
se.print_error("Couldn't open words file at {}".format(DICTIONARY_FILE_PATH))
exit(1)
for target in args.targets:
target = os.path.abspath(target)
if args.verbose:
print("Processing {} ...".format(target), end="", flush=True)
target_filenames = set()
if os.path.isdir(target):
for root, _, filenames in os.walk(target):
for filename in fnmatch.filter(filenames, "*.xhtml"):
target_filenames.add(os.path.join(root, filename))
else:
target_filenames.add(target)
for filename in target_filenames:
with open(filename, "r+", encoding="utf-8") as file:
xhtml = file.read()
new_xhtml = xhtml
# What language are we using?
language = regex.search(r"<html[^>]+?xml:lang=\"([^\"]+)\"", xhtml)
if language is None or (language.group(1) != "en-US" and language.group(1) != "en-GB"):
if args.verbose:
print("\n\t", end="", flush=True)
se.print_error("No valid xml:lang attribute in <html> root. Only en-US and en-GB are supported. File: {}".format(filename))
exit(1)
new_xhtml = modernize_spelling(new_xhtml, language.group(1))
if args.modernize_hyphenation:
new_xhtml = modernize_hyphenation(new_xhtml, dictionary)
if new_xhtml != xhtml:
file.seek(0)
file.write(new_xhtml)
file.truncate()
if args.verbose:
print(" OK")
def modernize_hyphenation(xhtml, dictionary):
# Easy fix for a common case
xhtml = regex.sub(r"\b([Nn])ow-a-days\b", r"\1owadays", xhtml) # now-a-days -> nowadays
result = regex.findall(r"\b[^\W\d_]+\-[^\W\d_]+\b", xhtml)
for word in set(result): # set() removes duplicates
new_word = word.replace("-", "").lower()
if new_word in dictionary:
# To preserve capitalization, we get the left-hand side of the compound, then the right-hand side,
# then we replace the word that way.
lhs = regex.sub(r"\-.+$", r"", word)
rhs = regex.sub(r"^.+?\-", r"", word)
xhtml = regex.sub(r"" + lhs + "-" + rhs, lhs + rhs, xhtml)
# Quick fix for a common case
xhtml = xhtml.replace("z3998:nonfiction", "z3998:non-fiction")
xhtml = regex.sub(r"\b([Uu])nChristian\b", r"\1nchristian", xhtml)
return xhtml
def modernize_spelling(xhtml, language):
# ADDING NEW WORDS TO THIS LIST:
# A good way to check if a word is "archaic" is to do a Google N-Gram search: https://books.google.com/ngrams/graph?case_insensitive=on&year_start=1800&year_end=2000&smoothing=3
# Remember that en-US and en-GB differ significantly, and just because a word might seem strange to you, doesn't mean it's not the common case in the other variant.
# If Google N-Gram shows that a word has declined significantly in usage in BOTH en-US and en-GB (or the SE editor makes an exception) then it may be a good candidate to add to this list.
xhtml = regex.sub(r"\b([Dd])evelope\b", r"\1evelop", xhtml) # develope -> develop
xhtml = regex.sub(r"\b([Oo])ker\b", r"\1cher", xhtml) # oker -> ocher
xhtml = regex.sub(r"\b([Ww])ellnigh\b", r"\1ell-nigh", xhtml) # wellnigh -> well-nigh
xhtml = regex.sub(r"\b([Tt]he|[Aa]nd|[Oo]r) what not(?! to)\b", r"\1 whatnot", xhtml) # what not -> whatnot
xhtml = regex.sub(r"\b([Gg])ood\-bye?\b", r"\1oodbye", xhtml) # good-by -> goodbye
xhtml = regex.sub(r"\b([Hh])indoo", r"\1indu", xhtml) # hindoo -> hindu
xhtml = regex.sub(r"\b([Hh])ind(u|oo)stanee", r"\1industani", xhtml) # hindoostanee -> hindustani
xhtml = regex.sub(r"\b([Ee])xpence", r"\1xpense", xhtml) # expence -> expense
xhtml = regex.sub(r"\b([Ll])otos", r"\1otus", xhtml) # lotos -> lotus
xhtml = regex.sub(r"\b([Ss])collop", r"\1callop", xhtml) # scollop -> scallop
xhtml = regex.sub(r"\b([Ss])ubtil", r"\1ubtle", xhtml) # subtil -> subtle
xhtml = regex.sub(r"\bQuoiff", r"Coif", xhtml) # quoiff -> coif
xhtml = regex.sub(r"\bquoiff", r"coif", xhtml) # quoiff -> coif
xhtml = regex.sub(r"\bIndorse", r"Endorse", xhtml) # indorse -> endorse
xhtml = regex.sub(r"\bindorse", r"endorse", xhtml) # indorse -> endorse
xhtml = regex.sub(r"\bPhantasie", r"Fantasy", xhtml) # phantasie -> fantasy
xhtml = regex.sub(r"\bphantasie", r"fantasy", xhtml) # phantasie -> fantasy
xhtml = regex.sub(r"\b([Mm])enage\b", r"\1énage", xhtml) # menage -> ménage
xhtml = regex.sub(r"([Hh])ypothenuse", r"\1ypotenuse", xhtml) # hypothenuse -> hypotenuse
xhtml = regex.sub(r"[‘’]([Bb])us\b", r"\1us", xhtml) # ’bus -> bus
xhtml = regex.sub(r"([Nn])aïve", r"\1aive", xhtml) # naïve -> naive
xhtml = regex.sub(r"([Nn])a[ïi]vet[ée]", r"\1aivete", xhtml) # naïveté -> naivete
xhtml = regex.sub(r"&c\.", r"etc.", xhtml) # &c. -> etc.
xhtml = regex.sub(r"([Pp])rot[ée]g[ée]", r"\1rotégé", xhtml) # protege -> protégé
xhtml = regex.sub(r"([Tt])ete-a-tete", r"\1ête-à-tête", xhtml) # tete-a-tete -> tête-à-tête
xhtml = regex.sub(r"([Vv])is-a-vis", r"\1is-à-vis", xhtml) # vis-a-vis _> vis-à-vis
xhtml = regex.sub(r"([Ff])acade", r"\1açade", xhtml) # facade -> façade
xhtml = regex.sub(r"([Cc])h?ateau(s?\b)", r"\1hâteau\2", xhtml) # chateau -> château
xhtml = regex.sub(r"([Hh])abitue", r"\1abitué", xhtml) # habitue -> habitué
xhtml = regex.sub(r"\b([Bb])lase\b", r"\1lasé", xhtml) # blase -> blasé
xhtml = regex.sub(r"\b([Cc])afe\b", r"\1afé", xhtml) # cafe -> café
xhtml = regex.sub(r"\b([Cc])afes\b", r"\1afés", xhtml) # cafes -> cafés; We break up cafe so that we don't catch 'cafeteria'
xhtml = regex.sub(r"([Mm])êlée", r"\1elee", xhtml) # mêlée -> melee
xhtml = regex.sub(r"\b([Ff])ete(sd)?\b", r"\1ête\2", xhtml) # fete -> fête
xhtml = regex.sub(r"\b([Rr])ôle\b", r"\1ole", xhtml) # rôle -> role
xhtml = regex.sub(r"\b([Cc])oö", r"\1oo", xhtml) # coö -> coo (as in coöperate)
xhtml = regex.sub(r"\b([Rr])eë", r"\1ee", xhtml) # reë -> ree (as in reëvaluate)
xhtml = regex.sub(r"\b([Dd])aïs\b", r"\1ais", xhtml) # daïs -> dais
xhtml = regex.sub(r"\b([Cc])oup\-de\-grace", r"\1oup-de-grâce", xhtml) # coup-de-grace -> coup-de-grâce
xhtml = regex.sub(r"\b([Cc])anape", r"\1anapé", xhtml) # canape -> canapé
xhtml = regex.sub(r"\b([Pp])recis\b", r"\1récis", xhtml) # precis -> précis
xhtml = regex.sub(r"\b([Gg])ood\-by([^e])", r"\1oodbye\2", xhtml) # good-by -> goodbye
xhtml = regex.sub(r"\b([Gg])ood\-night", r"\1ood night", xhtml) # good-night -> good night
xhtml = regex.sub(r"\b([Gg])ood\-morning", r"\1ood morning", xhtml) # good-morning -> good morning
xhtml = regex.sub(r"\b([Gg])ood\-evening", r"\1ood evening", xhtml) # good-evening -> good evening
xhtml = regex.sub(r"\b([Gg])ood\-day", r"\1ood day", xhtml) # good-day -> good day
xhtml = regex.sub(r"\b([Gg])ood\-afternoon", r"\1ood afternoon", xhtml) # good-afternoon -> good afternoon
xhtml = regex.sub(r"\b([Bb])ete noir", r"\1ête noir", xhtml) # bete noir -> bête noir
xhtml = regex.sub(r"\bEclat\b", r"Éclat", xhtml) # eclat -> éclat
xhtml = regex.sub(r"\beclat\b", r"éclat", xhtml) # eclat -> éclat
xhtml = regex.sub(r"\ba la\b", r"à la", xhtml) # a la -> à la
xhtml = regex.sub(r"\ba propos\b", r"apropos", xhtml) # a propos -> apropos
xhtml = regex.sub(r"\bper cent(s?)\b", r"percent\1", xhtml) # per cent -> percent
xhtml = regex.sub(r"\bpercent\.,\b", r"percent,", xhtml) # per cent. -> percent
xhtml = regex.sub(r"\b([Ff])iance", r"\1iancé", xhtml) # fiance -> fiancé
xhtml = regex.sub(r"\b([Oo])utre\b", r"\1utré", xhtml) # outre -> outré
xhtml = regex.sub(r"\b([Ff])etich", r"\1etish", xhtml) # fetich -> fetish
xhtml = regex.sub(r"\b([Pp])igstye\b", r"\1igsty", xhtml) # pigstye -> pigsty
xhtml = regex.sub(r"\b([Pp])igstyes\b", r"\1igsties", xhtml) # pigstyes -> pigsties
xhtml = regex.sub(r"\b([Cc])lew(s?)\b", r"\1lue\2", xhtml) # clew -> clue
xhtml = regex.sub(r"\b[ÀA]\s?propos\b", r"Apropos", xhtml) # à propos -> apropos
xhtml = regex.sub(r"\b[àa]\s?propos\b", r"apropos", xhtml) # à propos -> apropos
xhtml = regex.sub(r"\b([Nn])ew comer(s?)\b", r"\1ewcomer\2", xhtml) # new comer -> newcomer
xhtml = regex.sub(r"\b([Pp])ease\b(?![ \-]pudding)", r"\1eas", xhtml) # pease -> peas (but "pease pudding")
xhtml = regex.sub(r"\b([Ss])uch like\b", r"\1uchlike", xhtml) # such like -> suchlike
xhtml = regex.sub(r"\b([Ee])mployé", r"\1mployee", xhtml) # employé -> employee
xhtml = regex.sub(r"\b(?<!ancien )([Rr])égime", r"\1egime", xhtml) # régime -> regime (but "ancien régime")
xhtml = regex.sub(r"\b([Bb])urthen", r"\1urden", xhtml) # burthen -> burden
xhtml = regex.sub(r"\b([Dd])isburthen", r"\1isburden", xhtml) # disburthen -> disburthen
xhtml = regex.sub(r"\b[EÉ]lys[eé]e", r"Élysée", xhtml) # Elysee -> Élysée
xhtml = regex.sub(r"\b([Ll])aw suit", r"\1awsuit", xhtml) # law suit -> lawsuit
xhtml = regex.sub(r"\bIncase", r"Encase", xhtml) # incase -> encase
xhtml = regex.sub(r"\bincase", r"encase", xhtml) # incase -> encase
xhtml = regex.sub(r"\b([Cc])ocoa-?nut", r"\1oconut", xhtml) # cocoanut / cocoa-nut -> coconut
xhtml = regex.sub(r"\b([Ww])aggon", r"\1agon", xhtml) # waggon -> wagon
xhtml = regex.sub(r"\b([Ss])wop", r"\1wap", xhtml) # swop -> swap
xhtml = regex.sub(r"\b([Ll])acquey", r"\1ackey", xhtml) # lacquey -> lackey
xhtml = regex.sub(r"\b([Bb])ric-à-brac", r"\1ric-a-brac", xhtml) # bric-à-brac -> bric-a-brac
xhtml = regex.sub(r"\b([Kk])iosque", r"\1iosk", xhtml) # kiosque -> kiosk
xhtml = regex.sub(r"\b([Dd])epôt", r"\1epot", xhtml) # depôt -> depot
xhtml = regex.sub(r"\b([Cc])onnexion", r"\1onnection", xhtml) # connexion -> connection
xhtml = regex.sub(r"\b([Rr])eflexion", r"\1eflection", xhtml) # reflexion -> reflection
xhtml = regex.sub(r"\b([Dd])ulness", r"\1ullness", xhtml) # dulness -> dullness
xhtml = regex.sub(r"\b([Ff])iord", r"\1jord", xhtml) # fiord -> fjord
xhtml = regex.sub(r"\b([Ff])ulness\b", r"\1ullness", xhtml) # fulness -> fullness (but not for ex. thoughtfulness)
# Normalize some names
xhtml = regex.sub(r"Moliere", r"Molière", xhtml) # Moliere -> Molière
xhtml = regex.sub(r"Tolstoi", r"Tolstoy", xhtml) # Tolstoi -> Tolstoy
xhtml = regex.sub(r"Buonaparte", r"Bonaparte", xhtml) # Buonaparte -> Bonaparte
xhtml = regex.sub(r"Shake?spear([^ie])", r"Shakespeare\1", xhtml) # Shakespear/Shakspear -> Shakespeare
xhtml = regex.sub(r"Raffaelle", r"Raphael", xhtml) # Raffaelle -> Raphael
xhtml = regex.sub(r"Michael Angelo", r"Michaelangelo", xhtml) # Michael Angelo -> Michaelangelo
xhtml = regex.sub(r"\bVergil", r"Virgil", xhtml) # Vergil -> Virgil
if language == "en-US":
xhtml = regex.sub(r"\b([Cc])osey", r"\1ozy", xhtml)
xhtml = regex.sub(r"\b([Mm])anœuvre", r"\1aneuver", xhtml)
if language == "en-GB":
xhtml = regex.sub(r"\b([Cc])osey", r"\1osy", xhtml)
xhtml = regex.sub(r"\b([Mm])anœuvre", r"\1anoeuvre", xhtml)
return xhtml
if __name__ == "__main__":
main()