modernize-spelling

#!/usr/bin/env python3

import argparse
import os
import fnmatch
import regex
import se


DICTIONARY_FILE_PATH = os.path.join(os.path.dirname(os.path.realpath(__file__)), "data", "words")


def main():
	parser = argparse.ArgumentParser(description="Modernize spelling of some archaic words, and replace words that may be archaically compounded with a dash to a more modern spelling.  For example, replace \"ash-tray\" with \"ashtray\".")
	parser.add_argument("-v", "--verbose", action="store_true", help="increase output verbosity")
	parser.add_argument("-n", "--no-hyphens", dest="modernize_hyphenation", action="store_false", help="don't modernize hyphenation")
	parser.add_argument("targets", metavar="TARGET", nargs="+", help="an XHTML file, or a directory containing XHTML files")
	args = parser.parse_args()

	try:
		dictionary = set(line.strip().lower() for line in open(DICTIONARY_FILE_PATH))
	except Exception:
		se.print_error("Couldn't open words file at {}".format(DICTIONARY_FILE_PATH))
		exit(1)

	for target in args.targets:
		target = os.path.abspath(target)

		if args.verbose:
			print("Processing {} ...".format(target), end="", flush=True)

		target_filenames = set()
		if os.path.isdir(target):
			for root, _, filenames in os.walk(target):
				for filename in fnmatch.filter(filenames, "*.xhtml"):
					target_filenames.add(os.path.join(root, filename))
		else:
			target_filenames.add(target)


		for filename in target_filenames:
			with open(filename, "r+", encoding="utf-8") as file:
				xhtml = file.read()
				new_xhtml = xhtml

				# What language are we using?
				language = regex.search(r"<html[^>]+?xml:lang=\"([^\"]+)\"", xhtml)
				if language is None or (language.group(1) != "en-US" and language.group(1) != "en-GB"):
					if args.verbose:
						print("\n\t", end="", flush=True)
					se.print_error("No valid xml:lang attribute in <html> root.  Only en-US and en-GB are supported. File: {}".format(filename))
					exit(1)

				new_xhtml = modernize_spelling(new_xhtml, language.group(1))

				if args.modernize_hyphenation:
					new_xhtml = modernize_hyphenation(new_xhtml, dictionary)

				if new_xhtml != xhtml:
					file.seek(0)
					file.write(new_xhtml)
					file.truncate()

		if args.verbose:
			print(" OK")


def modernize_hyphenation(xhtml, dictionary):
	# Easy fix for a common case
	xhtml = regex.sub(r"\b([Nn])ow-a-days\b", r"\1owadays", xhtml)			# now-a-days -> nowadays

	result = regex.findall(r"\b[^\W\d_]+\-[^\W\d_]+\b", xhtml)

	for word in set(result): # set() removes duplicates
		new_word = word.replace("-", "").lower()
		if new_word in dictionary:
			# To preserve capitalization, we get the left-hand side of the compound, then the right-hand side,
			# then we replace the word that way.
			lhs = regex.sub(r"\-.+$", r"", word)
			rhs = regex.sub(r"^.+?\-", r"", word)
			xhtml = regex.sub(r"" + lhs + "-" + rhs, lhs + rhs, xhtml)

	# Quick fix for a common case
	xhtml = xhtml.replace("z3998:nonfiction", "z3998:non-fiction")
	xhtml = regex.sub(r"\b([Uu])nChristian\b", r"\1nchristian", xhtml)

	return xhtml


def modernize_spelling(xhtml, language):
	# ADDING NEW WORDS TO THIS LIST:
	# A good way to check if a word is "archaic" is to do a Google N-Gram search: https://books.google.com/ngrams/graph?case_insensitive=on&year_start=1800&year_end=2000&smoothing=3
	# Remember that en-US and en-GB differ significantly, and just because a word might seem strange to you, doesn't mean it's not the common case in the other variant.
	# If Google N-Gram shows that a word has declined significantly in usage in BOTH en-US and en-GB (or the SE editor makes an exception) then it may be a good candidate to add to this list.

	xhtml = regex.sub(r"\b([Dd])evelope\b", r"\1evelop", xhtml)			# develope -> develop
	xhtml = regex.sub(r"\b([Oo])ker\b", r"\1cher", xhtml)				# oker -> ocher
	xhtml = regex.sub(r"\b([Ww])ellnigh\b", r"\1ell-nigh", xhtml)			# wellnigh -> well-nigh
	xhtml = regex.sub(r"\b([Tt]he|[Aa]nd|[Oo]r) what not(?! to)\b", r"\1 whatnot", xhtml)	# what not -> whatnot
	xhtml = regex.sub(r"\b([Gg])ood\-bye?\b", r"\1oodbye", xhtml)			# good-by -> goodbye
	xhtml = regex.sub(r"\b([Hh])indoo", r"\1indu", xhtml)				# hindoo -> hindu
	xhtml = regex.sub(r"\b([Hh])ind(u|oo)stanee", r"\1industani", xhtml)		# hindoostanee -> hindustani
	xhtml = regex.sub(r"\b([Ee])xpence", r"\1xpense", xhtml)			# expence -> expense
	xhtml = regex.sub(r"\b([Ll])otos", r"\1otus", xhtml)				# lotos -> lotus
	xhtml = regex.sub(r"\b([Ss])collop", r"\1callop", xhtml)			# scollop -> scallop
	xhtml = regex.sub(r"\b([Ss])ubtil", r"\1ubtle", xhtml)				# subtil -> subtle
	xhtml = regex.sub(r"\bQuoiff", r"Coif", xhtml)					# quoiff -> coif
	xhtml = regex.sub(r"\bquoiff", r"coif", xhtml)					# quoiff -> coif
	xhtml = regex.sub(r"\bIndorse", r"Endorse", xhtml)				# indorse -> endorse
	xhtml = regex.sub(r"\bindorse", r"endorse", xhtml)				# indorse -> endorse
	xhtml = regex.sub(r"\bPhantasie", r"Fantasy", xhtml)				# phantasie -> fantasy
	xhtml = regex.sub(r"\bphantasie", r"fantasy", xhtml)				# phantasie -> fantasy
	xhtml = regex.sub(r"\b([Mm])enage\b", r"\1énage", xhtml)			# menage -> ménage
	xhtml = regex.sub(r"([Hh])ypothenuse", r"\1ypotenuse", xhtml)			# hypothenuse -> hypotenuse
	xhtml = regex.sub(r"[‘’]([Bb])us\b", r"\1us", xhtml)				# ’bus -> bus
	xhtml = regex.sub(r"([Nn])aïve", r"\1aive", xhtml)				# naïve -> naive
	xhtml = regex.sub(r"([Nn])a[ïi]vet[ée]", r"\1aivete", xhtml)			# naïveté -> naivete
	xhtml = regex.sub(r"&amp;c\.", r"etc.", xhtml)					# &c. -> etc.
	xhtml = regex.sub(r"([Pp])rot[ée]g[ée]", r"\1rotégé", xhtml)				# protege -> protégé
	xhtml = regex.sub(r"([Tt])ete-a-tete", r"\1ête-à-tête", xhtml)			# tete-a-tete -> tête-à-tête
	xhtml = regex.sub(r"([Vv])is-a-vis", r"\1is-à-vis", xhtml)			# vis-a-vis _> vis-à-vis
	xhtml = regex.sub(r"([Ff])acade", r"\1açade", xhtml)				# facade -> façade
	xhtml = regex.sub(r"([Cc])h?ateau(s?\b)", r"\1hâteau\2", xhtml)			# chateau -> château
	xhtml = regex.sub(r"([Hh])abitue", r"\1abitué", xhtml)				# habitue -> habitué
	xhtml = regex.sub(r"\b([Bb])lase\b", r"\1lasé", xhtml)				# blase -> blasé
	xhtml = regex.sub(r"\b([Cc])afe\b", r"\1afé", xhtml)				# cafe -> café
	xhtml = regex.sub(r"\b([Cc])afes\b", r"\1afés", xhtml)				# cafes -> cafés; We break up cafe so that we don't catch 'cafeteria'
	xhtml = regex.sub(r"([Mm])êlée", r"\1elee", xhtml)				# mêlée -> melee
	xhtml = regex.sub(r"\b([Ff])ete(sd)?\b", r"\1ête\2", xhtml)			# fete -> fête
	xhtml = regex.sub(r"\b([Rr])ôle\b", r"\1ole", xhtml)				# rôle -> role
	xhtml = regex.sub(r"\b([Cc])oö", r"\1oo", xhtml)				# coö -> coo (as in coöperate)
	xhtml = regex.sub(r"\b([Rr])eë", r"\1ee", xhtml)				# reë -> ree (as in reëvaluate)
	xhtml = regex.sub(r"\b([Dd])aïs\b", r"\1ais", xhtml)				# daïs -> dais
	xhtml = regex.sub(r"\b([Cc])oup\-de\-grace", r"\1oup-de-grâce", xhtml)		# coup-de-grace -> coup-de-grâce
	xhtml = regex.sub(r"\b([Cc])anape", r"\1anapé", xhtml)				# canape -> canapé
	xhtml = regex.sub(r"\b([Pp])recis\b", r"\1récis", xhtml)			# precis -> précis
	xhtml = regex.sub(r"\b([Gg])ood\-by([^e])", r"\1oodbye\2", xhtml)		# good-by -> goodbye
	xhtml = regex.sub(r"\b([Gg])ood\-night", r"\1ood night", xhtml)			# good-night -> good night
	xhtml = regex.sub(r"\b([Gg])ood\-morning", r"\1ood morning", xhtml)		# good-morning -> good morning
	xhtml = regex.sub(r"\b([Gg])ood\-evening", r"\1ood evening", xhtml)		# good-evening -> good evening
	xhtml = regex.sub(r"\b([Gg])ood\-day", r"\1ood day", xhtml)			# good-day -> good day
	xhtml = regex.sub(r"\b([Gg])ood\-afternoon", r"\1ood afternoon", xhtml)		# good-afternoon -> good afternoon
	xhtml = regex.sub(r"\b([Bb])ete noir", r"\1ête noir", xhtml)			# bete noir -> bête noir
	xhtml = regex.sub(r"\bEclat\b", r"Éclat", xhtml)				# eclat -> éclat
	xhtml = regex.sub(r"\beclat\b", r"éclat", xhtml)				# eclat -> éclat
	xhtml = regex.sub(r"\ba la\b", r"à la", xhtml)					# a la -> à la
	xhtml = regex.sub(r"\ba propos\b", r"apropos", xhtml)				# a propos -> apropos
	xhtml = regex.sub(r"\bper cent(s?)\b", r"percent\1", xhtml)			# per cent -> percent
	xhtml = regex.sub(r"\bpercent\.,\b", r"percent,", xhtml)			# per cent. -> percent
	xhtml = regex.sub(r"\b([Ff])iance", r"\1iancé", xhtml)				# fiance -> fiancé
	xhtml = regex.sub(r"\b([Oo])utre\b", r"\1utré", xhtml)				# outre -> outré
	xhtml = regex.sub(r"\b([Ff])etich", r"\1etish", xhtml)				# fetich -> fetish
	xhtml = regex.sub(r"\b([Pp])igstye\b", r"\1igsty", xhtml)			# pigstye -> pigsty
	xhtml = regex.sub(r"\b([Pp])igstyes\b", r"\1igsties", xhtml)			# pigstyes -> pigsties
	xhtml = regex.sub(r"\b([Cc])lew(s?)\b", r"\1lue\2", xhtml)			# clew -> clue
	xhtml = regex.sub(r"\b[ÀA]\s?propos\b", r"Apropos", xhtml)			# à propos -> apropos
	xhtml = regex.sub(r"\b[àa]\s?propos\b", r"apropos", xhtml)			# à propos -> apropos
	xhtml = regex.sub(r"\b([Nn])ew comer(s?)\b", r"\1ewcomer\2", xhtml)		# new comer -> newcomer
	xhtml = regex.sub(r"\b([Pp])ease\b(?![ \-]pudding)", r"\1eas", xhtml)		# pease -> peas (but "pease pudding")
	xhtml = regex.sub(r"\b([Ss])uch like\b", r"\1uchlike", xhtml)			# such like -> suchlike
	xhtml = regex.sub(r"\b([Ee])mployé", r"\1mployee", xhtml)			# employé -> employee
	xhtml = regex.sub(r"\b(?<!ancien )([Rr])égime", r"\1egime", xhtml)		# régime -> regime (but "ancien régime")
	xhtml = regex.sub(r"\b([Bb])urthen", r"\1urden", xhtml)				# burthen -> burden
	xhtml = regex.sub(r"\b([Dd])isburthen", r"\1isburden", xhtml)			# disburthen -> disburthen
	xhtml = regex.sub(r"\b[EÉ]lys[eé]e", r"Élysée", xhtml)				# Elysee -> Élysée
	xhtml = regex.sub(r"\b([Ll])aw suit", r"\1awsuit", xhtml)			# law suit -> lawsuit
	xhtml = regex.sub(r"\bIncase", r"Encase", xhtml)				# incase -> encase
	xhtml = regex.sub(r"\bincase", r"encase", xhtml)				# incase -> encase
	xhtml = regex.sub(r"\b([Cc])ocoa-?nut", r"\1oconut", xhtml)			# cocoanut / cocoa-nut -> coconut
	xhtml = regex.sub(r"\b([Ww])aggon", r"\1agon", xhtml)				# waggon -> wagon
	xhtml = regex.sub(r"\b([Ss])wop", r"\1wap", xhtml)				# swop -> swap
	xhtml = regex.sub(r"\b([Ll])acquey", r"\1ackey", xhtml)				# lacquey -> lackey
	xhtml = regex.sub(r"\b([Bb])ric-à-brac", r"\1ric-a-brac", xhtml)		# bric-à-brac -> bric-a-brac
	xhtml = regex.sub(r"\b([Kk])iosque", r"\1iosk", xhtml)				# kiosque -> kiosk
	xhtml = regex.sub(r"\b([Dd])epôt", r"\1epot", xhtml)				# depôt -> depot
	xhtml = regex.sub(r"\b([Cc])onnexion", r"\1onnection", xhtml)			# connexion -> connection
	xhtml = regex.sub(r"\b([Rr])eflexion", r"\1eflection", xhtml)			# reflexion -> reflection
	xhtml = regex.sub(r"\b([Dd])ulness", r"\1ullness", xhtml)			# dulness -> dullness
	xhtml = regex.sub(r"\b([Ff])iord", r"\1jord", xhtml)				# fiord -> fjord
	xhtml = regex.sub(r"\b([Ff])ulness\b", r"\1ullness", xhtml)			# fulness -> fullness (but not for ex. thoughtfulness)

	# Normalize some names
	xhtml = regex.sub(r"Moliere", r"Molière", xhtml)				# Moliere -> Molière
	xhtml = regex.sub(r"Tolstoi", r"Tolstoy", xhtml)				# Tolstoi -> Tolstoy
	xhtml = regex.sub(r"Buonaparte", r"Bonaparte", xhtml)				# Buonaparte -> Bonaparte
	xhtml = regex.sub(r"Shake?spear([^ie])", r"Shakespeare\1", xhtml)		# Shakespear/Shakspear -> Shakespeare
	xhtml = regex.sub(r"Raffaelle", r"Raphael", xhtml)				# Raffaelle -> Raphael
	xhtml = regex.sub(r"Michael Angelo", r"Michaelangelo", xhtml)			# Michael Angelo -> Michaelangelo
	xhtml = regex.sub(r"\bVergil", r"Virgil", xhtml)				# Vergil -> Virgil

	if language == "en-US":
		xhtml = regex.sub(r"\b([Cc])osey", r"\1ozy", xhtml)
		xhtml = regex.sub(r"\b([Mm])anœuvre", r"\1aneuver", xhtml)

	if language == "en-GB":
		xhtml = regex.sub(r"\b([Cc])osey", r"\1osy", xhtml)
		xhtml = regex.sub(r"\b([Mm])anœuvre", r"\1anoeuvre", xhtml)

	return xhtml


if __name__ == "__main__":
	main()