forked from standardebooks/tools
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtypogrify
executable file
·216 lines (166 loc) · 12.4 KB
/
typogrify
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
#!/usr/bin/env python3
import argparse
import os
import fnmatch
import html
import regex
import smartypants
# Some convenience aliases
WORD_JOINER = "\u2060" # word joiner, U+2060
HAIR_SPACE = "\u200a" # hair space, U+200A
NBSP = "\u00a0" # no-break space, U+00A0
def main():
parser = argparse.ArgumentParser(description="Apply some scriptable typography rules from the Standard Ebooks typography manual to XHTML files.")
parser.add_argument("-v", "--verbose", action="store_true", help="increase output verbosity")
parser.add_argument("-n", "--no-quotes", dest="quotes", action="store_false", help="don't convert to smart quotes before doing other adjustments")
parser.add_argument("targets", metavar="TARGET", nargs="+", help="an XHTML file, or a directory containing XHTML files")
args = parser.parse_args()
if args.verbose and not args.quotes:
print("Skipping smart quotes.")
for target in args.targets:
target = os.path.abspath(target)
if args.verbose:
print("Processing {} ...".format(target), end="", flush=True)
target_filenames = set()
if os.path.isdir(target):
for root, _, filenames in os.walk(target):
for filename in fnmatch.filter(filenames, "*.xhtml"):
target_filenames.add(os.path.join(root, filename))
else:
target_filenames.add(target)
for filename in target_filenames:
with open(filename, "r+", encoding="utf-8") as file:
xhtml = file.read()
processed_xhtml = xhtml
if args.quotes:
# Some Gutenberg works have a weird single quote style: `this is a quote'. Clean that up here before running Smartypants.
processed_xhtml = processed_xhtml.replace("`", "'")
# First, convert entities. Sometimes Gutenberg has entities instead of straight quotes.
processed_xhtml = html.unescape(processed_xhtml) # This converts html entites to unicode
processed_xhtml = regex.sub(r"(\s)&(\s)", r"\1&\2", processed_xhtml) # Oops! html.unescape also unescapes plain ampersands...
processed_xhtml = smartypants.smartypants(processed_xhtml) # Attr.u *should* output unicode characters instead of HTML entities, but it doesn't work
# Convert entities again
processed_xhtml = html.unescape(processed_xhtml) # This converts html entites to unicode
processed_xhtml = regex.sub(r"(\s)&(\s)", r"\1&\2", processed_xhtml) # Oops! html.unescape also unescapes plain ampersands...
# Replace sequential em dashes with the two or three em dash character
processed_xhtml = processed_xhtml.replace("———", "⸻")
processed_xhtml = processed_xhtml.replace("——", "⸺")
# Smartypants doesn't do well on em dashes followed by open quotes. Fix that here
processed_xhtml = regex.sub(r"—”([a-z])", r"—“\1", processed_xhtml, flags=regex.IGNORECASE)
processed_xhtml = regex.sub(r"—’([a-z])", r"—‘\1", processed_xhtml, flags=regex.IGNORECASE)
processed_xhtml = regex.sub(r"-“</p>", r"—”</p>", processed_xhtml, flags=regex.IGNORECASE)
processed_xhtml = regex.sub(r"‘”</p>", r"’{}”</p>".format(HAIR_SPACE), processed_xhtml, flags=regex.IGNORECASE)
# Remove spaces between en and em dashes
# Note that we match at least one character before the dashes, so that we don't catch start-of-line em dashes like in poetry.
processed_xhtml = regex.sub(r"([^\.\s])\s*([–—])\s*", r"\1\2", processed_xhtml)
# First, remove stray word joiners
processed_xhtml = processed_xhtml.replace(WORD_JOINER, "")
# Some older texts use the ,— construct; remove that archaichism
processed_xhtml = processed_xhtml.replace(",—", "—")
# Em dashes and two-em-dashes can be broken before, so add a word joiner between letters/punctuation and the following em dash
processed_xhtml = regex.sub(r"([^\s{}{}{}])([—⸻])".format(WORD_JOINER, NBSP, HAIR_SPACE), r"\1{}\2".format(WORD_JOINER), processed_xhtml, flags=regex.IGNORECASE)
# Add en dashes; don't replace match that is within an html tag, since ids and attrs often containg the pattern DIGIT-DIGIT
processed_xhtml = regex.sub(r"(?<!<[^>]*)([0-9]+)\-([0-9]+)", r"\1–\2", processed_xhtml)
# Add a word joiner on both sides of en dashes
processed_xhtml = regex.sub(r"{}?–{}?".format(WORD_JOINER, WORD_JOINER), r"{}–{}".format(WORD_JOINER, WORD_JOINER), processed_xhtml)
# Add a word joiner if eliding a word with a two-em-dash
# Word joiner isn't necessary if punctuation follows
# Note the \p{{P}}. We must double-curl {} because that's the escape sequence when using .format(). The actual regex should be \p{P} to match punctuation
processed_xhtml = regex.sub(r"([^\s{}{}{}])⸺".format(WORD_JOINER, NBSP, HAIR_SPACE), r"\1{}⸺".format(WORD_JOINER), processed_xhtml)
processed_xhtml = regex.sub(r"⸺([^\s\p{{P}}{}])".format(WORD_JOINER), r"⸺{}\1".format(WORD_JOINER), processed_xhtml)
# Remove word joiners from following opening tags--they're usually never correct
processed_xhtml = regex.sub(r"<([a-z]+)([^>]*?)>{}".format(WORD_JOINER), r"<\1\2>", processed_xhtml, flags=regex.IGNORECASE)
# Finally fix some other mistakes
processed_xhtml = processed_xhtml.replace("—-", "—")
# Replace Mr., Mrs., and other abbreviations, and include a non-breaking space
processed_xhtml = regex.sub(r"\b(Mr|Mr?s|Drs?|Profs?|Lieut|Fr|Lt|Capt|Pvt|Esq|Mt|St|MM|Mmes?|Mlles?)\.?\s+", r"\1.{}".format(NBSP), processed_xhtml)
processed_xhtml = regex.sub(r"<abbr>(Mr|Mr?s|Drs?|Profs?|Lieut|Fr|Lt|Capt|Pvt|Esq|Mt|St|MM|Mmes?|Mlles?)\.</abbr>?\s+", r"<abbr>\1.</abbr>{}".format(NBSP), processed_xhtml)
processed_xhtml = regex.sub(r"\bNo\.\s+([0-9]+)", r"No.{}\1".format(NBSP), processed_xhtml)
processed_xhtml = regex.sub(r"<abbr>No\.</abbr>\s+", r"<abbr>No.</abbr>{}".format(NBSP), processed_xhtml)
processed_xhtml = regex.sub(r"([0-9]+)\s<abbr", r"\1{}<abbr".format(NBSP), processed_xhtml)
# A note on spacing:
# ibooks kindle (mobi7)
# thin space U+2009: yes yes
# word joiner U+2060: no yes
# zero-width no-break space U+FEFF: yes yes
# narrow no-break space U+202F: no yes
# punctuation space U+2008: yes yes
# Fix common abbreviatons
processed_xhtml = regex.sub(r"(\s)‘a’(\s)", r"\1’a’\2", processed_xhtml, flags=regex.IGNORECASE)
# Years
processed_xhtml = regex.sub(r"‘([0-9]{2,}[^a-zA-Z0-9’])", r"’\1", processed_xhtml, flags=regex.IGNORECASE)
processed_xhtml = regex.sub(r"‘([Aa]ve|[Oo]me|[Ii]m|[Mm]idst|[Gg]ainst|[Nn]eath|[Ee]m|[Cc]os|[Tt]is|[Tt]was|[Tt]wixt|[Tt]were|[Tt]would|[Tt]wouldn|[Tt]ween|[Tt]will|[Rr]ound|[Pp]on)\b", r"’\1", processed_xhtml)
processed_xhtml = regex.sub(r"\b‘e\b", r"’e", processed_xhtml)
processed_xhtml = regex.sub(r"\b‘([Ee])r\b", r"’\1r", processed_xhtml)
processed_xhtml = regex.sub(r"\b‘([Ee])re\b", r"’\1re", processed_xhtml)
processed_xhtml = regex.sub(r"\b‘([Aa])ppen\b", r"’\1ppen", processed_xhtml)
processed_xhtml = regex.sub(r"\b‘([Aa])ven\b", r"’\1ven", processed_xhtml) # 'aven't
# nth (as in nth degree)
processed_xhtml = regex.sub(r"\bn\-?th\b", r"<i>n</i>th", processed_xhtml)
# Remove double spaces that use nbsp for spacing
processed_xhtml = regex.sub(r"{}[{} ]+".format(NBSP, NBSP), r" ", processed_xhtml)
processed_xhtml = regex.sub(r" [{} ]+".format(NBSP), r" ", processed_xhtml)
# House style: remove spacing from common Latinisms
processed_xhtml = regex.sub(r"([Ii])\.\s+e\.", r"\1.e.", processed_xhtml)
processed_xhtml = regex.sub(r"([Ee])\.\s+g\.", r"\1.g.", processed_xhtml)
# WARNING! This and below can remove the ending period of a sentence, if AD or BC is the last word! We need interactive S&R for this
processed_xhtml = regex.sub(r"([\d\s])A\.\s+D\.", r"\1AD", processed_xhtml)
processed_xhtml = regex.sub(r"B\.\s+C\.", r"BC", processed_xhtml)
# Put spacing next to close quotes
processed_xhtml = regex.sub(r"“[\s{}]*‘".format(NBSP), r"“{}‘".format(HAIR_SPACE), processed_xhtml, flags=regex.IGNORECASE)
processed_xhtml = regex.sub(r"’[\s{}]*”".format(NBSP), r"’{}”".format(HAIR_SPACE), processed_xhtml, flags=regex.IGNORECASE)
processed_xhtml = regex.sub(r"“[\s{}]*’".format(NBSP), r"“{}’".format(HAIR_SPACE), processed_xhtml, flags=regex.IGNORECASE)
processed_xhtml = regex.sub(r"‘[\s{}]*“".format(NBSP), r"‘{}“".format(HAIR_SPACE), processed_xhtml, flags=regex.IGNORECASE)
# We require a non-letter char at the end, otherwise we might match a contraction: “Hello,” ’e said.
processed_xhtml = regex.sub(r"”[\s{}]*’([^a-zA-Z])".format(NBSP), r"”{}’\1".format(HAIR_SPACE), processed_xhtml, flags=regex.IGNORECASE)
# Fix ellipses spacing
processed_xhtml = regex.sub(r"\s*\.\s*\.\s*\.\s*", r"…", processed_xhtml, flags=regex.IGNORECASE)
processed_xhtml = regex.sub(r"[\s{}]?…[\s{}]?\.".format(NBSP, NBSP), r".{}…".format(HAIR_SPACE), processed_xhtml, flags=regex.IGNORECASE)
processed_xhtml = regex.sub(r"[\s{}]?…[\s{}]?".format(NBSP, NBSP), r"{}… ".format(HAIR_SPACE), processed_xhtml, flags=regex.IGNORECASE)
processed_xhtml = regex.sub(r"<p([^>]*?)>{}…".format(HAIR_SPACE), r"<p\1>…", processed_xhtml, flags=regex.IGNORECASE)
# Remove spaces between opening tags and ellipses
processed_xhtml = regex.sub(r"(<[a-z0-9]+[^<]+?>)[\s{}]?…".format(NBSP), r"\1…", processed_xhtml, flags=regex.IGNORECASE)
# Remove spaces between closing tags and ellipses
processed_xhtml = regex.sub(r"…[\s{}]?(</[a-z0-9]+>)".format(NBSP), r"…\1", processed_xhtml, flags=regex.IGNORECASE)
processed_xhtml = regex.sub(r"…[\s{}]+([\)”’])".format(NBSP), r"…\1", processed_xhtml, flags=regex.IGNORECASE)
processed_xhtml = regex.sub(r"([\(“‘])[\s{}]+…".format(NBSP), r"\1…", processed_xhtml, flags=regex.IGNORECASE)
processed_xhtml = regex.sub(r"…[\s{}]?([\!\?\.\;\,])".format(NBSP), r"…{}\1".format(HAIR_SPACE), processed_xhtml, flags=regex.IGNORECASE)
processed_xhtml = regex.sub(r"([\!\?\.\;”’])[\s{}]?…".format(NBSP), r"\1{}…".format(HAIR_SPACE), processed_xhtml, flags=regex.IGNORECASE)
processed_xhtml = regex.sub(r"\,[\s{}]?…".format(NBSP), r",{}…".format(HAIR_SPACE), processed_xhtml, flags=regex.IGNORECASE)
# Remove spaces between ellipses and endnotes directly after
processed_xhtml = regex.sub(r"…[\s{}]?(<a[^>]+?id=\"note-[0-9]+\"[^>]*?>)".format(NBSP), r"…\1", processed_xhtml, flags=regex.IGNORECASE)
# Add non-breaking spaces between amounts with an abbreviated unit. E.g. 8 oz., 10 lbs.
processed_xhtml = regex.sub(r"([0-9])\s+([a-z]{1,3}\.)", r"\1{}\2".format(NBSP), processed_xhtml, flags=regex.IGNORECASE)
# Add non-breaking spaces between Arabic numbers and AM/PM
processed_xhtml = regex.sub(r"([0-9])\s+([ap])\.m\.", r"\1{}\2.m.".format(NBSP), processed_xhtml, flags=regex.IGNORECASE)
processed_xhtml = regex.sub(r"([0-9])\s+<abbr([^>]*?)>([ap])\.m\.", r"\1{}<abbr\2>\3.m.".format(NBSP), processed_xhtml, flags=regex.IGNORECASE)
processed_xhtml = processed_xhtml.replace("Ph.D", "PhD")
processed_xhtml = regex.sub(r"P\.\s*S\.", r"P.S.", processed_xhtml)
# Fractions
processed_xhtml = processed_xhtml.replace("1/4", "¼")
processed_xhtml = processed_xhtml.replace("1/2", "½")
processed_xhtml = processed_xhtml.replace("3/4", "¾")
processed_xhtml = processed_xhtml.replace("1/3", "⅓")
processed_xhtml = processed_xhtml.replace("2/3", "⅔")
processed_xhtml = processed_xhtml.replace("1/5", "⅕")
processed_xhtml = processed_xhtml.replace("2/5", "⅖")
processed_xhtml = processed_xhtml.replace("3/5", "⅗")
processed_xhtml = processed_xhtml.replace("4/5", "⅘")
processed_xhtml = processed_xhtml.replace("1/6", "⅙")
processed_xhtml = processed_xhtml.replace("5/6", "⅚")
processed_xhtml = processed_xhtml.replace("1/8", "⅛")
processed_xhtml = processed_xhtml.replace("3/8", "⅜")
processed_xhtml = processed_xhtml.replace("5/8", "⅝")
processed_xhtml = processed_xhtml.replace("7/8", "⅞")
# Remove spaces between whole numbers and fractions
processed_xhtml = regex.sub(r"([0-9,]+)\s+([¼½¾⅔⅕⅖⅗⅘⅙⅚⅛⅜⅝⅞])", r"\1\2", processed_xhtml)
# Use the Unicode Minus glyph (U+2212) for negative numbers
processed_xhtml = regex.sub(r"([\s>])\-([0-9,]+)", r"\1−\2", processed_xhtml)
if processed_xhtml != xhtml:
file.seek(0)
file.write(processed_xhtml)
file.truncate()
if args.verbose:
print(" OK")
if __name__ == "__main__":
main()