forked from standardebooks/tools
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtoc2kindle
executable file
·45 lines (34 loc) · 1.24 KB
/
toc2kindle
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
#!/usr/bin/env python3
import argparse
import sys
import regex
from bs4 import BeautifulSoup
import se
def main():
parser = argparse.ArgumentParser(description="Flatten a Standard Ebooks table of contents file to at most two levels deep for Kindle compatibility.")
parser.add_argument("-v", "--verbose", action="store_true", help="increase output verbosity")
parser.add_argument("filename", metavar="TOC-FILENAME", help="a Standard Ebooks table of contents file (typically toc.xhtml)")
args = parser.parse_args()
try:
with open(args.filename, "r+", encoding="utf-8") as file:
xhtml = file.read()
soup = BeautifulSoup(xhtml, "lxml")
for match in soup.select('ol > li > ol > li > ol'):
match.unwrap()
xhtml = str(soup)
pattern = regex.compile(r"(<li>\s*<a href=\"[^\"]+?\">.+?</a>\s*)<li>", regex.MULTILINE)
matches = 1
while matches > 0:
xhtml, matches = pattern.subn(r"\1</li><li>", xhtml)
pattern = regex.compile(r"</li>\s*</li>", regex.MULTILINE)
matches = 1
while matches > 0:
xhtml, matches = pattern.subn("</li>", xhtml)
file.seek(0)
file.write(xhtml)
file.truncate()
except Exception:
se.print_error("Couldn't read file: {}".format(args.filename))
exit(1)
if __name__ == "__main__":
main()