forked from standardebooks/tools
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathendnotes2kindle
executable file
·73 lines (51 loc) · 2.94 KB
/
endnotes2kindle
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
#!/usr/bin/env python3
import argparse
import regex
import lxml.etree as etree
import se
XHTML_NAMESPACES = {"xhtml": "http://www.w3.org/1999/xhtml", "epub": "http://www.idpf.org/2007/ops", "z3998": "http://www.daisy.org/z3998/2012/vocab/structure/", "se": "https://standardebooks.org/vocab/1.0"}
def main():
parser = argparse.ArgumentParser(description="Convert epub-friendly endnotes to Kindle-friendly popup endnotes.")
parser.add_argument("files", metavar="FILE", nargs="+", help="an XHTML endnotes file in Standard Ebooks endnotes file format")
args = parser.parse_args()
for filename in args.files:
with open(filename, "r+", encoding="utf-8") as file:
xhtml = file.read()
# We have to remove the default namespace declaration from our document, otherwise
# xpath won't find anything at all. See http://stackoverflow.com/questions/297239/why-doesnt-xpath-work-when-processing-an-xhtml-document-with-lxml-in-python
tree = etree.fromstring(str.encode(xhtml.replace(" xmlns=\"http://www.w3.org/1999/xhtml\"", "")))
notes = tree.xpath("//li[@epub:type=\"rearnote\" or @epub:type=\"footnote\"]", namespaces=XHTML_NAMESPACES)
processed_endnotes = ""
for note in notes:
note_id = note.get("id")
note_number = note.get("data-se-note-number")
# First, fixup the reference link for this endnote
try:
ref_link = etree.tostring(note.xpath("p[last()]/a[last()]")[0], encoding="unicode", pretty_print=True, with_tail=False).replace(" xmlns:epub=\"http://www.idpf.org/2007/ops\"", "").strip()
except Exception:
se.print_error("Can't find ref link for #{}".format(note_id))
exit(1)
new_ref_link = regex.sub(r">.*?</a>", ">" + note_number + "</a>.", ref_link)
# Now remove the wrapping li node from the note
note_text = regex.sub(r"^<li[^>]*?>(.*)</li>$", r"\1", etree.tostring(note, encoding="unicode", pretty_print=True, with_tail=False), flags=regex.IGNORECASE | regex.DOTALL)
# Insert our new ref link
result = regex.subn(r"^\s*<p([^>]*?)>", "<p\\1 id=\"" + note_id + "\">" + new_ref_link + " ", note_text)
# Sometimes there is no leading <p> tag (for example, if the endnote starts with a blockquote
# If that's the case, just insert one in front.
note_text = result[0]
if result[1] == 0:
note_text = "<p id=\"" + note_id + "\">" + new_ref_link + "</p>" + note_text
# Now remove the old ref_link
note_text = note_text.replace(ref_link, "")
# Trim trailing spaces left over after removing the ref link
note_text = regex.sub(r"\s+</p>", "</p>", note_text).strip()
# Sometimes ref links are in their own p tag--remove that too
note_text = regex.sub(r"<p>\s*</p>", "", note_text)
processed_endnotes += note_text + "\n"
# All done with endnotes, so drop them back in
xhtml = regex.sub(r"<ol>.*</ol>", processed_endnotes, xhtml, flags=regex.IGNORECASE | regex.DOTALL)
file.seek(0)
file.write(xhtml)
file.truncate()
if __name__ == "__main__":
main()