forked from standardebooks/tools
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbuild-kobo
executable file
·148 lines (121 loc) · 5.36 KB
/
build-kobo
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
#!/usr/bin/env python3
#Includes functions based on code from the Calibre Kobo Touch Extended Driver: https://www.mobileread.com/forums/showthread.php?t=211135
import argparse
import regex
import os
import fnmatch
from copy import deepcopy
import lxml.etree as etree
paragraph_counter = 1
segment_counter = 1
XHTML_NAMESPACES = {"xhtml": "http://www.w3.org/1999/xhtml", "epub": "http://www.idpf.org/2007/ops", "z3998": "http://www.daisy.org/z3998/2012/vocab/structure/", "se": "https://standardebooks.org/vocab/1.0"}
def append_kobo_spans_from_text(node, text):
global paragraph_counter
global segment_counter
if text is not None:
# if text is only whitespace, don't add spans
if regex.match(r"^\s+$", text, flags=regex.MULTILINE):
return False
else:
# split text in sentences
groups = regex.split(r'(.*?[\.\!\?\:][\'"\u201d\u2019]?\s*)', text, flags=regex.MULTILINE)
# remove empty strings resulting from split()
groups = [g for g in groups if g != ""]
# To match Kobo KePubs, the trailing whitespace needs to be
# prepended to the next group. Probably equivalent to make sure
# the space stays in the span at the end.
# add each sentence in its own span
for group in groups:
span = etree.Element("{%s}span" % ("http://www.w3.org/1999/xhtml", ), attrib={"id": "kobo.{0}.{1}".format(paragraph_counter, segment_counter), "class": "koboSpan"})
span.text = group
node.append(span)
segment_counter += 1
return True
return True
def add_kobo_spans_to_node(node):
global paragraph_counter
global segment_counter
# process node only if it is not a comment or a processing instruction
if not (node is None or isinstance(node, etree._Comment) or isinstance(node, etree._ProcessingInstruction)):
# Special case: <img> tags
special_tag_match = regex.search(r'^(?:\{[^\}]+\})?(\w+)$', node.tag)
if special_tag_match and special_tag_match.group(1) in ["img"]:
span = etree.Element("{%s}span" % ("http://www.w3.org/1999/xhtml", ), attrib={"id": "kobo.{0}.{1}".format(paragraph_counter, segment_counter), "class": "koboSpan"})
span.append(node)
return span
# save node content for later
nodetext = node.text
nodechildren = deepcopy(node.getchildren())
nodeattrs = {}
for key in node.keys():
nodeattrs[key] = node.get(key)
# reset current node, to start from scratch
node.clear()
# restore node attributes
for key in nodeattrs.keys():
node.set(key, nodeattrs[key])
# the node text is converted to spans
if nodetext is not None:
if not append_kobo_spans_from_text(node, nodetext):
# didn't add spans, restore text
node.text = nodetext
# re-add the node children
for child in nodechildren:
# save child tail for later
childtail = child.tail
child.tail = None
node.append(add_kobo_spans_to_node(child))
# the child tail is converted to spans
if childtail is not None:
paragraph_counter += 1
segment_counter = 1
if not append_kobo_spans_from_text(node, childtail):
# didn't add spans, restore tail on last child
paragraph_counter -= 1
node[-1].tail = childtail
paragraph_counter += 1
segment_counter = 1
else:
node.tail = None
return node
def main():
parser = argparse.ArgumentParser(description="Convert files in a Standard Ebooks source directory to be Kobo-compatible.")
parser.add_argument("directories", metavar="DIRECTORY", nargs="+", help="a Standard Ebooks source directory")
args = parser.parse_args()
global paragraph_counter
global segment_counter
for directory in args.directories:
directory = os.path.abspath(directory)
for root, _, filenames in os.walk(directory):
for filename in fnmatch.filter(filenames, "content.opf"):
with open(os.path.join(root, filename), "r+", encoding="utf-8") as file:
xhtml = file.read()
xhtml = regex.sub(r"<dc:publisher", "<meta property=\"se:transform\">kobo</meta>\n\t\t<dc:publisher", xhtml)
file.seek(0)
file.write(xhtml)
file.truncate()
for filename in fnmatch.filter(filenames, "*.xhtml"):
paragraph_counter = 1
segment_counter = 1
#Don't add spans to the ToC
if filename == "toc.xhtml":
continue
with open(os.path.join(root, filename), "r+", encoding="utf-8") as file:
xhtml = file.read()
#Kobos don't have fonts that support the ↩ character in endnotes, so replace it with «
if filename == "endnotes.xhtml":
xhtml = regex.sub(r"epub:type=\"se:referrer\">↩</a>", "epub:type=\"se:referrer\">«</a>", xhtml)
#We have to remove the default namespace declaration from our document, otherwise
#xpath won't find anything at all. See http://stackoverflow.com/questions/297239/why-doesnt-xpath-work-when-processing-an-xhtml-document-with-lxml-in-python
tree = etree.fromstring(str.encode(xhtml.replace(" xmlns=\"http://www.w3.org/1999/xhtml\"", "")))
add_kobo_spans_to_node(tree.xpath("./body", namespaces=XHTML_NAMESPACES)[0])
xhtml = etree.tostring(tree, encoding="unicode", pretty_print=True, with_tail=False)
xhtml = regex.sub(r"<html:span", "<span", xhtml)
xhtml = regex.sub(r"html:span>", "span>", xhtml)
xhtml = regex.sub(r"<span xmlns:html=\"http://www.w3.org/1999/xhtml\"", "<span", xhtml)
xhtml = regex.sub(r"<html", "<?xml version=\"1.0\" encoding=\"utf-8\"?>\n<html xmlns=\"http://www.w3.org/1999/xhtml\"", xhtml)
file.seek(0)
file.write(xhtml)
file.truncate()
if __name__ == "__main__":
main()