build-kobo

#!/usr/bin/env python3

#Includes functions based on code from the Calibre Kobo Touch Extended Driver: https://www.mobileread.com/forums/showthread.php?t=211135

import argparse
import regex
import os
import fnmatch
from copy import deepcopy
import lxml.etree as etree

paragraph_counter = 1
segment_counter = 1
XHTML_NAMESPACES = {"xhtml": "http://www.w3.org/1999/xhtml", "epub": "http://www.idpf.org/2007/ops", "z3998": "http://www.daisy.org/z3998/2012/vocab/structure/", "se": "https://standardebooks.org/vocab/1.0"}

def append_kobo_spans_from_text(node, text):
	global paragraph_counter
	global segment_counter

	if text is not None:
		# if text is only whitespace, don't add spans
		if regex.match(r"^\s+$", text, flags=regex.MULTILINE):
			return False
		else:
			# split text in sentences
			groups = regex.split(r'(.*?[\.\!\?\:][\'"\u201d\u2019]?\s*)', text, flags=regex.MULTILINE)
			# remove empty strings resulting from split()
			groups = [g for g in groups if g != ""]

			# To match Kobo KePubs, the trailing whitespace needs to be
			# prepended to the next group. Probably equivalent to make sure
			# the space stays in the span at the end.
			# add each sentence in its own span
			for group in groups:
				span = etree.Element("{%s}span" % ("http://www.w3.org/1999/xhtml", ), attrib={"id": "kobo.{0}.{1}".format(paragraph_counter, segment_counter), "class": "koboSpan"})
				span.text = group
				node.append(span)
				segment_counter += 1
			return True
	return True

def add_kobo_spans_to_node(node):
	global paragraph_counter
	global segment_counter

	# process node only if it is not a comment or a processing instruction
	if not (node is None or isinstance(node, etree._Comment) or isinstance(node, etree._ProcessingInstruction)):
		# Special case: <img> tags
		special_tag_match = regex.search(r'^(?:\{[^\}]+\})?(\w+)$', node.tag)
		if special_tag_match and special_tag_match.group(1) in ["img"]:
			span = etree.Element("{%s}span" % ("http://www.w3.org/1999/xhtml", ), attrib={"id": "kobo.{0}.{1}".format(paragraph_counter, segment_counter), "class": "koboSpan"})
			span.append(node)
			return span

		# save node content for later
		nodetext = node.text
		nodechildren = deepcopy(node.getchildren())
		nodeattrs = {}
		for key in node.keys():
			nodeattrs[key] = node.get(key)

		# reset current node, to start from scratch
		node.clear()

		# restore node attributes
		for key in nodeattrs.keys():
			node.set(key, nodeattrs[key])

		# the node text is converted to spans
		if nodetext is not None:
			if not append_kobo_spans_from_text(node, nodetext):
				# didn't add spans, restore text
				node.text = nodetext

		# re-add the node children
		for child in nodechildren:
			# save child tail for later
			childtail = child.tail
			child.tail = None
			node.append(add_kobo_spans_to_node(child))
			# the child tail is converted to spans
			if childtail is not None:
				paragraph_counter += 1
				segment_counter = 1
				if not append_kobo_spans_from_text(node, childtail):
					# didn't add spans, restore tail on last child
					paragraph_counter -= 1
					node[-1].tail = childtail

			paragraph_counter += 1
			segment_counter = 1
	else:
		node.tail = None
	return node

def main():
	parser = argparse.ArgumentParser(description="Convert files in a Standard Ebooks source directory to be Kobo-compatible.")
	parser.add_argument("directories", metavar="DIRECTORY", nargs="+", help="a Standard Ebooks source directory")
	args = parser.parse_args()
	global paragraph_counter
	global segment_counter

	for directory in args.directories:
		directory = os.path.abspath(directory)

		for root, _, filenames in os.walk(directory):
			for filename in fnmatch.filter(filenames, "content.opf"):
				with open(os.path.join(root, filename), "r+", encoding="utf-8") as file:
					xhtml = file.read()

					xhtml = regex.sub(r"<dc:publisher", "<meta property=\"se:transform\">kobo</meta>\n\t\t<dc:publisher", xhtml)

					file.seek(0)
					file.write(xhtml)
					file.truncate()

			for filename in fnmatch.filter(filenames, "*.xhtml"):
				paragraph_counter = 1
				segment_counter = 1

				#Don't add spans to the ToC
				if filename == "toc.xhtml":
					continue

				with open(os.path.join(root, filename), "r+", encoding="utf-8") as file:
					xhtml = file.read()
					#Kobos don't have fonts that support the ↩ character in endnotes, so replace it with «
					if filename == "endnotes.xhtml":
						xhtml = regex.sub(r"epub:type=\"se:referrer\">↩</a>", "epub:type=\"se:referrer\">«</a>", xhtml)

					#We have to remove the default namespace declaration from our document, otherwise
					#xpath won't find anything at all.  See http://stackoverflow.com/questions/297239/why-doesnt-xpath-work-when-processing-an-xhtml-document-with-lxml-in-python
					tree = etree.fromstring(str.encode(xhtml.replace(" xmlns=\"http://www.w3.org/1999/xhtml\"", "")))

					add_kobo_spans_to_node(tree.xpath("./body", namespaces=XHTML_NAMESPACES)[0])

					xhtml = etree.tostring(tree, encoding="unicode", pretty_print=True, with_tail=False)
					xhtml = regex.sub(r"<html:span", "<span", xhtml)
					xhtml = regex.sub(r"html:span>", "span>", xhtml)
					xhtml = regex.sub(r"<span xmlns:html=\"http://www.w3.org/1999/xhtml\"", "<span", xhtml)
					xhtml = regex.sub(r"<html", "<?xml version=\"1.0\" encoding=\"utf-8\"?>\n<html xmlns=\"http://www.w3.org/1999/xhtml\"", xhtml)

					file.seek(0)
					file.write(xhtml)
					file.truncate()

if __name__ == "__main__":
	main()