Skip to content

Commit ac47cdc

Browse files
rtibblesclaude
andcommitted
Fix body.text None crash in HTML5ConversionHandler.
html5lib sets body.text to None when the body element starts with a child element rather than text. Guard against this the same way as KPUBConversionHandler. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent b1c2efe commit ac47cdc

File tree

1 file changed

+26
-22
lines changed

1 file changed

+26
-22
lines changed

ricecooker/utils/pipeline/convert.py

Lines changed: 26 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -236,6 +236,31 @@ def read_file_from_archive(self, zf, filepath):
236236
f"File {zf.filename} is not a valid {self.FILE_TYPE} file, {filepath} is missing."
237237
)
238238

239+
def _validate_index_html_body(self, zf, path):
240+
"""Validate that index.html exists and has a non-empty body."""
241+
index_html = self.read_file_from_archive(zf, "index.html")
242+
try:
243+
dom = html5lib.parse(index_html, namespaceHTMLElements=False)
244+
body = dom.find("body")
245+
if body is None:
246+
raise InvalidFileException(
247+
f"File {path} is not a valid {self.FILE_TYPE} file, index.html is missing a body element."
248+
)
249+
# Check that the body has at least one child element
250+
# for some reason it seems like comments don't get a string tag attribute
251+
body_children = [
252+
c for c in body.iter() if isinstance(c.tag, str) and c.tag != "body"
253+
]
254+
if not (body.text and body.text.strip()) and not body_children:
255+
raise InvalidFileException(
256+
f"File {path} is not a valid {self.FILE_TYPE} file, index.html is empty."
257+
)
258+
return dom
259+
except ParseError:
260+
raise InvalidFileException(
261+
f"File {path} is not a valid {self.FILE_TYPE} file, index.html is not well-formed."
262+
)
263+
239264
def _read_and_compress_archive_file(
240265
self, filepath, reader, audio_settings=None, video_settings=None, ext=None
241266
):
@@ -291,28 +316,7 @@ class HTML5ConversionHandler(ArchiveProcessingBaseHandler):
291316

292317
def validate_archive(self, path: str):
293318
with self.open_and_verify_archive(path) as zf:
294-
# Check index.html exists and is valid HTML
295-
index_html = self.read_file_from_archive(zf, "index.html")
296-
try:
297-
dom = html5lib.parse(index_html, namespaceHTMLElements=False)
298-
body = dom.find("body")
299-
if body is None:
300-
raise InvalidFileException(
301-
f"File {path} is not a valid HTML5 file, index.html is missing a body element."
302-
)
303-
# Check that the body has at least one child element
304-
# for some reason it seems like comments don't get a string tag attribute
305-
body_children = [
306-
c for c in body.iter() if isinstance(c.tag, str) and c.tag != "body"
307-
]
308-
if not body.text.strip() and not body_children:
309-
raise InvalidFileException(
310-
f"File {path} is not a valid HTML5 file, index.html is empty."
311-
)
312-
except ParseError:
313-
raise InvalidFileException(
314-
f"File {path} is not a valid HTML5 file, index.html is not well-formed."
315-
)
319+
self._validate_index_html_body(zf, path)
316320

317321

318322
class H5PConversionHandler(ArchiveProcessingBaseHandler):

0 commit comments

Comments
 (0)