@@ -236,6 +236,31 @@ def read_file_from_archive(self, zf, filepath):
236236 f"File { zf .filename } is not a valid { self .FILE_TYPE } file, { filepath } is missing."
237237 )
238238
239+ def _validate_index_html_body (self , zf , path ):
240+ """Validate that index.html exists and has a non-empty body."""
241+ index_html = self .read_file_from_archive (zf , "index.html" )
242+ try :
243+ dom = html5lib .parse (index_html , namespaceHTMLElements = False )
244+ body = dom .find ("body" )
245+ if body is None :
246+ raise InvalidFileException (
247+ f"File { path } is not a valid { self .FILE_TYPE } file, index.html is missing a body element."
248+ )
249+ # Check that the body has at least one child element
250+ # for some reason it seems like comments don't get a string tag attribute
251+ body_children = [
252+ c for c in body .iter () if isinstance (c .tag , str ) and c .tag != "body"
253+ ]
254+ if not (body .text and body .text .strip ()) and not body_children :
255+ raise InvalidFileException (
256+ f"File { path } is not a valid { self .FILE_TYPE } file, index.html is empty."
257+ )
258+ return dom
259+ except ParseError :
260+ raise InvalidFileException (
261+ f"File { path } is not a valid { self .FILE_TYPE } file, index.html is not well-formed."
262+ )
263+
239264 def _read_and_compress_archive_file (
240265 self , filepath , reader , audio_settings = None , video_settings = None , ext = None
241266 ):
@@ -291,28 +316,7 @@ class HTML5ConversionHandler(ArchiveProcessingBaseHandler):
291316
292317 def validate_archive (self , path : str ):
293318 with self .open_and_verify_archive (path ) as zf :
294- # Check index.html exists and is valid HTML
295- index_html = self .read_file_from_archive (zf , "index.html" )
296- try :
297- dom = html5lib .parse (index_html , namespaceHTMLElements = False )
298- body = dom .find ("body" )
299- if body is None :
300- raise InvalidFileException (
301- f"File { path } is not a valid HTML5 file, index.html is missing a body element."
302- )
303- # Check that the body has at least one child element
304- # for some reason it seems like comments don't get a string tag attribute
305- body_children = [
306- c for c in body .iter () if isinstance (c .tag , str ) and c .tag != "body"
307- ]
308- if not body .text .strip () and not body_children :
309- raise InvalidFileException (
310- f"File { path } is not a valid HTML5 file, index.html is empty."
311- )
312- except ParseError :
313- raise InvalidFileException (
314- f"File { path } is not a valid HTML5 file, index.html is not well-formed."
315- )
319+ self ._validate_index_html_body (zf , path )
316320
317321
318322class H5PConversionHandler (ArchiveProcessingBaseHandler ):
0 commit comments