From 80a7dddefa8a1a1ac5ac8c73be0af4b6e1b4700b Mon Sep 17 00:00:00 2001
From: Harold Martin <harold.martin@gmail.com>
Date: Sat, 15 Jun 2024 16:28:02 -0700
Subject: [PATCH] HTML converter

---
 podcast_transcript_tools/convert.py      |  6 +--
 podcast_transcript_tools/html_to_json.py | 61 ++++++++++++++----------
 2 files changed, 40 insertions(+), 27 deletions(-)

diff --git a/podcast_transcript_tools/convert.py b/podcast_transcript_tools/convert.py
index 821b755..4327234 100644
--- a/podcast_transcript_tools/convert.py
+++ b/podcast_transcript_tools/convert.py
@@ -8,13 +8,13 @@ def list_files(directory: str) -> list[str]:
     for dirpath, dirnames, filenames in os.walk(directory):
         for filename in filenames:
             file_paths.append(
-                os.path.join(dirpath, filename)
+                os.path.join(dirpath, filename),
             )  # Append the file name to the full path
     return file_paths
 
 
 def read_first_line(file_path: str) -> str:
-    with open(file_path, "r") as file:
+    with open(file_path) as file:
         return file.readline()
 
 
@@ -49,7 +49,7 @@ def extract_file_types_from_name(
 def main(transcript_directory):
     file_paths = list_files(transcript_directory)
     vtt_files, srt_files, html_files, unknown_files = extract_file_types_from_name(
-        file_paths
+        file_paths,
     )
     # Enumerate first_lines and indentify any files matching patterns
     first_lines = read_files_in_parallel(unknown_files)
diff --git a/podcast_transcript_tools/html_to_json.py b/podcast_transcript_tools/html_to_json.py
index e31d79e..21bcf77 100644
--- a/podcast_transcript_tools/html_to_json.py
+++ b/podcast_transcript_tools/html_to_json.py
@@ -1,33 +1,44 @@
-import json
+from functools import reduce
+from json import dumps
 from pathlib import Path
 
 from bs4 import BeautifulSoup
+from loguru import logger
 
 
 def _ts_to_secs(time_string: str) -> float:
-    hours, minutes, seconds, milliseconds = map(
-        int,
-        time_string.replace(",", ":").split(":"),
-    )
-    return (hours * 3600) + (minutes * 60) + seconds + (milliseconds / 1000)
+    parts = enumerate(map(int, reversed(time_string.split(":"))))
+    secs = next(parts)[1]
+    return reduce(lambda acc, part: acc + ((60 ** part[0]) * part[1]), parts, secs)
 
 
-# See spec at:
-# https://github.com/Podcastindex-org/podcast-namespace/blob/main/transcripts/transcripts.md
+# https://github.com/Podcastindex-org/podcast-namespace/blob/main/transcripts/transcripts.md#html
 def _html_to_list(soup: BeautifulSoup) -> list[dict]:
-    blocks = []
-    # TODO: detect cite/time for conformance to spec.
-    for child in soup.children:
-        if child.name == "p":
-            blocks.append(
-                {
-                    "startTime": 0.0,
-                    "endTime": 0.0,
-                    "body": child.text,
-                }
-            )
+    blocks = [{}]
+    for child in soup.body.children or soup.children:
+        if child.name == "cite":
+            if "speaker" not in blocks[-1]:
+                blocks[-1]["speaker"] = child.text.replace(":", "").strip()
+            else:
+                blocks.append(
+                    {
+                        "speaker": child.text.replace(":", "").strip(),
+                    },
+                )
+        elif child.name == "time":
+            if "startTime" not in blocks[-1]:
+                blocks[-1]["startTime"] = _ts_to_secs(child.text.strip())
+            else:
+                blocks.append(
+                    {
+                        "startTime": _ts_to_secs(child.text.strip()),
+                    },
+                )
+        elif child.name == "p":
+            blocks[-1]["body"] = child.text.strip()
+        else:
+            logger.warning(f"Unknown tag: {child.name}")
     return blocks
-    return None
 
 
 def html_to_podcast_dict(html_string: str) -> dict:
@@ -40,11 +51,13 @@ def html_to_podcast_dict(html_string: str) -> dict:
 
 
 def html_file_to_json_file(html_file: str, json_file: str) -> None:
+    html_string = Path(html_file).read_text()
+    if "<cite>" not in html_string and "<time>" not in html_string:
+        logger.error(f"No <cite> or <time> tags found in {html_file}")
+        return
     Path(json_file).write_text(
-        data=json.dumps(
-            html_to_podcast_dict(
-                html_string=Path(html_file).read_text(),
-            ),
+        data=dumps(
+            html_to_podcast_dict(html_string),
             indent=4,
         ),
     )