From 80a7dddefa8a1a1ac5ac8c73be0af4b6e1b4700b Mon Sep 17 00:00:00 2001 From: Harold Martin Date: Sat, 15 Jun 2024 16:28:02 -0700 Subject: [PATCH] HTML converter --- podcast_transcript_tools/convert.py | 6 +-- podcast_transcript_tools/html_to_json.py | 61 ++++++++++++++---------- 2 files changed, 40 insertions(+), 27 deletions(-) diff --git a/podcast_transcript_tools/convert.py b/podcast_transcript_tools/convert.py index 821b755..4327234 100644 --- a/podcast_transcript_tools/convert.py +++ b/podcast_transcript_tools/convert.py @@ -8,13 +8,13 @@ def list_files(directory: str) -> list[str]: for dirpath, dirnames, filenames in os.walk(directory): for filename in filenames: file_paths.append( - os.path.join(dirpath, filename) + os.path.join(dirpath, filename), ) # Append the file name to the full path return file_paths def read_first_line(file_path: str) -> str: - with open(file_path, "r") as file: + with open(file_path) as file: return file.readline() @@ -49,7 +49,7 @@ def extract_file_types_from_name( def main(transcript_directory): file_paths = list_files(transcript_directory) vtt_files, srt_files, html_files, unknown_files = extract_file_types_from_name( - file_paths + file_paths, ) # Enumerate first_lines and indentify any files matching patterns first_lines = read_files_in_parallel(unknown_files) diff --git a/podcast_transcript_tools/html_to_json.py b/podcast_transcript_tools/html_to_json.py index e31d79e..21bcf77 100644 --- a/podcast_transcript_tools/html_to_json.py +++ b/podcast_transcript_tools/html_to_json.py @@ -1,33 +1,44 @@ -import json +from functools import reduce +from json import dumps from pathlib import Path from bs4 import BeautifulSoup +from loguru import logger def _ts_to_secs(time_string: str) -> float: - hours, minutes, seconds, milliseconds = map( - int, - time_string.replace(",", ":").split(":"), - ) - return (hours * 3600) + (minutes * 60) + seconds + (milliseconds / 1000) + parts = enumerate(map(int, reversed(time_string.split(":")))) + secs = next(parts)[1] + return reduce(lambda acc, part: acc + ((60 ** part[0]) * part[1]), parts, secs) -# See spec at: -# https://github.com/Podcastindex-org/podcast-namespace/blob/main/transcripts/transcripts.md +# https://github.com/Podcastindex-org/podcast-namespace/blob/main/transcripts/transcripts.md#html def _html_to_list(soup: BeautifulSoup) -> list[dict]: - blocks = [] - # TODO: detect cite/time for conformance to spec. - for child in soup.children: - if child.name == "p": - blocks.append( - { - "startTime": 0.0, - "endTime": 0.0, - "body": child.text, - } - ) + blocks = [{}] + for child in soup.body.children or soup.children: + if child.name == "cite": + if "speaker" not in blocks[-1]: + blocks[-1]["speaker"] = child.text.replace(":", "").strip() + else: + blocks.append( + { + "speaker": child.text.replace(":", "").strip(), + }, + ) + elif child.name == "time": + if "startTime" not in blocks[-1]: + blocks[-1]["startTime"] = _ts_to_secs(child.text.strip()) + else: + blocks.append( + { + "startTime": _ts_to_secs(child.text.strip()), + }, + ) + elif child.name == "p": + blocks[-1]["body"] = child.text.strip() + else: + logger.warning(f"Unknown tag: {child.name}") return blocks - return None def html_to_podcast_dict(html_string: str) -> dict: @@ -40,11 +51,13 @@ def html_to_podcast_dict(html_string: str) -> dict: def html_file_to_json_file(html_file: str, json_file: str) -> None: + html_string = Path(html_file).read_text() + if "" not in html_string and "