Skip to content

Commit

Permalink
HTML converter
Browse files Browse the repository at this point in the history
  • Loading branch information
hbmartin committed Jun 15, 2024
1 parent 54719ba commit 80a7ddd
Show file tree
Hide file tree
Showing 2 changed files with 40 additions and 27 deletions.
6 changes: 3 additions & 3 deletions podcast_transcript_tools/convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,13 @@ def list_files(directory: str) -> list[str]:
for dirpath, dirnames, filenames in os.walk(directory):
for filename in filenames:
file_paths.append(
os.path.join(dirpath, filename)
os.path.join(dirpath, filename),
) # Append the file name to the full path
return file_paths


def read_first_line(file_path: str) -> str:
with open(file_path, "r") as file:
with open(file_path) as file:
return file.readline()


Expand Down Expand Up @@ -49,7 +49,7 @@ def extract_file_types_from_name(
def main(transcript_directory):
file_paths = list_files(transcript_directory)
vtt_files, srt_files, html_files, unknown_files = extract_file_types_from_name(
file_paths
file_paths,
)
# Enumerate first_lines and indentify any files matching patterns
first_lines = read_files_in_parallel(unknown_files)
Expand Down
61 changes: 37 additions & 24 deletions podcast_transcript_tools/html_to_json.py
Original file line number Diff line number Diff line change
@@ -1,33 +1,44 @@
import json
from functools import reduce
from json import dumps
from pathlib import Path

from bs4 import BeautifulSoup
from loguru import logger


def _ts_to_secs(time_string: str) -> float:
hours, minutes, seconds, milliseconds = map(
int,
time_string.replace(",", ":").split(":"),
)
return (hours * 3600) + (minutes * 60) + seconds + (milliseconds / 1000)
parts = enumerate(map(int, reversed(time_string.split(":"))))
secs = next(parts)[1]
return reduce(lambda acc, part: acc + ((60 ** part[0]) * part[1]), parts, secs)


# See spec at:
# https://github.com/Podcastindex-org/podcast-namespace/blob/main/transcripts/transcripts.md
# https://github.com/Podcastindex-org/podcast-namespace/blob/main/transcripts/transcripts.md#html
def _html_to_list(soup: BeautifulSoup) -> list[dict]:
blocks = []
# TODO: detect cite/time for conformance to spec.
for child in soup.children:
if child.name == "p":
blocks.append(
{
"startTime": 0.0,
"endTime": 0.0,
"body": child.text,
}
)
blocks = [{}]
for child in soup.body.children or soup.children:
if child.name == "cite":
if "speaker" not in blocks[-1]:
blocks[-1]["speaker"] = child.text.replace(":", "").strip()
else:
blocks.append(
{
"speaker": child.text.replace(":", "").strip(),
},
)
elif child.name == "time":
if "startTime" not in blocks[-1]:
blocks[-1]["startTime"] = _ts_to_secs(child.text.strip())
else:
blocks.append(
{
"startTime": _ts_to_secs(child.text.strip()),
},
)
elif child.name == "p":
blocks[-1]["body"] = child.text.strip()
else:
logger.warning(f"Unknown tag: {child.name}")
return blocks
return None


def html_to_podcast_dict(html_string: str) -> dict:
Expand All @@ -40,11 +51,13 @@ def html_to_podcast_dict(html_string: str) -> dict:


def html_file_to_json_file(html_file: str, json_file: str) -> None:
html_string = Path(html_file).read_text()
if "<cite>" not in html_string and "<time>" not in html_string:
logger.error(f"No <cite> or <time> tags found in {html_file}")
return
Path(json_file).write_text(
data=json.dumps(
html_to_podcast_dict(
html_string=Path(html_file).read_text(),
),
data=dumps(
html_to_podcast_dict(html_string),
indent=4,
),
)

0 comments on commit 80a7ddd

Please sign in to comment.