-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
142 lines (107 loc) · 4.27 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
"""
Make Microsoft Teams inverview transcripts human-readable.
"""
from argparse import ArgumentParser
from collections.abc import Iterable
from pathlib import Path
import re
import pandas as pd
class BadInterviewerName(Exception):
pass
parser = ArgumentParser(
prog="format_transcript",
description="Turns a `.vtt` audio transcript from Microsoft Teams/Stream into a human-readable plain text file.",
)
parser.add_argument(
"files",
type=Path,
nargs="+", # at least one file must be provided
help="one or more `.vtt` files downloaded from Microsoft Teams/Stream",
)
parser.add_argument(
"-o",
"--output",
type=Path,
help="directory in which to save the formatted `.txt` files",
default=".",
)
parser.add_argument(
"-i",
"--interviewer",
type=str,
help="name of interviewer as it appears in the transcript",
)
def _format_transcript(transcript: str, interviewer: str) -> str:
# Strip first line which should just contain `WEBVTT`, then
# split the transcript into chunks of speech
chunks = transcript.split("\n\n")[1:]
array = [chunk.split("\n", maxsplit=2) for chunk in chunks]
df = pd.DataFrame(array, columns=["hash", "interval", "raw"])
# Replace hh:mm:ss.ff timestamp with mm:ss
df["timestamp"] = df["interval"].apply(
lambda s: ":".join(re.split(r":|\.", re.split(" ", s)[0])[1:3])
)
# Strip html tags and separate speaker/speech
df["raw"] = df["raw"].apply(lambda s: re.sub("<v |</v>", "", s))
df[["speaker", "speech"]] = df["raw"].str.split(">", n=1, expand=True)
# Replace newlines with spaces and drop rows containing no speech
df["speech"] = df["speech"].str.replace("\n", " ")
df = df[df["speech"].str.strip().astype(bool)]
df.drop(columns=["hash", "interval", "raw"], inplace=True)
# Merge adjacent blocks with the same speaker using a Boolean flag
# to indicate that the speaker has changed, then convert flag to
# integer increment using cumsum trick
df["block"] = (df["speaker"] != df["speaker"].shift()).cumsum()
df = df.groupby("block").agg(
{"timestamp": "first", "speaker": "first", "speech": lambda x: " ".join(x)}
)
# Check that there are 2 speakers, one of which is INTERVIEWER
speakers = df["speaker"].unique()
assert len(speakers) == 2
if interviewer not in speakers:
raise BadInterviewerName(
f"Interviewer '{interviewer}' is not present in this transcript"
)
# Replace names with 'Interviewer' and 'Student'
df["speaker"] = df["speaker"].apply(
lambda name: "Interviewer" if name == interviewer else "Student"
)
# Add '<' or '>' prefix
df["prefix"] = df["speaker"].apply(
lambda name: ">" if name == "Interviewer" else "<"
)
# Format in human-readable way, appropriate for annotation
# TODO: replace hard-coded f-string with template file
formatted_transcript = "\n\n".join(
[
f"{prefix} {speaker} | {speech} | {time}"
for (time, speaker, speech, prefix) in df.itertuples(index=False, name=None)
]
)
return formatted_transcript
def main(files: list[Path], output_dir: Path, interviewer: str) -> None:
"""Format a given list of `.vtt` transcript files and save the results."""
assert isinstance(files, Iterable)
assert len(files) > 0
assert all([isinstance(file, Path) for file in files])
assert isinstance(output_dir, Path)
assert interviewer is not None
assert isinstance(interviewer, str)
assert len(interviewer) > 0
output_dir.mkdir(parents=True, exist_ok=True)
for infile in files:
# Read file as single string (assume it's sufficiently small)
with infile.open("r") as f:
raw_transcript = f.read()
formatted_transcript = _format_transcript(raw_transcript, interviewer)
outfile = (output_dir / (infile.stem + "_formatted")).with_suffix(".txt")
assert not outfile.exists()
with outfile.open("w") as file:
file.write(formatted_transcript)
print(f"{infile} -> {outfile}")
def cli():
"""Wrapper around `main` that parses arguments from the command-line."""
args = parser.parse_args()
main(args.files, args.output, args.interviewer)
if __name__ == "__main__":
cli()