Skip to content

Commit c045a89

Browse files
|Ankit, Umair, Anirudh| handle long silence in transcript
1 parent 1ee91a1 commit c045a89

File tree

2 files changed

+18
-10
lines changed

2 files changed

+18
-10
lines changed

.circleci/config.yml

+1-1
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ parameters:
77
default: speech_recognition_model_api
88
package_version:
99
type: string
10-
default: "3.2.34"
10+
default: "3.2.35"
1111
dependency_image_name:
1212
type: string
1313
default: speech-recognition-open-api-dependency

src/lib/inference_lib.py

+17-9
Original file line numberDiff line numberDiff line change
@@ -3,22 +3,23 @@
33
import os
44
import subprocess
55
from pathlib import Path
6-
76
import numpy as np
87
import soundfile as sf
98
import torch
109
import torch.nn.functional as F
10+
from tqdm import tqdm
1111
from fairseq import utils
1212
from fairseq.data import Dictionary
1313
from fairseq.models import BaseFairseqModel
1414
from fairseq.models.wav2vec.wav2vec2_asr import Wav2VecEncoder, Wav2Vec2CtcConfig
15-
from pydub import AudioSegment
15+
from pydub import AudioSegment, effects
1616
import GPUtil
1717

1818
import src.media_convertor
1919
from src import utilities, log_setup
2020
from src.lib.audio_normalization import AudioNormalization
2121
from src.monitoring import monitor
22+
from src.srt.timestamp_generator import extract_time_stamps
2223
from src.utilities import get_env_var
2324

2425
try:
@@ -375,14 +376,21 @@ def get_results(wav_path, dict_path, generator, use_cuda=False, w2v_path=None, m
375376
net_input = dict()
376377
dir_name = src.media_convertor.media_conversion(wav_path, duration_limit=15)
377378
audio_file = dir_name / 'clipped_audio.wav'
378-
normalized_audio = AudioNormalization(audio_file).loudness_normalization_effects()
379-
LOGGER.debug('Audio normalization done')
379+
380+
start_time, end_time = extract_time_stamps(audio_file)
381+
original_file_path = wav_path.replace('clipped_audio_enhanced', 'clipped_audio')
382+
original_chunk = AudioSegment.from_wav(original_file_path)
380383
silence = AudioSegment.silent(duration=500)
381-
LOGGER.debug('Appending silence')
382-
sound = silence + normalized_audio + silence
383-
sound.export('test_sil.wav', format='wav')
384-
LOGGER.debug(f"The sound object is : {sound}")
385-
wav = np.array(sound.get_array_of_samples()).astype('float64')
384+
chunked_audio = AudioSegment.silent(duration=500)
385+
for i in tqdm(range(len(start_time))):
386+
chunked_audio = chunked_audio + original_chunk[start_time[i] * 1000: end_time[i] * 1000] + silence
387+
388+
normalized_audio = effects.normalize(chunked_audio)
389+
LOGGER.debug('Audio normalization done')
390+
391+
normalized_audio.export('test_sil.wav', format='wav')
392+
LOGGER.debug(f"The sound object is : {normalized_audio}")
393+
wav = np.array(normalized_audio.get_array_of_samples()).astype('float64')
386394
LOGGER.debug(f"The shape of the audio is {wav.shape}")
387395
# wav = np.array(normalized_audio.get_array_of_samples()).astype('float64')
388396

0 commit comments

Comments
 (0)