|
3 | 3 | import os
|
4 | 4 | import subprocess
|
5 | 5 | from pathlib import Path
|
6 |
| - |
7 | 6 | import numpy as np
|
8 | 7 | import soundfile as sf
|
9 | 8 | import torch
|
10 | 9 | import torch.nn.functional as F
|
| 10 | +from tqdm import tqdm |
11 | 11 | from fairseq import utils
|
12 | 12 | from fairseq.data import Dictionary
|
13 | 13 | from fairseq.models import BaseFairseqModel
|
14 | 14 | from fairseq.models.wav2vec.wav2vec2_asr import Wav2VecEncoder, Wav2Vec2CtcConfig
|
15 |
| -from pydub import AudioSegment |
| 15 | +from pydub import AudioSegment, effects |
16 | 16 | import GPUtil
|
17 | 17 |
|
18 | 18 | import src.media_convertor
|
19 | 19 | from src import utilities, log_setup
|
20 | 20 | from src.lib.audio_normalization import AudioNormalization
|
21 | 21 | from src.monitoring import monitor
|
| 22 | +from src.srt.timestamp_generator import extract_time_stamps |
22 | 23 | from src.utilities import get_env_var
|
23 | 24 |
|
24 | 25 | try:
|
@@ -375,14 +376,21 @@ def get_results(wav_path, dict_path, generator, use_cuda=False, w2v_path=None, m
|
375 | 376 | net_input = dict()
|
376 | 377 | dir_name = src.media_convertor.media_conversion(wav_path, duration_limit=15)
|
377 | 378 | audio_file = dir_name / 'clipped_audio.wav'
|
378 |
| - normalized_audio = AudioNormalization(audio_file).loudness_normalization_effects() |
379 |
| - LOGGER.debug('Audio normalization done') |
| 379 | + |
| 380 | + start_time, end_time = extract_time_stamps(audio_file) |
| 381 | + original_file_path = wav_path.replace('clipped_audio_enhanced', 'clipped_audio') |
| 382 | + original_chunk = AudioSegment.from_wav(original_file_path) |
380 | 383 | silence = AudioSegment.silent(duration=500)
|
381 |
| - LOGGER.debug('Appending silence') |
382 |
| - sound = silence + normalized_audio + silence |
383 |
| - sound.export('test_sil.wav', format='wav') |
384 |
| - LOGGER.debug(f"The sound object is : {sound}") |
385 |
| - wav = np.array(sound.get_array_of_samples()).astype('float64') |
| 384 | + chunked_audio = AudioSegment.silent(duration=500) |
| 385 | + for i in tqdm(range(len(start_time))): |
| 386 | + chunked_audio = chunked_audio + original_chunk[start_time[i] * 1000: end_time[i] * 1000] + silence |
| 387 | + |
| 388 | + normalized_audio = effects.normalize(chunked_audio) |
| 389 | + LOGGER.debug('Audio normalization done') |
| 390 | + |
| 391 | + normalized_audio.export('test_sil.wav', format='wav') |
| 392 | + LOGGER.debug(f"The sound object is : {normalized_audio}") |
| 393 | + wav = np.array(normalized_audio.get_array_of_samples()).astype('float64') |
386 | 394 | LOGGER.debug(f"The shape of the audio is {wav.shape}")
|
387 | 395 | # wav = np.array(normalized_audio.get_array_of_samples()).astype('float64')
|
388 | 396 |
|
|
0 commit comments