|
| 1 | +import av |
| 2 | +# import torchaudio |
| 3 | +import numpy as np |
| 4 | +from fractions import Fraction |
| 5 | + |
| 6 | + |
| 7 | +# def load_audio_torchaudio(fn): |
| 8 | +# data, sr = torchaudio.load(fn) |
| 9 | +# return data, sr |
| 10 | + |
| 11 | + |
| 12 | +def open_audio_av(path): |
| 13 | + container = av.open(path) |
| 14 | + for stream in container.streams.video: |
| 15 | + stream.codec_context.thread_type = av.codec.context.ThreadType.NONE |
| 16 | + stream.codec_context.thread_count = 1 |
| 17 | + for stream in container.streams.audio: |
| 18 | + stream.codec_context.thread_type = av.codec.context.ThreadType.NONE |
| 19 | + stream.codec_context.thread_count = 1 |
| 20 | + return container |
| 21 | + |
| 22 | + |
| 23 | +def load_audio_av(path=None, container=None, rate=None, start_time=None, duration=None, layout="mono"): |
| 24 | + if container is None: |
| 25 | + container = av.open(path) |
| 26 | + audio_stream = container.streams.audio[0] |
| 27 | + |
| 28 | + # Parse metadata |
| 29 | + _ss = audio_stream.start_time * audio_stream.time_base if audio_stream.start_time is not None else 0. |
| 30 | + _dur = audio_stream.duration * audio_stream.time_base |
| 31 | + _ff = _ss + _dur |
| 32 | + _rate = audio_stream.rate |
| 33 | + |
| 34 | + if rate is None: |
| 35 | + rate = _rate |
| 36 | + if start_time is None: |
| 37 | + start_time = _ss |
| 38 | + if duration is None: |
| 39 | + duration = _ff - start_time |
| 40 | + duration = min(duration, _ff - start_time) |
| 41 | + end_time = start_time + duration |
| 42 | + |
| 43 | + resampler = av.audio.resampler.AudioResampler(format="s16p", layout=layout, rate=rate) |
| 44 | + |
| 45 | + # Read data |
| 46 | + chunks = [] |
| 47 | + container.seek(int(start_time * av.time_base)) |
| 48 | + for frame in container.decode(audio=0): |
| 49 | + chunk_start_time = frame.pts * frame.time_base |
| 50 | + chunk_end_time = chunk_start_time + Fraction(frame.samples, frame.rate) |
| 51 | + if chunk_end_time < start_time: # Skip until start time |
| 52 | + continue |
| 53 | + if chunk_start_time > end_time: # Exit if clip has been extracted |
| 54 | + break |
| 55 | + |
| 56 | + try: |
| 57 | + frame.pts = None |
| 58 | + if resampler is not None: |
| 59 | + chunks.append((chunk_start_time, resampler.resample(frame)[0].to_ndarray())) |
| 60 | + else: |
| 61 | + chunks.append((chunk_start_time, frame.to_ndarray())) |
| 62 | + except AttributeError: |
| 63 | + break |
| 64 | + |
| 65 | + # Trim for frame accuracy |
| 66 | + audio = np.concatenate([af[1] for af in chunks], 1) |
| 67 | + ss = int((start_time - chunks[0][0]) * rate) |
| 68 | + t = int(duration * rate) |
| 69 | + if ss < 0: |
| 70 | + audio = np.pad(audio, ((0, 0), (-ss, 0)), 'constant', constant_values=0) |
| 71 | + ss = 0 |
| 72 | + audio = audio[:, ss: ss+t] |
| 73 | + |
| 74 | + # Normalize to [-1, 1] |
| 75 | + audio = audio / np.iinfo(audio.dtype).max |
| 76 | + |
| 77 | + return audio, rate |
| 78 | + |
| 79 | + |
| 80 | +def audio_info_av(inpt, audio=None, format=None): |
| 81 | + container = inpt |
| 82 | + if isinstance(inpt, str): |
| 83 | + try: |
| 84 | + container = av.open(inpt, format=format) |
| 85 | + except av.AVError: |
| 86 | + return None, None |
| 87 | + |
| 88 | + audio_stream = container.streams.audio[audio] |
| 89 | + time_base = audio_stream.time_base |
| 90 | + duration = audio_stream.duration * time_base |
| 91 | + start_time = audio_stream.start_time * time_base |
| 92 | + channels = audio_stream.channels |
| 93 | + fps = audio_stream.rate |
| 94 | + chunk_size = audio_stream.frame_size |
| 95 | + chunks = audio_stream.frames |
| 96 | + meta = {'channels': channels, |
| 97 | + 'fps': fps, |
| 98 | + 'start_time': start_time, |
| 99 | + 'duration': duration, |
| 100 | + 'chunks': chunks, |
| 101 | + 'chunk_size': chunk_size} |
| 102 | + return meta |
0 commit comments