Skip to content

Commit 12cdaa8

Browse files
authored
Move sample rate and sample format conversion utils into FFMPEGCommon.cpp (#629)
1 parent fb3448b commit 12cdaa8

File tree

4 files changed

+91
-115
lines changed

4 files changed

+91
-115
lines changed

src/torchcodec/_core/FFMPEGCommon.cpp

+71-2
Original file line numberDiff line numberDiff line change
@@ -116,16 +116,17 @@ void setChannelLayout(
116116
#endif
117117
}
118118

119-
SwrContext* allocateSwrContext(
119+
SwrContext* createSwrContext(
120120
UniqueAVCodecContext& avCodecContext,
121121
AVSampleFormat sourceSampleFormat,
122122
AVSampleFormat desiredSampleFormat,
123123
int sourceSampleRate,
124124
int desiredSampleRate) {
125125
SwrContext* swrContext = nullptr;
126+
int status = AVSUCCESS;
126127
#if LIBAVFILTER_VERSION_MAJOR > 7 // FFmpeg > 4
127128
AVChannelLayout layout = avCodecContext->ch_layout;
128-
auto status = swr_alloc_set_opts2(
129+
status = swr_alloc_set_opts2(
129130
&swrContext,
130131
&layout,
131132
desiredSampleFormat,
@@ -155,9 +156,77 @@ SwrContext* allocateSwrContext(
155156
#endif
156157

157158
TORCH_CHECK(swrContext != nullptr, "Couldn't create swrContext");
159+
status = swr_init(swrContext);
160+
TORCH_CHECK(
161+
status == AVSUCCESS,
162+
"Couldn't initialize SwrContext: ",
163+
getFFMPEGErrorStringFromErrorCode(status),
164+
". If the error says 'Invalid argument', it's likely that you are using "
165+
"a buggy FFmpeg version. FFmpeg4 is known to fail here in some "
166+
"valid scenarios. Try to upgrade FFmpeg?");
158167
return swrContext;
159168
}
160169

170+
UniqueAVFrame convertAudioAVFrameSampleFormatAndSampleRate(
171+
const UniqueSwrContext& swrContext,
172+
const UniqueAVFrame& srcAVFrame,
173+
AVSampleFormat desiredSampleFormat,
174+
int sourceSampleRate,
175+
int desiredSampleRate) {
176+
UniqueAVFrame convertedAVFrame(av_frame_alloc());
177+
TORCH_CHECK(
178+
convertedAVFrame,
179+
"Could not allocate frame for sample format conversion.");
180+
181+
setChannelLayout(convertedAVFrame, srcAVFrame);
182+
convertedAVFrame->format = static_cast<int>(desiredSampleFormat);
183+
convertedAVFrame->sample_rate = desiredSampleRate;
184+
if (sourceSampleRate != desiredSampleRate) {
185+
// Note that this is an upper bound on the number of output samples.
186+
// `swr_convert()` will likely not fill convertedAVFrame with that many
187+
// samples if sample rate conversion is needed. It will buffer the last few
188+
// ones because those require future samples. That's also why we reset
189+
// nb_samples after the call to `swr_convert()`.
190+
// We could also use `swr_get_out_samples()` to determine the number of
191+
// output samples, but empirically `av_rescale_rnd()` seems to provide a
192+
// tighter bound.
193+
convertedAVFrame->nb_samples = av_rescale_rnd(
194+
swr_get_delay(swrContext.get(), sourceSampleRate) +
195+
srcAVFrame->nb_samples,
196+
desiredSampleRate,
197+
sourceSampleRate,
198+
AV_ROUND_UP);
199+
} else {
200+
convertedAVFrame->nb_samples = srcAVFrame->nb_samples;
201+
}
202+
203+
auto status = av_frame_get_buffer(convertedAVFrame.get(), 0);
204+
TORCH_CHECK(
205+
status == AVSUCCESS,
206+
"Could not allocate frame buffers for sample format conversion: ",
207+
getFFMPEGErrorStringFromErrorCode(status));
208+
209+
auto numConvertedSamples = swr_convert(
210+
swrContext.get(),
211+
convertedAVFrame->data,
212+
convertedAVFrame->nb_samples,
213+
static_cast<const uint8_t**>(
214+
const_cast<const uint8_t**>(srcAVFrame->data)),
215+
srcAVFrame->nb_samples);
216+
// numConvertedSamples can be 0 if we're downsampling by a great factor and
217+
// the first frame doesn't contain a lot of samples. It should be handled
218+
// properly by the caller.
219+
TORCH_CHECK(
220+
numConvertedSamples >= 0,
221+
"Error in swr_convert: ",
222+
getFFMPEGErrorStringFromErrorCode(numConvertedSamples));
223+
224+
// See comment above about nb_samples
225+
convertedAVFrame->nb_samples = numConvertedSamples;
226+
227+
return convertedAVFrame;
228+
}
229+
161230
void setFFmpegLogLevel() {
162231
auto logLevel = AV_LOG_QUIET;
163232
const char* logLevelEnvPtr = std::getenv("TORCHCODEC_FFMPEG_LOG_LEVEL");

src/torchcodec/_core/FFMPEGCommon.h

+8-1
Original file line numberDiff line numberDiff line change
@@ -158,13 +158,20 @@ void setChannelLayout(
158158
void setChannelLayout(
159159
UniqueAVFrame& dstAVFrame,
160160
const UniqueAVFrame& srcAVFrame);
161-
SwrContext* allocateSwrContext(
161+
SwrContext* createSwrContext(
162162
UniqueAVCodecContext& avCodecContext,
163163
AVSampleFormat sourceSampleFormat,
164164
AVSampleFormat desiredSampleFormat,
165165
int sourceSampleRate,
166166
int desiredSampleRate);
167167

168+
UniqueAVFrame convertAudioAVFrameSampleFormatAndSampleRate(
169+
const UniqueSwrContext& swrContext,
170+
const UniqueAVFrame& srcAVFrame,
171+
AVSampleFormat desiredSampleFormat,
172+
int sourceSampleRate,
173+
int desiredSampleRate);
174+
168175
// Returns true if sws_scale can handle unaligned data.
169176
bool canSwsScaleHandleUnalignedData();
170177

src/torchcodec/_core/SingleStreamDecoder.cpp

+12-98
Original file line numberDiff line numberDiff line change
@@ -1345,20 +1345,29 @@ void SingleStreamDecoder::convertAudioAVFrameToFrameOutputOnCPU(
13451345
static_cast<AVSampleFormat>(srcAVFrame->format);
13461346
AVSampleFormat desiredSampleFormat = AV_SAMPLE_FMT_FLTP;
13471347

1348+
StreamInfo& streamInfo = streamInfos_[activeStreamIndex_];
13481349
int sourceSampleRate = srcAVFrame->sample_rate;
13491350
int desiredSampleRate =
1350-
streamInfos_[activeStreamIndex_].audioStreamOptions.sampleRate.value_or(
1351-
sourceSampleRate);
1351+
streamInfo.audioStreamOptions.sampleRate.value_or(sourceSampleRate);
13521352

13531353
bool mustConvert =
13541354
(sourceSampleFormat != desiredSampleFormat ||
13551355
sourceSampleRate != desiredSampleRate);
13561356

13571357
UniqueAVFrame convertedAVFrame;
13581358
if (mustConvert) {
1359+
if (!streamInfo.swrContext) {
1360+
streamInfo.swrContext.reset(createSwrContext(
1361+
streamInfo.codecContext,
1362+
sourceSampleFormat,
1363+
desiredSampleFormat,
1364+
sourceSampleRate,
1365+
desiredSampleRate));
1366+
}
1367+
13591368
convertedAVFrame = convertAudioAVFrameSampleFormatAndSampleRate(
1369+
streamInfo.swrContext,
13601370
srcAVFrame,
1361-
sourceSampleFormat,
13621371
desiredSampleFormat,
13631372
sourceSampleRate,
13641373
desiredSampleRate);
@@ -1393,77 +1402,6 @@ void SingleStreamDecoder::convertAudioAVFrameToFrameOutputOnCPU(
13931402
}
13941403
}
13951404

1396-
UniqueAVFrame SingleStreamDecoder::convertAudioAVFrameSampleFormatAndSampleRate(
1397-
const UniqueAVFrame& srcAVFrame,
1398-
AVSampleFormat sourceSampleFormat,
1399-
AVSampleFormat desiredSampleFormat,
1400-
int sourceSampleRate,
1401-
int desiredSampleRate) {
1402-
auto& streamInfo = streamInfos_[activeStreamIndex_];
1403-
1404-
if (!streamInfo.swrContext) {
1405-
createSwrContext(
1406-
streamInfo,
1407-
sourceSampleFormat,
1408-
desiredSampleFormat,
1409-
sourceSampleRate,
1410-
desiredSampleRate);
1411-
}
1412-
1413-
UniqueAVFrame convertedAVFrame(av_frame_alloc());
1414-
TORCH_CHECK(
1415-
convertedAVFrame,
1416-
"Could not allocate frame for sample format conversion.");
1417-
1418-
setChannelLayout(convertedAVFrame, srcAVFrame);
1419-
convertedAVFrame->format = static_cast<int>(desiredSampleFormat);
1420-
convertedAVFrame->sample_rate = desiredSampleRate;
1421-
if (sourceSampleRate != desiredSampleRate) {
1422-
// Note that this is an upper bound on the number of output samples.
1423-
// `swr_convert()` will likely not fill convertedAVFrame with that many
1424-
// samples if sample rate conversion is needed. It will buffer the last few
1425-
// ones because those require future samples. That's also why we reset
1426-
// nb_samples after the call to `swr_convert()`.
1427-
// We could also use `swr_get_out_samples()` to determine the number of
1428-
// output samples, but empirically `av_rescale_rnd()` seems to provide a
1429-
// tighter bound.
1430-
convertedAVFrame->nb_samples = av_rescale_rnd(
1431-
swr_get_delay(streamInfo.swrContext.get(), sourceSampleRate) +
1432-
srcAVFrame->nb_samples,
1433-
desiredSampleRate,
1434-
sourceSampleRate,
1435-
AV_ROUND_UP);
1436-
} else {
1437-
convertedAVFrame->nb_samples = srcAVFrame->nb_samples;
1438-
}
1439-
1440-
auto status = av_frame_get_buffer(convertedAVFrame.get(), 0);
1441-
TORCH_CHECK(
1442-
status == AVSUCCESS,
1443-
"Could not allocate frame buffers for sample format conversion: ",
1444-
getFFMPEGErrorStringFromErrorCode(status));
1445-
1446-
auto numConvertedSamples = swr_convert(
1447-
streamInfo.swrContext.get(),
1448-
convertedAVFrame->data,
1449-
convertedAVFrame->nb_samples,
1450-
static_cast<const uint8_t**>(
1451-
const_cast<const uint8_t**>(srcAVFrame->data)),
1452-
srcAVFrame->nb_samples);
1453-
// numConvertedSamples can be 0 if we're downsampling by a great factor and
1454-
// the first frame doesn't contain a lot of samples. It should be handled
1455-
// properly by the caller.
1456-
TORCH_CHECK(
1457-
numConvertedSamples >= 0,
1458-
"Error in swr_convert: ",
1459-
getFFMPEGErrorStringFromErrorCode(numConvertedSamples));
1460-
1461-
// See comment above about nb_samples
1462-
convertedAVFrame->nb_samples = numConvertedSamples;
1463-
1464-
return convertedAVFrame;
1465-
}
1466-
14671405
std::optional<torch::Tensor> SingleStreamDecoder::maybeFlushSwrBuffers() {
14681406
// When sample rate conversion is involved, swresample buffers some of the
14691407
// samples in-between calls to swr_convert (see the libswresample docs).
@@ -1735,30 +1673,6 @@ void SingleStreamDecoder::createSwsContext(
17351673
streamInfo.swsContext.reset(swsContext);
17361674
}
17371675

1738-
void SingleStreamDecoder::createSwrContext(
1739-
StreamInfo& streamInfo,
1740-
AVSampleFormat sourceSampleFormat,
1741-
AVSampleFormat desiredSampleFormat,
1742-
int sourceSampleRate,
1743-
int desiredSampleRate) {
1744-
auto swrContext = allocateSwrContext(
1745-
streamInfo.codecContext,
1746-
sourceSampleFormat,
1747-
desiredSampleFormat,
1748-
sourceSampleRate,
1749-
desiredSampleRate);
1750-
1751-
auto status = swr_init(swrContext);
1752-
TORCH_CHECK(
1753-
status == AVSUCCESS,
1754-
"Couldn't initialize SwrContext: ",
1755-
getFFMPEGErrorStringFromErrorCode(status),
1756-
". If the error says 'Invalid argument', it's likely that you are using "
1757-
"a buggy FFmpeg version. FFmpeg4 is known to fail here in some "
1758-
"valid scenarios. Try to upgrade FFmpeg?");
1759-
streamInfo.swrContext.reset(swrContext);
1760-
}
1761-
17621676
// --------------------------------------------------------------------------
17631677
// PTS <-> INDEX CONVERSIONS
17641678
// --------------------------------------------------------------------------

src/torchcodec/_core/SingleStreamDecoder.h

-14
Original file line numberDiff line numberDiff line change
@@ -287,13 +287,6 @@ class SingleStreamDecoder {
287287
const UniqueAVFrame& avFrame,
288288
torch::Tensor& outputTensor);
289289

290-
UniqueAVFrame convertAudioAVFrameSampleFormatAndSampleRate(
291-
const UniqueAVFrame& srcAVFrame,
292-
AVSampleFormat sourceSampleFormat,
293-
AVSampleFormat desiredSampleFormat,
294-
int sourceSampleRate,
295-
int desiredSampleRate);
296-
297290
std::optional<torch::Tensor> maybeFlushSwrBuffers();
298291

299292
// --------------------------------------------------------------------------
@@ -310,13 +303,6 @@ class SingleStreamDecoder {
310303
const DecodedFrameContext& frameContext,
311304
const enum AVColorSpace colorspace);
312305

313-
void createSwrContext(
314-
StreamInfo& streamInfo,
315-
AVSampleFormat sourceSampleFormat,
316-
AVSampleFormat desiredSampleFormat,
317-
int sourceSampleRate,
318-
int desiredSampleRate);
319-
320306
// --------------------------------------------------------------------------
321307
// PTS <-> INDEX CONVERSIONS
322308
// --------------------------------------------------------------------------

0 commit comments

Comments
 (0)