@@ -33,6 +33,44 @@ void validateSampleRate(const AVCodec& avCodec, int sampleRate) {
33
33
supportedRates.str ());
34
34
}
35
35
36
+ static const std::vector<AVSampleFormat> preferredFormatsOrder = {
37
+ AV_SAMPLE_FMT_FLTP,
38
+ AV_SAMPLE_FMT_FLT,
39
+ AV_SAMPLE_FMT_DBLP,
40
+ AV_SAMPLE_FMT_DBL,
41
+ AV_SAMPLE_FMT_S64P,
42
+ AV_SAMPLE_FMT_S64,
43
+ AV_SAMPLE_FMT_S32P,
44
+ AV_SAMPLE_FMT_S32,
45
+ AV_SAMPLE_FMT_S16P,
46
+ AV_SAMPLE_FMT_S16,
47
+ AV_SAMPLE_FMT_U8P,
48
+ AV_SAMPLE_FMT_U8};
49
+
50
+ AVSampleFormat findBestOutputSampleFormat (const AVCodec& avCodec) {
51
+ // Find a sample format that the encoder supports. We prefer using FLT[P],
52
+ // since this is the format of the input waveform. If FLTP isn't supported
53
+ // then we'll need to convert the AVFrame's format. Our heuristic is to encode
54
+ // into the format with the highest resolution.
55
+ if (avCodec.sample_fmts == nullptr ) {
56
+ // Can't really validate anything in this case, best we can do is hope that
57
+ // FLTP is supported by the encoder. If not, FFmpeg will raise.
58
+ return AV_SAMPLE_FMT_FLTP;
59
+ }
60
+
61
+ for (AVSampleFormat preferredFormat : preferredFormatsOrder) {
62
+ for (int i = 0 ; avCodec.sample_fmts [i] != -1 ; ++i) {
63
+ if (avCodec.sample_fmts [i] == preferredFormat) {
64
+ return preferredFormat;
65
+ }
66
+ }
67
+ }
68
+ // We should always find a match in preferredFormatsOrder, so we should always
69
+ // return earlier. But in the event that a future FFmpeg version defines an
70
+ // additional sample format that isn't in preferredFormatsOrder, we fallback:
71
+ return avCodec.sample_fmts [0 ];
72
+ }
73
+
36
74
} // namespace
37
75
38
76
AudioEncoder::~AudioEncoder () {}
@@ -47,6 +85,8 @@ AudioEncoder::AudioEncoder(
47
85
wf_.dtype () == torch::kFloat32 ,
48
86
" waveform must have float32 dtype, got " ,
49
87
wf_.dtype ());
88
+ // TODO-ENCODING check contiguity of the input wf to ensure that it is indeed
89
+ // planar (fltp).
50
90
TORCH_CHECK (
51
91
wf_.dim () == 2 , " waveform must have 2 dimensions, got " , wf_.dim ());
52
92
@@ -92,14 +132,10 @@ AudioEncoder::AudioEncoder(
92
132
validateSampleRate (*avCodec, sampleRate);
93
133
avCodecContext_->sample_rate = sampleRate;
94
134
95
- // Note: This is the format of the **input** waveform. This doesn't determine
96
- // the output.
97
- // TODO-ENCODING check contiguity of the input wf to ensure that it is indeed
98
- // planar.
99
- // TODO-ENCODING If the encoder doesn't support FLTP (like flac), FFmpeg will
100
- // raise. We need to handle this, probably converting the format with
101
- // libswresample.
102
- avCodecContext_->sample_fmt = AV_SAMPLE_FMT_FLTP;
135
+ // Input waveform is expected to be FLTP. Not all encoders support FLTP, so we
136
+ // may need to convert the wf into a supported output sample format, which is
137
+ // what the `.sample_fmt` defines.
138
+ avCodecContext_->sample_fmt = findBestOutputSampleFormat (*avCodec);
103
139
104
140
int numChannels = static_cast <int >(wf_.sizes ()[0 ]);
105
141
TORCH_CHECK (
@@ -120,12 +156,6 @@ AudioEncoder::AudioEncoder(
120
156
" avcodec_open2 failed: " ,
121
157
getFFMPEGErrorStringFromErrorCode (status));
122
158
123
- TORCH_CHECK (
124
- avCodecContext_->frame_size > 0 ,
125
- " frame_size is " ,
126
- avCodecContext_->frame_size ,
127
- " . Cannot encode. This should probably never happen?" );
128
-
129
159
// We're allocating the stream here. Streams are meant to be freed by
130
160
// avformat_free_context(avFormatContext), which we call in the
131
161
// avFormatContext_'s destructor.
@@ -143,8 +173,11 @@ AudioEncoder::AudioEncoder(
143
173
void AudioEncoder::encode () {
144
174
UniqueAVFrame avFrame (av_frame_alloc ());
145
175
TORCH_CHECK (avFrame != nullptr , " Couldn't allocate AVFrame." );
146
- avFrame->nb_samples = avCodecContext_->frame_size ;
147
- avFrame->format = avCodecContext_->sample_fmt ;
176
+ // Default to 256 like in torchaudio
177
+ int numSamplesAllocatedPerFrame =
178
+ avCodecContext_->frame_size > 0 ? avCodecContext_->frame_size : 256 ;
179
+ avFrame->nb_samples = numSamplesAllocatedPerFrame;
180
+ avFrame->format = AV_SAMPLE_FMT_FLTP;
148
181
avFrame->sample_rate = avCodecContext_->sample_rate ;
149
182
avFrame->pts = 0 ;
150
183
setChannelLayout (avFrame, avCodecContext_);
@@ -160,7 +193,6 @@ void AudioEncoder::encode() {
160
193
uint8_t * pwf = static_cast <uint8_t *>(wf_.data_ptr ());
161
194
int numSamples = static_cast <int >(wf_.sizes ()[1 ]); // per channel
162
195
int numEncodedSamples = 0 ; // per channel
163
- int numSamplesPerFrame = avCodecContext_->frame_size ; // per channel
164
196
int numBytesPerSample = static_cast <int >(wf_.element_size ());
165
197
int numBytesPerChannel = numSamples * numBytesPerSample;
166
198
@@ -178,7 +210,7 @@ void AudioEncoder::encode() {
178
210
getFFMPEGErrorStringFromErrorCode (status));
179
211
180
212
int numSamplesToEncode =
181
- std::min (numSamplesPerFrame , numSamples - numEncodedSamples);
213
+ std::min (numSamplesAllocatedPerFrame , numSamples - numEncodedSamples);
182
214
int numBytesToEncode = numSamplesToEncode * numBytesPerSample;
183
215
184
216
for (int ch = 0 ; ch < wf_.sizes ()[0 ]; ch++) {
@@ -211,7 +243,37 @@ void AudioEncoder::encode() {
211
243
212
244
void AudioEncoder::encodeInnerLoop (
213
245
AutoAVPacket& autoAVPacket,
214
- const UniqueAVFrame& avFrame) {
246
+ const UniqueAVFrame& srcAVFrame) {
247
+ bool mustConvert =
248
+ (avCodecContext_->sample_fmt != AV_SAMPLE_FMT_FLTP &&
249
+ srcAVFrame != nullptr );
250
+ UniqueAVFrame convertedAVFrame;
251
+ if (mustConvert) {
252
+ if (!swrContext_) {
253
+ swrContext_.reset (createSwrContext (
254
+ avCodecContext_,
255
+ AV_SAMPLE_FMT_FLTP,
256
+ avCodecContext_->sample_fmt ,
257
+ srcAVFrame->sample_rate , // No sample rate conversion
258
+ srcAVFrame->sample_rate ));
259
+ }
260
+ convertedAVFrame = convertAudioAVFrameSampleFormatAndSampleRate (
261
+ swrContext_,
262
+ srcAVFrame,
263
+ avCodecContext_->sample_fmt ,
264
+ srcAVFrame->sample_rate , // No sample rate conversion
265
+ srcAVFrame->sample_rate );
266
+ TORCH_CHECK (
267
+ convertedAVFrame->nb_samples == srcAVFrame->nb_samples ,
268
+ " convertedAVFrame->nb_samples=" ,
269
+ convertedAVFrame->nb_samples ,
270
+ " differs from " ,
271
+ " srcAVFrame->nb_samples=" ,
272
+ srcAVFrame->nb_samples ,
273
+ " This is unexpected, please report on the TorchCodec bug tracker." );
274
+ }
275
+ const UniqueAVFrame& avFrame = mustConvert ? convertedAVFrame : srcAVFrame;
276
+
215
277
auto status = avcodec_send_frame (avCodecContext_.get (), avFrame.get ());
216
278
TORCH_CHECK (
217
279
status == AVSUCCESS,
@@ -248,6 +310,9 @@ void AudioEncoder::encodeInnerLoop(
248
310
}
249
311
250
312
void AudioEncoder::flushBuffers () {
313
+ // We flush the main FFmpeg buffers, but not swresample buffers. Flushing
314
+ // swresample is only necessary when converting sample rates, which we don't
315
+ // do for encoding.
251
316
AutoAVPacket autoAVPacket;
252
317
encodeInnerLoop (autoAVPacket, UniqueAVFrame (nullptr ));
253
318
}
0 commit comments