Validation of allocated output tensor shapes (#339)

NicolasHug · web-flow · commit c2bea4be0755 · 2024-11-08T10:12:47.000Z
diff --git a/src/torchcodec/decoders/_core/CudaDevice.cpp b/src/torchcodec/decoders/_core/CudaDevice.cpp
@@ -224,6 +224,9 @@ void convertAVFrameToDecodedOutputOnCuda(
 
   auto start = std::chrono::high_resolution_clock::now();
 
+  // TODO height and width info of output tensor comes from the metadata, which
+  // may not be accurate. How do we make sure we won't corrupt memory if the
+  // allocated tensor is too short/large?
   NppStatus status = nppiNV12ToRGB_8u_P2C3R(
       input,
       src->linesize[0],
diff --git a/src/torchcodec/decoders/_core/VideoDecoder.cpp b/src/torchcodec/decoders/_core/VideoDecoder.cpp
@@ -880,7 +880,7 @@ VideoDecoder::DecodedOutput VideoDecoder::convertAVFrameToDecodedOutput(
 // speed-up when swscale is used. With swscale, we can tell ffmpeg to place the
 // decoded frame directly into `preAllocatedtensor.data_ptr()`. We haven't yet
 // found a way to do that with filtegraph.
-// TODO: Figure out whether that's possilbe!
+// TODO: Figure out whether that's possible!
 // Dimension order of the preAllocatedOutputTensor must be HWC, regardless of
 // `dimension_order` parameter. It's up to callers to re-shape it if needed.
 void VideoDecoder::convertAVFrameToDecodedOutputOnCPU(
@@ -890,41 +890,68 @@ void VideoDecoder::convertAVFrameToDecodedOutputOnCPU(
   int streamIndex = rawOutput.streamIndex;
   AVFrame* frame = rawOutput.frame.get();
   auto& streamInfo = streams_[streamIndex];
-  torch::Tensor tensor;
+
+  auto frameDims =
+      getHeightAndWidthFromOptionsOrAVFrame(streamInfo.options, *frame);
+  int expectedOutputHeight = frameDims.height;
+  int expectedOutputWidth = frameDims.width;
+
+  if (preAllocatedOutputTensor.has_value()) {
+    auto shape = preAllocatedOutputTensor.value().sizes();
+    TORCH_CHECK(
+        (shape.size() == 3) && (shape[0] == expectedOutputHeight) &&
+            (shape[1] == expectedOutputWidth) && (shape[2] == 3),
+        "Expected pre-allocated tensor of shape ",
+        expectedOutputHeight,
+        "x",
+        expectedOutputWidth,
+        "x3, got ",
+        shape);
+  }
+
+  torch::Tensor outputTensor;
   if (output.streamType == AVMEDIA_TYPE_VIDEO) {
     if (streamInfo.colorConversionLibrary == ColorConversionLibrary::SWSCALE) {
-      auto frameDims =
-          getHeightAndWidthFromOptionsOrAVFrame(streamInfo.options, *frame);
-      int height = frameDims.height;
-      int width = frameDims.width;
-      if (preAllocatedOutputTensor.has_value()) {
-        tensor = preAllocatedOutputTensor.value();
-        auto shape = tensor.sizes();
-        TORCH_CHECK(
-            (shape.size() == 3) && (shape[0] == height) &&
-                (shape[1] == width) && (shape[2] == 3),
-            "Expected tensor of shape ",
-            height,
-            "x",
-            width,
-            "x3, got ",
-            shape);
-      } else {
-        tensor = allocateEmptyHWCTensor(height, width, torch::kCPU);
-      }
-      rawOutput.data = tensor.data_ptr<uint8_t>();
-      convertFrameToBufferUsingSwsScale(rawOutput);
-
-      output.frame = tensor;
+      outputTensor = preAllocatedOutputTensor.value_or(allocateEmptyHWCTensor(
+          expectedOutputHeight, expectedOutputWidth, torch::kCPU));
+
+      int resultHeight =
+          convertFrameToBufferUsingSwsScale(streamIndex, frame, outputTensor);
+      // If this check failed, it would mean that the frame wasn't reshaped to
+      // the expected height.
+      // TODO: Can we do the same check for width?
+      TORCH_CHECK(
+          resultHeight == expectedOutputHeight,
+          "resultHeight != expectedOutputHeight: ",
+          resultHeight,
+          " != ",
+          expectedOutputHeight);
+
+      output.frame = outputTensor;
     } else if (
         streamInfo.colorConversionLibrary ==
         ColorConversionLibrary::FILTERGRAPH) {
-      tensor = convertFrameToTensorUsingFilterGraph(streamIndex, frame);
+      outputTensor = convertFrameToTensorUsingFilterGraph(streamIndex, frame);
+
+      // Similarly to above, if this check fails it means the frame wasn't
+      // reshaped to its expected dimensions by filtergraph.
+      auto shape = outputTensor.sizes();
+      TORCH_CHECK(
+          (shape.size() == 3) && (shape[0] == expectedOutputHeight) &&
+              (shape[1] == expectedOutputWidth) && (shape[2] == 3),
+          "Expected output tensor of shape ",
+          expectedOutputHeight,
+          "x",
+          expectedOutputWidth,
+          "x3, got ",
+          shape);
       if (preAllocatedOutputTensor.has_value()) {
-        preAllocatedOutputTensor.value().copy_(tensor);
+        // We have already validated that preAllocatedOutputTensor and
+        // outputTensor have the same shape.
+        preAllocatedOutputTensor.value().copy_(outputTensor);
         output.frame = preAllocatedOutputTensor.value();
       } else {
-        output.frame = tensor;
+        output.frame = outputTensor;
       }
     } else {
       throw std::runtime_error(
@@ -1303,24 +1330,23 @@ double VideoDecoder::getPtsSecondsForFrame(
   return ptsToSeconds(stream.allFrames[frameIndex].pts, stream.timeBase);
 }
 
-void VideoDecoder::convertFrameToBufferUsingSwsScale(
-    RawDecodedOutput& rawOutput) {
-  AVFrame* frame = rawOutput.frame.get();
-  int streamIndex = rawOutput.streamIndex;
+int VideoDecoder::convertFrameToBufferUsingSwsScale(
+    int streamIndex,
+    const AVFrame* frame,
+    torch::Tensor& outputTensor) {
   enum AVPixelFormat frameFormat =
       static_cast<enum AVPixelFormat>(frame->format);
   StreamInfo& activeStream = streams_[streamIndex];
-  auto frameDims =
-      getHeightAndWidthFromOptionsOrAVFrame(activeStream.options, *frame);
-  int outputHeight = frameDims.height;
-  int outputWidth = frameDims.width;
+
+  int expectedOutputHeight = outputTensor.sizes()[0];
+  int expectedOutputWidth = outputTensor.sizes()[1];
   if (activeStream.swsContext.get() == nullptr) {
     SwsContext* swsContext = sws_getContext(
         frame->width,
         frame->height,
         frameFormat,
-        outputWidth,
-        outputHeight,
+        expectedOutputWidth,
+        expectedOutputHeight,
         AV_PIX_FMT_RGB24,
         SWS_BILINEAR,
         nullptr,
@@ -1352,8 +1378,8 @@ void VideoDecoder::convertFrameToBufferUsingSwsScale(
   }
   SwsContext* swsContext = activeStream.swsContext.get();
   uint8_t* pointers[4] = {
-      static_cast<uint8_t*>(rawOutput.data), nullptr, nullptr, nullptr};
-  int linesizes[4] = {outputWidth * 3, 0, 0, 0};
+      outputTensor.data_ptr<uint8_t>(), nullptr, nullptr, nullptr};
+  int linesizes[4] = {expectedOutputWidth * 3, 0, 0, 0};
   int resultHeight = sws_scale(
       swsContext,
       frame->data,
@@ -1362,9 +1388,7 @@ void VideoDecoder::convertFrameToBufferUsingSwsScale(
       frame->height,
       pointers,
       linesizes);
-  TORCH_CHECK(
-      outputHeight == resultHeight,
-      "outputHeight(" + std::to_string(resultHeight) + ") != resultHeight");
+  return resultHeight;
 }
 
 torch::Tensor VideoDecoder::convertFrameToTensorUsingFilterGraph(
@@ -1379,8 +1403,7 @@ torch::Tensor VideoDecoder::convertFrameToTensorUsingFilterGraph(
   ffmpegStatus =
       av_buffersink_get_frame(filterState.sinkContext, filteredFrame.get());
   TORCH_CHECK_EQ(filteredFrame->format, AV_PIX_FMT_RGB24);
-  auto frameDims = getHeightAndWidthFromOptionsOrAVFrame(
-      streams_[streamIndex].options, *filteredFrame.get());
+  auto frameDims = getHeightAndWidthFromResizedAVFrame(*filteredFrame.get());
   int height = frameDims.height;
   int width = frameDims.width;
   std::vector<int64_t> shape = {height, width, 3};
@@ -1406,6 +1429,10 @@ VideoDecoder::~VideoDecoder() {
   }
 }
 
+FrameDims getHeightAndWidthFromResizedAVFrame(const AVFrame& resizedAVFrame) {
+  return FrameDims(resizedAVFrame.height, resizedAVFrame.width);
+}
+
 FrameDims getHeightAndWidthFromOptionsOrMetadata(
     const VideoDecoder::VideoStreamDecoderOptions& options,
     const VideoDecoder::StreamMetadata& metadata) {
diff --git a/src/torchcodec/decoders/_core/VideoDecoder.h b/src/torchcodec/decoders/_core/VideoDecoder.h
@@ -383,7 +383,10 @@ class VideoDecoder {
   torch::Tensor convertFrameToTensorUsingFilterGraph(
       int streamIndex,
       const AVFrame* frame);
-  void convertFrameToBufferUsingSwsScale(RawDecodedOutput& rawOutput);
+  int convertFrameToBufferUsingSwsScale(
+      int streamIndex,
+      const AVFrame* frame,
+      torch::Tensor& outputTensor);
   DecodedOutput convertAVFrameToDecodedOutput(
       RawDecodedOutput& rawOutput,
       std::optional<torch::Tensor> preAllocatedOutputTensor = std::nullopt);
@@ -426,30 +429,32 @@ class VideoDecoder {
 // MaybePermuteHWC2CHW().
 //
 // Also, importantly, the way we figure out the the height and width of the
-// output frame varies and depends on the decoding entry-point:
-// - In all cases, if the user requested specific height and width from the
-// options, we honor that. Otherwise we fall into one of the categories below.
-// - In Batch decoding APIs (e.g. getFramesAtIndices), we get height and width
-// from the stream metadata, which itself got its value from the CodecContext,
-// when the stream was added.
-// - In single frames APIs:
-//   - On CPU we get height and width from the AVFrame.
-//   - On GPU, we get height and width from the metadata (same as batch APIs)
-//
-// These 2 strategies are encapsulated within
-// getHeightAndWidthFromOptionsOrMetadata() and
-// getHeightAndWidthFromOptionsOrAVFrame(). The reason they exist is to make it
-// very obvious which logic is used in which place, and they allow for `git
-// grep`ing.
+// output frame tensor varies, and depends on the decoding entry-point. In
+// *decreasing order of accuracy*, we use the following sources for determining
+// height and width:
+// - getHeightAndWidthFromResizedAVFrame(). This is the height and width of the
+//   AVframe, *post*-resizing. This is only used for single-frame decoding APIs,
+//   on CPU, with filtergraph.
+// - getHeightAndWidthFromOptionsOrAVFrame(). This is the height and width from
+//   the user-specified options if they exist, or the height and width of the
+//   AVFrame *before* it is resized. In theory, i.e. if there are no bugs within
+//   our code or within FFmpeg code, this should be exactly the same as
+//   getHeightAndWidthFromResizedAVFrame(). This is used by single-frame
+//   decoding APIs, on CPU, with swscale.
+// - getHeightAndWidthFromOptionsOrMetadata(). This is the height and width from
+//   the user-specified options if they exist, or the height and width form the
+//   stream metadata, which itself got its value from the CodecContext, when the
+//   stream was added. This is used by batch decoding APIs, or by GPU-APIs (both
+//   batch and single-frames).
 //
-// The source of truth for height and width really is the AVFrame: it's the
-// decoded ouptut from FFmpeg. The info from the metadata (i.e. from the
-// CodecContext) may not be as accurate. However, the AVFrame is only available
-// late in the call stack, when the frame is decoded, while the CodecContext is
-// available early when a stream is added. This is why we use the CodecContext
-// for pre-allocating batched output tensors (we could pre-allocate those only
-// once we decode the first frame to get the info frame the AVFrame, but that's
-// a more complex logic).
+// The source of truth for height and width really is the (resized) AVFrame:
+// it's the decoded ouptut from FFmpeg. The info from the metadata (i.e. from
+// the CodecContext) may not be as accurate. However, the AVFrame is only
+// available late in the call stack, when the frame is decoded, while the
+// CodecContext is available early when a stream is added. This is why we use
+// the CodecContext for pre-allocating batched output tensors (we could
+// pre-allocate those only once we decode the first frame to get the info frame
+// the AVFrame, but that's a more complex logic).
 //
 // Because the sources for height and width may disagree, we may end up with
 // conflicts: e.g. if we pre-allocate a batch output tensor based on the
@@ -463,6 +468,10 @@ struct FrameDims {
   FrameDims(int h, int w) : height(h), width(w) {}
 };
 
+// There's nothing preventing you from calling this on a non-resized frame, but
+// please don't.
+FrameDims getHeightAndWidthFromResizedAVFrame(const AVFrame& resizedAVFrame);
+
 FrameDims getHeightAndWidthFromOptionsOrMetadata(
     const VideoDecoder::VideoStreamDecoderOptions& options,
     const VideoDecoder::StreamMetadata& metadata);