meta-pytorch · IteratorandIterator · Jan 21, 2026 · Jan 21, 2026 · Jan 21, 2026 · Jan 22, 2026
diff --git a/src/torchcodec/_core/BetaCudaDeviceInterface.cpp b/src/torchcodec/_core/BetaCudaDeviceInterface.cpp
@@ -243,7 +243,7 @@ BetaCudaDeviceInterface::~BetaCudaDeviceInterface() {
     flush();
     unmapPreviousFrame();
     NVDECCache::getCache(device_).returnDecoder(
-        &videoFormat_, std::move(decoder_));
+        &videoFormat_, std::move(decoder_), decoderId_);
   }
 
   if (videoParser_) {
@@ -270,7 +270,6 @@ void BetaCudaDeviceInterface::initialize(
     // We'll always use the CPU fallback from now on, so we can return early.
     return;
   }
-
   TORCH_CHECK(avStream != nullptr, "AVStream cannot be null");
   timeBase_ = avStream->time_base;
   frameRateAvgFromFFmpeg_ = avStream->r_frame_rate;
@@ -387,6 +386,29 @@ void BetaCudaDeviceInterface::initializeBSF(
       getFFMPEGErrorStringFromErrorCode(retVal));
 }
 
+
+int BetaCudaDeviceInterface::reconfigureNVDECDecoder(CUVIDEOFORMAT* videoFormat) {
+  CUVIDRECONFIGUREDECODERINFO info = {};
+
+  info.ulWidth  = videoFormat->coded_width;
+  info.ulHeight = videoFormat->coded_height;
+
+  info.ulTargetWidth  = videoFormat->display_area.right - videoFormat->display_area.left;
+  info.ulTargetHeight = videoFormat->display_area.bottom - videoFormat->display_area.top;
+
+  info.ulNumDecodeSurfaces = videoFormat->min_num_decode_surfaces;
+
+  info.display_area.left   = videoFormat->display_area.left;
+  info.display_area.top    = videoFormat->display_area.top;
+  info.display_area.right  = videoFormat->display_area.right;
+  info.display_area.bottom = videoFormat->display_area.bottom;
+
+  cuvidReconfigureDecoder(*decoder_, &info);
+
+  return static_cast<int>(videoFormat_.min_num_decode_surfaces);
+}
+
+
 // This callback is called by the parser within cuvidParseVideoData when there
 // is a change in the stream's properties (like resolution change), as specified
 // by CUVIDEOFORMAT. Particularly (but not just!), this is called at the very
@@ -404,14 +426,42 @@ int BetaCudaDeviceInterface::streamPropertyChange(CUVIDEOFORMAT* videoFormat) {
     videoFormat_.min_num_decode_surfaces = 20;
   }
 
+  CUvideodecoderCache decoderCache;
+
   if (!decoder_) {
-    decoder_ = NVDECCache::getCache(device_).getDecoder(videoFormat);
-
-    if (!decoder_) {
-      // TODONVDEC P2: consider re-configuring an existing decoder instead of
-      // re-creating one. See docs, see DALI. Re-configuration doesn't seem to
-      // be enabled in DALI by default.
+    decoderCache = NVDECCache::getCache(device_).getDecoder(videoFormat);
+
+    auto cache_type = std::get<0>(decoderCache);
+    if (cache_type == NVDECCacheType::Reconfig) {
+      // Need to reconfigure existing decoder
+      auto decoderId = std::get<2>(decoderCache);
+      decoderId_ = decoderId;
+
+      decoder_ = std::move(std::get<1>(decoderCache));
+      TORCH_CHECK(decoder_, "Failed to get decoder from cache");
+      return reconfigureNVDECDecoder(videoFormat);
+    }
+    else if (cache_type == NVDECCacheType::Reuse) {
+      // Can reuse existing decoder as is
+      auto decoderId = std::get<2>(decoderCache);
+      decoderId_ = decoderId;
+
+      decoder_ = std::move(std::get<1>(decoderCache));
+    }
+    else if (cache_type == NVDECCacheType::Create) {
+      // Need to create a new decoder
+      TORCH_CHECK(!decoder_, "Decoder should be null here");
       decoder_ = createDecoder(videoFormat);
+      decoderId_ = NVDECCache::getCache(device_).allocDecoderId();
+      NVDECCache::getCache(device_).registerDecoderId(
+          decoderId_,
+          videoFormat->coded_width,
+          videoFormat->coded_height,
+          // The number of surfaces used in reconfig cannot exceed the number of surfaces 
+          // specified when creating the decoder. In most cases, this should not be a problem.
+          // We'll see the real result based on whether the tests pass.
+          videoFormat->min_num_decode_surfaces
+      );
     }
 
     TORCH_CHECK(decoder_, "Failed to get or create decoder");

diff --git a/src/torchcodec/_core/BetaCudaDeviceInterface.h b/src/torchcodec/_core/BetaCudaDeviceInterface.h
@@ -53,6 +53,8 @@ class BetaCudaDeviceInterface : public DeviceInterface {
   int receiveFrame(UniqueAVFrame& avFrame) override;
   void flush() override;
 
+  int reconfigureNVDECDecoder(CUVIDEOFORMAT* videoFormat);
+
   // NVDEC callback functions (must be public for C callbacks)
   int streamPropertyChange(CUVIDEOFORMAT* videoFormat);
   int frameReadyForDecoding(CUVIDPICPARAMS* picParams);
@@ -102,6 +104,8 @@ class BetaCudaDeviceInterface : public DeviceInterface {
   bool nvcuvidAvailable_ = false;
   UniqueSwsContext swsContext_;
   SwsFrameContext prevSwsFrameContext_;
+
+  uint32_t decoderId_ = 0;
 };
 
 } // namespace facebook::torchcodec

diff --git a/src/torchcodec/_core/NVCUVIDRuntimeLoader.cpp b/src/torchcodec/_core/NVCUVIDRuntimeLoader.cpp
@@ -103,6 +103,7 @@ typedef CUresult CUDAAPI tcuvidMapVideoFrame(CUvideodecoder, int, unsigned int*,
 typedef CUresult CUDAAPI tcuvidUnmapVideoFrame(CUvideodecoder, unsigned int);
 typedef CUresult CUDAAPI tcuvidMapVideoFrame64(CUvideodecoder, int, unsigned long long*, unsigned int*, CUVIDPROCPARAMS*);
 typedef CUresult CUDAAPI tcuvidUnmapVideoFrame64(CUvideodecoder, unsigned long long);
+typedef CUresult CUDAAPI tcuvidReconfigureDecoder(CUvideodecoder, CUVIDRECONFIGUREDECODERINFO*);
 /* clang-format on */
 
 // Global function pointers - will be dynamically loaded
@@ -117,6 +118,7 @@ static tcuvidMapVideoFrame* dl_cuvidMapVideoFrame = nullptr;
 static tcuvidUnmapVideoFrame* dl_cuvidUnmapVideoFrame = nullptr;
 static tcuvidMapVideoFrame64* dl_cuvidMapVideoFrame64 = nullptr;
 static tcuvidUnmapVideoFrame64* dl_cuvidUnmapVideoFrame64 = nullptr;
+static tcuvidReconfigureDecoder* dl_cuvidReconfigureDecoder = nullptr;
 
 static tHandle g_nvcuvid_handle = nullptr;
 static std::mutex g_nvcuvid_mutex;
@@ -128,7 +130,7 @@ bool isLoaded() {
       dl_cuvidCreateDecoder && dl_cuvidDestroyDecoder &&
       dl_cuvidDecodePicture && dl_cuvidMapVideoFrame &&
       dl_cuvidUnmapVideoFrame && dl_cuvidMapVideoFrame64 &&
-      dl_cuvidUnmapVideoFrame64);
+      dl_cuvidUnmapVideoFrame64 && dl_cuvidReconfigureDecoder);
 }
 
 template <typename T>
@@ -202,6 +204,8 @@ bool loadNVCUVIDLibrary() {
       bindFunction<tcuvidMapVideoFrame64>("cuvidMapVideoFrame64");
   dl_cuvidUnmapVideoFrame64 =
       bindFunction<tcuvidUnmapVideoFrame64>("cuvidUnmapVideoFrame64");
+  dl_cuvidReconfigureDecoder =
+      bindFunction<tcuvidReconfigureDecoder>("cuvidReconfigureDecoder");
 
   return isLoaded();
 }
@@ -315,6 +319,14 @@ cuvidUnmapVideoFrame64(CUvideodecoder decoder, unsigned long long framePtr) {
   return facebook::torchcodec::dl_cuvidUnmapVideoFrame64(decoder, framePtr);
 }
 
+CUresult CUDAAPI
+cuvidReconfigureDecoder(CUvideodecoder decoder, CUVIDRECONFIGUREDECODERINFO* pDecReconfigParams) {
+  TORCH_CHECK(
+      facebook::torchcodec::dl_cuvidReconfigureDecoder,
+      "cuvidReconfigureDecoder called but NVCUVID not loaded!");
+  return facebook::torchcodec::dl_cuvidReconfigureDecoder(decoder, pDecReconfigParams);
+}
+
 } // extern "C"
 
 #endif // FBCODE_CAFFE2
diff --git a/src/torchcodec/_core/NVDECCache.cpp b/src/torchcodec/_core/NVDECCache.cpp
@@ -25,36 +25,140 @@ NVDECCache& NVDECCache::getCache(const torch::Device& device) {
   return cacheInstances[getDeviceIndex(device)];
 }
 
-UniqueCUvideodecoder NVDECCache::getDecoder(CUVIDEOFORMAT* videoFormat) {
+// Simple ID allocator for decoders - not strictly necessary, but useful for
+// reconfiguring decoders.
+uint32_t NVDECCache::allocDecoderId() {
+  std::lock_guard<std::mutex> lock(cacheLock_);
+  return nextId++;
+}
+
+// Register a decoder ID with its maximum width and height.
+bool NVDECCache::registerDecoderId(uint32_t decoderId,
+    uint32_t ulMaxWidth,
+    uint32_t ulMaxHeight,
+    uint8_t ulMaxNumDecodeSurfaces) {
+  std::lock_guard<std::mutex> lock(cacheLock_);
+  auto it = id_context_map_.find(decoderId);
+  if (it != id_context_map_.end()) {
+    // Already registered
+    return false;
+  }
+  id_context_map_[decoderId] = DecoderMaxWHContext{
+    decoderId, 
+    ulMaxWidth, 
+    ulMaxHeight, 
+    ulMaxNumDecodeSurfaces
+  };
+  return true;
+
+}
+
+
+CUvideodecoderCache NVDECCache::getDecoder(CUVIDEOFORMAT* videoFormat) {
+  NVDECCacheType cache_type = NVDECCacheType::Create;
   CacheKey key(videoFormat);
   std::lock_guard<std::mutex> lock(cacheLock_);
 
   auto it = cache_.find(key);
-  if (it != cache_.end()) {
-    auto decoder = std::move(it->second);
-    cache_.erase(it);
-    return decoder;
+  if (it != cache_.end() && it->second.size() > 0) {
+    auto it2 = context_cache_.find(key);
+    TORCH_CHECK(
+      it2 != context_cache_.end(),
+      "Decoder context cache inconsistency detected."
+    );
+    TORCH_CHECK(
+      it->second.size() == it2->second.size(),
+      "Size of cache_[key] and context_cache_[key] do not match."
+    );
+
+    // We first check if the cached decoder can be reused as is. If the number of
+    // surfaces allocated for the cached decoder is equal to the requested number of
+    // surfaces, and the coded dimensions also match, then we can reuse it directly.
+    // Otherwise, we need to reconfigure it.
+    for (auto bg = it2->second.begin(), ed = it2->second.end(); bg != ed; ++bg) {
+      const auto& context = *bg;
+      if (
+        context.first.numDecodeSurfaces == videoFormat->min_num_decode_surfaces &&
+        context.first.coded_width == videoFormat->coded_width &&
+        context.first.coded_height == videoFormat->coded_height
+      ) {
+        cache_type = NVDECCacheType::Reuse;
+
+        // Delete the selected decoder from the cache lists
+        auto dist = std::distance(it2->second.begin(), bg);
+        auto decoder_it = it->second.begin();
+        std::advance(decoder_it, dist);
+        auto decoder = std::move(*decoder_it);;
+        it->second.erase(decoder_it);
+        it2->second.erase(bg);
+        return std::make_tuple(cache_type, std::move(decoder), context.second.decoderID);
+      }
+    }
+
+    for (auto bg = it2->second.begin(), ed = it2->second.end(); bg != ed; ++bg) {
+      const auto& context = *bg;
+      if (
+        context.second.ulMaxWidth >= videoFormat->coded_width &&
+        context.second.ulMaxHeight >= videoFormat->coded_height &&
+        context.second.ulMaxNumDecodeSurfaces >= videoFormat->min_num_decode_surfaces
+      ) {
+        cache_type = NVDECCacheType::Reconfig;
+
+        // Delete the selected decoder from the cache lists
+        auto dist = std::distance(it2->second.begin(), bg);
+        auto decoder_it = it->second.begin();
+        std::advance(decoder_it, dist);
+        auto decoder = std::move(*decoder_it);
+        it->second.erase(decoder_it);
+        it2->second.erase(bg);
+        return std::make_tuple(cache_type, std::move(decoder), context.second.decoderID);
+      }
+    }
   }
 
-  return nullptr;
+  return std::make_tuple(cache_type, nullptr, 0);
 }
 
 bool NVDECCache::returnDecoder(
     CUVIDEOFORMAT* videoFormat,
-    UniqueCUvideodecoder decoder) {
+    UniqueCUvideodecoder decoder,
+    uint32_t decoderId) {
   if (!decoder) {
     return false;
   }
 
   CacheKey key(videoFormat);
   std::lock_guard<std::mutex> lock(cacheLock_);
 
-  if (cache_.size() >= MAX_CACHE_SIZE) {
+  uint32_t current_cache_size = 0;
+  for (const auto& pair : cache_) {
+    current_cache_size += pair.second.size();
+  }
+
+  if (current_cache_size >= MAX_CACHE_SIZE) {
     return false;
   }
 
-  cache_[key] = std::move(decoder);
+  auto it = id_context_map_.find(decoderId);
+  TORCH_CHECK(
+    it != id_context_map_.end(),
+    "Decoder ID not registered in id_context_map_"
+  );
+
+  cache_[key].push_back(std::move(decoder));
+  context_cache_[key].push_back(
+    std::make_pair(
+      VideoDecodeContext{
+        videoFormat->min_num_decode_surfaces,
+        videoFormat->coded_width,
+        videoFormat->coded_height
+      },
+      it->second
+    )
+  );
+
   return true;
 }
 
+
 } // namespace facebook::torchcodec