Skip to content
66 changes: 58 additions & 8 deletions src/torchcodec/_core/BetaCudaDeviceInterface.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -243,7 +243,7 @@ BetaCudaDeviceInterface::~BetaCudaDeviceInterface() {
flush();
unmapPreviousFrame();
NVDECCache::getCache(device_).returnDecoder(
&videoFormat_, std::move(decoder_));
&videoFormat_, std::move(decoder_), decoderId_);
}

if (videoParser_) {
Expand All @@ -270,7 +270,6 @@ void BetaCudaDeviceInterface::initialize(
// We'll always use the CPU fallback from now on, so we can return early.
return;
}

TORCH_CHECK(avStream != nullptr, "AVStream cannot be null");
timeBase_ = avStream->time_base;
frameRateAvgFromFFmpeg_ = avStream->r_frame_rate;
Expand Down Expand Up @@ -387,6 +386,29 @@ void BetaCudaDeviceInterface::initializeBSF(
getFFMPEGErrorStringFromErrorCode(retVal));
}


int BetaCudaDeviceInterface::reconfigureNVDECDecoder(CUVIDEOFORMAT* videoFormat) {
CUVIDRECONFIGUREDECODERINFO info = {};

info.ulWidth = videoFormat->coded_width;
info.ulHeight = videoFormat->coded_height;

info.ulTargetWidth = videoFormat->display_area.right - videoFormat->display_area.left;
info.ulTargetHeight = videoFormat->display_area.bottom - videoFormat->display_area.top;

info.ulNumDecodeSurfaces = videoFormat->min_num_decode_surfaces;

info.display_area.left = videoFormat->display_area.left;
info.display_area.top = videoFormat->display_area.top;
info.display_area.right = videoFormat->display_area.right;
info.display_area.bottom = videoFormat->display_area.bottom;

cuvidReconfigureDecoder(*decoder_, &info);

return static_cast<int>(videoFormat_.min_num_decode_surfaces);
}


// This callback is called by the parser within cuvidParseVideoData when there
// is a change in the stream's properties (like resolution change), as specified
// by CUVIDEOFORMAT. Particularly (but not just!), this is called at the very
Expand All @@ -404,14 +426,42 @@ int BetaCudaDeviceInterface::streamPropertyChange(CUVIDEOFORMAT* videoFormat) {
videoFormat_.min_num_decode_surfaces = 20;
}

CUvideodecoderCache decoderCache;

if (!decoder_) {
decoder_ = NVDECCache::getCache(device_).getDecoder(videoFormat);

if (!decoder_) {
// TODONVDEC P2: consider re-configuring an existing decoder instead of
// re-creating one. See docs, see DALI. Re-configuration doesn't seem to
// be enabled in DALI by default.
decoderCache = NVDECCache::getCache(device_).getDecoder(videoFormat);

auto cache_type = std::get<0>(decoderCache);
if (cache_type == NVDECCacheType::Reconfig) {
// Need to reconfigure existing decoder
auto decoderId = std::get<2>(decoderCache);
decoderId_ = decoderId;

decoder_ = std::move(std::get<1>(decoderCache));
TORCH_CHECK(decoder_, "Failed to get decoder from cache");
return reconfigureNVDECDecoder(videoFormat);
}
else if (cache_type == NVDECCacheType::Reuse) {
// Can reuse existing decoder as is
auto decoderId = std::get<2>(decoderCache);
decoderId_ = decoderId;

decoder_ = std::move(std::get<1>(decoderCache));
}
else if (cache_type == NVDECCacheType::Create) {
// Need to create a new decoder
TORCH_CHECK(!decoder_, "Decoder should be null here");
decoder_ = createDecoder(videoFormat);
decoderId_ = NVDECCache::getCache(device_).allocDecoderId();
NVDECCache::getCache(device_).registerDecoderId(
decoderId_,
videoFormat->coded_width,
videoFormat->coded_height,
// The number of surfaces used in reconfig cannot exceed the number of surfaces
// specified when creating the decoder. In most cases, this should not be a problem.
// We'll see the real result based on whether the tests pass.
videoFormat->min_num_decode_surfaces
);
}

TORCH_CHECK(decoder_, "Failed to get or create decoder");
Expand Down
4 changes: 4 additions & 0 deletions src/torchcodec/_core/BetaCudaDeviceInterface.h
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,8 @@ class BetaCudaDeviceInterface : public DeviceInterface {
int receiveFrame(UniqueAVFrame& avFrame) override;
void flush() override;

int reconfigureNVDECDecoder(CUVIDEOFORMAT* videoFormat);

// NVDEC callback functions (must be public for C callbacks)
int streamPropertyChange(CUVIDEOFORMAT* videoFormat);
int frameReadyForDecoding(CUVIDPICPARAMS* picParams);
Expand Down Expand Up @@ -102,6 +104,8 @@ class BetaCudaDeviceInterface : public DeviceInterface {
bool nvcuvidAvailable_ = false;
UniqueSwsContext swsContext_;
SwsFrameContext prevSwsFrameContext_;

uint32_t decoderId_ = 0;
};

} // namespace facebook::torchcodec
Expand Down
14 changes: 13 additions & 1 deletion src/torchcodec/_core/NVCUVIDRuntimeLoader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,7 @@ typedef CUresult CUDAAPI tcuvidMapVideoFrame(CUvideodecoder, int, unsigned int*,
typedef CUresult CUDAAPI tcuvidUnmapVideoFrame(CUvideodecoder, unsigned int);
typedef CUresult CUDAAPI tcuvidMapVideoFrame64(CUvideodecoder, int, unsigned long long*, unsigned int*, CUVIDPROCPARAMS*);
typedef CUresult CUDAAPI tcuvidUnmapVideoFrame64(CUvideodecoder, unsigned long long);
typedef CUresult CUDAAPI tcuvidReconfigureDecoder(CUvideodecoder, CUVIDRECONFIGUREDECODERINFO*);
/* clang-format on */

// Global function pointers - will be dynamically loaded
Expand All @@ -117,6 +118,7 @@ static tcuvidMapVideoFrame* dl_cuvidMapVideoFrame = nullptr;
static tcuvidUnmapVideoFrame* dl_cuvidUnmapVideoFrame = nullptr;
static tcuvidMapVideoFrame64* dl_cuvidMapVideoFrame64 = nullptr;
static tcuvidUnmapVideoFrame64* dl_cuvidUnmapVideoFrame64 = nullptr;
static tcuvidReconfigureDecoder* dl_cuvidReconfigureDecoder = nullptr;

static tHandle g_nvcuvid_handle = nullptr;
static std::mutex g_nvcuvid_mutex;
Expand All @@ -128,7 +130,7 @@ bool isLoaded() {
dl_cuvidCreateDecoder && dl_cuvidDestroyDecoder &&
dl_cuvidDecodePicture && dl_cuvidMapVideoFrame &&
dl_cuvidUnmapVideoFrame && dl_cuvidMapVideoFrame64 &&
dl_cuvidUnmapVideoFrame64);
dl_cuvidUnmapVideoFrame64 && dl_cuvidReconfigureDecoder);
}

template <typename T>
Expand Down Expand Up @@ -202,6 +204,8 @@ bool loadNVCUVIDLibrary() {
bindFunction<tcuvidMapVideoFrame64>("cuvidMapVideoFrame64");
dl_cuvidUnmapVideoFrame64 =
bindFunction<tcuvidUnmapVideoFrame64>("cuvidUnmapVideoFrame64");
dl_cuvidReconfigureDecoder =
bindFunction<tcuvidReconfigureDecoder>("cuvidReconfigureDecoder");

return isLoaded();
}
Expand Down Expand Up @@ -315,6 +319,14 @@ cuvidUnmapVideoFrame64(CUvideodecoder decoder, unsigned long long framePtr) {
return facebook::torchcodec::dl_cuvidUnmapVideoFrame64(decoder, framePtr);
}

CUresult CUDAAPI
cuvidReconfigureDecoder(CUvideodecoder decoder, CUVIDRECONFIGUREDECODERINFO* pDecReconfigParams) {
TORCH_CHECK(
facebook::torchcodec::dl_cuvidReconfigureDecoder,
"cuvidReconfigureDecoder called but NVCUVID not loaded!");
return facebook::torchcodec::dl_cuvidReconfigureDecoder(decoder, pDecReconfigParams);
}

} // extern "C"

#endif // FBCODE_CAFFE2
122 changes: 113 additions & 9 deletions src/torchcodec/_core/NVDECCache.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,36 +25,140 @@ NVDECCache& NVDECCache::getCache(const torch::Device& device) {
return cacheInstances[getDeviceIndex(device)];
}

UniqueCUvideodecoder NVDECCache::getDecoder(CUVIDEOFORMAT* videoFormat) {
// Simple ID allocator for decoders - not strictly necessary, but useful for
// reconfiguring decoders.
uint32_t NVDECCache::allocDecoderId() {
std::lock_guard<std::mutex> lock(cacheLock_);
return nextId++;
}

// Register a decoder ID with its maximum width and height.
bool NVDECCache::registerDecoderId(uint32_t decoderId,
uint32_t ulMaxWidth,
uint32_t ulMaxHeight,
uint8_t ulMaxNumDecodeSurfaces) {
std::lock_guard<std::mutex> lock(cacheLock_);
auto it = id_context_map_.find(decoderId);
if (it != id_context_map_.end()) {
// Already registered
return false;
}
id_context_map_[decoderId] = DecoderMaxWHContext{
decoderId,
ulMaxWidth,
ulMaxHeight,
ulMaxNumDecodeSurfaces
};
return true;

}


CUvideodecoderCache NVDECCache::getDecoder(CUVIDEOFORMAT* videoFormat) {
NVDECCacheType cache_type = NVDECCacheType::Create;
CacheKey key(videoFormat);
std::lock_guard<std::mutex> lock(cacheLock_);

auto it = cache_.find(key);
if (it != cache_.end()) {
auto decoder = std::move(it->second);
cache_.erase(it);
return decoder;
if (it != cache_.end() && it->second.size() > 0) {
auto it2 = context_cache_.find(key);
TORCH_CHECK(
it2 != context_cache_.end(),
"Decoder context cache inconsistency detected."
);
TORCH_CHECK(
it->second.size() == it2->second.size(),
"Size of cache_[key] and context_cache_[key] do not match."
);

// We first check if the cached decoder can be reused as is. If the number of
// surfaces allocated for the cached decoder is equal to the requested number of
// surfaces, and the coded dimensions also match, then we can reuse it directly.
// Otherwise, we need to reconfigure it.
for (auto bg = it2->second.begin(), ed = it2->second.end(); bg != ed; ++bg) {
const auto& context = *bg;
if (
context.first.numDecodeSurfaces == videoFormat->min_num_decode_surfaces &&
context.first.coded_width == videoFormat->coded_width &&
context.first.coded_height == videoFormat->coded_height
) {
cache_type = NVDECCacheType::Reuse;

// Delete the selected decoder from the cache lists
auto dist = std::distance(it2->second.begin(), bg);
auto decoder_it = it->second.begin();
std::advance(decoder_it, dist);
auto decoder = std::move(*decoder_it);;
it->second.erase(decoder_it);
it2->second.erase(bg);
return std::make_tuple(cache_type, std::move(decoder), context.second.decoderID);
}
}

for (auto bg = it2->second.begin(), ed = it2->second.end(); bg != ed; ++bg) {
const auto& context = *bg;
if (
context.second.ulMaxWidth >= videoFormat->coded_width &&
context.second.ulMaxHeight >= videoFormat->coded_height &&
context.second.ulMaxNumDecodeSurfaces >= videoFormat->min_num_decode_surfaces
) {
cache_type = NVDECCacheType::Reconfig;

// Delete the selected decoder from the cache lists
auto dist = std::distance(it2->second.begin(), bg);
auto decoder_it = it->second.begin();
std::advance(decoder_it, dist);
auto decoder = std::move(*decoder_it);
it->second.erase(decoder_it);
it2->second.erase(bg);
return std::make_tuple(cache_type, std::move(decoder), context.second.decoderID);
}
}
}

return nullptr;
return std::make_tuple(cache_type, nullptr, 0);
}

bool NVDECCache::returnDecoder(
CUVIDEOFORMAT* videoFormat,
UniqueCUvideodecoder decoder) {
UniqueCUvideodecoder decoder,
uint32_t decoderId) {
if (!decoder) {
return false;
}

CacheKey key(videoFormat);
std::lock_guard<std::mutex> lock(cacheLock_);

if (cache_.size() >= MAX_CACHE_SIZE) {
uint32_t current_cache_size = 0;
for (const auto& pair : cache_) {
current_cache_size += pair.second.size();
}

if (current_cache_size >= MAX_CACHE_SIZE) {
return false;
}

cache_[key] = std::move(decoder);
auto it = id_context_map_.find(decoderId);
TORCH_CHECK(
it != id_context_map_.end(),
"Decoder ID not registered in id_context_map_"
);

cache_[key].push_back(std::move(decoder));
context_cache_[key].push_back(
std::make_pair(
VideoDecodeContext{
videoFormat->min_num_decode_surfaces,
videoFormat->coded_width,
videoFormat->coded_height
},
it->second
)
);

return true;
}


} // namespace facebook::torchcodec
Loading
Loading