|
| 1 | +# Copyright (c) Meta Platforms, Inc. and affiliates. |
| 2 | +# All rights reserved. |
| 3 | +# |
| 4 | +# This source code is licensed under the BSD-style license found in the |
| 5 | +# LICENSE file in the root directory of this source tree. |
| 6 | +""" |
| 7 | +Accelerated video decoding on GPUs with CUDA and NVDEC |
| 8 | +================================================================ |
| 9 | +
|
| 10 | +TorchCodec can use supported Nvidia hardware (see support matrix |
| 11 | +`here <https://developer.nvidia.com/video-encode-and-decode-gpu-support-matrix-new>`_) to speed-up |
| 12 | +video decoding. This is called "CUDA Decoding" and it uses Nvidia's |
| 13 | +`NVDEC hardware decoder <https://developer.nvidia.com/video-codec-sdk>`_ |
| 14 | +and CUDA kernels to respectively decompress and convert to RGB. |
| 15 | +CUDA Decoding can be faster than CPU Decoding for the actual decoding step and also for |
| 16 | +subsequent transform steps like scaling, cropping or rotating. This is because the decode step leaves |
| 17 | +the decoded tensor in GPU memory so the GPU doesn't have to fetch from main memory before |
| 18 | +running the transform steps. Encoded packets are often much smaller than decoded frames so |
| 19 | +CUDA decoding also uses less PCI-e bandwidth. |
| 20 | +
|
| 21 | +CUDA Decoding can offer speed-up over CPU Decoding in a few scenarios: |
| 22 | +
|
| 23 | +#. You are decoding a large resolution video |
| 24 | +#. You are decoding a large batch of videos that's saturating the CPU |
| 25 | +#. You want to do whole-image transforms like scaling or convolutions on the decoded tensors |
| 26 | + after decoding |
| 27 | +#. Your CPU is saturated and you want to free it up for other work |
| 28 | +
|
| 29 | +
|
| 30 | +Here are situations where CUDA Decoding may not make sense: |
| 31 | +
|
| 32 | +#. You want bit-exact results compared to CPU Decoding |
| 33 | +#. You have small resolution videos and the PCI-e transfer latency is large |
| 34 | +#. Your GPU is already busy and CPU is not |
| 35 | +
|
| 36 | +It's best to experiment with CUDA Decoding to see if it improves your use-case. With |
| 37 | +TorchCodec you can simply pass in a device parameter to the |
| 38 | +:class:`~torchcodec.decoders.VideoDecoder` class to use CUDA Decoding. |
| 39 | +
|
| 40 | +
|
| 41 | +In order to use CUDA Decoding will need the following installed in your environment: |
| 42 | +
|
| 43 | +#. An Nvidia GPU that supports decoding the video format you want to decode. See |
| 44 | + the support matrix `here <https://developer.nvidia.com/video-encode-and-decode-gpu-support-matrix-new>`_ |
| 45 | +#. `CUDA-enabled pytorch <https://pytorch.org/get-started/locally/>`_ |
| 46 | +#. FFmpeg binaries that support |
| 47 | + `NVDEC-enabled <https://docs.nvidia.com/video-technologies/video-codec-sdk/12.0/ffmpeg-with-nvidia-gpu/index.html>`_ |
| 48 | + codecs |
| 49 | +#. libnpp and nvrtc (these are usually installed when you install the full cuda-toolkit) |
| 50 | +
|
| 51 | +
|
| 52 | +FFmpeg versions 5, 6 and 7 from conda-forge are built with |
| 53 | +`NVDEC support <https://docs.nvidia.com/video-technologies/video-codec-sdk/12.0/ffmpeg-with-nvidia-gpu/index.html>`_ |
| 54 | +and you can install them with conda. For example, to install FFmpeg version 7: |
| 55 | +
|
| 56 | +
|
| 57 | +.. code-block:: bash |
| 58 | +
|
| 59 | + conda install ffmpeg=7 -c conda-forge |
| 60 | + conda install libnpp cuda-nvrtc -c nvidia |
| 61 | +
|
| 62 | +
|
| 63 | +""" |
| 64 | + |
| 65 | +# %% |
| 66 | +# Checking if Pytorch has CUDA enabled |
| 67 | +# ------------------------------------- |
| 68 | +# |
| 69 | +# .. note:: |
| 70 | +# |
| 71 | +# This tutorial requires FFmpeg libraries compiled with CUDA support. |
| 72 | +# |
| 73 | +# |
| 74 | +import torch |
| 75 | + |
| 76 | +print(f"{torch.__version__=}") |
| 77 | +print(f"{torch.cuda.is_available()=}") |
| 78 | +print(f"{torch.cuda.get_device_properties(0)=}") |
| 79 | + |
| 80 | + |
| 81 | +# %% |
| 82 | +# Downloading the video |
| 83 | +# ------------------------------------- |
| 84 | +# |
| 85 | +# We will use the following video which has the following properties: |
| 86 | +# |
| 87 | +# - Codec: H.264 |
| 88 | +# - Resolution: 960x540 |
| 89 | +# - FPS: 29.97 |
| 90 | +# - Pixel format: YUV420P |
| 91 | +# |
| 92 | +# .. raw:: html |
| 93 | +# |
| 94 | +# <video style="max-width: 100%" controls> |
| 95 | +# <source src="https://download.pytorch.org/torchaudio/tutorial-assets/stream-api/NASAs_Most_Scientifically_Complex_Space_Observatory_Requires_Precision-MP4_small.mp4" type="video/mp4"> |
| 96 | +# </video> |
| 97 | +import urllib.request |
| 98 | + |
| 99 | +video_file = "video.mp4" |
| 100 | +urllib.request.urlretrieve( |
| 101 | + "https://download.pytorch.org/torchaudio/tutorial-assets/stream-api/NASAs_Most_Scientifically_Complex_Space_Observatory_Requires_Precision-MP4_small.mp4", |
| 102 | + video_file, |
| 103 | +) |
| 104 | + |
| 105 | + |
| 106 | +# %% |
| 107 | +# CUDA Decoding using VideoDecoder |
| 108 | +# ------------------------------------- |
| 109 | +# |
| 110 | +# To use CUDA decoder, you need to pass in a cuda device to the decoder. |
| 111 | +# |
| 112 | +from torchcodec.decoders import VideoDecoder |
| 113 | + |
| 114 | +decoder = VideoDecoder(video_file, device="cuda") |
| 115 | +frame = decoder[0] |
| 116 | + |
| 117 | +# %% |
| 118 | +# |
| 119 | +# The video frames are decoded and returned as tensor of NCHW format. |
| 120 | + |
| 121 | +print(frame.shape, frame.dtype) |
| 122 | + |
| 123 | +# %% |
| 124 | +# |
| 125 | +# The video frames are left on the GPU memory. |
| 126 | + |
| 127 | +print(frame.data.device) |
| 128 | + |
| 129 | + |
| 130 | +# %% |
| 131 | +# Visualizing Frames |
| 132 | +# ------------------------------------- |
| 133 | +# |
| 134 | +# Let's look at the frames decoded by CUDA decoder and compare them |
| 135 | +# against equivalent results from the CPU decoders. |
| 136 | +timestamps = [12, 19, 45, 131, 180] |
| 137 | +cpu_decoder = VideoDecoder(video_file, device="cpu") |
| 138 | +cuda_decoder = VideoDecoder(video_file, device="cuda") |
| 139 | +cpu_frames = cpu_decoder.get_frames_played_at(timestamps).data |
| 140 | +cuda_frames = cuda_decoder.get_frames_played_at(timestamps).data |
| 141 | + |
| 142 | + |
| 143 | +def plot_cpu_and_cuda_frames(cpu_frames: torch.Tensor, cuda_frames: torch.Tensor): |
| 144 | + try: |
| 145 | + import matplotlib.pyplot as plt |
| 146 | + from torchvision.transforms.v2.functional import to_pil_image |
| 147 | + except ImportError: |
| 148 | + print("Cannot plot, please run `pip install torchvision matplotlib`") |
| 149 | + return |
| 150 | + n_rows = len(timestamps) |
| 151 | + fig, axes = plt.subplots(n_rows, 2, figsize=[12.8, 16.0]) |
| 152 | + for i in range(n_rows): |
| 153 | + axes[i][0].imshow(to_pil_image(cpu_frames[i].to("cpu"))) |
| 154 | + axes[i][1].imshow(to_pil_image(cuda_frames[i].to("cpu"))) |
| 155 | + |
| 156 | + axes[0][0].set_title("CPU decoder", fontsize=24) |
| 157 | + axes[0][1].set_title("CUDA decoder", fontsize=24) |
| 158 | + plt.setp(axes, xticks=[], yticks=[]) |
| 159 | + plt.tight_layout() |
| 160 | + |
| 161 | + |
| 162 | +plot_cpu_and_cuda_frames(cpu_frames, cuda_frames) |
| 163 | + |
| 164 | +# %% |
| 165 | +# |
| 166 | +# They look visually similar to the human eye but there may be subtle |
| 167 | +# differences because CUDA math is not bit-exact with respect to CPU math. |
| 168 | +# |
| 169 | +frames_equal = torch.equal(cpu_frames.to("cuda"), cuda_frames) |
| 170 | +mean_abs_diff = torch.mean( |
| 171 | + torch.abs(cpu_frames.float().to("cuda") - cuda_frames.float()) |
| 172 | +) |
| 173 | +max_abs_diff = torch.max(torch.abs(cpu_frames.to("cuda").float() - cuda_frames.float())) |
| 174 | +print(f"{frames_equal=}") |
| 175 | +print(f"{mean_abs_diff=}") |
| 176 | +print(f"{max_abs_diff=}") |
0 commit comments