diff --git a/.github/workflows/build_ffmpeg.yaml b/.github/workflows/build_ffmpeg.yaml
index 847c9161..833c1b41 100644
--- a/.github/workflows/build_ffmpeg.yaml
+++ b/.github/workflows/build_ffmpeg.yaml
@@ -14,6 +14,7 @@ on:
   pull_request:
     paths:
       - packaging/build_ffmpeg.sh
+      - .github/workflows/build_ffmpeg.yaml # self reference
   schedule:
     - cron: '0 0 * * 0'  # on sunday
 
diff --git a/.github/workflows/cpp_tests.yaml b/.github/workflows/cpp_tests.yaml
index b2b19a78..bc5ee0cb 100644
--- a/.github/workflows/cpp_tests.yaml
+++ b/.github/workflows/cpp_tests.yaml
@@ -4,6 +4,10 @@ on:
   push:
     branches: [ main ]
   pull_request:
+    paths:
+      - src/torchcodec/*
+      - test/*
+      - .github/workflows/cpp_tests.yaml # self reference
 
 concurrency:
   group: unit-test${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }}
diff --git a/.github/workflows/docs.yaml b/.github/workflows/docs.yaml
index 545ddf9c..e0d78114 100644
--- a/.github/workflows/docs.yaml
+++ b/.github/workflows/docs.yaml
@@ -4,46 +4,116 @@ on:
   push:
     branches: [ main ]
   pull_request:
+    paths:
+      - src/torchcodec/*
+      - docs/*
+      - .github/workflows/docs.yaml # self reference
+
+permissions:
+  id-token: write
+  contents: write
 
 defaults:
   run:
     shell: bash -l -eo pipefail {0}
 
 jobs:
+  generate-matrix:
+    uses: pytorch/test-infra/.github/workflows/generate_binary_build_matrix.yml@main
+    with:
+      package-type: wheel
+      os: linux
+      test-infra-repository: pytorch/test-infra
+      test-infra-ref: main
+      with-cpu: disable
+      with-xpu: disable
+      with-rocm: disable
+      with-cuda: enable
+      build-python-only: "disable"
   build:
-    runs-on: ubuntu-latest
+    needs: generate-matrix
+    strategy:
+      fail-fast: false
+    name: Build and Upload wheel
+    uses: pytorch/test-infra/.github/workflows/build_wheels_linux.yml@main
+    with:
+      repository: pytorch/torchcodec
+      ref: ""
+      test-infra-repository: pytorch/test-infra
+      test-infra-ref: main
+      build-matrix: ${{ needs.generate-matrix.outputs.matrix }}
+      post-script: packaging/post_build_script.sh
+      smoke-test-script: packaging/fake_smoke_test.py
+      package-name: torchcodec
+      trigger-event: ${{ github.event_name }}
+      build-platform: "python-build-package"
+      build-command: "BUILD_AGAINST_ALL_FFMPEG_FROM_S3=1 ENABLE_CUDA=1 python -m build --wheel -vvv --no-isolation"
+
+  build-docs:
+    runs-on: linux.4xlarge.nvidia.gpu
     strategy:
       fail-fast: false
+      matrix:
+          # 3.9 corresponds to the minimum python version for which we build
+          # the wheel unless the label cliflow/binaries/all is present in the
+          # PR.
+        python-version: ['3.9']
+        cuda-version: ['12.4']
+        ffmpeg-version-for-tests: ['7']
+    container:
+      image: "pytorch/manylinux-builder:cuda${{ matrix.cuda-version }}"
+      options: "--gpus all -e NVIDIA_DRIVER_CAPABILITIES=video,compute,utility"
+    needs: build
     steps:
-      - name: Check out repo
-        uses: actions/checkout@v3
-      - name: Setup conda env
-        uses: conda-incubator/setup-miniconda@v2
+      - name: Setup env vars
+        run: |
+          cuda_version_without_periods=$(echo "${{ matrix.cuda-version }}" | sed 's/\.//g')
+          echo cuda_version_without_periods=${cuda_version_without_periods} >> $GITHUB_ENV
+      - uses: actions/download-artifact@v3
         with:
-          auto-update-conda: true
-          miniconda-version: "latest"
-          activate-environment: test
-          python-version: '3.12'
+          name: pytorch_torchcodec__3.9_cu${{ env.cuda_version_without_periods }}_x86_64
+          path: pytorch/torchcodec/dist/
+      - name: Setup miniconda using test-infra
+        uses: pytorch/test-infra/.github/actions/setup-miniconda@main
+        with:
+          python-version: ${{ matrix.python-version }}
+          #
+          # For some reason nvidia::libnpp=12.4 doesn't install but nvidia/label/cuda-12.4.0::libnpp does.
+          # So we use the latter convention for libnpp.
+          # We install conda packages at the start because otherwise conda may have conflicts with dependencies.
+          default-packages: "nvidia/label/cuda-${{ matrix.cuda-version }}.0::libnpp nvidia::cuda-nvrtc=${{ matrix.cuda-version }} nvidia::cuda-toolkit=${{ matrix.cuda-version }} nvidia::cuda-cudart=${{ matrix.cuda-version }} nvidia::cuda-driver-dev=${{ matrix.cuda-version }} conda-forge::ffmpeg=${{ matrix.ffmpeg-version-for-tests }}"
+      - name: Check env
+        run: |
+          ${CONDA_RUN} env
+          ${CONDA_RUN} conda info
+          ${CONDA_RUN} nvidia-smi
+          ${CONDA_RUN} conda list
+      - name: Assert ffmpeg exists
+        run: |
+          ${CONDA_RUN} ffmpeg -buildconf
       - name: Update pip
-        run: python -m pip install --upgrade pip
-      - name: Install dependencies and FFmpeg
+        run: ${CONDA_RUN} python -m pip install --upgrade pip
+      - name: Install PyTorch
         run: |
-          # TODO: torchvision and torchaudio shouldn't be needed. They were only added
-          #  to silence an error as seen in https://github.com/pytorch/torchcodec/issues/203
-          python -m pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cpu
-          conda install "ffmpeg=7.0.1" pkg-config -c conda-forge
-          ffmpeg -version
-      - name: Build and install torchcodec
+          ${CONDA_RUN} python -m pip install --pre torch torchvision --index-url https://download.pytorch.org/whl/nightly/cu${{ env.cuda_version_without_periods }}
+          ${CONDA_RUN} python -c 'import torch; print(f"{torch.__version__}"); print(f"{torch.__file__}"); print(f"{torch.cuda.is_available()=}")'
+      - name: Install torchcodec from the wheel
         run: |
-          python -m pip install -e ".[dev]" --no-build-isolation -vvv
+          wheel_path=`find pytorch/torchcodec/dist -type f -name "*.whl"`
+          echo Installing $wheel_path
+          ${CONDA_RUN} python -m pip install $wheel_path -vvv
+
+      - name: Check out repo
+        uses: actions/checkout@v3
+
       - name: Install doc dependencies
         run: |
           cd docs
-          python -m pip install -r requirements.txt
+          ${CONDA_RUN} python -m pip install -r requirements.txt
       - name: Build docs
         run: |
           cd docs
-          make html
+          ${CONDA_RUN} make html
       - uses: actions/upload-artifact@v3
         with:
           name: Built-Docs
diff --git a/.github/workflows/linux_cuda_wheel.yaml b/.github/workflows/linux_cuda_wheel.yaml
index 915c5236..be4544fd 100644
--- a/.github/workflows/linux_cuda_wheel.yaml
+++ b/.github/workflows/linux_cuda_wheel.yaml
@@ -2,6 +2,12 @@ name: Build and test Linux CUDA wheels
 
 on:
   pull_request:
+    paths:
+      - src/torchcodec/*
+      - benchmarks/*
+      - packaging/*
+      - test/*
+      - .github/workflows/linux_cuda_wheel.yaml # self reference
   push:
     branches:
       - nightly
diff --git a/.github/workflows/linux_wheel.yaml b/.github/workflows/linux_wheel.yaml
index 38f25733..56031f78 100644
--- a/.github/workflows/linux_wheel.yaml
+++ b/.github/workflows/linux_wheel.yaml
@@ -2,6 +2,11 @@ name: Build and test Linux wheel
 
 on:
   pull_request:
+    paths:
+      - src/torchcodec/*
+      - packaging/*
+      - test/*
+      - .github/workflows/linux_wheel.yaml # self reference
   push:
     branches:
       - nightly
diff --git a/.github/workflows/macos_wheel.yaml b/.github/workflows/macos_wheel.yaml
index ef637194..6286aaab 100644
--- a/.github/workflows/macos_wheel.yaml
+++ b/.github/workflows/macos_wheel.yaml
@@ -2,6 +2,11 @@ name: Build and test MacOS wheel
 
 on:
   pull_request:
+    paths:
+      - src/torchcodec/*
+      - packaging/*
+      - test/*
+      - .github/workflows/macos_wheel.yaml # self reference
   push:
     branches:
       - nightly
diff --git a/.github/workflows/reference_resources.yaml b/.github/workflows/reference_resources.yaml
index e0414d51..c3a10fef 100644
--- a/.github/workflows/reference_resources.yaml
+++ b/.github/workflows/reference_resources.yaml
@@ -5,6 +5,7 @@ on:
   pull_request:
     paths:
       - test/generate_reference_resources.sh
+      - .github/workflows/reference_resources.yaml # self reference
   schedule:
     - cron: '0 0 * * 0'  # on sunday
 
diff --git a/README.md b/README.md
index ca0fa4f8..40603254 100644
--- a/README.md
+++ b/README.md
@@ -94,29 +94,38 @@ ffmpeg -f lavfi -i \
 
 ## Installing TorchCodec
 
-Note: if you're on MacOS, you'll need to [build from source](./CONTRIBUTING.md).
-The instructions below assume you're on Linux.
+1. Install the latest stable version of PyTorch following the
+   [official instructions](https://pytorch.org/get-started/locally/). For other
+   versions, refer to the table below for compatibility between versions of
+   `torch` and `torchcodec`.
 
-  1. Install the latest stable version of PyTorch following the
-     [official instructions](https://pytorch.org/get-started/locally/). TorchCodec
-     requires [PyTorch 2.4](https://pytorch.org/docs/2.4/).
+2. Install FFmpeg, if it's not already installed. Linux distributions usually
+   come with FFmpeg pre-installed. TorchCodec supports all major FFmpeg versions
+   in [4, 7].
 
-  2. Install FFmpeg, if it's not already installed. Your Linux distribution probably
-     comes with FFmpeg pre-installed. TorchCodec supports all major FFmpeg versions
-     in [4, 7].
+   If FFmpeg is not already installed, or you need a more recent version, an
+   easy way to install it is to use `conda`:
 
-     If FFmpeg is not already installed, or you need a later version, install it with:
+   ```bash
+   conda install ffmpeg
+   # or
+   conda install ffmpeg -c conda-forge
+   ```
 
-     ```bash
-     conda install ffmpeg
-     # or
-     conda install ffmpeg -c conda-forge
-     ```
-  3. Install TorchCodec:
+3. Install TorchCodec:
 
-     ```bash
-     pip install torchcodec
-     ```
+   ```bash
+   pip install torchcodec
+   ```
+
+The following table indicates the compatibility between versions of
+`torchcodec`, `torch` and Python.
+
+| `torchcodec`       | `torch`            | Python              |
+| ------------------ | ------------------ | ------------------- |
+| `main` / `nightly` | `main` / `nightly` | `>=3.9`, `<=3.12`   |
+| not yet supported  | `2.5`              | `>=3.9`, `<=3.12`   |
+| `0.0.3`            | `2.4`              | `>=3.8`, `<=3.12`   |
 
 ## Benchmark Results
 
@@ -134,10 +143,6 @@ encoded with libx264 and yuv420p pixel format.
 
 We are actively working on the following features:
 
-- [Ship wheels for MacOS](https://github.com/pytorch/torchcodec/issues/111), so
-  that MacOS users can `pip install torchcodec`. For now this is only supported
-  on Linux, but MacOS users can [build from source](./CONTRIBUTING.md).
-- [GPU decoding](https://github.com/pytorch/torchcodec/pull/58)
 - [Audio decoding](https://github.com/pytorch/torchcodec/issues/85)
 
 Let us know if you have any feature requests by [opening an
diff --git a/benchmarks/decoders/benchmark_decoders.py b/benchmarks/decoders/benchmark_decoders.py
index 81a8c0ea..23f45dab 100644
--- a/benchmarks/decoders/benchmark_decoders.py
+++ b/benchmarks/decoders/benchmark_decoders.py
@@ -13,6 +13,7 @@
 
 from benchmark_decoders_library import (
     AbstractDecoder,
+    BatchParameters,
     DecordAccurate,
     DecordAccurateBatch,
     plot_data,
@@ -173,6 +174,7 @@ def main() -> None:
         num_sequential_frames_from_start=[1, 10, 100],
         min_runtime_seconds=args.bm_video_speed_min_run_seconds,
         benchmark_video_creation=args.bm_video_creation,
+        batch_parameters=BatchParameters(num_threads=8, batch_size=40),
     )
     plot_data(df_data, args.plot_path)
 
diff --git a/benchmarks/decoders/benchmark_decoders_library.py b/benchmarks/decoders/benchmark_decoders_library.py
index 8d9d8a92..ae07727f 100644
--- a/benchmarks/decoders/benchmark_decoders_library.py
+++ b/benchmarks/decoders/benchmark_decoders_library.py
@@ -3,6 +3,7 @@
 import subprocess
 import urllib.request
 from concurrent.futures import ThreadPoolExecutor, wait
+from dataclasses import dataclass
 from itertools import product
 from pathlib import Path
 
@@ -479,6 +480,43 @@ def get_metadata(video_file_path: str) -> VideoStreamMetadata:
     return VideoDecoder(video_file_path).metadata
 
 
+@dataclass
+class BatchParameters:
+    num_threads: int
+    batch_size: int
+
+
+def run_batch_using_threads(
+    function,
+    *args,
+    batch_parameters: BatchParameters = BatchParameters(num_threads=8, batch_size=40),
+):
+    executor = ThreadPoolExecutor(max_workers=batch_parameters.num_threads)
+    futures = []
+    for _ in range(batch_parameters.batch_size):
+        futures.append(executor.submit(function, *args))
+    for f in futures:
+        assert f.result()
+    executor.shutdown(wait=True)
+
+
+def convert_result_to_df_item(
+    result, decoder_name, video_file_path, num_samples, decode_pattern
+):
+    df_item = {}
+    df_item["decoder"] = decoder_name
+    df_item["video"] = str(video_file_path)
+    df_item["description"] = result.description
+    df_item["frame_count"] = num_samples
+    df_item["median"] = result.median
+    df_item["iqr"] = result.iqr
+    df_item["type"] = decode_pattern
+    df_item["fps_median"] = num_samples / result.median
+    df_item["fps_p75"] = num_samples / result._p75
+    df_item["fps_p25"] = num_samples / result._p25
+    return df_item
+
+
 def run_benchmarks(
     decoder_dict: dict[str, AbstractDecoder],
     video_files_paths: list[Path],
@@ -486,6 +524,7 @@ def run_benchmarks(
     num_sequential_frames_from_start: list[int],
     min_runtime_seconds: float,
     benchmark_video_creation: bool,
+    batch_parameters: BatchParameters = None,
 ) -> list[dict[str, str | float | int]]:
     # Ensure that we have the same seed across benchmark runs.
     torch.manual_seed(0)
@@ -532,18 +571,44 @@ def run_benchmarks(
                 results.append(
                     seeked_result.blocked_autorange(min_run_time=min_runtime_seconds)
                 )
-                df_item = {}
-                df_item["decoder"] = decoder_name
-                df_item["video"] = str(video_file_path)
-                df_item["description"] = results[-1].description
-                df_item["frame_count"] = num_samples
-                df_item["median"] = results[-1].median
-                df_item["iqr"] = results[-1].iqr
-                df_item["type"] = f"{kind}:seek()+next()"
-                df_item["fps_median"] = num_samples / results[-1].median
-                df_item["fps_p75"] = num_samples / results[-1]._p75
-                df_item["fps_p25"] = num_samples / results[-1]._p25
-                df_data.append(df_item)
+                df_data.append(
+                    convert_result_to_df_item(
+                        results[-1],
+                        decoder_name,
+                        video_file_path,
+                        num_samples,
+                        f"{kind} seek()+next()",
+                    )
+                )
+
+                if batch_parameters:
+                    seeked_result = benchmark.Timer(
+                        stmt="run_batch_using_threads(decoder.get_frames_from_video, video_file, pts_list, batch_parameters=batch_parameters)",
+                        globals={
+                            "video_file": str(video_file_path),
+                            "pts_list": pts_list,
+                            "decoder": decoder,
+                            "run_batch_using_threads": run_batch_using_threads,
+                            "batch_parameters": batch_parameters,
+                        },
+                        label=f"video={video_file_path} {metadata_label}",
+                        sub_label=decoder_name,
+                        description=f"batch {kind} {num_samples} seek()+next()",
+                    )
+                    results.append(
+                        seeked_result.blocked_autorange(
+                            min_run_time=min_runtime_seconds
+                        )
+                    )
+                    df_data.append(
+                        convert_result_to_df_item(
+                            results[-1],
+                            decoder_name,
+                            video_file_path,
+                            num_samples * batch_parameters.batch_size,
+                            f"batch {kind} seek()+next()",
+                        )
+                    )
 
             for num_consecutive_nexts in num_sequential_frames_from_start:
                 consecutive_frames_result = benchmark.Timer(
@@ -562,18 +627,44 @@ def run_benchmarks(
                         min_run_time=min_runtime_seconds
                     )
                 )
-                df_item = {}
-                df_item["decoder"] = decoder_name
-                df_item["video"] = str(video_file_path)
-                df_item["description"] = results[-1].description
-                df_item["frame_count"] = num_consecutive_nexts
-                df_item["median"] = results[-1].median
-                df_item["iqr"] = results[-1].iqr
-                df_item["type"] = "next()"
-                df_item["fps_median"] = num_consecutive_nexts / results[-1].median
-                df_item["fps_p75"] = num_consecutive_nexts / results[-1]._p75
-                df_item["fps_p25"] = num_consecutive_nexts / results[-1]._p25
-                df_data.append(df_item)
+                df_data.append(
+                    convert_result_to_df_item(
+                        results[-1],
+                        decoder_name,
+                        video_file_path,
+                        num_consecutive_nexts,
+                        f"{num_consecutive_nexts} next()",
+                    )
+                )
+
+                if batch_parameters:
+                    consecutive_frames_result = benchmark.Timer(
+                        stmt="run_batch_using_threads(decoder.get_consecutive_frames_from_video, video_file, consecutive_frames_to_extract, batch_parameters=batch_parameters)",
+                        globals={
+                            "video_file": str(video_file_path),
+                            "consecutive_frames_to_extract": num_consecutive_nexts,
+                            "decoder": decoder,
+                            "run_batch_using_threads": run_batch_using_threads,
+                            "batch_parameters": batch_parameters,
+                        },
+                        label=f"video={video_file_path} {metadata_label}",
+                        sub_label=decoder_name,
+                        description=f"batch {num_consecutive_nexts} next()",
+                    )
+                    results.append(
+                        consecutive_frames_result.blocked_autorange(
+                            min_run_time=min_runtime_seconds
+                        )
+                    )
+                    df_data.append(
+                        convert_result_to_df_item(
+                            results[-1],
+                            decoder_name,
+                            video_file_path,
+                            num_consecutive_nexts * batch_parameters.batch_size,
+                            f"batch {num_consecutive_nexts} next()",
+                        )
+                    )
 
         first_video_file_path = video_files_paths[0]
         if benchmark_video_creation:
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 20d6db90..c882dc48 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -29,7 +29,7 @@ We achieve these capabilities through:
      .. grid-item-card:: :octicon:`file-code;1em`
         Installation instructions
         :img-top: _static/img/card-background.svg
-        :link: install_instructions.html
+        :link: https://github.com/pytorch/torchcodec?tab=readme-ov-file#installing-torchcodec
         :link-type: url
 
         How to install TorchCodec
@@ -50,6 +50,14 @@ We achieve these capabilities through:
 
         How to sample video clips
 
+     .. grid-item-card:: :octicon:`file-code;1em`
+        GPU decoding using TorchCodec
+        :img-top: _static/img/card-background.svg
+        :link: generated_examples/basic_cuda_example.html
+        :link-type: url
+
+        A simple example demonstrating CUDA GPU decoding
+
 .. toctree::
    :maxdepth: 1
    :caption: TorchCodec documentation
@@ -63,7 +71,7 @@ We achieve these capabilities through:
    :caption: Examples and tutorials
    :hidden:
 
-   install_instructions
+   Installation instructions <https://github.com/pytorch/torchcodec?tab=readme-ov-file#installing-torchcodec>
    generated_examples/index
 
 
diff --git a/docs/source/install_instructions.rst b/docs/source/install_instructions.rst
deleted file mode 100644
index 1242c24b..00000000
--- a/docs/source/install_instructions.rst
+++ /dev/null
@@ -1,32 +0,0 @@
-Installation Instructions
-=========================
-
-.. note::
-    TorchCodec is only available on Linux for now. We plan to support other
-    platforms in the future.
-
-There are three steps to installing TorchCodec:
-
-1. Install the latest stable version of PyTorch following the
-   `official instructions <https://pytorch.org/get-started/locally/>`_. TorchCodec
-   requires `PyTorch 2.4 <https://pytorch.org/docs/2.4/>`_.
-
-2. Install FFmpeg, if it's not already installed. Your Linux distribution probably
-   comes with FFmpeg pre-installed. TorchCodec supports all major FFmpeg versions
-   in [4, 7]. If FFmpeg is not already installed, or you need a later version, install
-   it with:
-
-   .. code:: bash
-
-      conda install ffmpeg
-      # or
-      conda install ffmpeg -c conda-forge
-3. Install TorchCodec:
-
-   .. code:: bash
-
-      pip install torchcodec
-
-Note that installation instructions may slightly change over time. The most
-up-to-date instructions should be available from the `README
-<https://github.com/pytorch/torchcodec?tab=readme-ov-file#installing-torchcodec>`_.
diff --git a/examples/basic_cuda_example.py b/examples/basic_cuda_example.py
new file mode 100644
index 00000000..5ff85e8e
--- /dev/null
+++ b/examples/basic_cuda_example.py
@@ -0,0 +1,176 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+Accelerated video decoding on GPUs with CUDA and NVDEC
+================================================================
+
+TorchCodec can use supported Nvidia hardware (see support matrix
+`here <https://developer.nvidia.com/video-encode-and-decode-gpu-support-matrix-new>`_) to speed-up
+video decoding. This is called "CUDA Decoding" and it uses Nvidia's
+`NVDEC hardware decoder <https://developer.nvidia.com/video-codec-sdk>`_
+and CUDA kernels to respectively decompress and convert to RGB.
+CUDA Decoding can be faster than CPU Decoding for the actual decoding step and also for
+subsequent transform steps like scaling, cropping or rotating. This is because the decode step leaves
+the decoded tensor in GPU memory so the GPU doesn't have to fetch from main memory before
+running the transform steps. Encoded packets are often much smaller than decoded frames so
+CUDA decoding also uses less PCI-e bandwidth.
+
+CUDA Decoding can offer speed-up over CPU Decoding in a few scenarios:
+
+#. You are decoding a large resolution video
+#. You are decoding a large batch of videos that's saturating the CPU
+#. You want to do whole-image transforms like scaling or convolutions on the decoded tensors
+   after decoding
+#. Your CPU is saturated and you want to free it up for other work
+
+
+Here are situations where CUDA Decoding may not make sense:
+
+#. You want bit-exact results compared to CPU Decoding
+#. You have small resolution videos and the PCI-e transfer latency is large
+#. Your GPU is already busy and CPU is not
+
+It's best to experiment with CUDA Decoding to see if it improves your use-case. With
+TorchCodec you can simply pass in a device parameter to the
+:class:`~torchcodec.decoders.VideoDecoder` class to use CUDA Decoding.
+
+
+In order to use CUDA Decoding will need the following installed in your environment:
+
+#. An Nvidia GPU that supports decoding the video format you want to decode. See
+   the support matrix `here <https://developer.nvidia.com/video-encode-and-decode-gpu-support-matrix-new>`_
+#. `CUDA-enabled pytorch <https://pytorch.org/get-started/locally/>`_
+#. FFmpeg binaries that support
+   `NVDEC-enabled <https://docs.nvidia.com/video-technologies/video-codec-sdk/12.0/ffmpeg-with-nvidia-gpu/index.html>`_
+   codecs
+#. libnpp and nvrtc (these are usually installed when you install the full cuda-toolkit)
+
+
+FFmpeg versions 5, 6 and 7 from conda-forge are built with
+`NVDEC support <https://docs.nvidia.com/video-technologies/video-codec-sdk/12.0/ffmpeg-with-nvidia-gpu/index.html>`_
+and you can install them with conda. For example, to install FFmpeg version 7:
+
+
+.. code-block:: bash
+
+    conda install ffmpeg=7 -c conda-forge
+    conda install libnpp cuda-nvrtc -c nvidia
+
+
+"""
+
+# %%
+# Checking if Pytorch has CUDA enabled
+# -------------------------------------
+#
+# .. note::
+#
+#    This tutorial requires FFmpeg libraries compiled with CUDA support.
+#
+#
+import torch
+
+print(f"{torch.__version__=}")
+print(f"{torch.cuda.is_available()=}")
+print(f"{torch.cuda.get_device_properties(0)=}")
+
+
+# %%
+# Downloading the video
+# -------------------------------------
+#
+# We will use the following video which has the following properties:
+#
+# - Codec: H.264
+# - Resolution: 960x540
+# - FPS: 29.97
+# - Pixel format: YUV420P
+#
+# .. raw:: html
+#
+#    <video style="max-width: 100%" controls>
+#      <source src="https://download.pytorch.org/torchaudio/tutorial-assets/stream-api/NASAs_Most_Scientifically_Complex_Space_Observatory_Requires_Precision-MP4_small.mp4" type="video/mp4">
+#    </video>
+import urllib.request
+
+video_file = "video.mp4"
+urllib.request.urlretrieve(
+    "https://download.pytorch.org/torchaudio/tutorial-assets/stream-api/NASAs_Most_Scientifically_Complex_Space_Observatory_Requires_Precision-MP4_small.mp4",
+    video_file,
+)
+
+
+# %%
+# CUDA Decoding using VideoDecoder
+# -------------------------------------
+#
+# To use CUDA decoder, you need to pass in a cuda device to the decoder.
+#
+from torchcodec.decoders import VideoDecoder
+
+decoder = VideoDecoder(video_file, device="cuda")
+frame = decoder[0]
+
+# %%
+#
+# The video frames are decoded and returned as tensor of NCHW format.
+
+print(frame.shape, frame.dtype)
+
+# %%
+#
+# The video frames are left on the GPU memory.
+
+print(frame.data.device)
+
+
+# %%
+# Visualizing Frames
+# -------------------------------------
+#
+# Let's look at the frames decoded by CUDA decoder and compare them
+# against equivalent results from the CPU decoders.
+timestamps = [12, 19, 45, 131, 180]
+cpu_decoder = VideoDecoder(video_file, device="cpu")
+cuda_decoder = VideoDecoder(video_file, device="cuda")
+cpu_frames = cpu_decoder.get_frames_played_at(timestamps).data
+cuda_frames = cuda_decoder.get_frames_played_at(timestamps).data
+
+
+def plot_cpu_and_cuda_frames(cpu_frames: torch.Tensor, cuda_frames: torch.Tensor):
+    try:
+        import matplotlib.pyplot as plt
+        from torchvision.transforms.v2.functional import to_pil_image
+    except ImportError:
+        print("Cannot plot, please run `pip install torchvision matplotlib`")
+        return
+    n_rows = len(timestamps)
+    fig, axes = plt.subplots(n_rows, 2, figsize=[12.8, 16.0])
+    for i in range(n_rows):
+        axes[i][0].imshow(to_pil_image(cpu_frames[i].to("cpu")))
+        axes[i][1].imshow(to_pil_image(cuda_frames[i].to("cpu")))
+
+    axes[0][0].set_title("CPU decoder", fontsize=24)
+    axes[0][1].set_title("CUDA decoder", fontsize=24)
+    plt.setp(axes, xticks=[], yticks=[])
+    plt.tight_layout()
+
+
+plot_cpu_and_cuda_frames(cpu_frames, cuda_frames)
+
+# %%
+#
+# They look visually similar to the human eye but there may be subtle
+# differences because CUDA math is not bit-exact with respect to CPU math.
+#
+frames_equal = torch.equal(cpu_frames.to("cuda"), cuda_frames)
+mean_abs_diff = torch.mean(
+    torch.abs(cpu_frames.float().to("cuda") - cuda_frames.float())
+)
+max_abs_diff = torch.max(torch.abs(cpu_frames.to("cuda").float() - cuda_frames.float()))
+print(f"{frames_equal=}")
+print(f"{mean_abs_diff=}")
+print(f"{max_abs_diff=}")
diff --git a/src/torchcodec/_frame.py b/src/torchcodec/_frame.py
index b9e0fd57..a8fc7f5b 100644
--- a/src/torchcodec/_frame.py
+++ b/src/torchcodec/_frame.py
@@ -62,6 +62,10 @@ class FrameBatch(Iterable):
     or 5D for sequences of clips, as returned by the :ref:`samplers
     <sphx_glr_generated_examples_sampling.py>`. When ``data`` is 4D (resp.  5D)
     the ``pts_seconds`` and ``duration_seconds`` tensors are 1D (resp. 2D).
+
+    .. note::
+        The ``pts_seconds`` and ``duration_seconds`` Tensors are always returned
+        on CPU, even if ``data`` is on GPU.
     """
 
     data: Tensor
diff --git a/src/torchcodec/decoders/_core/video_decoder_ops.py b/src/torchcodec/decoders/_core/video_decoder_ops.py
index d4102ae5..d3f8e9a6 100644
--- a/src/torchcodec/decoders/_core/video_decoder_ops.py
+++ b/src/torchcodec/decoders/_core/video_decoder_ops.py
@@ -42,10 +42,13 @@ def load_torchcodec_extension():
         + "\n[end of libtorchcodec loading traceback]."
     )
     raise RuntimeError(
-        """Could not load libtorchcodec. Likely causes:
+        f"""Could not load libtorchcodec. Likely causes:
           1. FFmpeg is not properly installed in your environment. We support
-             verisons 4, 5, 6 and 7.
-          2. PyTorch 2.4 is not properly installed in your environment.
+             versions 4, 5, 6 and 7.
+          2. The PyTorch version ({torch.__version__}) is not compatible with
+             this version of TorchCodec. Refer to the version compatibility
+             table:
+             https://github.com/pytorch/torchcodec?tab=readme-ov-file#installing-torchcodec.
           3. Another runtime dependency; see exceptions below.
         The following exceptions were raised as we tried to load libtorchcodec:
         """