|
| 1 | +# Copyright (c) Meta Platforms, Inc. and affiliates. |
| 2 | +# All rights reserved. |
| 3 | +# |
| 4 | +# This source code is licensed under the BSD-style license found in the |
| 5 | +# LICENSE file in the root directory of this source tree. |
| 6 | + |
| 7 | +""" |
| 8 | +=================================================================== |
| 9 | +Streaming data through file-like support |
| 10 | +=================================================================== |
| 11 | +
|
| 12 | +In this example, we will show how to decode streaming data. That is, when files |
| 13 | +do not reside locally, we will show how to only download the data segments that |
| 14 | +are needed to decode the frames you care about. We accomplish this capability |
| 15 | +with Python |
| 16 | +`file-like objects <https://docs.python.org/3/glossary.html#term-file-like-object>`_. |
| 17 | +Our example uses a video file, so we use the :class:`~torchcodec.decoders.VideoDecoder` |
| 18 | +class to decode it. But all of the lessons here also apply to audio files and the |
| 19 | +:class:`~torchcodec.decoders.AudioDecoder` class as well.""" |
| 20 | + |
| 21 | +# %% |
| 22 | +# First, a bit of boilerplate. We define two functions: one to download content |
| 23 | +# from a given URL, and another to time the execution of a given function. |
| 24 | + |
| 25 | + |
| 26 | +import torch |
| 27 | +import requests |
| 28 | +from time import perf_counter_ns |
| 29 | + |
| 30 | + |
| 31 | +def get_url_content(url): |
| 32 | + response = requests.get(url, headers={"User-Agent": ""}) |
| 33 | + if response.status_code != 200: |
| 34 | + raise RuntimeError(f"Failed to download video. {response.status_code = }.") |
| 35 | + return response.content |
| 36 | + |
| 37 | + |
| 38 | +def bench(f, average_over=10, warmup=2): |
| 39 | + for _ in range(warmup): |
| 40 | + f() |
| 41 | + |
| 42 | + times = [] |
| 43 | + for _ in range(average_over): |
| 44 | + start = perf_counter_ns() |
| 45 | + f() |
| 46 | + end = perf_counter_ns() |
| 47 | + times.append(end - start) |
| 48 | + |
| 49 | + times = torch.tensor(times) * 1e-6 # ns to ms |
| 50 | + std = times.std().item() |
| 51 | + med = times.median().item() |
| 52 | + print(f"{med = :.2f}ms +- {std:.2f}") |
| 53 | + |
| 54 | + |
| 55 | +# %% |
| 56 | +# Performance: downloading first vs. streaming |
| 57 | +# -------------------------------------------- |
| 58 | +# |
| 59 | +# We are going to investigate the cost of having to download an entire video |
| 60 | +# before decoding any frames versus being able to stream the video's data |
| 61 | +# while decoding. To demonsrate an extreme case, we're going to always decode |
| 62 | +# just the first frame of the video, while we vary how we get that video's |
| 63 | +# data. |
| 64 | +# |
| 65 | +# The video we're going to use in this tutorial is publicly available on the |
| 66 | +# internet. We perform an initial download of it so that we can understand |
| 67 | +# its size and content: |
| 68 | + |
| 69 | +from torchcodec.decoders import VideoDecoder |
| 70 | + |
| 71 | +nasa_url = "https://download.pytorch.org/torchaudio/tutorial-assets/stream-api/NASAs_Most_Scientifically_Complex_Space_Observatory_Requires_Precision-MP4.mp4" |
| 72 | + |
| 73 | +pre_downloaded_raw_video_bytes = get_url_content(nasa_url) |
| 74 | +decoder = VideoDecoder(pre_downloaded_raw_video_bytes) |
| 75 | + |
| 76 | +print(f"Video size in MB: {len(pre_downloaded_raw_video_bytes) // 1024 // 1024}") |
| 77 | +print(decoder.metadata) |
| 78 | + |
| 79 | +# %% |
| 80 | +# We can see that the video is about 253 MB, has the resolution 1920x1080, is |
| 81 | +# about 30 frames per second and is almost 3 and a half minutes long. As we |
| 82 | +# only want to decode the first frame, we would clearly benefit from not having |
| 83 | +# to download the entire video! |
| 84 | +# |
| 85 | +# Let's first test three scenarios: |
| 86 | +# |
| 87 | +# 1. Decode from the *existing* video we just downloaded. This is our baseline |
| 88 | +# performance, as we've reduced the downloading cost to 0. |
| 89 | +# 2. Download the entire video before decoding. This is the worst case |
| 90 | +# that we want to avoid. |
| 91 | +# 3. Provde the URL directly to the :class:`~torchcodec.decoders.VideoDecoder` class, which will pass |
| 92 | +# the URL on to FFmpeg. Then FFmpeg will decide how much of the video to |
| 93 | +# download before decoding. |
| 94 | +# |
| 95 | +# Note that in our scenarios, we are always setting the ``seek_mode`` parameter of |
| 96 | +# the :class:`~torchcodec.decoders.VideoDecoder` class to ``"approximate"``. We do |
| 97 | +# this to avoid scanning the entire video during initialization, which would |
| 98 | +# require downloading the entire video even if we only want to decode the first |
| 99 | +# frame. See :ref:`sphx_glr_generated_examples_approximate_mode.py` for more. |
| 100 | + |
| 101 | + |
| 102 | +def decode_from_existing_download(): |
| 103 | + decoder = VideoDecoder( |
| 104 | + source=pre_downloaded_raw_video_bytes, |
| 105 | + seek_mode="approximate", |
| 106 | + ) |
| 107 | + return decoder[0] |
| 108 | + |
| 109 | + |
| 110 | +def download_before_decode(): |
| 111 | + raw_video_bytes = get_url_content(nasa_url) |
| 112 | + decoder = VideoDecoder( |
| 113 | + source=raw_video_bytes, |
| 114 | + seek_mode="approximate", |
| 115 | + ) |
| 116 | + return decoder[0] |
| 117 | + |
| 118 | + |
| 119 | +def direct_url_to_ffmpeg(): |
| 120 | + decoder = VideoDecoder( |
| 121 | + source=nasa_url, |
| 122 | + seek_mode="approximate", |
| 123 | + ) |
| 124 | + return decoder[0] |
| 125 | + |
| 126 | + |
| 127 | +print("Decode from existing download:") |
| 128 | +bench(decode_from_existing_download) |
| 129 | +print() |
| 130 | + |
| 131 | +print("Download before decode:") |
| 132 | +bench(download_before_decode) |
| 133 | +print() |
| 134 | + |
| 135 | +print("Direct url to FFmpeg:") |
| 136 | +bench(direct_url_to_ffmpeg) |
| 137 | + |
| 138 | +# %% |
| 139 | +# Decoding the already downloaded video is clearly the fastest. Having to |
| 140 | +# download the entire video each time we want to decode just the first frame |
| 141 | +# is many times slower than decoding an existing video. Providing a direct URL |
| 142 | +# is much better, but we're still probably downloading more than we need to. |
| 143 | +# |
| 144 | +# We can do better, and the way how is to use a file-like object which |
| 145 | +# implements its own read and seek methods that only download data from a URL as |
| 146 | +# needed. Rather than implementing our own, we can use such objects from the |
| 147 | +# `fsspec <https://github.com/fsspec/filesystem_spec>`_ module that provides |
| 148 | +# `Filesystem interfaces for Python <https://filesystem-spec.readthedocs.io/en/latest/?badge=latest>`_. |
| 149 | +# Note that using these capabilities from the fsspec library also requires the |
| 150 | +# `aiohttp <https://docs.aiohttp.org/en/stable/>`_ module. You can install both with |
| 151 | +# `pip install fsspec aiohttp`. |
| 152 | + |
| 153 | +import fsspec |
| 154 | + |
| 155 | + |
| 156 | +def stream_while_decode(): |
| 157 | + # The `client_kwargs` are passed down to the aiohttp module's client |
| 158 | + # session; we need to indicate that we need to trust the environment |
| 159 | + # settings for proxy configuration. Depending on your environment, you may |
| 160 | + # not need this setting. |
| 161 | + with fsspec.open(nasa_url, client_kwargs={'trust_env': True}) as file_like: |
| 162 | + decoder = VideoDecoder(file_like, seek_mode="approximate") |
| 163 | + return decoder[0] |
| 164 | + |
| 165 | + |
| 166 | +print("Stream while decode: ") |
| 167 | +bench(stream_while_decode) |
| 168 | + |
| 169 | +# %% |
| 170 | +# Streaming the data through a file-like object is much faster than |
| 171 | +# downloading the video first. And not only is it also faster than |
| 172 | +# providing a direct URL, it's more general. :class:`~torchcodec.decoders.VideoDecoder` supports |
| 173 | +# direct URLs because the underlying FFmpeg functions support them. But the |
| 174 | +# kinds of protocols supported are determined by what that version of FFmpeg |
| 175 | +# supports. A file-like object can adapt any kind of resource, including ones |
| 176 | +# that are specific to your own infrastructure and are unknown to FFmpeg. |
| 177 | + |
| 178 | + |
| 179 | +# %% |
| 180 | +# How it works |
| 181 | +# ------------ |
| 182 | +# In Python, a `file-like object <https://docs.python.org/3/glossary.html#term-file-like-object>`_ |
| 183 | +# is any object that exposes special methods for reading, writing and seeking. |
| 184 | +# While such methods are obviously file oriented, it's not required that |
| 185 | +# a file-like object is backed by an actual file. As far as Python is concerned, |
| 186 | +# if an object acts like a file, it's a file. This is a powerful concept, as |
| 187 | +# it enables libraries that read or write data to assume a file-like interface. |
| 188 | +# Other libraries that present novel resources can then be easily used by |
| 189 | +# providing a file-like wrapper for their resource. |
| 190 | +# |
| 191 | +# For our case, we only need the read and seek methods for decoding. The exact |
| 192 | +# method signature needed is in the example below. Rather than wrap a novel |
| 193 | +# resource, we demonstrate this capability by wrapping an actual file while |
| 194 | +# counting how often each method is called. |
| 195 | + |
| 196 | +from pathlib import Path |
| 197 | +import tempfile |
| 198 | + |
| 199 | +# Create a local file to interact with. |
| 200 | +temp_dir = tempfile.mkdtemp() |
| 201 | +nasa_video_path = Path(temp_dir) / "nasa_video.mp4" |
| 202 | +with open(nasa_video_path, "wb") as f: |
| 203 | + f.write(pre_downloaded_raw_video_bytes) |
| 204 | + |
| 205 | + |
| 206 | +# A file-like class that is backed by an actual file, but it intercepts reads |
| 207 | +# and seeks to maintain counts. |
| 208 | +class FileOpCounter: |
| 209 | + def __init__(self, file): |
| 210 | + self._file = file |
| 211 | + self.num_reads = 0 |
| 212 | + self.num_seeks = 0 |
| 213 | + |
| 214 | + def read(self, size: int) -> bytes: |
| 215 | + self.num_reads += 1 |
| 216 | + return self._file.read(size) |
| 217 | + |
| 218 | + def seek(self, offset: int, whence: int) -> bytes: |
| 219 | + self.num_seeks += 1 |
| 220 | + return self._file.seek(offset, whence) |
| 221 | + |
| 222 | + |
| 223 | +# Let's now get a file-like object from our class defined above, providing it a |
| 224 | +# reference to the file we created. We pass our file-like object to the decoder |
| 225 | +# rather than the file itself. |
| 226 | +file_op_counter = FileOpCounter(open(nasa_video_path, "rb")) |
| 227 | +counter_decoder = VideoDecoder(file_op_counter, seek_mode="approximate") |
| 228 | + |
| 229 | +print("Decoder initialization required " |
| 230 | + f"{file_op_counter.num_reads} reads and " |
| 231 | + f"{file_op_counter.num_seeks} seeks.") |
| 232 | + |
| 233 | +init_reads = file_op_counter.num_reads |
| 234 | +init_seeks = file_op_counter.num_seeks |
| 235 | + |
| 236 | +first_frame = counter_decoder[0] |
| 237 | + |
| 238 | +print("Decoding the first frame required " |
| 239 | + f"{file_op_counter.num_reads - init_reads} additional reads and " |
| 240 | + f"{file_op_counter.num_seeks - init_seeks} additional seeks.") |
| 241 | + |
| 242 | +# %% |
| 243 | +# While we defined a simple class primarily for demonstration, it's actually |
| 244 | +# useful for diagnosing how much reading and seeking are required for different |
| 245 | +# decoding operations. We've also introduced a mystery that we should answer: |
| 246 | +# why does *initializing* the decoder take more reads and seeks than decoding |
| 247 | +# the first frame? The answer is that in our decoder implementation, we're |
| 248 | +# actually calling a special |
| 249 | +# `FFmpeg function <https://ffmpeg.org/doxygen/6.1/group__lavf__decoding.html#gad42172e27cddafb81096939783b157bb>`_ |
| 250 | +# that decodes the first few frames to return more robust metadata. |
| 251 | +# |
| 252 | +# It's also worth noting that the Python file-like interface is only half of |
| 253 | +# the story. FFmpeg also has its own mechanism for directing reads and seeks |
| 254 | +# during decoding to user-define functions. The |
| 255 | +# :class:`~torchcodec.decoders.VideoDecoder` object does the work of |
| 256 | +# connecting the Python methods you define to FFmpeg. All you have to do is |
| 257 | +# define your methods in Python, and we do the rest. |
| 258 | + |
| 259 | +# %% |
| 260 | +# Performance: local file path vs. local file-like object |
| 261 | +# ------------------------------------------------------- |
| 262 | +# |
| 263 | +# Since we have a local file defined, let's do a bonus performance test. We now |
| 264 | +# have two means of providing a local file to :class:`~torchcodec.decoders.VideoDecoder`: |
| 265 | +# |
| 266 | +# 1. Through a *path*, where the :class:`~torchcodec.decoders.VideoDecoder` |
| 267 | +# object will then do the work of opening the local file at that path. |
| 268 | +# 2. Through a *file-like object*, where you open the file yourself and provide |
| 269 | +# the file-like object to :class:`~torchcodec.decoders.VideoDecoder`. |
| 270 | +# |
| 271 | +# An obvious question is: which is faster? The code below tests that question. |
| 272 | + |
| 273 | + |
| 274 | +def decode_from_existing_file_path(): |
| 275 | + decoder = VideoDecoder(nasa_video_path, seek_mode="approximate") |
| 276 | + return decoder[0] |
| 277 | + |
| 278 | + |
| 279 | +def decode_from_existing_open_file_object(): |
| 280 | + with open(nasa_video_path, "rb") as file: |
| 281 | + decoder = VideoDecoder(file, seek_mode="approximate") |
| 282 | + return decoder[0] |
| 283 | + |
| 284 | + |
| 285 | +print("Decode from existing file path:") |
| 286 | +bench(decode_from_existing_file_path) |
| 287 | +print() |
| 288 | + |
| 289 | +print("Decode from existing open file object:") |
| 290 | +bench(decode_from_existing_open_file_object) |
| 291 | + |
| 292 | +# %% |
| 293 | +# Thankfully, the answer is both means of decoding from a local file take about |
| 294 | +# the same amount of time. This result means that in your own code, you can use |
| 295 | +# whichever method is more convienient. What this result implies is that the |
| 296 | +# cost of actually reading and copying data dominates the cost of calling Python |
| 297 | +# methods while decoding. |
| 298 | + |
| 299 | +# %% |
| 300 | +# Finally, let's clean up the local resources we created. |
| 301 | +import shutil |
| 302 | +shutil.rmtree(temp_dir) |
| 303 | +# %% |
0 commit comments