Skip to content

Commit fda54c9

Browse files
authored
File like tutorial (#638)
1 parent e70a98b commit fda54c9

File tree

4 files changed

+311
-4
lines changed

4 files changed

+311
-4
lines changed

docs/requirements.txt

+2
Original file line numberDiff line numberDiff line change
@@ -6,4 +6,6 @@ sphinx-tabs
66
matplotlib
77
torchvision
88
ipython
9+
fsspec
10+
aiohttp
911
-e git+https://github.com/pytorch/pytorch_sphinx_theme.git#egg=pytorch_sphinx_theme

examples/file_like.py

+303
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,303 @@
1+
# Copyright (c) Meta Platforms, Inc. and affiliates.
2+
# All rights reserved.
3+
#
4+
# This source code is licensed under the BSD-style license found in the
5+
# LICENSE file in the root directory of this source tree.
6+
7+
"""
8+
===================================================================
9+
Streaming data through file-like support
10+
===================================================================
11+
12+
In this example, we will show how to decode streaming data. That is, when files
13+
do not reside locally, we will show how to only download the data segments that
14+
are needed to decode the frames you care about. We accomplish this capability
15+
with Python
16+
`file-like objects <https://docs.python.org/3/glossary.html#term-file-like-object>`_.
17+
Our example uses a video file, so we use the :class:`~torchcodec.decoders.VideoDecoder`
18+
class to decode it. But all of the lessons here also apply to audio files and the
19+
:class:`~torchcodec.decoders.AudioDecoder` class as well."""
20+
21+
# %%
22+
# First, a bit of boilerplate. We define two functions: one to download content
23+
# from a given URL, and another to time the execution of a given function.
24+
25+
26+
import torch
27+
import requests
28+
from time import perf_counter_ns
29+
30+
31+
def get_url_content(url):
32+
response = requests.get(url, headers={"User-Agent": ""})
33+
if response.status_code != 200:
34+
raise RuntimeError(f"Failed to download video. {response.status_code = }.")
35+
return response.content
36+
37+
38+
def bench(f, average_over=10, warmup=2):
39+
for _ in range(warmup):
40+
f()
41+
42+
times = []
43+
for _ in range(average_over):
44+
start = perf_counter_ns()
45+
f()
46+
end = perf_counter_ns()
47+
times.append(end - start)
48+
49+
times = torch.tensor(times) * 1e-6 # ns to ms
50+
std = times.std().item()
51+
med = times.median().item()
52+
print(f"{med = :.2f}ms +- {std:.2f}")
53+
54+
55+
# %%
56+
# Performance: downloading first vs. streaming
57+
# --------------------------------------------
58+
#
59+
# We are going to investigate the cost of having to download an entire video
60+
# before decoding any frames versus being able to stream the video's data
61+
# while decoding. To demonsrate an extreme case, we're going to always decode
62+
# just the first frame of the video, while we vary how we get that video's
63+
# data.
64+
#
65+
# The video we're going to use in this tutorial is publicly available on the
66+
# internet. We perform an initial download of it so that we can understand
67+
# its size and content:
68+
69+
from torchcodec.decoders import VideoDecoder
70+
71+
nasa_url = "https://download.pytorch.org/torchaudio/tutorial-assets/stream-api/NASAs_Most_Scientifically_Complex_Space_Observatory_Requires_Precision-MP4.mp4"
72+
73+
pre_downloaded_raw_video_bytes = get_url_content(nasa_url)
74+
decoder = VideoDecoder(pre_downloaded_raw_video_bytes)
75+
76+
print(f"Video size in MB: {len(pre_downloaded_raw_video_bytes) // 1024 // 1024}")
77+
print(decoder.metadata)
78+
79+
# %%
80+
# We can see that the video is about 253 MB, has the resolution 1920x1080, is
81+
# about 30 frames per second and is almost 3 and a half minutes long. As we
82+
# only want to decode the first frame, we would clearly benefit from not having
83+
# to download the entire video!
84+
#
85+
# Let's first test three scenarios:
86+
#
87+
# 1. Decode from the *existing* video we just downloaded. This is our baseline
88+
# performance, as we've reduced the downloading cost to 0.
89+
# 2. Download the entire video before decoding. This is the worst case
90+
# that we want to avoid.
91+
# 3. Provde the URL directly to the :class:`~torchcodec.decoders.VideoDecoder` class, which will pass
92+
# the URL on to FFmpeg. Then FFmpeg will decide how much of the video to
93+
# download before decoding.
94+
#
95+
# Note that in our scenarios, we are always setting the ``seek_mode`` parameter of
96+
# the :class:`~torchcodec.decoders.VideoDecoder` class to ``"approximate"``. We do
97+
# this to avoid scanning the entire video during initialization, which would
98+
# require downloading the entire video even if we only want to decode the first
99+
# frame. See :ref:`sphx_glr_generated_examples_approximate_mode.py` for more.
100+
101+
102+
def decode_from_existing_download():
103+
decoder = VideoDecoder(
104+
source=pre_downloaded_raw_video_bytes,
105+
seek_mode="approximate",
106+
)
107+
return decoder[0]
108+
109+
110+
def download_before_decode():
111+
raw_video_bytes = get_url_content(nasa_url)
112+
decoder = VideoDecoder(
113+
source=raw_video_bytes,
114+
seek_mode="approximate",
115+
)
116+
return decoder[0]
117+
118+
119+
def direct_url_to_ffmpeg():
120+
decoder = VideoDecoder(
121+
source=nasa_url,
122+
seek_mode="approximate",
123+
)
124+
return decoder[0]
125+
126+
127+
print("Decode from existing download:")
128+
bench(decode_from_existing_download)
129+
print()
130+
131+
print("Download before decode:")
132+
bench(download_before_decode)
133+
print()
134+
135+
print("Direct url to FFmpeg:")
136+
bench(direct_url_to_ffmpeg)
137+
138+
# %%
139+
# Decoding the already downloaded video is clearly the fastest. Having to
140+
# download the entire video each time we want to decode just the first frame
141+
# is many times slower than decoding an existing video. Providing a direct URL
142+
# is much better, but we're still probably downloading more than we need to.
143+
#
144+
# We can do better, and the way how is to use a file-like object which
145+
# implements its own read and seek methods that only download data from a URL as
146+
# needed. Rather than implementing our own, we can use such objects from the
147+
# `fsspec <https://github.com/fsspec/filesystem_spec>`_ module that provides
148+
# `Filesystem interfaces for Python <https://filesystem-spec.readthedocs.io/en/latest/?badge=latest>`_.
149+
# Note that using these capabilities from the fsspec library also requires the
150+
# `aiohttp <https://docs.aiohttp.org/en/stable/>`_ module. You can install both with
151+
# `pip install fsspec aiohttp`.
152+
153+
import fsspec
154+
155+
156+
def stream_while_decode():
157+
# The `client_kwargs` are passed down to the aiohttp module's client
158+
# session; we need to indicate that we need to trust the environment
159+
# settings for proxy configuration. Depending on your environment, you may
160+
# not need this setting.
161+
with fsspec.open(nasa_url, client_kwargs={'trust_env': True}) as file_like:
162+
decoder = VideoDecoder(file_like, seek_mode="approximate")
163+
return decoder[0]
164+
165+
166+
print("Stream while decode: ")
167+
bench(stream_while_decode)
168+
169+
# %%
170+
# Streaming the data through a file-like object is much faster than
171+
# downloading the video first. And not only is it also faster than
172+
# providing a direct URL, it's more general. :class:`~torchcodec.decoders.VideoDecoder` supports
173+
# direct URLs because the underlying FFmpeg functions support them. But the
174+
# kinds of protocols supported are determined by what that version of FFmpeg
175+
# supports. A file-like object can adapt any kind of resource, including ones
176+
# that are specific to your own infrastructure and are unknown to FFmpeg.
177+
178+
179+
# %%
180+
# How it works
181+
# ------------
182+
# In Python, a `file-like object <https://docs.python.org/3/glossary.html#term-file-like-object>`_
183+
# is any object that exposes special methods for reading, writing and seeking.
184+
# While such methods are obviously file oriented, it's not required that
185+
# a file-like object is backed by an actual file. As far as Python is concerned,
186+
# if an object acts like a file, it's a file. This is a powerful concept, as
187+
# it enables libraries that read or write data to assume a file-like interface.
188+
# Other libraries that present novel resources can then be easily used by
189+
# providing a file-like wrapper for their resource.
190+
#
191+
# For our case, we only need the read and seek methods for decoding. The exact
192+
# method signature needed is in the example below. Rather than wrap a novel
193+
# resource, we demonstrate this capability by wrapping an actual file while
194+
# counting how often each method is called.
195+
196+
from pathlib import Path
197+
import tempfile
198+
199+
# Create a local file to interact with.
200+
temp_dir = tempfile.mkdtemp()
201+
nasa_video_path = Path(temp_dir) / "nasa_video.mp4"
202+
with open(nasa_video_path, "wb") as f:
203+
f.write(pre_downloaded_raw_video_bytes)
204+
205+
206+
# A file-like class that is backed by an actual file, but it intercepts reads
207+
# and seeks to maintain counts.
208+
class FileOpCounter:
209+
def __init__(self, file):
210+
self._file = file
211+
self.num_reads = 0
212+
self.num_seeks = 0
213+
214+
def read(self, size: int) -> bytes:
215+
self.num_reads += 1
216+
return self._file.read(size)
217+
218+
def seek(self, offset: int, whence: int) -> bytes:
219+
self.num_seeks += 1
220+
return self._file.seek(offset, whence)
221+
222+
223+
# Let's now get a file-like object from our class defined above, providing it a
224+
# reference to the file we created. We pass our file-like object to the decoder
225+
# rather than the file itself.
226+
file_op_counter = FileOpCounter(open(nasa_video_path, "rb"))
227+
counter_decoder = VideoDecoder(file_op_counter, seek_mode="approximate")
228+
229+
print("Decoder initialization required "
230+
f"{file_op_counter.num_reads} reads and "
231+
f"{file_op_counter.num_seeks} seeks.")
232+
233+
init_reads = file_op_counter.num_reads
234+
init_seeks = file_op_counter.num_seeks
235+
236+
first_frame = counter_decoder[0]
237+
238+
print("Decoding the first frame required "
239+
f"{file_op_counter.num_reads - init_reads} additional reads and "
240+
f"{file_op_counter.num_seeks - init_seeks} additional seeks.")
241+
242+
# %%
243+
# While we defined a simple class primarily for demonstration, it's actually
244+
# useful for diagnosing how much reading and seeking are required for different
245+
# decoding operations. We've also introduced a mystery that we should answer:
246+
# why does *initializing* the decoder take more reads and seeks than decoding
247+
# the first frame? The answer is that in our decoder implementation, we're
248+
# actually calling a special
249+
# `FFmpeg function <https://ffmpeg.org/doxygen/6.1/group__lavf__decoding.html#gad42172e27cddafb81096939783b157bb>`_
250+
# that decodes the first few frames to return more robust metadata.
251+
#
252+
# It's also worth noting that the Python file-like interface is only half of
253+
# the story. FFmpeg also has its own mechanism for directing reads and seeks
254+
# during decoding to user-define functions. The
255+
# :class:`~torchcodec.decoders.VideoDecoder` object does the work of
256+
# connecting the Python methods you define to FFmpeg. All you have to do is
257+
# define your methods in Python, and we do the rest.
258+
259+
# %%
260+
# Performance: local file path vs. local file-like object
261+
# -------------------------------------------------------
262+
#
263+
# Since we have a local file defined, let's do a bonus performance test. We now
264+
# have two means of providing a local file to :class:`~torchcodec.decoders.VideoDecoder`:
265+
#
266+
# 1. Through a *path*, where the :class:`~torchcodec.decoders.VideoDecoder`
267+
# object will then do the work of opening the local file at that path.
268+
# 2. Through a *file-like object*, where you open the file yourself and provide
269+
# the file-like object to :class:`~torchcodec.decoders.VideoDecoder`.
270+
#
271+
# An obvious question is: which is faster? The code below tests that question.
272+
273+
274+
def decode_from_existing_file_path():
275+
decoder = VideoDecoder(nasa_video_path, seek_mode="approximate")
276+
return decoder[0]
277+
278+
279+
def decode_from_existing_open_file_object():
280+
with open(nasa_video_path, "rb") as file:
281+
decoder = VideoDecoder(file, seek_mode="approximate")
282+
return decoder[0]
283+
284+
285+
print("Decode from existing file path:")
286+
bench(decode_from_existing_file_path)
287+
print()
288+
289+
print("Decode from existing open file object:")
290+
bench(decode_from_existing_open_file_object)
291+
292+
# %%
293+
# Thankfully, the answer is both means of decoding from a local file take about
294+
# the same amount of time. This result means that in your own code, you can use
295+
# whichever method is more convienient. What this result implies is that the
296+
# cost of actually reading and copying data dominates the cost of calling Python
297+
# methods while decoding.
298+
299+
# %%
300+
# Finally, let's clean up the local resources we created.
301+
import shutil
302+
shutil.rmtree(temp_dir)
303+
# %%

src/torchcodec/decoders/_audio_decoder.py

+3-2
Original file line numberDiff line numberDiff line change
@@ -32,8 +32,9 @@ class AudioDecoder:
3232
- If ``Pathlib.path``: a path to a local video or audio file.
3333
- If ``bytes`` object or ``torch.Tensor``: the raw encoded audio data.
3434
- If file-like object: we read video data from the object on demand. The object must
35-
expose the methods ``read(self, size: int) -> bytes`` and
36-
``seek(self, offset: int, whence: int) -> bytes``. Read more in TODO_FILE_LIKE_TUTORIAL.
35+
expose the methods `read(self, size: int) -> bytes` and
36+
`seek(self, offset: int, whence: int) -> bytes`. Read more in:
37+
:ref:`sphx_glr_generated_examples_file_like.py`.
3738
stream_index (int, optional): Specifies which stream in the file to decode samples from.
3839
Note that this index is absolute across all media types. If left unspecified, then
3940
the :term:`best stream` is used.

src/torchcodec/decoders/_video_decoder.py

+3-2
Original file line numberDiff line numberDiff line change
@@ -28,8 +28,9 @@ class VideoDecoder:
2828
- If ``Pathlib.path``: a path to a local video file.
2929
- If ``bytes`` object or ``torch.Tensor``: the raw encoded video data.
3030
- If file-like object: we read video data from the object on demand. The object must
31-
expose the methods ``read(self, size: int) -> bytes`` and
32-
``seek(self, offset: int, whence: int) -> bytes``. Read more in TODO_FILE_LIKE_TUTORIAL.
31+
expose the methods `read(self, size: int) -> bytes` and
32+
`seek(self, offset: int, whence: int) -> bytes`. Read more in:
33+
:ref:`sphx_glr_generated_examples_file_like.py`.
3334
stream_index (int, optional): Specifies which stream in the video to decode frames from.
3435
Note that this index is absolute across all media types. If left unspecified, then
3536
the :term:`best stream` is used.

0 commit comments

Comments
 (0)