apache · alamb · Apr 26, 2026 · May 13, 2026 · Jun 19, 2026 · Jun 23, 2026
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/datafusion-examples/Cargo.toml b/datafusion-examples/Cargo.toml
@@ -60,7 +60,7 @@ futures = { workspace = true }
 insta = { workspace = true }
 log = { workspace = true }
 mimalloc = { version = "0.1", default-features = false }
-object_store = { workspace = true, features = ["aws", "http"] }
+object_store = { workspace = true, features = ["aws", "fs", "http"] }
 prost = { workspace = true }
 rand = { workspace = true }
 serde = { version = "1", features = ["derive"] }

diff --git a/datafusion-examples/README.md b/datafusion-examples/README.md
@@ -93,6 +93,7 @@ cargo run --example dataframe -- dataframe
 | catalog                 | [`data_io/catalog.rs`](examples/data_io/catalog.rs)                                       | Register tables into a custom catalog                                     |
 | in_memory_object_store  | [`data_io/in_memory_object_store.rs`](examples/data_io/in_memory_object_store.rs)         | Read CSV from an in-memory object store (pattern applies to JSON/Parquet) |
 | json_shredding          | [`data_io/json_shredding.rs`](examples/data_io/json_shredding.rs)                         | Implement filter rewriting for JSON shredding                             |
+| object_store_spill      | [`data_io/object_store_spill.rs`](examples/data_io/object_store_spill.rs)                 | Use ObjectStore-backed spill files                                        |
 | parquet_adv_idx         | [`data_io/parquet_advanced_index.rs`](examples/data_io/parquet_advanced_index.rs)         | Create a secondary index across multiple parquet files                    |
 | parquet_emb_idx         | [`data_io/parquet_embedded_index.rs`](examples/data_io/parquet_embedded_index.rs)         | Store a custom index inside Parquet files                                 |
 | parquet_enc             | [`data_io/parquet_encrypted.rs`](examples/data_io/parquet_encrypted.rs)                   | Read & write encrypted Parquet files                                      |

diff --git a/datafusion-examples/examples/data_io/main.rs b/datafusion-examples/examples/data_io/main.rs
@@ -21,7 +21,7 @@
 //!
 //! ## Usage
 //! ```bash
-//! cargo run --example data_io -- [all|catalog|in_memory_object_store|json_shredding|parquet_adv_idx|parquet_emb_idx|parquet_enc_with_kms|parquet_enc|parquet_exec_visitor|parquet_idx|query_http_csv|remote_catalog]
+//! cargo run --example data_io -- [all|catalog|in_memory_object_store|json_shredding|object_store_spill|parquet_adv_idx|parquet_emb_idx|parquet_enc_with_kms|parquet_enc|parquet_exec_visitor|parquet_idx|query_http_csv|remote_catalog]
 //! ```
 //!
 //! Each subcommand runs a corresponding example:
@@ -36,6 +36,9 @@
 //! - `json_shredding`
 //!   (file: json_shredding.rs, desc: Implement filter rewriting for JSON shredding)
 //!
+//! - `object_store_spill`
+//!   (file: object_store_spill.rs, desc: Use ObjectStore-backed spill files)
+//!
 //! - `parquet_adv_idx`
 //!   (file: parquet_advanced_index.rs, desc: Create a secondary index across multiple parquet files)
 //!
@@ -66,6 +69,7 @@
 mod catalog;
 mod in_memory_object_store;
 mod json_shredding;
+mod object_store_spill;
 mod parquet_advanced_index;
 mod parquet_embedded_index;
 mod parquet_encrypted;
@@ -87,6 +91,7 @@ enum ExampleKind {
     Catalog,
     InMemoryObjectStore,
     JsonShredding,
+    ObjectStoreSpill,
     ParquetAdvIdx,
     ParquetEmbIdx,
     ParquetEnc,
@@ -118,6 +123,9 @@ impl ExampleKind {
                 in_memory_object_store::in_memory_object_store().await?
             }
             ExampleKind::JsonShredding => json_shredding::json_shredding().await?,
+            ExampleKind::ObjectStoreSpill => {
+                object_store_spill::object_store_spill().await?
+            }
             ExampleKind::ParquetAdvIdx => {
                 parquet_advanced_index::parquet_advanced_index().await?
             }

diff --git a/datafusion-examples/examples/data_io/object_store_spill.rs b/datafusion-examples/examples/data_io/object_store_spill.rs
@@ -0,0 +1,271 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! See `main.rs` for how to run it.
+//!
+//! [`object_store_spill`] shows how to use the [`TempFileFactory]` API to configure
+//! DataFusion to spill intermediate results to remote storage when it exceeds
+//! its memory limits.
+use std::future::Future;
+use std::io::Write;
+use std::path::Path as StdPath;
+use std::pin::Pin;
+use std::sync::Arc;
+use std::sync::atomic::{AtomicU64, Ordering};
+
+use bytes::Bytes;
+use datafusion::common::Result;
+use datafusion::error::DataFusionError;
+use datafusion::execution::disk_manager::DiskManagerBuilder;
+use datafusion::execution::runtime_env::RuntimeEnvBuilder;
+use datafusion::execution::{SpillFile, SpillWriter, TempFileFactory};
+use datafusion::prelude::{SessionConfig, SessionContext};
+use futures::{Stream, StreamExt, stream};
+use object_store::local::LocalFileSystem;
+use object_store::path::Path;
+use object_store::{ObjectStore, ObjectStoreExt, PutPayload};
+use tempfile::tempdir;
+
+/// Demonstrates configuring DataFusion with spill files backed by an ObjectStore.
+pub async fn object_store_spill() -> Result<()> {
+    // Use LocalFileSystem to simplify running the example. A real system would
+    // use  S3, GCS, Azure, or some other  ObjectStore for remote spills.
+    let tmp_dir = tempdir()?;
+    let store: Arc<dyn ObjectStore> =
+        Arc::new(LocalFileSystem::new_with_prefix(tmp_dir.path())?);
+
+    // Create the custom TempFileFactory that will create spill files in the ObjectStore.
+    let temp_file_factory = Arc::new(ObjectStoreTempFileFactory::new(store));
+    let disk_manager_builder =
+        DiskManagerBuilder::default().with_temp_file_factory(temp_file_factory.clone());
+    let runtime = RuntimeEnvBuilder::new()
+        // Configure DataFusion with the factory
+        .with_disk_manager_builder(disk_manager_builder)
+        // and set a small memory limit so the ORDER BY query below spills.
+        .with_memory_limit(1024 * 1024, 1.0)
+        .build_arc()?;
+
+    // Configure a SessionContext for running queries, with  a single partition
+    // and no sort spill reservation to make the example deterministic and keep
+    // the spill behavior easy to observe.
+    let config = SessionConfig::new()
+        .with_sort_spill_reservation_bytes(0)
+        .with_sort_in_place_threshold_bytes(0)
+        .with_target_partitions(1);
+    let ctx = SessionContext::new_with_config_rt(config, Arc::clone(&runtime));
+
+    // Run an SQL query that sorts a large amount of data, and the low memory
+    // limit forces the sort to spill.
+    let row_count = 10_000_000;
+    let mut stream = ctx
+        .sql(&format!(
+            "SELECT * FROM generate_series(1, {row_count}) AS t(v) ORDER BY v DESC"
+        ))
+        .await?
+        .execute_stream()
+        .await?;
+
+    // Drive the query to completion, and verify output
+    let mut output_rows = 0;
+    while let Some(batch) = stream.next().await {
+        output_rows += batch?.num_rows();
+    }
+
+    assert_eq!(output_rows, row_count as usize);
+    assert!(
+        temp_file_factory.created_files() > 0,
+        "expected the custom TempFileFactory to be used for spilling"
+    );
+
+    Ok(())
+}
+
+/// Creates spill files backed by an [`ObjectStore`].
+///
+/// DataFusion calls this factory whenever an operator needs a new temporary
+/// file for spilling. A remote deployment would use the same pattern with an
+/// S3, GCS, Azure, or other remote ObjectStore implementation.
+struct ObjectStoreTempFileFactory {
+    /// ObjectStore used for all spill file reads and writes.
+    store: Arc<dyn ObjectStore>,
+    /// Monotonic counter used to create unique object paths.
+    counter: AtomicU64,
+    /// Counts how many spill files DataFusion requested from this factory.
+    created_files: AtomicU64,
+}
+
+impl ObjectStoreTempFileFactory {
+    /// Create a new spill file factory that stores spill data in `store`.
+    fn new(store: Arc<dyn ObjectStore>) -> Self {
+        Self {
+            store,
+            counter: AtomicU64::new(0),
+            created_files: AtomicU64::new(0),
+        }
+    }
+
+    /// Return the number of spill files created through this factory.
+    fn created_files(&self) -> u64 {
+        self.created_files.load(Ordering::Relaxed)
+    }
+}
+
+// The DataFusion TempFileFactory trait requires unwind-safety. This example
+// only stores atomics and an ObjectStore handle, so is unwind safe
+impl std::panic::RefUnwindSafe for ObjectStoreTempFileFactory {}
+impl std::panic::UnwindSafe for ObjectStoreTempFileFactory {}
+
+impl TempFileFactory for ObjectStoreTempFileFactory {
+    /// Create one logical spill file at a unique ObjectStore location.
+    fn create_temp_file(&self, description: &str) -> Result<Arc<dyn SpillFile>> {
+        let id = self.counter.fetch_add(1, Ordering::Relaxed);
+        self.created_files.fetch_add(1, Ordering::Relaxed);
+        let description = sanitize_path_part(description);
+        let location = Path::from(format!("spill/{description}-{id}.bin"));
+
+        // Return a SpillFile implementation that reads and writes this ObjectStore path.
+        Ok(Arc::new(ObjectStoreSpillFile {
+            store: Arc::clone(&self.store),
+            location,
+            size: Arc::new(AtomicU64::new(0)),
+        }))
+    }
+}
+
+/// Logical spill file stored at an ObjectStore path.
+///
+/// DataFusion writes spill data by calling [`SpillFile::open_writer`] and reads
+/// it back by calling [`SpillFile::read_stream`].
+struct ObjectStoreSpillFile {
+    /// ObjectStore containing the spill object.
+    store: Arc<dyn ObjectStore>,
+    /// ObjectStore path for this spill object.
+    location: Path,
+    /// Last committed object size, updated when the writer finishes.
+    size: Arc<AtomicU64>,
+}
+
+impl SpillFile for ObjectStoreSpillFile {
+    /// Return a local filesystem path when one exists.
+    fn path(&self) -> Option<&StdPath> {
+        // Remote ObjectStores do not have a local OS path.
+        None
+    }
+
+    /// Return the size of the uploaded object
+    fn size(&self) -> Option<u64> {
+        // Return the last committed size, which this example tracks after upload.
+        Some(self.size.load(Ordering::Relaxed))
+    }
+
+    /// Read the spill file contents as an async byte stream.
+    fn read_stream(&self) -> Result<Pin<Box<dyn Stream<Item = Result<Bytes>> + Send>>> {
+        // Defer the ObjectStore read until DataFusion polls the returned stream.
+        let store = Arc::clone(&self.store);
+        let location = self.location.clone();
+
+        Ok(Box::pin(stream::once(async move {
+            Ok(store.get(&location).await?.bytes().await?)
+        })))
+    }
+
+    /// Open a synchronous writer for this spill file.
+    fn open_writer(&self) -> Result<Box<dyn SpillWriter>> {
+        // Create a writer that buffers bytes and uploads them on finish.
+        Ok(Box::new(ObjectStoreSpillWriter {
+            store: Arc::clone(&self.store),
+            location: self.location.clone(),
+            size: Arc::clone(&self.size),
+            buffer: Vec::new(),
+        }))
+    }
+}
+
+/// Spill writer that adapts DataFusion's synchronous write API to ObjectStore.
+///
+/// This simple example buffers bytes in memory and uploads them in
+/// [`SpillWriter::finish`]. A production remote implementation should consider
+/// multipart or streaming uploads.
+struct ObjectStoreSpillWriter {
+    /// ObjectStore that receives the final spill object.
+    store: Arc<dyn ObjectStore>,
+    /// ObjectStore path to upload to.
+    location: Path,
+    /// Shared size field on the corresponding [`ObjectStoreSpillFile`].
+    size: Arc<AtomicU64>,
+    /// Buffered spill bytes waiting to be uploaded.
+    buffer: Vec<u8>,
+}
+
+impl Write for ObjectStoreSpillWriter {
+    /// Append bytes to the in-memory buffer.
+    fn write(&mut self, buf: &[u8]) -> std::io::Result<usize> {
+        // Buffer bytes written through the synchronous Write API.
+        self.buffer.extend_from_slice(buf);
+        Ok(buf.len())
+    }
+
+    /// No-op because data is committed in [`SpillWriter::finish`].
+    fn flush(&mut self) -> std::io::Result<()> {
+        Ok(())
+    }
+}
+
+impl SpillWriter for ObjectStoreSpillWriter {
+    /// Upload buffered bytes to ObjectStore and mark the spill file complete.
+    fn finish(&mut self) -> Result<()> {
+        // Move the buffered bytes into the upload future.
+        let store = Arc::clone(&self.store);
+        let location = self.location.clone();
+        let data = std::mem::take(&mut self.buffer);
+        let size = data.len() as u64;
+
+        // This simple example buffers the spill and uploads it on finish.
+        // Production remote stores should consider multipart or streaming uploads.
+        block_on_object_store(async move {
+            store
+                .put(&location, PutPayload::from_bytes(data.into()))
+                .await?;
+            Ok(())
+        })?;
+
+        self.size.store(size, Ordering::Relaxed);
+        Ok(())
+    }
+}
+
+/// Run an async ObjectStore operation from DataFusion's synchronous spill API.
+/// TODO remove this
+fn block_on_object_store<T>(future: impl Future<Output = Result<T>>) -> Result<T> {
+    if let Ok(handle) = tokio::runtime::Handle::try_current() {
+        tokio::task::block_in_place(|| handle.block_on(future))
+    } else {
+        tokio::runtime::Runtime::new()
+            .map_err(DataFusionError::IoError)?
+            .block_on(future)
+    }
+}
+
+/// Convert a query-provided spill description into an ObjectStore-safe path component.
+///
+/// For example, `"Sort Spill: partition 0"` becomes `"Sort_Spill__partition_0"`.
+fn sanitize_path_part(value: &str) -> String {
+    value
+        .chars()
+        .map(|c| if c.is_ascii_alphanumeric() { c } else { '_' })
+        .collect()
+}
diff --git a/datafusion/execution/Cargo.toml b/datafusion/execution/Cargo.toml
@@ -55,6 +55,7 @@ sql = []
 arrow = { workspace = true }
 arrow-buffer = { workspace = true }
 async-trait = { workspace = true }
+bytes = { workspace = true }
 dashmap = { workspace = true }
 datafusion-common = { workspace = true, default-features = false }
 datafusion-expr = { workspace = true, default-features = false }
@@ -66,7 +67,11 @@ parking_lot = { workspace = true }
 parquet = { workspace = true, optional = true }
 rand = { workspace = true }
 tempfile = { workspace = true }
+tokio = { workspace = true }
+tokio-util = { workspace = true, features = ["io"] }
 url = { workspace = true }
+[target.'cfg(not(target_arch = "wasm32"))'.dependencies]
+tokio = { workspace = true, features = ["fs"] }
 
 [dev-dependencies]
 chrono = { workspace = true }