[persist] refactor Blob impl for Azure for higher performance (#31127)

ParkMyCar · web-flow · commit ee1954a3f2d8 · 2025-01-28T15:17:05.000-05:00
This refactors the impl of `Blob` for Azure in a way that should be faster. The `BlobClient` we use from the `azure_storage_blob` crate returns a `Stream` that when `await`-ed sends a ranged GET request for a chunk of a blob. This PR refactors our impl so we await each ranged request in a `tokio::task` which increases the concurrency at which we fetch chunks of a `Part`. It also refactors how we handle the case when the `content-length` header is missing, and adds metrics so we can track how often this occurs. ### Motivation Maybe progress against https://github.com/MaterializeInc/database-issues/issues/8892 ### Checklist - [ ] This PR has adequate test coverage / QA involvement has been duly considered. ([trigger-ci for additional test/nightly runs](https://trigger-ci.dev.materialize.com/)) - [ ] This PR has an associated up-to-date [design doc](https://github.com/MaterializeInc/materialize/blob/main/doc/developer/design/README.md), is a design doc ([template](https://github.com/MaterializeInc/materialize/blob/main/doc/developer/design/00000000_template.md)), or is sufficiently small to not require a design.  - [ ] If this PR evolves [an existing `$T ⇔ Proto$T` mapping](https://github.com/MaterializeInc/materialize/blob/main/doc/developer/command-and-response-binary-encoding.md) (possibly in a backwards-incompatible way), then it is tagged with a `T-proto` label. - [ ] If this PR will require changes to cloud orchestration or tests, there is a companion cloud PR to account for those changes that is tagged with the release-blocker label ([example](https://github.com/MaterializeInc/cloud/pull/5021)).  - [ ] If this PR includes major [user-facing behavior changes](https://github.com/MaterializeInc/materialize/blob/main/doc/developer/guide-changes.md#what-changes-require-a-release-note), I have pinged the relevant PM to schedule a changelog post.
diff --git a/src/persist/src/azure.rs b/src/persist/src/azure.rs
@@ -12,13 +12,15 @@
 use std::fmt::Debug;
 use std::sync::Arc;
 
-use anyhow::anyhow;
+use anyhow::{anyhow, Context};
 use async_trait::async_trait;
 use azure_core::StatusCode;
 use azure_identity::create_default_credential;
 use azure_storage::{prelude::*, CloudLocation, EMULATOR_ACCOUNT};
+use azure_storage_blobs::blob::operations::GetBlobResponse;
 use azure_storage_blobs::prelude::*;
 use bytes::Bytes;
+use futures_util::stream::FuturesOrdered;
 use futures_util::StreamExt;
 use tracing::{info, warn};
 use url::Url;
@@ -185,29 +187,12 @@ impl Blob for AzureBlob {
     async fn get(&self, key: &str) -> Result<Option<SegmentedBytes>, ExternalError> {
         let path = self.get_path(key);
         let blob = self.client.blob_client(path);
-        let mut segments: Vec<MaybeLgBytes> = vec![];
-
-        // TODO: the default chunk size is 1MB. We have not tried tuning it,
-        // but making this configurable / running some benchmarks could be
-        // valuable.
-        let mut stream = blob.get().into_stream();
-        while let Some(value) = stream.next().await {
-            let response = match value {
-                Ok(v) => v,
-                Err(e) => {
-                    if let Some(e) = e.as_http_error() {
-                        if e.status() == StatusCode::NotFound {
-                            return Ok(None);
-                        }
-                    }
-
-                    return Err(ExternalError::from(anyhow!(
-                        "Azure blob get error: {:?}",
-                        e
-                    )));
-                }
-            };
 
+        /// Fetch a the body of a single [`GetBlobResponse`].
+        async fn fetch_chunk(
+            response: GetBlobResponse,
+            metrics: S3BlobMetrics,
+        ) -> Result<MaybeLgBytes, ExternalError> {
             let content_length = response.blob.properties.content_length;
 
             // Here we're being quite defensive. If `content_length` comes back
@@ -216,35 +201,86 @@ impl Blob for AzureBlob {
             // buffer into lgalloc.
             let mut buffer = match content_length {
                 1.. => {
-                    let region = self
-                        .metrics
+                    let region = metrics
                         .lgbytes
                         .persist_azure
                         .new_region(usize::cast_from(content_length));
                     PreSizedBuffer::Sized(region)
                 }
-                0 => PreSizedBuffer::Unknown(Vec::new()),
+                0 => PreSizedBuffer::Unknown(SegmentedBytes::new()),
             };
 
             let mut body = response.data;
             while let Some(value) = body.next().await {
                 let value = value.map_err(|e| {
                     ExternalError::from(anyhow!("Azure blob get body error: {}", e))
                 })?;
-                buffer.extend_from_slice(&value);
+
+                match &mut buffer {
+                    PreSizedBuffer::Sized(region) => region.extend_from_slice(&value),
+                    PreSizedBuffer::Unknown(segments) => segments.push(value),
+                }
             }
 
             // Spill our bytes to lgalloc, if they aren't already.
-            let lg_bytes = match buffer {
+            let lgbytes = match buffer {
                 PreSizedBuffer::Sized(region) => LgBytes::from(Arc::new(region)),
-                PreSizedBuffer::Unknown(buffer) => {
-                    self.metrics.lgbytes.persist_azure.try_mmap(buffer)
+                // Now that we've collected all of the segments, we know the size of our region.
+                PreSizedBuffer::Unknown(segments) => {
+                    let mut region = metrics.lgbytes.persist_azure.new_region(segments.len());
+                    for segment in segments.into_segments() {
+                        region.extend_from_slice(segment.as_ref());
+                    }
+                    LgBytes::from(Arc::new(region))
                 }
             };
-            segments.push(MaybeLgBytes::LgBytes(lg_bytes));
+
+            // Report if the content-length header didn't match the number of
+            // bytes we read from the network.
+            if content_length != u64::cast_from(lgbytes.len()) {
+                metrics.get_invalid_resp.inc();
+            }
+
+            Ok(MaybeLgBytes::LgBytes(lgbytes))
         }
 
-        Ok(Some(SegmentedBytes::from(segments)))
+        let mut requests = FuturesOrdered::new();
+        // TODO: the default chunk size is 1MB. We have not tried tuning it,
+        // but making this configurable / running some benchmarks could be
+        // valuable.
+        let mut stream = blob.get().into_stream();
+
+        while let Some(value) = stream.next().await {
+            // Return early if any of the individual fetch requests return an error.
+            let response = match value {
+                Ok(v) => v,
+                Err(e) => {
+                    if let Some(e) = e.as_http_error() {
+                        if e.status() == StatusCode::NotFound {
+                            return Ok(None);
+                        }
+                    }
+
+                    return Err(ExternalError::from(anyhow!(
+                        "Azure blob get error: {:?}",
+                        e
+                    )));
+                }
+            };
+
+            // Drive all of the fetch requests concurrently.
+            let metrics = self.metrics.clone();
+            requests.push_back(fetch_chunk(response, metrics));
+        }
+
+        // Await on all of our chunks.
+        let mut segments = SegmentedBytes::with_capacity(requests.len());
+        while let Some(body) = requests.next().await {
+            let segment = body.context("azure get body err")?;
+            segments.push(segment);
+        }
+
+        Ok(Some(segments))
     }
 
     async fn list_keys_and_metadata(
@@ -343,16 +379,7 @@ impl Blob for AzureBlob {
 /// that as we read bytes off the network.
 enum PreSizedBuffer {
     Sized(MetricsRegion<u8>),
-    Unknown(Vec<u8>),
-}
-
-impl PreSizedBuffer {
-    fn extend_from_slice(&mut self, slice: &[u8]) {
-        match self {
-            PreSizedBuffer::Sized(region) => region.extend_from_slice(slice),
-            PreSizedBuffer::Unknown(buffer) => buffer.extend_from_slice(slice),
-        }
-    }
+    Unknown(SegmentedBytes),
 }
 
 #[cfg(test)]