Skip to content

Commit ee1954a

Browse files
authored
[persist] refactor Blob impl for Azure for higher performance (#31127)
This refactors the impl of `Blob` for Azure in a way that should be faster. The `BlobClient` we use from the `azure_storage_blob` crate returns a `Stream` that when `await`-ed sends a ranged GET request for a chunk of a blob. This PR refactors our impl so we await each ranged request in a `tokio::task` which increases the concurrency at which we fetch chunks of a `Part`. It also refactors how we handle the case when the `content-length` header is missing, and adds metrics so we can track how often this occurs. ### Motivation Maybe progress against https://github.com/MaterializeInc/database-issues/issues/8892 ### Checklist - [ ] This PR has adequate test coverage / QA involvement has been duly considered. ([trigger-ci for additional test/nightly runs](https://trigger-ci.dev.materialize.com/)) - [ ] This PR has an associated up-to-date [design doc](https://github.com/MaterializeInc/materialize/blob/main/doc/developer/design/README.md), is a design doc ([template](https://github.com/MaterializeInc/materialize/blob/main/doc/developer/design/00000000_template.md)), or is sufficiently small to not require a design. <!-- Reference the design in the description. --> - [ ] If this PR evolves [an existing `$T ⇔ Proto$T` mapping](https://github.com/MaterializeInc/materialize/blob/main/doc/developer/command-and-response-binary-encoding.md) (possibly in a backwards-incompatible way), then it is tagged with a `T-proto` label. - [ ] If this PR will require changes to cloud orchestration or tests, there is a companion cloud PR to account for those changes that is tagged with the release-blocker label ([example](https://github.com/MaterializeInc/cloud/pull/5021)). <!-- Ask in #team-cloud on Slack if you need help preparing the cloud PR. --> - [ ] If this PR includes major [user-facing behavior changes](https://github.com/MaterializeInc/materialize/blob/main/doc/developer/guide-changes.md#what-changes-require-a-release-note), I have pinged the relevant PM to schedule a changelog post.
1 parent 3b86bc3 commit ee1954a

File tree

1 file changed

+69
-42
lines changed

1 file changed

+69
-42
lines changed

src/persist/src/azure.rs

Lines changed: 69 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -12,13 +12,15 @@
1212
use std::fmt::Debug;
1313
use std::sync::Arc;
1414

15-
use anyhow::anyhow;
15+
use anyhow::{anyhow, Context};
1616
use async_trait::async_trait;
1717
use azure_core::StatusCode;
1818
use azure_identity::create_default_credential;
1919
use azure_storage::{prelude::*, CloudLocation, EMULATOR_ACCOUNT};
20+
use azure_storage_blobs::blob::operations::GetBlobResponse;
2021
use azure_storage_blobs::prelude::*;
2122
use bytes::Bytes;
23+
use futures_util::stream::FuturesOrdered;
2224
use futures_util::StreamExt;
2325
use tracing::{info, warn};
2426
use url::Url;
@@ -185,29 +187,12 @@ impl Blob for AzureBlob {
185187
async fn get(&self, key: &str) -> Result<Option<SegmentedBytes>, ExternalError> {
186188
let path = self.get_path(key);
187189
let blob = self.client.blob_client(path);
188-
let mut segments: Vec<MaybeLgBytes> = vec![];
189-
190-
// TODO: the default chunk size is 1MB. We have not tried tuning it,
191-
// but making this configurable / running some benchmarks could be
192-
// valuable.
193-
let mut stream = blob.get().into_stream();
194-
while let Some(value) = stream.next().await {
195-
let response = match value {
196-
Ok(v) => v,
197-
Err(e) => {
198-
if let Some(e) = e.as_http_error() {
199-
if e.status() == StatusCode::NotFound {
200-
return Ok(None);
201-
}
202-
}
203-
204-
return Err(ExternalError::from(anyhow!(
205-
"Azure blob get error: {:?}",
206-
e
207-
)));
208-
}
209-
};
210190

191+
/// Fetch a the body of a single [`GetBlobResponse`].
192+
async fn fetch_chunk(
193+
response: GetBlobResponse,
194+
metrics: S3BlobMetrics,
195+
) -> Result<MaybeLgBytes, ExternalError> {
211196
let content_length = response.blob.properties.content_length;
212197

213198
// Here we're being quite defensive. If `content_length` comes back
@@ -216,35 +201,86 @@ impl Blob for AzureBlob {
216201
// buffer into lgalloc.
217202
let mut buffer = match content_length {
218203
1.. => {
219-
let region = self
220-
.metrics
204+
let region = metrics
221205
.lgbytes
222206
.persist_azure
223207
.new_region(usize::cast_from(content_length));
224208
PreSizedBuffer::Sized(region)
225209
}
226-
0 => PreSizedBuffer::Unknown(Vec::new()),
210+
0 => PreSizedBuffer::Unknown(SegmentedBytes::new()),
227211
};
228212

229213
let mut body = response.data;
230214
while let Some(value) = body.next().await {
231215
let value = value.map_err(|e| {
232216
ExternalError::from(anyhow!("Azure blob get body error: {}", e))
233217
})?;
234-
buffer.extend_from_slice(&value);
218+
219+
match &mut buffer {
220+
PreSizedBuffer::Sized(region) => region.extend_from_slice(&value),
221+
PreSizedBuffer::Unknown(segments) => segments.push(value),
222+
}
235223
}
236224

237225
// Spill our bytes to lgalloc, if they aren't already.
238-
let lg_bytes = match buffer {
226+
let lgbytes = match buffer {
239227
PreSizedBuffer::Sized(region) => LgBytes::from(Arc::new(region)),
240-
PreSizedBuffer::Unknown(buffer) => {
241-
self.metrics.lgbytes.persist_azure.try_mmap(buffer)
228+
// Now that we've collected all of the segments, we know the size of our region.
229+
PreSizedBuffer::Unknown(segments) => {
230+
let mut region = metrics.lgbytes.persist_azure.new_region(segments.len());
231+
for segment in segments.into_segments() {
232+
region.extend_from_slice(segment.as_ref());
233+
}
234+
LgBytes::from(Arc::new(region))
242235
}
243236
};
244-
segments.push(MaybeLgBytes::LgBytes(lg_bytes));
237+
238+
// Report if the content-length header didn't match the number of
239+
// bytes we read from the network.
240+
if content_length != u64::cast_from(lgbytes.len()) {
241+
metrics.get_invalid_resp.inc();
242+
}
243+
244+
Ok(MaybeLgBytes::LgBytes(lgbytes))
245245
}
246246

247-
Ok(Some(SegmentedBytes::from(segments)))
247+
let mut requests = FuturesOrdered::new();
248+
// TODO: the default chunk size is 1MB. We have not tried tuning it,
249+
// but making this configurable / running some benchmarks could be
250+
// valuable.
251+
let mut stream = blob.get().into_stream();
252+
253+
while let Some(value) = stream.next().await {
254+
// Return early if any of the individual fetch requests return an error.
255+
let response = match value {
256+
Ok(v) => v,
257+
Err(e) => {
258+
if let Some(e) = e.as_http_error() {
259+
if e.status() == StatusCode::NotFound {
260+
return Ok(None);
261+
}
262+
}
263+
264+
return Err(ExternalError::from(anyhow!(
265+
"Azure blob get error: {:?}",
266+
e
267+
)));
268+
}
269+
};
270+
271+
// Drive all of the fetch requests concurrently.
272+
let metrics = self.metrics.clone();
273+
requests.push_back(fetch_chunk(response, metrics));
274+
}
275+
276+
// Await on all of our chunks.
277+
let mut segments = SegmentedBytes::with_capacity(requests.len());
278+
while let Some(body) = requests.next().await {
279+
let segment = body.context("azure get body err")?;
280+
segments.push(segment);
281+
}
282+
283+
Ok(Some(segments))
248284
}
249285

250286
async fn list_keys_and_metadata(
@@ -343,16 +379,7 @@ impl Blob for AzureBlob {
343379
/// that as we read bytes off the network.
344380
enum PreSizedBuffer {
345381
Sized(MetricsRegion<u8>),
346-
Unknown(Vec<u8>),
347-
}
348-
349-
impl PreSizedBuffer {
350-
fn extend_from_slice(&mut self, slice: &[u8]) {
351-
match self {
352-
PreSizedBuffer::Sized(region) => region.extend_from_slice(slice),
353-
PreSizedBuffer::Unknown(buffer) => buffer.extend_from_slice(slice),
354-
}
355-
}
382+
Unknown(SegmentedBytes),
356383
}
357384

358385
#[cfg(test)]

0 commit comments

Comments
 (0)