Skip to content

Commit 6b4acd0

Browse files
don't enumerate index_uid when requesting splits to gc (#5489)
* don't enumerate index when listing splits for gc * add tests for get splits of all indexes * no-op minor refactoring * Explicitly post filtering splits no-op refactoring. * CR comments --------- Co-authored-by: Paul Masurel <[email protected]>
1 parent a2c5f1d commit 6b4acd0

File tree

13 files changed

+392
-114
lines changed

13 files changed

+392
-114
lines changed

quickwit/quickwit-index-management/src/garbage_collection.rs

Lines changed: 42 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ use futures::{Future, StreamExt};
2727
use itertools::Itertools;
2828
use quickwit_common::metrics::IntCounter;
2929
use quickwit_common::pretty::PrettySample;
30-
use quickwit_common::Progress;
30+
use quickwit_common::{rate_limited_info, Progress};
3131
use quickwit_metastore::{
3232
ListSplitsQuery, ListSplitsRequestExt, MetastoreServiceStreamSplitsExt, SplitInfo,
3333
SplitMetadata, SplitState,
@@ -122,8 +122,8 @@ pub async fn run_garbage_collect(
122122

123123
let index_uids: Vec<IndexUid> = indexes.keys().cloned().collect();
124124

125-
let Some(list_splits_query_for_index_uids) =
126-
ListSplitsQuery::try_from_index_uids(index_uids.clone())
125+
// TODO maybe we want to do a ListSplitsQuery::for_all_indexes and post-filter ourselves here
126+
let Some(list_splits_query_for_index_uids) = ListSplitsQuery::try_from_index_uids(index_uids)
127127
else {
128128
return Ok(SplitRemovalInfo::default());
129129
};
@@ -187,7 +187,6 @@ pub async fn run_garbage_collect(
187187
OffsetDateTime::now_utc().unix_timestamp() - deletion_grace_period.as_secs() as i64;
188188

189189
Ok(delete_splits_marked_for_deletion_several_indexes(
190-
index_uids,
191190
updated_before_timestamp,
192191
metastore,
193192
indexes,
@@ -221,20 +220,15 @@ async fn delete_splits(
221220
)
222221
.await
223222
} else {
224-
error!(
225-
"we are trying to GC without knowing the storage, this shouldn't \
226-
happen"
223+
// in practice this can happen if the index was created between the start of
224+
// the run and now, and one of its splits has already expired, which likely
225+
// means a very long gc run, or if we run gc on a single index from the cli.
226+
quickwit_common::rate_limited_warn!(
227+
limit_per_min = 2,
228+
index_uid=%index_uid,
229+
"we are trying to GC without knowing the storage",
227230
);
228-
Err(DeleteSplitsError {
229-
successes: Vec::new(),
230-
storage_error: None,
231-
storage_failures: splits_metadata_to_delete
232-
.into_iter()
233-
.map(|split| split.as_split_info())
234-
.collect(),
235-
metastore_error: None,
236-
metastore_failures: Vec::new(),
237-
})
231+
Ok(Vec::new())
238232
}
239233
}
240234
})
@@ -304,11 +298,12 @@ async fn list_splits_metadata(
304298
/// Removes any splits marked for deletion which haven't been
305299
/// updated after `updated_before_timestamp` in batches of 1000 splits.
306300
///
301+
/// Only splits from index_uids in the `storages` map will be deleted.
302+
///
307303
/// The aim of this is to spread the load out across a longer period
308304
/// rather than short, heavy bursts on the metastore and storage system itself.
309-
#[instrument(skip(index_uids, storages, metastore, progress_opt, metrics), fields(num_indexes=%index_uids.len()))]
305+
#[instrument(skip(storages, metastore, progress_opt, metrics), fields(num_indexes=%storages.len()))]
310306
async fn delete_splits_marked_for_deletion_several_indexes(
311-
index_uids: Vec<IndexUid>,
312307
updated_before_timestamp: i64,
313308
metastore: MetastoreServiceClient,
314309
storages: HashMap<IndexUid, Arc<dyn Storage>>,
@@ -317,18 +312,22 @@ async fn delete_splits_marked_for_deletion_several_indexes(
317312
) -> SplitRemovalInfo {
318313
let mut split_removal_info = SplitRemovalInfo::default();
319314

320-
let Some(list_splits_query) = ListSplitsQuery::try_from_index_uids(index_uids) else {
321-
error!("failed to create list splits query. this should never happen");
322-
return split_removal_info;
323-
};
315+
// we ask for all indexes because the query is more efficient and we almost always want all
316+
// indexes anyway. The exception is when garbage collecting a single index from the commandline.
317+
// In this case, we will log a bunch of warn. i (trinity) consider it worth the more generic
318+
// code which needs fewer special case while testing, but we could check index_uids len if we
319+
// think it's a better idea.
320+
let list_splits_query = ListSplitsQuery::for_all_indexes();
324321

325322
let mut list_splits_query = list_splits_query
326323
.with_split_state(SplitState::MarkedForDeletion)
327324
.with_update_timestamp_lte(updated_before_timestamp)
328325
.with_limit(DELETE_SPLITS_BATCH_SIZE)
329326
.sort_by_index_uid();
330327

331-
loop {
328+
let mut splits_to_delete_possibly_remaining = true;
329+
330+
while splits_to_delete_possibly_remaining {
332331
let splits_metadata_to_delete: Vec<SplitMetadata> = match protect_future(
333332
progress_opt,
334333
list_splits_metadata(&metastore, &list_splits_query),
@@ -342,19 +341,32 @@ async fn delete_splits_marked_for_deletion_several_indexes(
342341
}
343342
};
344343

344+
// We page through the list of splits to delete using a limit and a `search_after` trick.
345+
// To detect if this is the last page, we check if the number of splits is less than the
346+
// limit.
347+
assert!(splits_metadata_to_delete.len() <= DELETE_SPLITS_BATCH_SIZE);
348+
splits_to_delete_possibly_remaining =
349+
splits_metadata_to_delete.len() == DELETE_SPLITS_BATCH_SIZE;
350+
345351
// set split after which to search for the next loop
346352
let Some(last_split_metadata) = splits_metadata_to_delete.last() else {
347353
break;
348354
};
349355
list_splits_query = list_splits_query.after_split(last_split_metadata);
350356

351-
let num_splits_to_delete = splits_metadata_to_delete.len();
357+
let mut splits_metadata_to_delete_per_index: HashMap<IndexUid, Vec<SplitMetadata>> =
358+
HashMap::with_capacity(storages.len());
352359

353-
let splits_metadata_to_delete_per_index: HashMap<IndexUid, Vec<SplitMetadata>> =
354-
splits_metadata_to_delete
355-
.into_iter()
356-
.map(|meta| (meta.index_uid.clone(), meta))
357-
.into_group_map();
360+
for meta in splits_metadata_to_delete {
361+
if !storages.contains_key(&meta.index_uid) {
362+
rate_limited_info!(limit_per_min=6, index_uid=?meta.index_uid, "split not listed in storage map: skipping");
363+
continue;
364+
}
365+
splits_metadata_to_delete_per_index
366+
.entry(meta.index_uid.clone())
367+
.or_default()
368+
.push(meta);
369+
}
358370

359371
// ignore return we continue either way
360372
let _: Result<(), ()> = delete_splits(
@@ -366,12 +378,6 @@ async fn delete_splits_marked_for_deletion_several_indexes(
366378
&mut split_removal_info,
367379
)
368380
.await;
369-
370-
if num_splits_to_delete < DELETE_SPLITS_BATCH_SIZE {
371-
// stop the gc if this was the last batch
372-
// we are guaranteed to make progress due to .after_split()
373-
break;
374-
}
375381
}
376382

377383
split_removal_info

quickwit/quickwit-indexing/src/actors/indexing_service.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1764,14 +1764,14 @@ mod tests {
17641764
.expect_list_splits()
17651765
.withf(|request| {
17661766
let list_splits_query = request.deserialize_list_splits_query().unwrap();
1767-
list_splits_query.index_uids == [("test-index-0", 0)]
1767+
list_splits_query.index_uids.unwrap() == [("test-index-0", 0)]
17681768
})
17691769
.return_once(|_request| Ok(ServiceStream::empty()));
17701770
mock_metastore
17711771
.expect_list_splits()
17721772
.withf(|request| {
17731773
let list_splits_query = request.deserialize_list_splits_query().unwrap();
1774-
list_splits_query.index_uids == [("test-index-1", 0), ("test-index-2", 0)]
1774+
list_splits_query.index_uids.unwrap() == [("test-index-1", 0), ("test-index-2", 0)]
17751775
})
17761776
.return_once(|_request| {
17771777
let splits = vec![Split {

quickwit/quickwit-indexing/src/actors/merge_pipeline.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -613,7 +613,7 @@ mod tests {
613613
.times(1)
614614
.withf(move |list_splits_request| {
615615
let list_split_query = list_splits_request.deserialize_list_splits_query().unwrap();
616-
assert_eq!(list_split_query.index_uids, &[index_uid.clone()]);
616+
assert_eq!(list_split_query.index_uids, Some(vec![index_uid.clone()]));
617617
assert_eq!(
618618
list_split_query.split_states,
619619
vec![quickwit_metastore::SplitState::Published]

quickwit/quickwit-janitor/src/actors/garbage_collector.rs

Lines changed: 29 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -302,10 +302,13 @@ mod tests {
302302
.times(2)
303303
.returning(move |list_splits_request| {
304304
let query = list_splits_request.deserialize_list_splits_query().unwrap();
305-
assert_eq!(query.index_uids[0], index_uid_clone,);
306305
let splits = match query.split_states[0] {
307-
SplitState::Staged => make_splits("test-index", &["a"], SplitState::Staged),
306+
SplitState::Staged => {
307+
assert_eq!(query.index_uids.unwrap()[0], index_uid_clone);
308+
make_splits("test-index", &["a"], SplitState::Staged)
309+
}
308310
SplitState::MarkedForDeletion => {
311+
assert!(query.index_uids.is_none());
309312
let expected_deletion_timestamp = OffsetDateTime::now_utc()
310313
.unix_timestamp()
311314
- split_deletion_grace_period().as_secs() as i64;
@@ -394,14 +397,19 @@ mod tests {
394397
.times(2)
395398
.returning(|list_splits_request| {
396399
let query = list_splits_request.deserialize_list_splits_query().unwrap();
397-
assert_eq!(&query.index_uids[0].index_id, "test-index");
398400
let splits = match query.split_states[0] {
399-
SplitState::Staged => make_splits("test-index", &["a"], SplitState::Staged),
400-
SplitState::MarkedForDeletion => make_splits(
401-
"test-index",
402-
&["a", "b", "c"],
403-
SplitState::MarkedForDeletion,
404-
),
401+
SplitState::Staged => {
402+
assert_eq!(&query.index_uids.unwrap()[0].index_id, "test-index");
403+
make_splits("test-index", &["a"], SplitState::Staged)
404+
}
405+
SplitState::MarkedForDeletion => {
406+
assert!(query.index_uids.is_none());
407+
make_splits(
408+
"test-index",
409+
&["a", "b", "c"],
410+
SplitState::MarkedForDeletion,
411+
)
412+
}
405413
_ => panic!("only Staged and MarkedForDeletion expected."),
406414
};
407415
let splits = ListSplitsResponse::try_from_splits(splits).unwrap();
@@ -469,10 +477,13 @@ mod tests {
469477
.times(6)
470478
.returning(|list_splits_request| {
471479
let query = list_splits_request.deserialize_list_splits_query().unwrap();
472-
assert_eq!(&query.index_uids[0].index_id, "test-index");
473480
let splits = match query.split_states[0] {
474-
SplitState::Staged => make_splits("test-index", &["a"], SplitState::Staged),
481+
SplitState::Staged => {
482+
assert_eq!(&query.index_uids.unwrap()[0].index_id, "test-index");
483+
make_splits("test-index", &["a"], SplitState::Staged)
484+
}
475485
SplitState::MarkedForDeletion => {
486+
assert!(&query.index_uids.is_none());
476487
make_splits("test-index", &["a", "b"], SplitState::MarkedForDeletion)
477488
}
478489
_ => panic!("only Staged and MarkedForDeletion expected."),
@@ -633,11 +644,6 @@ mod tests {
633644
.times(3)
634645
.returning(|list_splits_request| {
635646
let query = list_splits_request.deserialize_list_splits_query().unwrap();
636-
assert_eq!(query.index_uids.len(), 2);
637-
assert!(["test-index-1", "test-index-2"]
638-
.contains(&query.index_uids[0].index_id.as_ref()));
639-
assert!(["test-index-1", "test-index-2"]
640-
.contains(&query.index_uids[1].index_id.as_ref()));
641647
let splits_ids_string: Vec<String> =
642648
(0..8000).map(|seq| format!("split-{seq:04}")).collect();
643649
let splits_ids: Vec<&str> = splits_ids_string
@@ -646,11 +652,18 @@ mod tests {
646652
.collect();
647653
let mut splits = match query.split_states[0] {
648654
SplitState::Staged => {
655+
let index_uids = query.index_uids.unwrap();
656+
assert_eq!(index_uids.len(), 2);
657+
assert!(["test-index-1", "test-index-2"]
658+
.contains(&index_uids[0].index_id.as_ref()));
659+
assert!(["test-index-1", "test-index-2"]
660+
.contains(&index_uids[1].index_id.as_ref()));
649661
let mut splits = make_splits("test-index-1", &["a"], SplitState::Staged);
650662
splits.append(&mut make_splits("test-index-2", &["a"], SplitState::Staged));
651663
splits
652664
}
653665
SplitState::MarkedForDeletion => {
666+
assert!(query.index_uids.is_none());
654667
assert_eq!(query.limit, Some(10_000));
655668
let mut splits =
656669
make_splits("test-index-1", &splits_ids, SplitState::MarkedForDeletion);

quickwit/quickwit-janitor/src/actors/retention_policy_executor.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -459,7 +459,7 @@ mod tests {
459459
.returning(|list_splits_request| {
460460
let query = list_splits_request.deserialize_list_splits_query().unwrap();
461461
assert_eq!(query.split_states, &[SplitState::Published]);
462-
let splits = match query.index_uids[0].index_id.as_ref() {
462+
let splits = match query.index_uids.unwrap()[0].index_id.as_ref() {
463463
"index-1" => {
464464
vec![
465465
make_split("split-1", Some(1000..=5000)),

0 commit comments

Comments
 (0)