Skip to content

Commit 22ef190

Browse files
committed
feat: global delete support, DeleteFileIndex
1 parent a3b34b2 commit 22ef190

File tree

2 files changed

+88
-80
lines changed

2 files changed

+88
-80
lines changed

crates/iceberg/src/delete_file_index.rs

+77-79
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ use futures::StreamExt;
2828
use crate::runtime::spawn;
2929
use crate::scan::{DeleteFileContext, FileScanTaskDeleteFile};
3030
use crate::spec::{DataContentType, DataFile, Struct};
31+
use crate::{Error, ErrorKind, Result};
3132

3233
/// Index of delete files
3334
#[derive(Clone, Debug)]
@@ -47,7 +48,10 @@ struct PopulatedDeleteFileIndex {
4748
global_deletes: Vec<Arc<DeleteFileContext>>,
4849
eq_deletes_by_partition: HashMap<Struct, Vec<Arc<DeleteFileContext>>>,
4950
pos_deletes_by_partition: HashMap<Struct, Vec<Arc<DeleteFileContext>>>,
50-
pos_deletes_by_path: HashMap<String, Vec<Arc<DeleteFileContext>>>,
51+
// TODO: do we need this?
52+
// pos_deletes_by_path: HashMap<String, Vec<Arc<DeleteFileContext>>>,
53+
54+
// TODO: Deletion Vector support
5155
}
5256

5357
impl DeleteFileIndex {
@@ -75,7 +79,7 @@ impl DeleteFileIndex {
7579

7680
/// Gets all the delete files that apply to the specified data file.
7781
///
78-
/// Returns a future that resolves to a Vec<FileScanTaskDeleteFile>
82+
/// Returns a future that resolves to a Result<Vec<FileScanTaskDeleteFile>>
7983
pub(crate) fn get_deletes_for_data_file<'a>(
8084
&self,
8185
data_file: &'a DataFile,
@@ -95,60 +99,41 @@ impl PopulatedDeleteFileIndex {
9599
HashMap::default();
96100
let mut pos_deletes_by_partition: HashMap<Struct, Vec<Arc<DeleteFileContext>>> =
97101
HashMap::default();
98-
let mut pos_deletes_by_path: HashMap<String, Vec<Arc<DeleteFileContext>>> =
99-
HashMap::default();
100102

101-
files.into_iter().for_each(|del_file_ctx| {
102-
let arc_del_file_ctx = Arc::new(del_file_ctx);
103-
match arc_del_file_ctx.manifest_entry.content_type() {
104-
DataContentType::PositionDeletes => {
105-
// TODO: implement logic from ContentFileUtil.referencedDataFile
106-
// see https://github.com/apache/iceberg/blob/cdf748e8e5537f13d861aa4c617a51f3e11dc97c/core/src/main/java/org/apache/iceberg/util/ContentFileUtil.java#L54
107-
let referenced_data_file_path = "TODO".to_string();
108-
109-
pos_deletes_by_path
110-
.entry(referenced_data_file_path)
111-
.and_modify(|entry| {
112-
entry.push(arc_del_file_ctx.clone());
113-
})
114-
.or_insert(vec![arc_del_file_ctx.clone()]);
115-
116-
pos_deletes_by_partition
117-
.entry(
118-
arc_del_file_ctx
119-
.manifest_entry
120-
.data_file()
121-
.partition()
122-
.clone(),
123-
)
124-
.and_modify(|entry| {
125-
entry.push(arc_del_file_ctx.clone());
126-
})
127-
.or_insert(vec![arc_del_file_ctx.clone()]);
128-
}
129-
DataContentType::EqualityDeletes => {
130-
eq_deletes_by_partition
131-
.entry(
132-
arc_del_file_ctx
133-
.manifest_entry
134-
.data_file()
135-
.partition()
136-
.clone(),
137-
)
138-
.and_modify(|entry| {
139-
entry.push(arc_del_file_ctx.clone());
140-
})
141-
.or_insert(vec![arc_del_file_ctx.clone()]);
103+
let mut global_deletes: Vec<Arc<DeleteFileContext>> = vec![];
104+
105+
files.into_iter().for_each(|ctx| {
106+
let arc_ctx = Arc::new(ctx);
107+
108+
let partition = arc_ctx.manifest_entry.data_file().partition();
109+
110+
// The spec states that "Equality delete files stored with an unpartitioned spec are applied as global deletes".
111+
if partition.fields().is_empty() {
112+
// TODO: confirm we're good to skip here if we encounter a pos del
113+
if arc_ctx.manifest_entry.content_type() != DataContentType::PositionDeletes {
114+
global_deletes.push(arc_ctx);
115+
return;
142116
}
143-
_ => unreachable!(),
144117
}
118+
119+
let destination_map = match arc_ctx.manifest_entry.content_type() {
120+
DataContentType::PositionDeletes => &mut pos_deletes_by_partition,
121+
DataContentType::EqualityDeletes => &mut eq_deletes_by_partition,
122+
_ => unreachable!(),
123+
};
124+
125+
destination_map
126+
.entry(partition.clone())
127+
.and_modify(|entry| {
128+
entry.push(arc_ctx.clone());
129+
})
130+
.or_insert(vec![arc_ctx.clone()]);
145131
});
146132

147133
PopulatedDeleteFileIndex {
148-
global_deletes: vec![],
134+
global_deletes,
149135
eq_deletes_by_partition,
150136
pos_deletes_by_partition,
151-
pos_deletes_by_path,
152137
}
153138
}
154139

@@ -158,33 +143,47 @@ impl PopulatedDeleteFileIndex {
158143
data_file: &DataFile,
159144
seq_num: Option<i64>,
160145
) -> Vec<FileScanTaskDeleteFile> {
161-
let mut deletes_queue = vec![];
162-
163-
if let Some(deletes) = self.pos_deletes_by_path.get(data_file.file_path()) {
164-
deletes_queue.extend(deletes.iter());
165-
}
166-
167-
if let Some(deletes) = self.pos_deletes_by_partition.get(data_file.partition()) {
168-
deletes_queue.extend(deletes.iter());
169-
}
146+
let mut results = vec![];
170147

171-
if let Some(deletes) = self.eq_deletes_by_partition.get(data_file.partition()) {
172-
deletes_queue.extend(deletes.iter());
173-
}
174-
175-
deletes_queue
148+
self.global_deletes
176149
.iter()
150+
// filter that returns true if the provided delete file's sequence number is **greater than or equal to** `seq_num`
177151
.filter(|&delete| {
178152
seq_num
179-
.map(|seq_num| delete.manifest_entry.sequence_number() > Some(seq_num))
153+
.map(|seq_num| delete.manifest_entry.sequence_number() >= Some(seq_num))
180154
.unwrap_or_else(|| true)
181155
})
182-
.map(|delete| FileScanTaskDeleteFile {
183-
file_path: delete.manifest_entry.file_path().to_string(),
184-
file_type: delete.manifest_entry.content_type(),
185-
partition_spec_id: delete.partition_spec_id,
186-
})
187-
.collect()
156+
.for_each(|delete| results.push(delete.as_ref().into()));
157+
158+
if let Some(deletes) = self.eq_deletes_by_partition.get(data_file.partition()) {
159+
deletes
160+
.iter()
161+
// filter that returns true if the provided delete file's sequence number is **greater than or equal to** `seq_num`
162+
.filter(|&delete| {
163+
seq_num
164+
.map(|seq_num| delete.manifest_entry.sequence_number() >= Some(seq_num))
165+
.unwrap_or_else(|| true)
166+
})
167+
.for_each(|delete| results.push(delete.as_ref().into()));
168+
}
169+
170+
// TODO: the spec states that:
171+
// "The data file's file_path is equal to the delete file's referenced_data_file if it is non-null".
172+
// we're not yet doing that here. The referenced data file's name will also be present in the positional
173+
// delete file's file path column.
174+
if let Some(deletes) = self.pos_deletes_by_partition.get(data_file.partition()) {
175+
deletes
176+
.iter()
177+
// filter that returns true if the provided delete file's sequence number is **greater thano** `seq_num`
178+
.filter(|&delete| {
179+
seq_num
180+
.map(|seq_num| delete.manifest_entry.sequence_number() > Some(seq_num))
181+
.unwrap_or_else(|| true)
182+
})
183+
.for_each(|delete| results.push(delete.as_ref().into()));
184+
}
185+
186+
results
188187
}
189188
}
190189

@@ -196,18 +195,17 @@ pub(crate) struct DeletesForDataFile<'a> {
196195
}
197196

198197
impl Future for DeletesForDataFile<'_> {
199-
type Output = Vec<FileScanTaskDeleteFile>;
198+
type Output = Result<Vec<FileScanTaskDeleteFile>>;
200199

201200
fn poll(self: Pin<&mut Self>, _cx: &mut Context<'_>) -> Poll<Self::Output> {
202-
let Ok(guard) = self.state.try_read() else {
203-
return Poll::Pending;
204-
};
205-
206-
match guard.deref() {
207-
DeleteFileIndexState::Populated(idx) => {
208-
Poll::Ready(idx.get_deletes_for_data_file(self.data_file, self.seq_num))
209-
}
210-
_ => Poll::Pending,
201+
match self.state.try_read() {
202+
Ok(guard) => match guard.deref() {
203+
DeleteFileIndexState::Populated(idx) => Poll::Ready(Ok(
204+
idx.get_deletes_for_data_file(self.data_file, self.seq_num)
205+
)),
206+
_ => Poll::Pending,
207+
},
208+
Err(err) => Poll::Ready(Err(Error::new(ErrorKind::Unexpected, err.to_string()))),
211209
}
212210
}
213211
}

crates/iceberg/src/scan.rs

+11-1
Original file line numberDiff line numberDiff line change
@@ -684,7 +684,7 @@ impl ManifestEntryContext {
684684
self.manifest_entry.data_file(),
685685
self.manifest_entry.sequence_number(),
686686
)
687-
.await
687+
.await?
688688
} else {
689689
vec![]
690690
};
@@ -1088,6 +1088,16 @@ pub(crate) struct DeleteFileContext {
10881088
pub(crate) partition_spec_id: i32,
10891089
}
10901090

1091+
impl From<&DeleteFileContext> for FileScanTaskDeleteFile {
1092+
fn from(ctx: &DeleteFileContext) -> Self {
1093+
FileScanTaskDeleteFile {
1094+
file_path: ctx.manifest_entry.file_path().to_string(),
1095+
file_type: ctx.manifest_entry.content_type(),
1096+
partition_spec_id: ctx.partition_spec_id,
1097+
}
1098+
}
1099+
}
1100+
10911101
impl FileScanTask {
10921102
/// Returns the data file path of this file scan task.
10931103
pub fn data_file_path(&self) -> &str {

0 commit comments

Comments
 (0)