@@ -370,17 +370,19 @@ impl ArrowReader {
370
370
/// as having been deleted by a positional delete, taking into account any row groups that have
371
371
/// been skipped entirely by the filter predicate
372
372
fn build_deletes_row_selection (
373
- row_group_metadata : & [ RowGroupMetaData ] ,
373
+ row_group_metadata_list : & [ RowGroupMetaData ] ,
374
374
selected_row_groups : & Option < Vec < usize > > ,
375
- mut positional_deletes : & DeleteVector ,
375
+ positional_deletes : & DeleteVector ,
376
376
) -> Result < RowSelection > {
377
377
let mut results: Vec < RowSelector > = Vec :: new ( ) ;
378
378
let mut selected_row_groups_idx = 0 ;
379
- let mut current_page_base_idx: u64 = 0 ;
379
+ let mut current_row_group_base_idx: u64 = 0 ;
380
+ let mut delete_vector_iter = positional_deletes. iter ( ) ;
381
+ let mut next_deleted_row_idx_opt = delete_vector_iter. next ( ) ;
380
382
381
- for ( idx, row_group_metadata) in row_group_metadata . iter ( ) . enumerate ( ) {
382
- let page_num_rows = row_group_metadata. num_rows ( ) as u64 ;
383
- let next_page_base_idx = current_page_base_idx + page_num_rows ;
383
+ for ( idx, row_group_metadata) in row_group_metadata_list . iter ( ) . enumerate ( ) {
384
+ let row_group_num_rows = row_group_metadata. num_rows ( ) as u64 ;
385
+ let next_row_group_base_idx = current_row_group_base_idx + row_group_num_rows ;
384
386
385
387
// if row group selection is enabled,
386
388
if let Some ( selected_row_groups) = selected_row_groups {
@@ -397,36 +399,37 @@ impl ArrowReader {
397
399
} else {
398
400
// remove any positional deletes from the skipped page so that
399
401
// `positional.deletes.min()` can be used
400
- positional_deletes. remove_range ( current_page_base_idx..next_page_base_idx) ;
402
+ delete_vector_iter. advance_to ( next_row_group_base_idx) ;
403
+ next_deleted_row_idx_opt = delete_vector_iter. next ( ) ;
401
404
402
405
// still increment the current page base index but then skip to the next row group
403
406
// in the file
404
- current_page_base_idx += page_num_rows ;
407
+ current_row_group_base_idx += row_group_num_rows ;
405
408
continue ;
406
409
}
407
410
}
408
411
409
- let mut next_deleted_row_idx = match positional_deletes . min ( ) {
412
+ let mut next_deleted_row_idx = match next_deleted_row_idx_opt {
410
413
Some ( next_deleted_row_idx) => {
411
- // if the index of the next deleted row is beyond this page , add a selection for
412
- // the remainder of this page and skip to the next page
413
- if next_deleted_row_idx >= next_page_base_idx {
414
- results. push ( RowSelector :: select ( page_num_rows as usize ) ) ;
414
+ // if the index of the next deleted row is beyond this row group , add a selection for
415
+ // the remainder of this row group and skip to the next row group
416
+ if next_deleted_row_idx >= next_row_group_base_idx {
417
+ results. push ( RowSelector :: select ( row_group_num_rows as usize ) ) ;
415
418
continue ;
416
419
}
417
420
418
421
next_deleted_row_idx
419
422
}
420
423
421
- // If there are no more pos deletes, add a selector for the entirety of this page .
424
+ // If there are no more pos deletes, add a selector for the entirety of this row group .
422
425
_ => {
423
- results. push ( RowSelector :: select ( page_num_rows as usize ) ) ;
426
+ results. push ( RowSelector :: select ( row_group_num_rows as usize ) ) ;
424
427
continue ;
425
428
}
426
429
} ;
427
430
428
- let mut current_idx = current_page_base_idx ;
429
- ' chunks: while next_deleted_row_idx < next_page_base_idx {
431
+ let mut current_idx = current_row_group_base_idx ;
432
+ ' chunks: while next_deleted_row_idx < next_row_group_base_idx {
430
433
// `select` all rows that precede the next delete index
431
434
if current_idx < next_deleted_row_idx {
432
435
let run_length = next_deleted_row_idx - current_idx;
@@ -437,18 +440,18 @@ impl ArrowReader {
437
440
// `skip` all consecutive deleted rows in the current row group
438
441
let mut run_length = 0 ;
439
442
while next_deleted_row_idx == current_idx
440
- && next_deleted_row_idx < next_page_base_idx
443
+ && next_deleted_row_idx < next_row_group_base_idx
441
444
{
442
445
run_length += 1 ;
443
446
current_idx += 1 ;
444
- positional_deletes. remove ( next_deleted_row_idx) ;
445
447
446
- next_deleted_row_idx = match positional_deletes. min ( ) {
448
+ next_deleted_row_idx_opt = delete_vector_iter. next ( ) ;
449
+ next_deleted_row_idx = match next_deleted_row_idx_opt {
447
450
Some ( next_deleted_row_idx) => next_deleted_row_idx,
448
451
_ => {
449
452
// We've processed the final positional delete.
450
453
// Conclude the skip and then break so that we select the remaining
451
- // rows in the page and move on to the next row group
454
+ // rows in the row group and move on to the next row group
452
455
results. push ( RowSelector :: skip ( run_length) ) ;
453
456
break ' chunks;
454
457
}
@@ -457,13 +460,13 @@ impl ArrowReader {
457
460
results. push ( RowSelector :: skip ( run_length) ) ;
458
461
}
459
462
460
- if current_idx < next_page_base_idx {
463
+ if current_idx < next_row_group_base_idx {
461
464
results. push ( RowSelector :: select (
462
- ( next_page_base_idx - current_idx) as usize ,
465
+ ( next_row_group_base_idx - current_idx) as usize ,
463
466
) ) ;
464
467
}
465
468
466
- current_page_base_idx += page_num_rows ;
469
+ current_row_group_base_idx += row_group_num_rows ;
467
470
}
468
471
469
472
Ok ( results. into ( ) )
@@ -1400,18 +1403,19 @@ mod tests {
1400
1403
use arrow_array:: { ArrayRef , RecordBatch , StringArray } ;
1401
1404
use arrow_schema:: { DataType , Field , Schema as ArrowSchema , TimeUnit } ;
1402
1405
use futures:: TryStreamExt ;
1406
+ use parquet:: arrow:: arrow_reader:: { RowSelection , RowSelector } ;
1403
1407
use parquet:: arrow:: { ArrowWriter , ProjectionMask } ;
1404
1408
use parquet:: basic:: Compression ;
1405
- use parquet:: file:: properties:: WriterProperties ;
1406
- use parquet:: arrow:: arrow_reader:: { RowSelection , RowSelector } ;
1407
1409
use parquet:: file:: metadata:: { ColumnChunkMetaData , RowGroupMetaData } ;
1410
+ use parquet:: file:: properties:: WriterProperties ;
1408
1411
use parquet:: schema:: parser:: parse_message_type;
1409
- use tempfile:: TempDir ;
1410
1412
use parquet:: schema:: types:: { SchemaDescPtr , SchemaDescriptor } ;
1411
1413
use roaring:: RoaringTreemap ;
1414
+ use tempfile:: TempDir ;
1412
1415
1413
1416
use crate :: arrow:: reader:: { CollectFieldIdVisitor , PARQUET_FIELD_ID_META_KEY } ;
1414
1417
use crate :: arrow:: { ArrowReader , ArrowReaderBuilder } ;
1418
+ use crate :: delete_vector:: DeleteVector ;
1415
1419
use crate :: expr:: visitors:: bound_predicate_visitor:: visit;
1416
1420
use crate :: expr:: { Bind , Predicate , Reference } ;
1417
1421
use crate :: io:: FileIO ;
@@ -1758,16 +1762,14 @@ message schema {
1758
1762
2999 , // single item at end of selected rg3 (1)
1759
1763
3000 , // single item at start of skipped rg4
1760
1764
] ) ;
1761
-
1762
- let positional_deletes = DeleteVector {
1763
- inner : positional_deletes
1764
- } ;
1765
+
1766
+ let positional_deletes = DeleteVector :: new ( positional_deletes) ;
1765
1767
1766
1768
// using selected row groups 1 and 3
1767
1769
let result = ArrowReader :: build_deletes_row_selection (
1768
1770
& row_groups_metadata,
1769
1771
& selected_row_groups,
1770
- positional_deletes. clone ( ) ,
1772
+ & positional_deletes,
1771
1773
)
1772
1774
. unwrap ( ) ;
1773
1775
@@ -1791,7 +1793,7 @@ message schema {
1791
1793
let result = ArrowReader :: build_deletes_row_selection (
1792
1794
& row_groups_metadata,
1793
1795
& None ,
1794
- positional_deletes,
1796
+ & positional_deletes,
1795
1797
)
1796
1798
. unwrap ( ) ;
1797
1799
0 commit comments