@@ -328,11 +328,44 @@ impl DeleteFileManager {
328
328
///
329
329
/// Returns a map of data file path to a delete vector
330
330
async fn parse_positional_deletes_record_batch_stream (
331
- stream : ArrowRecordBatchStream ,
331
+ mut stream : ArrowRecordBatchStream ,
332
332
) -> Result < HashMap < String , RoaringTreemap > > {
333
- // TODO
333
+ let mut result: HashMap < String , RoaringTreemap > = HashMap :: default ( ) ;
334
+
335
+ while let Some ( batch) = stream. next ( ) . await {
336
+ let batch = batch?;
337
+ let schema = batch. schema ( ) ;
338
+ let columns = batch. columns ( ) ;
339
+
340
+ let Some ( file_paths) = columns[ 0 ] . as_any ( ) . downcast_ref :: < StringArray > ( ) else {
341
+ return Err ( Error :: new (
342
+ ErrorKind :: DataInvalid ,
343
+ "Could not downcast file paths array to StringArray" ,
344
+ ) ) ;
345
+ } ;
346
+ let Some ( positions) = columns[ 1 ] . as_any ( ) . downcast_ref :: < Int64Array > ( ) else {
347
+ return Err ( Error :: new (
348
+ ErrorKind :: DataInvalid ,
349
+ "Could not downcast positions array to Int64Array" ,
350
+ ) ) ;
351
+ } ;
352
+
353
+ for ( file_path, pos) in file_paths. iter ( ) . zip ( positions. iter ( ) ) {
354
+ let ( Some ( file_path) , Some ( pos) ) = ( file_path, pos) else {
355
+ return Err ( Error :: new (
356
+ ErrorKind :: DataInvalid ,
357
+ "null values in delete file" ,
358
+ ) ) ;
359
+ } ;
360
+
361
+ result
362
+ . entry ( file_path. to_string ( ) )
363
+ . or_default ( )
364
+ . insert ( pos as u64 ) ;
365
+ }
366
+ }
334
367
335
- Ok ( HashMap :: default ( ) )
368
+ Ok ( result )
336
369
}
337
370
338
371
/// Parses record batch streams from individual equality delete files
@@ -452,27 +485,67 @@ mod tests {
452
485
. load_deletes ( & file_scan_tasks[ 0 ] . deletes , file_io, 5 )
453
486
. await
454
487
. unwrap ( ) ;
488
+
489
+ let result = delete_file_manager
490
+ . get_delete_vector_for_task ( & file_scan_tasks[ 0 ] )
491
+ . unwrap ( ) ;
492
+ assert_eq ! ( result. len( ) , 3 ) ; // pos dels from pos del file 1 and 2
493
+
494
+ let result = delete_file_manager
495
+ . get_delete_vector_for_task ( & file_scan_tasks[ 1 ] )
496
+ . unwrap ( ) ;
497
+ assert_eq ! ( result. len( ) , 3 ) ; // pos dels from pos del file 3
455
498
}
456
499
457
500
fn setup ( table_location : & Path ) -> Vec < FileScanTask > {
458
501
let data_file_schema = Arc :: new ( Schema :: builder ( ) . build ( ) . unwrap ( ) ) ;
459
502
let positional_delete_schema = create_pos_del_schema ( ) ;
460
503
461
- let file_path_values = vec ! [ format!( "{}/1.parquet" , table_location. to_str( ) . unwrap( ) ) ; 8 ] ;
462
- let pos_values = vec ! [ 0 , 1 , 3 , 5 , 6 , 8 , 1022 , 1023 ] ;
463
-
464
- let file_path_col = Arc :: new ( StringArray :: from_iter_values ( file_path_values) ) ;
465
- let pos_col = Arc :: new ( Int64Array :: from_iter_values ( pos_values) ) ;
504
+ let mut file_path_values = vec ! [ ] ;
505
+ let mut pos_values = vec ! [ ] ;
506
+
507
+ file_path_values. push ( vec ! [
508
+ format!(
509
+ "{}/1.parquet" ,
510
+ table_location. to_str( ) . unwrap( )
511
+ ) ;
512
+ 3
513
+ ] ) ;
514
+ pos_values. push ( vec ! [ 0 , 1 , 3 ] ) ;
515
+
516
+ file_path_values. push ( vec ! [
517
+ format!(
518
+ "{}/1.parquet" ,
519
+ table_location. to_str( ) . unwrap( )
520
+ ) ;
521
+ 3
522
+ ] ) ;
523
+ pos_values. push ( vec ! [ 5 , 6 , 8 ] ) ;
524
+
525
+ file_path_values. push ( vec ! [
526
+ format!(
527
+ "{}/2.parquet" ,
528
+ table_location. to_str( ) . unwrap( )
529
+ ) ;
530
+ 3
531
+ ] ) ;
532
+ pos_values. push ( vec ! [ 1022 , 1023 , 1024 ] ) ;
533
+ // 9 rows in total pos deleted across 3 files
466
534
467
535
let props = WriterProperties :: builder ( )
468
536
. set_compression ( Compression :: SNAPPY )
469
537
. build ( ) ;
470
538
471
539
for n in 1 ..=3 {
540
+ let file_path_col = Arc :: new ( StringArray :: from_iter_values (
541
+ file_path_values. pop ( ) . unwrap ( ) ,
542
+ ) ) ;
543
+ let pos_col = Arc :: new ( Int64Array :: from_iter_values ( pos_values. pop ( ) . unwrap ( ) ) ) ;
544
+
472
545
let positional_deletes_to_write =
473
546
RecordBatch :: try_new ( positional_delete_schema. clone ( ) , vec ! [
474
547
file_path_col. clone( ) ,
475
- pos_col. clone ( ) ,
548
+ pos_col,
476
549
] )
477
550
. unwrap ( ) ;
478
551
@@ -520,7 +593,7 @@ mod tests {
520
593
start: 0 ,
521
594
length: 0 ,
522
595
record_count: None ,
523
- data_file_path: "" . to_string ( ) ,
596
+ data_file_path: format! ( "{}/1.parquet" , table_location . to_str ( ) . unwrap ( ) ) ,
524
597
data_file_content: DataContentType :: Data ,
525
598
data_file_format: DataFileFormat :: Parquet ,
526
599
schema: data_file_schema. clone( ) ,
@@ -532,13 +605,13 @@ mod tests {
532
605
start: 0 ,
533
606
length: 0 ,
534
607
record_count: None ,
535
- data_file_path: "" . to_string ( ) ,
608
+ data_file_path: format! ( "{}/2.parquet" , table_location . to_str ( ) . unwrap ( ) ) ,
536
609
data_file_content: DataContentType :: Data ,
537
610
data_file_format: DataFileFormat :: Parquet ,
538
611
schema: data_file_schema. clone( ) ,
539
612
project_field_ids: vec![ ] ,
540
613
predicate: None ,
541
- deletes: vec![ pos_del_2 , pos_del_3] ,
614
+ deletes: vec![ pos_del_3] ,
542
615
} ,
543
616
] ;
544
617
0 commit comments