14
14
15
15
use std:: collections:: BTreeMap ;
16
16
use std:: collections:: HashMap ;
17
+ use std:: collections:: HashSet ;
17
18
use std:: mem;
18
19
use std:: sync:: Arc ;
19
20
20
21
use chrono:: Utc ;
22
+ use databend_common_catalog:: table:: Table ;
21
23
use databend_common_catalog:: table_context:: TableContext ;
22
24
use databend_common_exception:: Result ;
25
+ use databend_common_expression:: types:: DataType ;
23
26
use databend_common_expression:: Column ;
24
27
use databend_common_expression:: ColumnId ;
28
+ use databend_common_expression:: ComputedExpr ;
25
29
use databend_common_expression:: DataBlock ;
26
30
use databend_common_expression:: FieldIndex ;
27
31
use databend_common_expression:: TableField ;
28
32
use databend_common_expression:: TableSchema ;
29
33
use databend_common_expression:: TableSchemaRef ;
34
+ use databend_common_expression:: Value ;
35
+ use databend_common_expression:: ORIGIN_BLOCK_ROW_NUM_COLUMN_ID ;
30
36
use databend_common_io:: constants:: DEFAULT_BLOCK_BUFFER_SIZE ;
31
37
use databend_common_native:: write:: NativeWriter ;
38
+ use databend_storages_common_index:: BloomIndex ;
32
39
use databend_storages_common_index:: BloomIndexBuilder ;
40
+ use databend_storages_common_index:: Index ;
41
+ use databend_storages_common_index:: RangeIndex ;
33
42
use databend_storages_common_table_meta:: meta:: BlockMeta ;
34
43
use databend_storages_common_table_meta:: meta:: ColumnMeta ;
35
44
use databend_storages_common_table_meta:: meta:: TableMetaTimestamps ;
@@ -39,6 +48,7 @@ use parquet::basic::Encoding;
39
48
use parquet:: file:: properties:: EnabledStatistics ;
40
49
use parquet:: file:: properties:: WriterProperties ;
41
50
51
+ use crate :: io:: create_inverted_index_builders;
42
52
use crate :: io:: write:: stream:: cluster_statistics:: ClusterStatisticsBuilder ;
43
53
use crate :: io:: write:: stream:: cluster_statistics:: ClusterStatisticsState ;
44
54
use crate :: io:: write:: stream:: column_statistics:: ColumnStatisticsState ;
@@ -51,6 +61,7 @@ use crate::io::TableMetaLocationGenerator;
51
61
use crate :: io:: WriteSettings ;
52
62
use crate :: operations:: column_parquet_metas;
53
63
use crate :: FuseStorageFormat ;
64
+ use crate :: FuseTable ;
54
65
55
66
pub enum BlockWriterImpl {
56
67
Arrow ( ArrowWriter < Vec < u8 > > ) ,
@@ -215,7 +226,7 @@ impl StreamBlockBuilder {
215
226
} )
216
227
}
217
228
218
- pub fn write ( & mut self , block : DataBlock , schema : & TableSchemaRef ) -> Result < ( ) > {
229
+ pub fn write ( & mut self , block : DataBlock ) -> Result < ( ) > {
219
230
if block. is_empty ( ) {
220
231
return Ok ( ( ) ) ;
221
232
}
@@ -225,15 +236,17 @@ impl StreamBlockBuilder {
225
236
}
226
237
227
238
let block = self . cluster_stats_state . add_block ( block) ?;
228
- self . column_stats_state . add_block ( schema, & block) ?;
239
+ self . column_stats_state
240
+ . add_block ( & self . properties . source_schema , & block) ?;
229
241
self . bloom_index_builder . add_block ( & block) ?;
230
242
for writer in self . inverted_index_writers . iter_mut ( ) {
231
- writer. add_block ( schema , & block) ?;
243
+ writer. add_block ( & self . properties . source_schema , & block) ?;
232
244
}
233
245
234
246
self . row_count += block. num_rows ( ) ;
235
247
self . block_size += block. estimate_block_size ( ) ;
236
- self . block_writer . write ( block, schema) ?;
248
+ self . block_writer
249
+ . write ( block, & self . properties . source_schema ) ?;
237
250
Ok ( ( ) )
238
251
}
239
252
@@ -325,3 +338,71 @@ pub struct StreamBlockProperties {
325
338
inverted_index_builders : Vec < InvertedIndexBuilder > ,
326
339
table_meta_timestamps : TableMetaTimestamps ,
327
340
}
341
+
342
+ impl StreamBlockProperties {
343
+ pub fn try_create (
344
+ ctx : Arc < dyn TableContext > ,
345
+ table : & FuseTable ,
346
+ table_meta_timestamps : TableMetaTimestamps ,
347
+ do_append : bool ,
348
+ ) -> Result < Arc < Self > > {
349
+ // remove virtual computed fields.
350
+ let mut fields = table
351
+ . schema ( )
352
+ . fields ( )
353
+ . iter ( )
354
+ . filter ( |f| !matches ! ( f. computed_expr( ) , Some ( ComputedExpr :: Virtual ( _) ) ) )
355
+ . cloned ( )
356
+ . collect :: < Vec < _ > > ( ) ;
357
+ if !do_append {
358
+ // add stream fields.
359
+ for stream_column in table. stream_columns ( ) . iter ( ) {
360
+ fields. push ( stream_column. table_field ( ) ) ;
361
+ }
362
+ }
363
+ let source_schema = Arc :: new ( TableSchema {
364
+ fields,
365
+ ..table. schema ( ) . as_ref ( ) . clone ( )
366
+ } ) ;
367
+
368
+ let bloom_columns_map = table
369
+ . bloom_index_cols
370
+ . bloom_index_fields ( source_schema. clone ( ) , BloomIndex :: supported_type) ?;
371
+ let bloom_column_ids = bloom_columns_map
372
+ . values ( )
373
+ . map ( |v| v. column_id ( ) )
374
+ . collect :: < HashSet < _ > > ( ) ;
375
+
376
+ let inverted_index_builders = create_inverted_index_builders ( & table. table_info . meta ) ;
377
+
378
+ let cluster_stats_builder = ClusterStatisticsBuilder :: try_create ( table, ctx. clone ( ) ) ?;
379
+
380
+ let mut stats_columns = vec ! [ ] ;
381
+ let mut distinct_columns = vec ! [ ] ;
382
+ let leaf_fields = source_schema. leaf_fields ( ) ;
383
+ for field in leaf_fields. iter ( ) {
384
+ let column_id = field. column_id ( ) ;
385
+ if RangeIndex :: supported_type ( & DataType :: from ( field. data_type ( ) ) )
386
+ && column_id != ORIGIN_BLOCK_ROW_NUM_COLUMN_ID
387
+ {
388
+ stats_columns. push ( column_id) ;
389
+ if !bloom_column_ids. contains ( & column_id) {
390
+ distinct_columns. push ( column_id) ;
391
+ }
392
+ }
393
+ }
394
+
395
+ Ok ( Arc :: new ( StreamBlockProperties {
396
+ ctx,
397
+ meta_locations : table. meta_location_generator ( ) . clone ( ) ,
398
+ source_schema,
399
+ write_settings : table. get_write_settings ( ) ,
400
+ cluster_stats_builder,
401
+ stats_columns,
402
+ distinct_columns,
403
+ bloom_columns_map,
404
+ inverted_index_builders,
405
+ table_meta_timestamps,
406
+ } ) )
407
+ }
408
+ }
0 commit comments