Skip to content

Commit a0e48a0

Browse files
committed
fix
1 parent 1e8045e commit a0e48a0

File tree

13 files changed

+230
-202
lines changed

13 files changed

+230
-202
lines changed

src/query/service/src/test_kits/block_writer.rs

+1-1
Original file line numberDiff line numberDiff line change
@@ -21,12 +21,12 @@ use databend_common_io::constants::DEFAULT_BLOCK_BUFFER_SIZE;
2121
use databend_common_io::constants::DEFAULT_BLOCK_INDEX_BUFFER_SIZE;
2222
use databend_common_sql::BloomIndexColumns;
2323
use databend_common_storages_fuse::io::serialize_block;
24-
use databend_common_storages_fuse::io::BloomIndexBuilder;
2524
use databend_common_storages_fuse::io::TableMetaLocationGenerator;
2625
use databend_common_storages_fuse::io::WriteSettings;
2726
use databend_common_storages_fuse::FuseStorageFormat;
2827
use databend_storages_common_blocks::blocks_to_parquet;
2928
use databend_storages_common_index::BloomIndex;
29+
use databend_storages_common_index::BloomIndexBuilder;
3030
use databend_storages_common_table_meta::meta::BlockMeta;
3131
use databend_storages_common_table_meta::meta::ClusterStatistics;
3232
use databend_storages_common_table_meta::meta::Compression;

src/query/service/tests/it/storages/fuse/mod.rs

-1
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,6 @@
1414

1515
#![allow(clippy::too_many_arguments)]
1616
mod bloom_index_meta_size;
17-
mod bloom_pruner;
1817
mod conflict;
1918
mod io;
2019
mod meta;

src/query/storages/common/index/src/bloom_index.rs

+180
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,10 @@
1212
// See the License for the specific language governing permissions and
1313
// limitations under the License.
1414

15+
use std::collections::BTreeMap;
1516
use std::collections::HashMap;
1617
use std::ops::ControlFlow;
18+
use std::ops::Deref;
1719
use std::sync::Arc;
1820

1921
use databend_common_ast::Span;
@@ -24,9 +26,11 @@ use databend_common_expression::eval_function;
2426
use databend_common_expression::expr::*;
2527
use databend_common_expression::types::boolean::BooleanDomain;
2628
use databend_common_expression::types::nullable::NullableDomain;
29+
use databend_common_expression::types::AnyType;
2730
use databend_common_expression::types::Bitmap;
2831
use databend_common_expression::types::Buffer;
2932
use databend_common_expression::types::DataType;
33+
use databend_common_expression::types::MapType;
3034
use databend_common_expression::types::NullableType;
3135
use databend_common_expression::types::Number;
3236
use databend_common_expression::types::NumberDataType;
@@ -35,16 +39,20 @@ use databend_common_expression::types::ValueType;
3539
use databend_common_expression::visit_expr;
3640
use databend_common_expression::BlockEntry;
3741
use databend_common_expression::Column;
42+
use databend_common_expression::ColumnBuilder;
3843
use databend_common_expression::ColumnId;
3944
use databend_common_expression::ConstantFolder;
4045
use databend_common_expression::DataBlock;
4146
use databend_common_expression::Domain;
4247
use databend_common_expression::Expr;
4348
use databend_common_expression::ExprVisitor;
49+
use databend_common_expression::FieldIndex;
4450
use databend_common_expression::FunctionContext;
4551
use databend_common_expression::Scalar;
52+
use databend_common_expression::ScalarRef;
4653
use databend_common_expression::TableDataType;
4754
use databend_common_expression::TableField;
55+
use databend_common_expression::TableSchema;
4856
use databend_common_expression::TableSchemaRef;
4957
use databend_common_expression::Value;
5058
use databend_common_functions::BUILTIN_FUNCTIONS;
@@ -59,8 +67,11 @@ use serde::Serialize;
5967
use super::eliminate_cast::is_injective_cast;
6068
use crate::eliminate_cast::cast_const;
6169
use crate::filters::BlockBloomFilterIndexVersion;
70+
use crate::filters::BlockFilter;
6271
use crate::filters::Filter;
72+
use crate::filters::FilterBuilder;
6373
use crate::filters::V2BloomBlock;
74+
use crate::filters::Xor8Builder;
6475
use crate::filters::Xor8Filter;
6576
use crate::statistics_to_domain;
6677
use crate::Index;
@@ -439,6 +450,175 @@ impl BloomIndex {
439450
}
440451
}
441452

453+
pub struct BloomIndexBuilder {
454+
func_ctx: FunctionContext,
455+
columns: Vec<ColumnXor8Builder>,
456+
}
457+
458+
struct ColumnXor8Builder {
459+
index: FieldIndex,
460+
field: TableField,
461+
builder: Xor8Builder,
462+
}
463+
464+
impl BloomIndexBuilder {
465+
pub fn create(
466+
func_ctx: FunctionContext,
467+
bloom_columns_map: BTreeMap<FieldIndex, TableField>,
468+
) -> Self {
469+
let columns = bloom_columns_map
470+
.iter()
471+
.map(|(&index, field)| ColumnXor8Builder {
472+
index,
473+
field: field.clone(),
474+
builder: Xor8Builder::create(),
475+
})
476+
.collect();
477+
Self { func_ctx, columns }
478+
}
479+
480+
pub fn add_block(&mut self, block: &DataBlock) -> Result<()> {
481+
if block.is_empty() {
482+
return Err(ErrorCode::BadArguments("block is empty"));
483+
}
484+
if block.num_columns() == 0 {
485+
return Ok(());
486+
}
487+
488+
let mut keys_to_remove = Vec::with_capacity(self.columns.len());
489+
for (index, bloom_index_column) in self.columns.iter_mut().enumerate() {
490+
let field_type = &block.get_by_offset(bloom_index_column.index).data_type;
491+
if !Xor8Filter::supported_type(field_type) {
492+
keys_to_remove.push(index);
493+
continue;
494+
}
495+
496+
let column = match &block.get_by_offset(bloom_index_column.index).value {
497+
Value::Scalar(s) => {
498+
let builder = ColumnBuilder::repeat(&s.as_ref(), 1, field_type);
499+
builder.build()
500+
}
501+
Value::Column(c) => c.clone(),
502+
};
503+
504+
let (column, data_type) = match field_type.remove_nullable() {
505+
DataType::Map(box inner_ty) => {
506+
// Add bloom filter for the value of map type
507+
let map_column = if field_type.is_nullable() {
508+
let nullable_column =
509+
NullableType::<MapType<AnyType, AnyType>>::try_downcast_column(&column)
510+
.unwrap();
511+
nullable_column.column
512+
} else {
513+
MapType::<AnyType, AnyType>::try_downcast_column(&column).unwrap()
514+
};
515+
let column = map_column.underlying_column().values;
516+
517+
let DataType::Tuple(kv_tys) = inner_ty else {
518+
unreachable!();
519+
};
520+
let val_type = kv_tys[1].clone();
521+
// Extract JSON value of string type to create bloom index,
522+
// other types of JSON value will be ignored.
523+
if val_type.remove_nullable() == DataType::Variant {
524+
let mut builder = ColumnBuilder::with_capacity(
525+
&DataType::Nullable(Box::new(DataType::String)),
526+
column.len(),
527+
);
528+
for val in column.iter() {
529+
if let ScalarRef::Variant(v) = val {
530+
let raw_jsonb = RawJsonb::new(v);
531+
if let Ok(Some(str_val)) = raw_jsonb.as_str() {
532+
builder.push(ScalarRef::String(&str_val));
533+
continue;
534+
}
535+
}
536+
builder.push_default();
537+
}
538+
let str_column = builder.build();
539+
if BloomIndex::check_large_string(&str_column) {
540+
keys_to_remove.push(index);
541+
continue;
542+
}
543+
let str_type = DataType::Nullable(Box::new(DataType::String));
544+
(str_column, str_type)
545+
} else {
546+
if BloomIndex::check_large_string(&column) {
547+
keys_to_remove.push(index);
548+
continue;
549+
}
550+
(column, val_type)
551+
}
552+
}
553+
_ => {
554+
if BloomIndex::check_large_string(&column) {
555+
keys_to_remove.push(index);
556+
continue;
557+
}
558+
(column, field_type.clone())
559+
}
560+
};
561+
562+
let (column, validity) =
563+
BloomIndex::calculate_nullable_column_digest(&self.func_ctx, &column, &data_type)?;
564+
565+
// create filter per column
566+
if validity.as_ref().map(|v| v.null_count()).unwrap_or(0) > 0 {
567+
let validity = validity.unwrap();
568+
let it = column.deref().iter().zip(validity.iter()).map(
569+
|(v, b)| {
570+
if !b {
571+
&0
572+
} else {
573+
v
574+
}
575+
},
576+
);
577+
bloom_index_column.builder.add_digests(it);
578+
} else {
579+
bloom_index_column.builder.add_digests(column.deref());
580+
}
581+
}
582+
for k in keys_to_remove {
583+
self.columns.remove(k);
584+
}
585+
Ok(())
586+
}
587+
588+
pub fn finalize(mut self) -> Result<Option<BloomIndex>> {
589+
let mut column_distinct_count = HashMap::with_capacity(self.columns.len());
590+
let mut filters = Vec::with_capacity(self.columns.len());
591+
let mut filter_fields = Vec::with_capacity(self.columns.len());
592+
for column in self.columns.iter_mut() {
593+
let filter = column.builder.build()?;
594+
if let Some(len) = filter.len() {
595+
if !matches!(
596+
column.field.data_type().remove_nullable(),
597+
TableDataType::Map(_) | TableDataType::Variant
598+
) {
599+
column_distinct_count.insert(column.field.column_id, len);
600+
}
601+
}
602+
let filter_name =
603+
BloomIndex::build_filter_column_name(BlockFilter::VERSION, &column.field)?;
604+
filter_fields.push(TableField::new(&filter_name, TableDataType::Binary));
605+
filters.push(Arc::new(filter));
606+
}
607+
608+
if filter_fields.is_empty() {
609+
return Ok(None);
610+
}
611+
let filter_schema = Arc::new(TableSchema::new(filter_fields));
612+
Ok(Some(BloomIndex {
613+
func_ctx: self.func_ctx,
614+
version: BlockFilter::VERSION,
615+
filter_schema,
616+
filters,
617+
column_distinct_count,
618+
}))
619+
}
620+
}
621+
442622
struct Visitor<T: EqVisitor>(T);
443623

444624
impl<T> ExprVisitor<String> for Visitor<T>

src/query/storages/common/index/src/lib.rs

+1
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ mod page_index;
2525
mod range_index;
2626

2727
pub use bloom_index::BloomIndex;
28+
pub use bloom_index::BloomIndexBuilder;
2829
pub use bloom_index::BloomIndexMeta;
2930
pub use bloom_index::FilterEvalResult;
3031
pub use eliminate_cast::eliminate_cast;

src/query/service/tests/it/storages/fuse/bloom_pruner.rs renamed to src/query/storages/common/index/tests/it/bloom_pruner.rs

+4-4
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
// Copyright 2021 Datafuse Labs
1+
// Copyright 2022 Datafuse Labs
22
//
33
// Licensed under the Apache License, Version 2.0 (the "License");
44
// you may not use this file except in compliance with the License.
@@ -17,13 +17,13 @@ use std::collections::HashMap;
1717
use std::io::Write;
1818
use std::sync::Arc;
1919

20-
use databend_common_column::buffer::Buffer;
2120
use databend_common_expression::type_check;
2221
use databend_common_expression::type_check::check_function;
2322
use databend_common_expression::types::map::KvColumn;
2423
use databend_common_expression::types::map::KvPair;
2524
use databend_common_expression::types::AnyType;
2625
use databend_common_expression::types::ArrayColumn;
26+
use databend_common_expression::types::Buffer;
2727
use databend_common_expression::types::DataType;
2828
use databend_common_expression::types::DateType;
2929
use databend_common_expression::types::Int16Type;
@@ -51,17 +51,17 @@ use databend_common_expression::TableSchema;
5151
use databend_common_expression::Value;
5252
use databend_common_functions::test_utils::parse_raw_expr;
5353
use databend_common_functions::BUILTIN_FUNCTIONS;
54-
use databend_common_storages_fuse::io::BloomIndexBuilder;
5554
use databend_storages_common_index::filters::Xor8Filter;
5655
use databend_storages_common_index::BloomIndex;
56+
use databend_storages_common_index::BloomIndexBuilder;
5757
use databend_storages_common_index::FilterEvalResult;
5858
use databend_storages_common_index::Index;
5959
use databend_storages_common_table_meta::meta::ColumnStatistics;
6060
use goldenfile::Mint;
6161

6262
#[test]
6363
fn test_bloom_filter() {
64-
let mut mint = Mint::new("tests/it/storages/testdata");
64+
let mut mint = Mint::new("tests/it/testdata");
6565
let file = &mut mint.new_goldenfile("test_bloom_filter.txt").unwrap();
6666

6767
test_base(file);

src/query/storages/common/index/tests/it/main.rs

+1
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
#![feature(box_patterns)]
1616
#![allow(clippy::uninlined_format_args)]
1717

18+
mod bloom_pruner;
1819
mod eliminate_cast;
1920
mod page_pruner;
2021
mod range_pruner;

src/query/storages/fuse/src/io/mod.rs

+2-1
Original file line numberDiff line numberDiff line change
@@ -44,11 +44,12 @@ pub use write::write_data;
4444
pub use write::BlockBuilder;
4545
pub use write::BlockSerialization;
4646
pub use write::BlockWriter;
47-
pub use write::BloomIndexBuilder;
4847
pub use write::BloomIndexRebuilder;
4948
pub use write::BloomIndexState;
5049
pub use write::CachedMetaWriter;
5150
pub use write::InvertedIndexBuilder;
5251
pub use write::InvertedIndexWriter;
5352
pub use write::MetaWriter;
53+
pub(crate) use write::StreamBlockBuilder;
54+
pub(crate) use write::StreamBlockProperties;
5455
pub use write::WriteSettings;

0 commit comments

Comments
 (0)