Skip to content

Commit 14ad3d0

Browse files
authored
Adding optional arg to BF.INSERT to allow users to check if their bloom filter can reach the desired size (#41)
* Adding optional arg to BF.INSERT to allow users to check if their bloom filter can reach the desired size Signed-off-by: zackcam <zackcam@amazon.com> * Fixing ATLEASTCAPACITY calculation as well as adding MAXCAPACITY functionality for info Signed-off-by: zackcam <zackcam@amazon.com> --------- Signed-off-by: zackcam <zackcam@amazon.com>
1 parent 20efd95 commit 14ad3d0

File tree

6 files changed

+288
-20
lines changed

6 files changed

+288
-20
lines changed

src/bloom/command_handler.rs

Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -462,6 +462,7 @@ pub fn bloom_filter_insert(ctx: &Context, input_args: &[ValkeyString]) -> Valkey
462462
true => (None, true),
463463
false => (Some(configs::FIXED_SEED), false),
464464
};
465+
let mut validate_scale_to = None;
465466
let mut nocreate = false;
466467
let mut items_provided = false;
467468
while idx < argc {
@@ -553,6 +554,23 @@ pub fn bloom_filter_insert(ctx: &Context, input_args: &[ValkeyString]) -> Valkey
553554
}
554555
};
555556
}
557+
"VALIDATESCALETO" => {
558+
if idx >= (argc - 1) {
559+
return Err(ValkeyError::WrongArity);
560+
}
561+
idx += 1;
562+
validate_scale_to = match input_args[idx].to_string_lossy().parse::<i64>() {
563+
Ok(num) if (BLOOM_CAPACITY_MIN..=BLOOM_CAPACITY_MAX).contains(&num) => {
564+
Some(num)
565+
}
566+
Ok(0) => {
567+
return Err(ValkeyError::Str(utils::CAPACITY_LARGER_THAN_0));
568+
}
569+
_ => {
570+
return Err(ValkeyError::Str(utils::BAD_CAPACITY));
571+
}
572+
};
573+
}
556574
"ITEMS" => {
557575
idx += 1;
558576
items_provided = true;
@@ -568,6 +586,26 @@ pub fn bloom_filter_insert(ctx: &Context, input_args: &[ValkeyString]) -> Valkey
568586
// When the `ITEMS` argument is provided, we expect additional item arg/s to be provided.
569587
return Err(ValkeyError::WrongArity);
570588
}
589+
// Check if we have a wanted capacity and calculate if we can reach that capacity. Using VALIDATESCALETO and NONSCALING options together is invalid.
590+
if let Some(scale_to) = validate_scale_to {
591+
if expansion == 0 {
592+
return Err(ValkeyError::Str(
593+
utils::NON_SCALING_AND_VALIDATE_SCALE_TO_IS_INVALID,
594+
));
595+
}
596+
match utils::BloomObject::calculate_max_scaled_capacity(
597+
capacity,
598+
fp_rate,
599+
scale_to,
600+
tightening_ratio,
601+
expansion,
602+
) {
603+
Ok(_) => (),
604+
Err(err) => {
605+
return Err(ValkeyError::Str(err.as_str()));
606+
}
607+
};
608+
}
571609
// If the filter does not exist, create one
572610
let filter_key = ctx.open_key_writable(filter_name);
573611
let value = match filter_key.get_value::<BloomObject>(&BLOOM_TYPE) {
@@ -678,12 +716,29 @@ pub fn bloom_filter_info(ctx: &Context, input_args: &[ValkeyString]) -> ValkeyRe
678716
"SIZE" => Ok(ValkeyValue::Integer(val.memory_usage() as i64)),
679717
"FILTERS" => Ok(ValkeyValue::Integer(val.num_filters() as i64)),
680718
"ITEMS" => Ok(ValkeyValue::Integer(val.cardinality())),
719+
"ERROR" => Ok(ValkeyValue::Float(val.fp_rate())),
681720
"EXPANSION" => {
682721
if val.expansion() == 0 {
683722
return Ok(ValkeyValue::Null);
684723
}
685724
Ok(ValkeyValue::Integer(val.expansion() as i64))
686725
}
726+
// Only calculate and expose MAXSCALEDCAPACITY for scaling bloom objects.
727+
"MAXSCALEDCAPACITY" if val.expansion() > 0 => {
728+
let max_capacity = match utils::BloomObject::calculate_max_scaled_capacity(
729+
val.starting_capacity(),
730+
val.fp_rate(),
731+
-1,
732+
val.tightening_ratio(),
733+
val.expansion(),
734+
) {
735+
Ok(result) => result,
736+
Err(err) => {
737+
return Err(ValkeyError::Str(err.as_str()));
738+
}
739+
};
740+
Ok(ValkeyValue::Integer(max_capacity))
741+
}
687742
_ => Err(ValkeyError::Str(utils::INVALID_INFO_VALUE)),
688743
}
689744
}
@@ -697,13 +752,31 @@ pub fn bloom_filter_info(ctx: &Context, input_args: &[ValkeyString]) -> ValkeyRe
697752
ValkeyValue::Integer(val.num_filters() as i64),
698753
ValkeyValue::SimpleStringStatic("Number of items inserted"),
699754
ValkeyValue::Integer(val.cardinality()),
755+
ValkeyValue::SimpleStringStatic("Error rate"),
756+
ValkeyValue::Float(val.fp_rate()),
700757
ValkeyValue::SimpleStringStatic("Expansion rate"),
701758
];
702759
if val.expansion() == 0 {
703760
result.push(ValkeyValue::Null);
704761
} else {
705762
result.push(ValkeyValue::Integer(val.expansion() as i64));
706763
}
764+
if val.expansion() != 0 {
765+
let max_capacity = match utils::BloomObject::calculate_max_scaled_capacity(
766+
val.starting_capacity(),
767+
val.fp_rate(),
768+
-1,
769+
val.tightening_ratio(),
770+
val.expansion(),
771+
) {
772+
Ok(result) => result,
773+
Err(err) => {
774+
return Err(ValkeyError::Str(err.as_str()));
775+
}
776+
};
777+
result.push(ValkeyValue::SimpleStringStatic("Max scaled capacity"));
778+
result.push(ValkeyValue::Integer(max_capacity));
779+
}
707780
Ok(ValkeyValue::Array(result))
708781
}
709782
_ => Err(ValkeyError::Str(utils::NOT_FOUND)),

src/bloom/data_type.rs

Lines changed: 9 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -86,16 +86,15 @@ impl ValkeyDataType for BloomObject {
8686
let Ok(capacity) = raw::load_unsigned(rdb) else {
8787
return None;
8888
};
89-
let new_fp_rate =
90-
match Self::calculate_fp_rate(fp_rate, num_filters as i32, tightening_ratio) {
91-
Ok(rate) => rate,
92-
Err(_) => {
93-
logging::log_warning(
94-
"Failed to restore bloom object: Reached max number of filters",
95-
);
96-
return None;
97-
}
98-
};
89+
let new_fp_rate = match Self::calculate_fp_rate(fp_rate, i as i32, tightening_ratio) {
90+
Ok(rate) => rate,
91+
Err(_) => {
92+
logging::log_warning(
93+
"Failed to restore bloom object: False positive degrades to 0 on scale out",
94+
);
95+
return None;
96+
}
97+
};
9998
let curr_filter_size = BloomFilter::compute_size(capacity as i64, new_fp_rate);
10099
let curr_object_size = BloomObject::compute_size(filters.capacity())
101100
+ filters_memory_usage

src/bloom/utils.rs

Lines changed: 164 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -29,13 +29,20 @@ pub const ERROR_RATE_RANGE: &str = "ERR (0 < error rate range < 1)";
2929
pub const BAD_TIGHTENING_RATIO: &str = "ERR bad tightening ratio";
3030
pub const TIGHTENING_RATIO_RANGE: &str = "ERR (0 < tightening ratio range < 1)";
3131
pub const CAPACITY_LARGER_THAN_0: &str = "ERR (capacity should be larger than 0)";
32-
pub const MAX_NUM_SCALING_FILTERS: &str = "ERR bloom object reached max number of filters";
32+
pub const FALSE_POSITIVE_DEGRADES_TO_O: &str = "ERR false positive degrades to 0 on scale out";
3333
pub const UNKNOWN_ARGUMENT: &str = "ERR unknown argument received";
3434
pub const EXCEEDS_MAX_BLOOM_SIZE: &str = "ERR operation exceeds bloom object memory limit";
35+
pub const VALIDATE_SCALE_TO_EXCEEDS_MAX_SIZE: &str =
36+
"ERR provided VALIDATESCALETO causes bloom object to exceed memory limit";
37+
pub const MAX_NUM_SCALING_FILTERS: &str = "ERR bloom object reached max number of filters";
38+
pub const VALIDATE_SCALE_TO_FALSE_POSITIVE_INVALID: &str =
39+
"ERR provided VALIDATESCALETO causes false positive to degrade to 0";
3540
pub const KEY_EXISTS: &str = "BUSYKEY Target key name already exists.";
3641
pub const DECODE_BLOOM_OBJECT_FAILED: &str = "ERR bloom object decoding failed";
3742
pub const DECODE_UNSUPPORTED_VERSION: &str =
3843
"ERR bloom object decoding failed. Unsupported version";
44+
pub const NON_SCALING_AND_VALIDATE_SCALE_TO_IS_INVALID: &str =
45+
"ERR cannot use NONSCALING and VALIDATESCALETO options together";
3946
/// Logging Error messages
4047
pub const ENCODE_BLOOM_OBJECT_FAILED: &str = "Failed to encode bloom object.";
4148

@@ -49,6 +56,10 @@ pub enum BloomError {
4956
DecodeUnsupportedVersion,
5057
ErrorRateRange,
5158
BadExpansion,
59+
FalsePositiveReachesZero,
60+
BadCapacity,
61+
ValidateScaleToExceedsMaxSize,
62+
ValidateScaleToFalsePositiveInvalid,
5263
}
5364

5465
impl BloomError {
@@ -62,6 +73,12 @@ impl BloomError {
6273
BloomError::DecodeUnsupportedVersion => DECODE_UNSUPPORTED_VERSION,
6374
BloomError::ErrorRateRange => ERROR_RATE_RANGE,
6475
BloomError::BadExpansion => BAD_EXPANSION,
76+
BloomError::FalsePositiveReachesZero => FALSE_POSITIVE_DEGRADES_TO_O,
77+
BloomError::BadCapacity => BAD_CAPACITY,
78+
BloomError::ValidateScaleToExceedsMaxSize => VALIDATE_SCALE_TO_EXCEEDS_MAX_SIZE,
79+
BloomError::ValidateScaleToFalsePositiveInvalid => {
80+
VALIDATE_SCALE_TO_FALSE_POSITIVE_INVALID
81+
}
6582
}
6683
}
6784
}
@@ -241,6 +258,13 @@ impl BloomObject {
241258
.expect("Every BloomObject is expected to have at least one filter")
242259
.seed()
243260
}
261+
/// Return the starting capacity used by the Bloom object. This capacity is held within the first filter
262+
pub fn starting_capacity(&self) -> i64 {
263+
self.filters
264+
.first()
265+
.expect("Every BloomObject is expected to have at least one filter")
266+
.capacity()
267+
}
244268

245269
/// Return the expansion of the bloom object.
246270
pub fn expansion(&self) -> u32 {
@@ -311,8 +335,8 @@ impl BloomObject {
311335
let new_capacity = match filter.capacity.checked_mul(self.expansion.into()) {
312336
Some(new_capacity) => new_capacity,
313337
None => {
314-
// u32:max cannot be reached with 64MB memory usage limit per filter even with a high fp rate (e.g. 0.9).
315-
return Err(BloomError::MaxNumScalingFilters);
338+
// With a 128MB memory limit for a bloom object overall, it is not possible to reach u32:max capacity.
339+
return Err(BloomError::BadCapacity);
316340
}
317341
};
318342
// Reject the request, if the operation will result in creation of a filter of size greater than what is allowed.
@@ -366,7 +390,7 @@ impl BloomObject {
366390
) -> Result<f64, BloomError> {
367391
match fp_rate * tightening_ratio.powi(num_filters) {
368392
x if x > f64::MIN_POSITIVE => Ok(x),
369-
_ => Err(BloomError::MaxNumScalingFilters),
393+
_ => Err(BloomError::FalsePositiveReachesZero),
370394
}
371395
}
372396

@@ -455,6 +479,78 @@ impl BloomObject {
455479
_ => Err(BloomError::DecodeUnsupportedVersion),
456480
}
457481
}
482+
483+
/// This method is called from two different bloom commands: BF.INFO and BF.INSERT. The functionality varies slightly on which command it
484+
/// is called from. When called from BF.INFO, this method is used to find the maximum possible size that the bloom object could scale to
485+
/// without throwing an error. When called from BF.INSERT, this method is used to determine if it is possible to reach the provided `validate_scale_to`.
486+
///
487+
/// # Arguments
488+
///
489+
/// * `capacity` - The size of the initial filter in the bloom object.
490+
/// * `fp_rate` - the false positive rate for the bloom object
491+
/// * `validate_scale_to` - the capacity we check to see if it can scale to. If this method is called from BF.INFO this is set as -1 as we
492+
/// want to check the maximum size we could scale up till
493+
/// * `tightening_ratio` - The tightening ratio of the object
494+
/// * `expansion` - The expanison rate of the object
495+
///
496+
/// # Returns
497+
/// * i64 - The maximum capacity that can be reached if called from BF.INFO. If called from BF.INSERT the size it reached when it became greater than `validate_scale_to`
498+
/// * ValkeyError - Can return two different errors:
499+
/// VALIDATE_SCALE_TO_EXCEEDS_MAX_SIZE: When scaling to the wanted capacity would go over the bloom object memory limit
500+
/// VALIDATE_SCALE_TO_FALSE_POSITIVE_INVALID: When scaling to the wanted capacity would cause the false positive rate to reach 0
501+
pub fn calculate_max_scaled_capacity(
502+
capacity: i64,
503+
fp_rate: f64,
504+
validate_scale_to: i64,
505+
tightening_ratio: f64,
506+
expansion: u32,
507+
) -> Result<i64, BloomError> {
508+
let mut curr_filter_capacity = capacity;
509+
let mut curr_total_capacity = 0;
510+
let mut curr_num_filters: u64 = 0;
511+
let mut filters_memory_usage = 0;
512+
while curr_total_capacity < validate_scale_to || validate_scale_to == -1 {
513+
// Check to see if scaling to the next filter will cause a degradation in FP to 0
514+
let curr_fp_rate = match BloomObject::calculate_fp_rate(
515+
fp_rate,
516+
curr_num_filters as i32,
517+
tightening_ratio,
518+
) {
519+
Ok(rate) => rate,
520+
Err(_) => {
521+
if validate_scale_to == -1 {
522+
return Ok(curr_total_capacity);
523+
}
524+
return Err(BloomError::ValidateScaleToFalsePositiveInvalid);
525+
}
526+
};
527+
// Check that if it scales to this number of filters that the object won't exceed the memory limit
528+
let curr_filter_size = BloomFilter::compute_size(curr_filter_capacity, curr_fp_rate);
529+
// For vectors of size < 4 the capacity of the vector is 4. However after that the capacity is always a power of two above or equal to the size
530+
let curr_object_size = BloomObject::compute_size(
531+
std::cmp::max(4, curr_num_filters).next_power_of_two() as usize,
532+
) + filters_memory_usage
533+
+ curr_filter_size;
534+
if !BloomObject::validate_size(curr_object_size) {
535+
if validate_scale_to == -1 {
536+
return Ok(curr_total_capacity);
537+
}
538+
return Err(BloomError::ValidateScaleToExceedsMaxSize);
539+
}
540+
// Update overall memory usage
541+
filters_memory_usage += curr_filter_size;
542+
curr_total_capacity += curr_filter_capacity;
543+
curr_filter_capacity = match curr_filter_capacity.checked_mul(expansion.into()) {
544+
Some(new_capacity) => new_capacity,
545+
None => {
546+
// With a 128MB memory limit for a bloom object overall, it is not possible to reach u32:max capacity.
547+
return Err(BloomError::BadCapacity);
548+
}
549+
};
550+
curr_num_filters += 1;
551+
}
552+
Ok(curr_total_capacity)
553+
}
458554
}
459555

460556
/// Structure representing a single bloom filter. 200 Bytes.
@@ -613,6 +709,7 @@ impl Drop for BloomFilter {
613709
#[cfg(test)]
614710
mod tests {
615711
use super::*;
712+
use crate::configs::TIGHTENING_RATIO_DEFAULT;
616713
use configs;
617714
use rand::{distributions::Alphanumeric, Rng};
618715
use rstest::rstest;
@@ -961,6 +1058,10 @@ mod tests {
9611058
let test_bloom_filter2 = BloomFilter::with_random_seed(0.5_f64, 1000_i64);
9621059
let test_seed2 = test_bloom_filter2.seed();
9631060
assert_ne!(test_seed2, configs::FIXED_SEED);
1061+
// Check that the random seed changes for each BloomFilter
1062+
let test_bloom_filter3 = BloomFilter::with_random_seed(0.5_f64, 1000_i64);
1063+
let test_seed3 = test_bloom_filter3.seed();
1064+
assert_ne!(test_seed2, test_seed3);
9641065
}
9651066

9661067
#[test]
@@ -979,6 +1080,65 @@ mod tests {
9791080
assert_eq!(result2.err(), Some(BloomError::ExceedsMaxBloomSize));
9801081
}
9811082

1083+
#[rstest]
1084+
#[case(1000, 0.01, 10000, 2, 15000)]
1085+
#[case(10000, 0.001, 100000, 4, 210000)]
1086+
#[case(50000, 0.0001, 500000, 3, 650000)]
1087+
#[case(100000, 0.00001, 1000000, 2, 1500000)]
1088+
#[case(100, 0.00001, 1000, 1, 1000)]
1089+
fn test_calculate_max_scaled_capacity(
1090+
#[case] capacity: i64,
1091+
#[case] fp_rate: f64,
1092+
#[case] validate_scale_to: i64,
1093+
#[case] expansion: u32,
1094+
#[case] resulting_size: i64,
1095+
) {
1096+
// Validate that max scaled capacity returns the correct capacity reached when a valid validate_scale_to to is provided
1097+
let returned_size = BloomObject::calculate_max_scaled_capacity(
1098+
capacity,
1099+
fp_rate,
1100+
validate_scale_to,
1101+
TIGHTENING_RATIO_DEFAULT
1102+
.parse()
1103+
.expect("global config should always be 0.5"),
1104+
expansion,
1105+
);
1106+
assert_eq!(resulting_size, returned_size.unwrap());
1107+
// Test that with a -1 validate_scale_to the returned value will be the max capacity
1108+
let max_returned_size = BloomObject::calculate_max_scaled_capacity(
1109+
capacity,
1110+
fp_rate,
1111+
-1,
1112+
TIGHTENING_RATIO_DEFAULT
1113+
.parse()
1114+
.expect("global config should always be 0.5"),
1115+
expansion,
1116+
);
1117+
// Check that 1 more than the max will trigger the error cases
1118+
let failed_returned_size = BloomObject::calculate_max_scaled_capacity(
1119+
capacity,
1120+
fp_rate,
1121+
max_returned_size.unwrap() + 1,
1122+
TIGHTENING_RATIO_DEFAULT
1123+
.parse()
1124+
.expect("global config should always be 0.5"),
1125+
expansion,
1126+
);
1127+
if expansion == 1 {
1128+
// FP rate reaches 0 case
1129+
assert!(failed_returned_size
1130+
.unwrap_err()
1131+
.as_str()
1132+
.contains("provided VALIDATESCALETO causes false positive to degrade to 0"));
1133+
} else {
1134+
// Exceeds memory limit case
1135+
assert!(failed_returned_size
1136+
.unwrap_err()
1137+
.as_str()
1138+
.contains("provided VALIDATESCALETO causes bloom object to exceed memory limit"));
1139+
}
1140+
}
1141+
9821142
#[rstest(expansion, case::nonscaling(0), case::scaling(2))]
9831143
fn test_bf_encode_and_decode(expansion: u32) {
9841144
let mut bf =

0 commit comments

Comments
 (0)