diff --git a/db/compaction/compaction.cc b/db/compaction/compaction.cc index 44ab72c9cd5..b6b9a5922a9 100644 --- a/db/compaction/compaction.cc +++ b/db/compaction/compaction.cc @@ -410,6 +410,7 @@ Compaction::Compaction( void Compaction::PopulatePenultimateLevelOutputRange() { if (!SupportsPerKeyPlacement()) { + assert(keep_in_last_level_through_seqno_ == kMaxSequenceNumber); return; } @@ -452,6 +453,27 @@ void Compaction::PopulatePenultimateLevelOutputRange() { GetBoundaryInternalKeys(input_vstorage_, inputs_, &penultimate_level_smallest_, &penultimate_level_largest_, exclude_level); + + if (penultimate_output_range_type_ != + PenultimateOutputRangeType::kFullRange) { + // If not full range in penultimate level, must keep everything already + // in the last level there, because moving it back up might cause + // overlap/placement issues that are difficult to resolve properly in the + // presence of range deletes + SequenceNumber max_last_level_seqno = 0; + for (const auto& input_lvl : inputs_) { + if (input_lvl.level == output_level_) { + for (const auto& file : input_lvl.files) { + max_last_level_seqno = + std::max(max_last_level_seqno, file->fd.largest_seqno); + } + } + } + + keep_in_last_level_through_seqno_ = max_last_level_seqno; + } else { + keep_in_last_level_through_seqno_ = 0; + } } Compaction::~Compaction() { @@ -494,22 +516,29 @@ bool Compaction::OverlapPenultimateLevelOutputRange( } // key includes timestamp if user-defined timestamp is enabled. -bool Compaction::WithinPenultimateLevelOutputRange( - const ParsedInternalKey& ikey) const { - if (!SupportsPerKeyPlacement()) { - return false; - } +void Compaction::TEST_AssertWithinPenultimateLevelOutputRange( + const Slice& user_key, bool expect_failure) const { +#ifdef NDEBUG + (void)user_key; + (void)expect_failure; +#else + assert(SupportsPerKeyPlacement()); - if (penultimate_level_smallest_.size() == 0 || - penultimate_level_largest_.size() == 0) { - return false; - } + assert(penultimate_level_smallest_.size() > 0); + assert(penultimate_level_largest_.size() > 0); - const InternalKeyComparator* icmp = input_vstorage_->InternalComparator(); + auto* cmp = input_vstorage_->user_comparator(); // op_type of a key can change during compaction, e.g. Merge -> Put. - return icmp->CompareKeySeq(ikey, penultimate_level_smallest_.Encode()) >= 0 && - icmp->CompareKeySeq(ikey, penultimate_level_largest_.Encode()) <= 0; + if (!(cmp->Compare(user_key, penultimate_level_smallest_.user_key()) >= 0)) { + assert(expect_failure); + } else if (!(cmp->Compare(user_key, penultimate_level_largest_.user_key()) <= + 0)) { + assert(expect_failure); + } else { + assert(!expect_failure); + } +#endif } bool Compaction::InputCompressionMatchesOutput() const { diff --git a/db/compaction/compaction.h b/db/compaction/compaction.h index 49e0b3b2af9..d707b49c419 100644 --- a/db/compaction/compaction.h +++ b/db/compaction/compaction.h @@ -388,12 +388,12 @@ class Compaction { bool OverlapPenultimateLevelOutputRange(const Slice& smallest_key, const Slice& largest_key) const; - // Return true if the key is within penultimate level output range for - // per_key_placement feature, which is safe to place the key to the - // penultimate level. different compaction strategy has different rules. - // If per_key_placement is not supported, always return false. - // key includes timestamp if user-defined timestamp is enabled. - bool WithinPenultimateLevelOutputRange(const ParsedInternalKey& ikey) const; + // For testing purposes, check that a key is within penultimate level + // output range for per_key_placement feature, which is safe to place the key + // to the penultimate level. Different compaction strategies have different + // rules. `user_key` includes timestamp if user-defined timestamp is enabled. + void TEST_AssertWithinPenultimateLevelOutputRange( + const Slice& user_key, bool expect_failure = false) const; CompactionReason compaction_reason() const { return compaction_reason_; } @@ -456,6 +456,13 @@ class Compaction { const ImmutableOptions& immutable_options, const int start_level, const int output_level); + // If some data cannot be safely migrated "up" the LSM tree due to a change + // in the preclude_last_level_data_seconds setting, this indicates a sequence + // number for the newest data that must be kept in the last level. + SequenceNumber GetKeepInLastLevelThroughSeqno() const { + return keep_in_last_level_through_seqno_; + } + // mark (or clear) all files that are being compacted void MarkFilesBeingCompacted(bool being_compacted) const; @@ -605,6 +612,8 @@ class Compaction { // Blob garbage collection age cutoff. double blob_garbage_collection_age_cutoff_; + SequenceNumber keep_in_last_level_through_seqno_ = kMaxSequenceNumber; + // only set when per_key_placement feature is enabled, -1 (kInvalidLevel) // means not supported. const int penultimate_level_; diff --git a/db/compaction/compaction_iterator.cc b/db/compaction/compaction_iterator.cc index e9d1482fa8b..dc441817c6c 100644 --- a/db/compaction/compaction_iterator.cc +++ b/db/compaction/compaction_iterator.cc @@ -38,8 +38,7 @@ CompactionIterator::CompactionIterator( const std::atomic* shutting_down, const std::shared_ptr info_log, const std::string* full_history_ts_low, - const SequenceNumber preserve_time_min_seqno, - const SequenceNumber preclude_last_level_min_seqno) + std::optional preserve_seqno_min) : CompactionIterator( input, cmp, merge_helper, last_sequence, snapshots, earliest_snapshot, earliest_write_conflict_snapshot, job_snapshot, snapshot_checker, env, @@ -48,8 +47,7 @@ CompactionIterator::CompactionIterator( manual_compaction_canceled, compaction ? std::make_unique(compaction) : nullptr, must_count_input_entries, compaction_filter, shutting_down, info_log, - full_history_ts_low, preserve_time_min_seqno, - preclude_last_level_min_seqno) {} + full_history_ts_low, preserve_seqno_min) {} CompactionIterator::CompactionIterator( InternalIterator* input, const Comparator* cmp, MergeHelper* merge_helper, @@ -67,8 +65,7 @@ CompactionIterator::CompactionIterator( const std::atomic* shutting_down, const std::shared_ptr info_log, const std::string* full_history_ts_low, - const SequenceNumber preserve_time_min_seqno, - const SequenceNumber preclude_last_level_min_seqno) + std::optional preserve_seqno_min) : input_(input, cmp, must_count_input_entries), cmp_(cmp), merge_helper_(merge_helper), @@ -109,10 +106,9 @@ CompactionIterator::CompactionIterator( current_key_committed_(false), cmp_with_history_ts_low_(0), level_(compaction_ == nullptr ? 0 : compaction_->level()), - preserve_time_min_seqno_(preserve_time_min_seqno), - preclude_last_level_min_seqno_(preclude_last_level_min_seqno) { + preserve_seqno_after_(preserve_seqno_min.value_or(earliest_snapshot)) { assert(snapshots_ != nullptr); - assert(preserve_time_min_seqno_ <= preclude_last_level_min_seqno_); + assert(preserve_seqno_after_ <= earliest_snapshot_); if (compaction_ != nullptr) { level_ptrs_ = std::vector(compaction_->number_levels(), 0); @@ -1017,7 +1013,6 @@ void CompactionIterator::NextFromInput() { } else { if (ikey_.sequence != 0) { iter_stats_.num_timed_put_swap_preferred_seqno++; - saved_seq_for_penul_check_ = ikey_.sequence; ikey_.sequence = preferred_seqno; } ikey_.type = kTypeValue; @@ -1258,71 +1253,6 @@ void CompactionIterator::GarbageCollectBlobIfNeeded() { } } -void CompactionIterator::DecideOutputLevel() { - assert(compaction_->SupportsPerKeyPlacement()); - output_to_penultimate_level_ = false; - // if the key is newer than the cutoff sequence or within the earliest - // snapshot, it should output to the penultimate level. - if (ikey_.sequence > preclude_last_level_min_seqno_ || - ikey_.sequence > earliest_snapshot_) { - output_to_penultimate_level_ = true; - } - -#ifndef NDEBUG - // Could be overridden by unittest - PerKeyPlacementContext context(level_, ikey_.user_key, value_, ikey_.sequence, - output_to_penultimate_level_); - TEST_SYNC_POINT_CALLBACK("CompactionIterator::PrepareOutput.context", - &context); - if (ikey_.sequence > earliest_snapshot_) { - output_to_penultimate_level_ = true; - } -#endif // NDEBUG - - // saved_seq_for_penul_check_ is populated in `NextFromInput` when the - // entry's sequence number is non zero and validity context for output this - // entry is kSwapPreferredSeqno for use in `DecideOutputLevel`. It should be - // cleared out here unconditionally. Otherwise, it may end up getting consumed - // incorrectly by a different entry. - SequenceNumber seq_for_range_check = - (saved_seq_for_penul_check_.has_value() && - saved_seq_for_penul_check_.value() != kMaxSequenceNumber) - ? saved_seq_for_penul_check_.value() - : ikey_.sequence; - saved_seq_for_penul_check_ = std::nullopt; - ParsedInternalKey ikey_for_range_check = ikey_; - if (seq_for_range_check != ikey_.sequence) { - ikey_for_range_check.sequence = seq_for_range_check; - } - if (output_to_penultimate_level_) { - // If it's decided to output to the penultimate level, but unsafe to do so, - // still output to the last level. For example, moving the data from a lower - // level to a higher level outside of the higher-level input key range is - // considered unsafe, because the key may conflict with higher-level SSTs - // not from this compaction. - // TODO: add statistic for declined output_to_penultimate_level - bool safe_to_penultimate_level = - compaction_->WithinPenultimateLevelOutputRange(ikey_for_range_check); - if (!safe_to_penultimate_level) { - output_to_penultimate_level_ = false; - // It could happen when disable/enable `last_level_temperature` while - // holding a snapshot. When `last_level_temperature` is not set - // (==kUnknown), the data newer than any snapshot is pushed to the last - // level, but when the per_key_placement feature is enabled on the fly, - // the data later than the snapshot has to be moved to the penultimate - // level, which may or may not be safe. So the user needs to make sure all - // snapshot is released before enabling `last_level_temperature` feature - // We will migrate the feature to `last_level_temperature` and maybe make - // it not dynamically changeable. - if (seq_for_range_check > earliest_snapshot_) { - status_ = Status::Corruption( - "Unsafe to store Seq later than snapshot in the last level if " - "per_key_placement is enabled"); - } - } - } -} - void CompactionIterator::PrepareOutput() { if (Valid()) { if (LIKELY(!is_range_del_)) { @@ -1331,13 +1261,6 @@ void CompactionIterator::PrepareOutput() { } else if (ikey_.type == kTypeBlobIndex) { GarbageCollectBlobIfNeeded(); } - - // For range del sentinel, we don't use it to cut files for bottommost - // compaction. So it should not make a difference which output level we - // decide. - if (compaction_ != nullptr && compaction_->SupportsPerKeyPlacement()) { - DecideOutputLevel(); - } } // Zeroing out the sequence number leads to better compression. @@ -1355,8 +1278,7 @@ void CompactionIterator::PrepareOutput() { !compaction_->allow_ingest_behind() && bottommost_level_ && DefinitelyInSnapshot(ikey_.sequence, earliest_snapshot_) && ikey_.type != kTypeMerge && current_key_committed_ && - !output_to_penultimate_level_ && - ikey_.sequence < preserve_time_min_seqno_ && !is_range_del_) { + ikey_.sequence <= preserve_seqno_after_ && !is_range_del_) { if (ikey_.type == kTypeDeletion || (ikey_.type == kTypeSingleDeletion && timestamp_size_ == 0)) { ROCKS_LOG_FATAL( diff --git a/db/compaction/compaction_iterator.h b/db/compaction/compaction_iterator.h index b9b4766c4c3..3cf0ce0d3bd 100644 --- a/db/compaction/compaction_iterator.h +++ b/db/compaction/compaction_iterator.h @@ -117,10 +117,6 @@ class CompactionIterator { virtual const Compaction* real_compaction() const = 0; virtual bool SupportsPerKeyPlacement() const = 0; - - // `key` includes timestamp if user-defined timestamp is enabled. - virtual bool WithinPenultimateLevelOutputRange( - const ParsedInternalKey&) const = 0; }; class RealCompaction : public CompactionProxy { @@ -184,14 +180,6 @@ class CompactionIterator { return compaction_->SupportsPerKeyPlacement(); } - // Check if key is within penultimate level output range, to see if it's - // safe to output to the penultimate level for per_key_placement feature. - // `key` includes timestamp if user-defined timestamp is enabled. - bool WithinPenultimateLevelOutputRange( - const ParsedInternalKey& ikey) const override { - return compaction_->WithinPenultimateLevelOutputRange(ikey); - } - private: const Compaction* compaction_; }; @@ -216,29 +204,29 @@ class CompactionIterator { const std::atomic* shutting_down = nullptr, const std::shared_ptr info_log = nullptr, const std::string* full_history_ts_low = nullptr, - const SequenceNumber preserve_time_min_seqno = kMaxSequenceNumber, - const SequenceNumber preclude_last_level_min_seqno = kMaxSequenceNumber); + std::optional preserve_seqno_min = {}); // Constructor with custom CompactionProxy, used for tests. - CompactionIterator( - InternalIterator* input, const Comparator* cmp, MergeHelper* merge_helper, - SequenceNumber last_sequence, std::vector* snapshots, - SequenceNumber earliest_snapshot, - SequenceNumber earliest_write_conflict_snapshot, - SequenceNumber job_snapshot, const SnapshotChecker* snapshot_checker, - Env* env, bool report_detailed_time, bool expect_valid_internal_key, - CompactionRangeDelAggregator* range_del_agg, - BlobFileBuilder* blob_file_builder, bool allow_data_in_errors, - bool enforce_single_del_contracts, - const std::atomic& manual_compaction_canceled, - std::unique_ptr compaction, - bool must_count_input_entries, - const CompactionFilter* compaction_filter = nullptr, - const std::atomic* shutting_down = nullptr, - const std::shared_ptr info_log = nullptr, - const std::string* full_history_ts_low = nullptr, - const SequenceNumber preserve_time_min_seqno = kMaxSequenceNumber, - const SequenceNumber preclude_last_level_min_seqno = kMaxSequenceNumber); + CompactionIterator(InternalIterator* input, const Comparator* cmp, + MergeHelper* merge_helper, SequenceNumber last_sequence, + std::vector* snapshots, + SequenceNumber earliest_snapshot, + SequenceNumber earliest_write_conflict_snapshot, + SequenceNumber job_snapshot, + const SnapshotChecker* snapshot_checker, Env* env, + bool report_detailed_time, bool expect_valid_internal_key, + CompactionRangeDelAggregator* range_del_agg, + BlobFileBuilder* blob_file_builder, + bool allow_data_in_errors, + bool enforce_single_del_contracts, + const std::atomic& manual_compaction_canceled, + std::unique_ptr compaction, + bool must_count_input_entries, + const CompactionFilter* compaction_filter = nullptr, + const std::atomic* shutting_down = nullptr, + const std::shared_ptr info_log = nullptr, + const std::string* full_history_ts_low = nullptr, + std::optional preserve_seqno_min = {}); ~CompactionIterator(); @@ -269,11 +257,6 @@ class CompactionIterator { const CompactionIterationStats& iter_stats() const { return iter_stats_; } bool HasNumInputEntryScanned() const { return input_.HasNumItered(); } uint64_t NumInputEntryScanned() const { return input_.NumItered(); } - // If the current key should be placed on penultimate level, only valid if - // per_key_placement is supported - bool output_to_penultimate_level() const { - return output_to_penultimate_level_; - } Status InputStatus() const { return input_.status(); } bool IsDeleteRangeSentinelKey() const { return is_range_del_; } @@ -285,10 +268,6 @@ class CompactionIterator { // Do final preparations before presenting the output to the callee. void PrepareOutput(); - // Decide the current key should be output to the last level or penultimate - // level, only call for compaction supports per key placement - void DecideOutputLevel(); - // Passes the output value to the blob file builder (if any), and replaces it // with the corresponding blob reference if it has been actually written to a // blob file (i.e. if it passed the value size check). Returns true if the @@ -440,16 +419,6 @@ class CompactionIterator { // NextFromInput()). ParsedInternalKey ikey_; - // When a kTypeValuePreferredSeqno entry's preferred seqno is safely swapped - // in in this compaction, this field saves its original sequence number for - // range checking whether it's safe to be placed on the penultimate level. - // This is to ensure when such an entry happens to be the right boundary of - // penultimate safe range, it won't get excluded because with the preferred - // seqno swapped in, it's now larger than the right boundary (itself before - // the swap). This is safe to do, because preferred seqno is swapped in only - // when no entries with the same user key exist on lower levels and this entry - // is already visible in the earliest snapshot. - std::optional saved_seq_for_penul_check_ = kMaxSequenceNumber; // Stores whether ikey_.user_key is valid. If set to false, the user key is // not compared against the current key in the underlying iterator. bool has_current_user_key_ = false; @@ -507,17 +476,8 @@ class CompactionIterator { // just been zeroed out during bottommost compaction. bool last_key_seq_zeroed_{false}; - // True if the current key should be output to the penultimate level if - // possible, compaction logic makes the final decision on which level to - // output to. - bool output_to_penultimate_level_{false}; - - // min seqno for preserving the time information. - const SequenceNumber preserve_time_min_seqno_ = kMaxSequenceNumber; - - // min seqno to preclude the data from the last level, if the key seqno larger - // than this, it will be output to penultimate level - const SequenceNumber preclude_last_level_min_seqno_ = kMaxSequenceNumber; + // Max seqno that can be zeroed out at last level (various reasons) + const SequenceNumber preserve_seqno_after_ = kMaxSequenceNumber; void AdvanceInputIter() { input_.Next(); } diff --git a/db/compaction/compaction_iterator_test.cc b/db/compaction/compaction_iterator_test.cc index c3a8a7574bf..974a4e1ff83 100644 --- a/db/compaction/compaction_iterator_test.cc +++ b/db/compaction/compaction_iterator_test.cc @@ -192,11 +192,6 @@ class FakeCompaction : public CompactionIterator::CompactionProxy { return supports_per_key_placement; } - bool WithinPenultimateLevelOutputRange( - const ParsedInternalKey& key) const override { - return (!key.user_key.starts_with("unsafe_pb")); - } - bool key_not_exists_beyond_output_level = false; bool is_bottommost_level = false; @@ -844,123 +839,6 @@ TEST_P(CompactionIteratorTest, ZeroSeqOfKeyAndSnapshot) { INSTANTIATE_TEST_CASE_P(CompactionIteratorTestInstance, CompactionIteratorTest, testing::Values(true, false)); -class PerKeyPlacementCompIteratorTest : public CompactionIteratorTest { - public: - bool SupportsPerKeyPlacement() const override { return true; } -}; - -TEST_P(PerKeyPlacementCompIteratorTest, SplitLastLevelData) { - std::atomic_uint64_t latest_cold_seq = 0; - - SyncPoint::GetInstance()->SetCallBack( - "CompactionIterator::PrepareOutput.context", [&](void* arg) { - auto context = static_cast(arg); - context->output_to_penultimate_level = - context->seq_num > latest_cold_seq; - }); - SyncPoint::GetInstance()->EnableProcessing(); - - latest_cold_seq = 5; - - InitIterators( - {test::KeyStr("a", 7, kTypeValue), test::KeyStr("b", 6, kTypeValue), - test::KeyStr("c", 5, kTypeValue)}, - {"vala", "valb", "valc"}, {}, {}, kMaxSequenceNumber, kMaxSequenceNumber, - nullptr, nullptr, true); - c_iter_->SeekToFirst(); - ASSERT_TRUE(c_iter_->Valid()); - - // the first 2 keys are hot, which should has - // `output_to_penultimate_level()==true` and seq num not zeroed out - ASSERT_EQ(test::KeyStr("a", 7, kTypeValue), c_iter_->key().ToString()); - ASSERT_TRUE(c_iter_->output_to_penultimate_level()); - c_iter_->Next(); - ASSERT_TRUE(c_iter_->Valid()); - ASSERT_EQ(test::KeyStr("b", 6, kTypeValue), c_iter_->key().ToString()); - ASSERT_TRUE(c_iter_->output_to_penultimate_level()); - c_iter_->Next(); - ASSERT_TRUE(c_iter_->Valid()); - // `a` is cold data, which should be output to bottommost - ASSERT_EQ(test::KeyStr("c", 0, kTypeValue), c_iter_->key().ToString()); - ASSERT_FALSE(c_iter_->output_to_penultimate_level()); - c_iter_->Next(); - ASSERT_OK(c_iter_->status()); - ASSERT_FALSE(c_iter_->Valid()); - SyncPoint::GetInstance()->DisableProcessing(); - SyncPoint::GetInstance()->ClearAllCallBacks(); -} - -TEST_P(PerKeyPlacementCompIteratorTest, SnapshotData) { - AddSnapshot(5); - - InitIterators( - {test::KeyStr("a", 7, kTypeValue), test::KeyStr("b", 6, kTypeDeletion), - test::KeyStr("b", 5, kTypeValue)}, - {"vala", "", "valb"}, {}, {}, kMaxSequenceNumber, kMaxSequenceNumber, - nullptr, nullptr, true); - c_iter_->SeekToFirst(); - ASSERT_TRUE(c_iter_->Valid()); - - // The first key and the tombstone are within snapshot, which should output - // to the penultimate level (and seq num cannot be zeroed out). - ASSERT_EQ(test::KeyStr("a", 7, kTypeValue), c_iter_->key().ToString()); - ASSERT_TRUE(c_iter_->output_to_penultimate_level()); - c_iter_->Next(); - ASSERT_TRUE(c_iter_->Valid()); - ASSERT_EQ(test::KeyStr("b", 6, kTypeDeletion), c_iter_->key().ToString()); - ASSERT_TRUE(c_iter_->output_to_penultimate_level()); - c_iter_->Next(); - ASSERT_TRUE(c_iter_->Valid()); - // `a` is not protected by the snapshot, the sequence number is zero out and - // should output bottommost - ASSERT_EQ(test::KeyStr("b", 0, kTypeValue), c_iter_->key().ToString()); - ASSERT_FALSE(c_iter_->output_to_penultimate_level()); - c_iter_->Next(); - ASSERT_OK(c_iter_->status()); - ASSERT_FALSE(c_iter_->Valid()); -} - -TEST_P(PerKeyPlacementCompIteratorTest, ConflictWithSnapshot) { - std::atomic_uint64_t latest_cold_seq = 0; - - SyncPoint::GetInstance()->SetCallBack( - "CompactionIterator::PrepareOutput.context", [&](void* arg) { - auto context = static_cast(arg); - context->output_to_penultimate_level = - context->seq_num > latest_cold_seq; - }); - SyncPoint::GetInstance()->EnableProcessing(); - - latest_cold_seq = 6; - - AddSnapshot(5); - - InitIterators({test::KeyStr("a", 7, kTypeValue), - test::KeyStr("unsafe_pb", 6, kTypeValue), - test::KeyStr("c", 5, kTypeValue)}, - {"vala", "valb", "valc"}, {}, {}, kMaxSequenceNumber, - kMaxSequenceNumber, nullptr, nullptr, true); - c_iter_->SeekToFirst(); - ASSERT_TRUE(c_iter_->Valid()); - - ASSERT_EQ(test::KeyStr("a", 7, kTypeValue), c_iter_->key().ToString()); - ASSERT_TRUE(c_iter_->output_to_penultimate_level()); - // the 2nd key is unsafe to output_to_penultimate_level, but it's within - // snapshot so for per_key_placement feature it has to be outputted to the - // penultimate level. which is a corruption. We should never see - // such case as the data with seq num (within snapshot) should always come - // from higher compaction input level, which makes it safe to - // output_to_penultimate_level. - c_iter_->Next(); - ASSERT_TRUE(c_iter_->status().IsCorruption()); - SyncPoint::GetInstance()->DisableProcessing(); - SyncPoint::GetInstance()->ClearAllCallBacks(); -} - -INSTANTIATE_TEST_CASE_P(PerKeyPlacementCompIteratorTest, - PerKeyPlacementCompIteratorTest, - testing::Values(true, false)); - // Tests how CompactionIterator work together with SnapshotChecker. class CompactionIteratorWithSnapshotCheckerTest : public CompactionIteratorTest { diff --git a/db/compaction/compaction_job.cc b/db/compaction/compaction_job.cc index 3a7a19edb3f..887688ad90f 100644 --- a/db/compaction/compaction_job.cc +++ b/db/compaction/compaction_job.cc @@ -298,6 +298,8 @@ void CompactionJob::Prepare( // collect all seqno->time information from the input files which will be used // to encode seqno->time to the output files. + SequenceNumber preserve_time_min_seqno = kMaxSequenceNumber; + SequenceNumber preclude_last_level_min_seqno = kMaxSequenceNumber; uint64_t preserve_time_duration = std::max(c->mutable_cf_options()->preserve_internal_time_seconds, c->mutable_cf_options()->preclude_last_level_data_seconds); @@ -330,8 +332,8 @@ void CompactionJob::Prepare( "Failed to get current time in compaction: Status: %s", s.ToString().c_str()); // preserve all time information - preserve_time_min_seqno_ = 0; - preclude_last_level_min_seqno_ = 0; + preserve_time_min_seqno = 0; + preclude_last_level_min_seqno = 0; seqno_to_time_mapping_.Enforce(); } else { seqno_to_time_mapping_.Enforce(_current_time); @@ -339,7 +341,7 @@ void CompactionJob::Prepare( static_cast(_current_time), c->mutable_cf_options()->preserve_internal_time_seconds, c->mutable_cf_options()->preclude_last_level_data_seconds, - &preserve_time_min_seqno_, &preclude_last_level_min_seqno_); + &preserve_time_min_seqno, &preclude_last_level_min_seqno); } // For accuracy of the GetProximalSeqnoBeforeTime queries above, we only // limit the capacity after them. @@ -353,15 +355,42 @@ void CompactionJob::Prepare( seqno_to_time_mapping_.SetCapacity(kMaxSeqnoToTimeEntries); } #ifndef NDEBUG - assert(preserve_time_min_seqno_ <= preclude_last_level_min_seqno_); + assert(preserve_time_min_seqno <= preclude_last_level_min_seqno); TEST_SYNC_POINT_CALLBACK( "CompactionJob::PrepareTimes():preclude_last_level_min_seqno", - static_cast(&preclude_last_level_min_seqno_)); + static_cast(&preclude_last_level_min_seqno)); // Restore the invariant asserted above, in case it was broken under the // callback - preserve_time_min_seqno_ = - std::min(preclude_last_level_min_seqno_, preserve_time_min_seqno_); + preserve_time_min_seqno = + std::min(preclude_last_level_min_seqno, preserve_time_min_seqno); #endif + + // Preserve sequence numbers for preserved write times and snapshots, though + // the specific sequence number of the earliest snapshot can be zeroed. + preserve_seqno_after_ = + std::max(preserve_time_min_seqno, SequenceNumber{1}) - 1; + preserve_seqno_after_ = std::min(preserve_seqno_after_, earliest_snapshot_); + // If using preclude feature, also preclude snapshots from last level, just + // because they are heuristically more likely to be accessed than non-snapshot + // data. + if (preclude_last_level_min_seqno < kMaxSequenceNumber && + earliest_snapshot_ < preclude_last_level_min_seqno) { + preclude_last_level_min_seqno = earliest_snapshot_; + } + // Now combine what we would like to preclude from last level with what we + // can safely support without dangerously moving data back up the LSM tree, + // to get the final seqno threshold for penultimate vs. last. In particular, + // when the reserved output key range for the penultimate level does not + // include the entire last level input key range, we need to keep entries + // already in the last level there. (Even allowing within-range entries to + // move back up could cause problems with range tombstones. Perhaps it + // would be better in some rare cases to keep entries in the last level + // one-by-one rather than based on sequence number, but that would add extra + // tracking and complexity to CompactionIterator that is probably not + // worthwhile overall. Correctness is also more clear when splitting by + // seqno threshold.) + penultimate_after_seqno_ = std::max(preclude_last_level_min_seqno, + c->GetKeepInLastLevelThroughSeqno()); } uint64_t CompactionJob::GetSubcompactionsLimit() { @@ -1325,8 +1354,7 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) { sub_compact->compaction ->DoesInputReferenceBlobFiles() /* must_count_input_entries */, sub_compact->compaction, compaction_filter, shutting_down_, - db_options_.info_log, full_history_ts_low, preserve_time_min_seqno_, - preclude_last_level_min_seqno_); + db_options_.info_log, full_history_ts_low, preserve_seqno_after_); c_iter->SeekToFirst(); const auto& c_iter_stats = c_iter->iter_stats(); @@ -1372,14 +1400,41 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) { last_cpu_micros = cur_cpu_micros; } + const auto& ikey = c_iter->ikey(); + bool use_penultimate_output = ikey.sequence > penultimate_after_seqno_; +#ifndef NDEBUG + if (sub_compact->compaction->SupportsPerKeyPlacement()) { + // Could be overridden by unittest + PerKeyPlacementContext context(sub_compact->compaction->output_level(), + ikey.user_key, c_iter->value(), + ikey.sequence, use_penultimate_output); + TEST_SYNC_POINT_CALLBACK("CompactionIterator::PrepareOutput.context", + &context); + if (use_penultimate_output) { + // Verify that entries sent to the penultimate level are within the + // allowed range (because the input key range of the last level could + // be larger than the allowed output key range of the penultimate + // level). This check uses user keys (ignores sequence numbers) because + // compaction boundaries are a "clean cut" between user keys (see + // CompactionPicker::ExpandInputsToCleanCut()), which is especially + // important when preferred sequence numbers has been swapped in for + // kTypeValuePreferredSeqno / TimedPut. + sub_compact->compaction->TEST_AssertWithinPenultimateLevelOutputRange( + c_iter->user_key()); + } + } else { + assert(penultimate_after_seqno_ == kMaxSequenceNumber); + assert(!use_penultimate_output); + } +#endif // NDEBUG + // Add current compaction_iterator key to target compaction output, if the // output file needs to be close or open, it will call the `open_file_func` // and `close_file_func`. // TODO: it would be better to have the compaction file open/close moved // into `CompactionOutputs` which has the output file information. - status = - sub_compact->AddToOutput(*c_iter, c_iter->output_to_penultimate_level(), - open_file_func, close_file_func); + status = sub_compact->AddToOutput(*c_iter, use_penultimate_output, + open_file_func, close_file_func); if (!status.ok()) { break; } @@ -1586,19 +1641,32 @@ Status CompactionJob::FinishCompactionOutputFile( earliest_snapshot = existing_snapshots_[0]; } if (s.ok()) { + // Inclusive lower bound, exclusive upper bound + std::pair keep_seqno_range{ + 0, kMaxSequenceNumber}; + if (sub_compact->compaction->SupportsPerKeyPlacement()) { + if (outputs.IsPenultimateLevel()) { + keep_seqno_range.first = penultimate_after_seqno_; + } else { + keep_seqno_range.second = penultimate_after_seqno_; + } + } CompactionIterationStats range_del_out_stats; - // if the compaction supports per_key_placement, only output range dels to - // the penultimate level. - // Note: Use `bottommost_level_ = true` for both bottommost and + // NOTE1: Use `bottommost_level_ = true` for both bottommost and // output_to_penultimate_level compaction here, as it's only used to decide - // if range dels could be dropped. - if (sub_compact->HasRangeDel() && - (!sub_compact->compaction->SupportsPerKeyPlacement() ^ - outputs.IsPenultimateLevel())) { - s = outputs.AddRangeDels( - *sub_compact->RangeDelAgg(), comp_start_user_key, comp_end_user_key, - range_del_out_stats, bottommost_level_, cfd->internal_comparator(), - earliest_snapshot, next_table_min_key, full_history_ts_low_); + // if range dels could be dropped. (Logically, we are taking a single sorted + // run returned from CompactionIterator and physically splitting it between + // two output levels.) + // NOTE2: with per-key placement, range tombstones will be filtered on + // each output level based on sequence number (traversed twice). This is + // CPU-inefficient for a large number of range tombstones, but that would + // be an unusual work load. + if (sub_compact->HasRangeDel()) { + s = outputs.AddRangeDels(*sub_compact->RangeDelAgg(), comp_start_user_key, + comp_end_user_key, range_del_out_stats, + bottommost_level_, cfd->internal_comparator(), + earliest_snapshot, keep_seqno_range, + next_table_min_key, full_history_ts_low_); } RecordDroppedKeys(range_del_out_stats, &sub_compact->compaction_job_stats); TEST_SYNC_POINT("CompactionJob::FinishCompactionOutputFile1"); @@ -1998,9 +2066,7 @@ Status CompactionJob::OpenCompactionOutputFile(SubcompactionState* sub_compact, bottommost_level_, TableFileCreationReason::kCompaction, 0 /* oldest_key_time */, current_time, db_id_, db_session_id_, sub_compact->compaction->max_output_file_size(), file_number, - preclude_last_level_min_seqno_ == kMaxSequenceNumber - ? preclude_last_level_min_seqno_ - : std::min(earliest_snapshot_, preclude_last_level_min_seqno_)); + penultimate_after_seqno_ /*last_level_inclusive_max_seqno_threshold*/); outputs.NewBuilder(tboptions); @@ -2172,8 +2238,8 @@ void CompactionJob::LogCompaction() { ? int64_t{-1} // Use -1 for "none" : static_cast(existing_snapshots_[0])); if (compaction->SupportsPerKeyPlacement()) { - stream << "preclude_last_level_min_seqno" - << preclude_last_level_min_seqno_; + stream << "prenultimate_after_seqno" << penultimate_after_seqno_; + stream << "preserve_seqno_after" << preserve_seqno_after_; stream << "penultimate_output_level" << compaction->GetPenultimateLevel(); stream << "penultimate_output_range" << GetCompactionPenultimateOutputRangeTypeString( diff --git a/db/compaction/compaction_job.h b/db/compaction/compaction_job.h index de80b7f30ee..2108eec5e30 100644 --- a/db/compaction/compaction_job.h +++ b/db/compaction/compaction_job.h @@ -359,15 +359,14 @@ class CompactionJob { // it also collects the smallest_seqno -> oldest_ancester_time from the SST. SeqnoToTimeMapping seqno_to_time_mapping_; - // Minimal sequence number for preserving the time information. The time info - // older than this sequence number won't be preserved after the compaction and - // if it's bottommost compaction, the seq num will be zeroed out. - SequenceNumber preserve_time_min_seqno_ = kMaxSequenceNumber; + // Max seqno that can be zeroed out in last level, including for preserving + // write times. + SequenceNumber preserve_seqno_after_ = kMaxSequenceNumber; // Minimal sequence number to preclude the data from the last level. If the // key has bigger (newer) sequence number than this, it will be precluded from // the last level (output to penultimate level). - SequenceNumber preclude_last_level_min_seqno_ = kMaxSequenceNumber; + SequenceNumber penultimate_after_seqno_ = kMaxSequenceNumber; // Get table file name in where it's outputting to, which should also be in // `output_directory_`. diff --git a/db/compaction/compaction_job_test.cc b/db/compaction/compaction_job_test.cc index 2865b355983..f8a6b006c9a 100644 --- a/db/compaction/compaction_job_test.cc +++ b/db/compaction/compaction_job_test.cc @@ -1535,14 +1535,11 @@ TEST_F(CompactionJobTest, VerifyPenultimateLevelOutput) { /*verify_func=*/[&](Compaction& comp) { for (char c = 'a'; c <= 'z'; c++) { if (c == 'a') { - ParsedInternalKey pik("a", 0U, kTypeValue); - ASSERT_FALSE(comp.WithinPenultimateLevelOutputRange(pik)); + comp.TEST_AssertWithinPenultimateLevelOutputRange( + "a", true /*expect_failure*/); } else { std::string c_str{c}; - // WithinPenultimateLevelOutputRange checks internal key range. - // 'z' is the last key, so set seqno properly. - ParsedInternalKey pik(c_str, c == 'z' ? 12U : 0U, kTypeValue); - ASSERT_TRUE(comp.WithinPenultimateLevelOutputRange(pik)); + comp.TEST_AssertWithinPenultimateLevelOutputRange(c_str); } } }); diff --git a/db/compaction/compaction_outputs.cc b/db/compaction/compaction_outputs.cc index 7d9c6811eeb..b724dd39357 100644 --- a/db/compaction/compaction_outputs.cc +++ b/db/compaction/compaction_outputs.cc @@ -459,6 +459,7 @@ Status CompactionOutputs::AddRangeDels( const Slice* comp_start_user_key, const Slice* comp_end_user_key, CompactionIterationStats& range_del_out_stats, bool bottommost_level, const InternalKeyComparator& icmp, SequenceNumber earliest_snapshot, + std::pair keep_seqno_range, const Slice& next_table_min_key, const std::string& full_history_ts_low) { // The following example does not happen since // CompactionOutput::ShouldStopBefore() always return false for the first @@ -587,6 +588,12 @@ Status CompactionOutputs::AddRangeDels( for (it->SeekToFirst(); it->Valid(); it->Next()) { auto tombstone = it->Tombstone(); auto kv = tombstone.Serialize(); + // Filter out by seqno for per-key placement + if (tombstone.seq_ < keep_seqno_range.first || + tombstone.seq_ >= keep_seqno_range.second) { + continue; + } + InternalKey tombstone_end = tombstone.SerializeEndKey(); // TODO: the underlying iterator should support clamping the bounds. // tombstone_end.Encode is of form user_key@kMaxSeqno diff --git a/db/compaction/compaction_outputs.h b/db/compaction/compaction_outputs.h index 46db1f82264..33259be4670 100644 --- a/db/compaction/compaction_outputs.h +++ b/db/compaction/compaction_outputs.h @@ -184,14 +184,13 @@ class CompactionOutputs { // @param next_table_min_key internal key lower bound for the next compaction // output. // @param full_history_ts_low used for range tombstone garbage collection. - Status AddRangeDels(CompactionRangeDelAggregator& range_del_agg, - const Slice* comp_start_user_key, - const Slice* comp_end_user_key, - CompactionIterationStats& range_del_out_stats, - bool bottommost_level, const InternalKeyComparator& icmp, - SequenceNumber earliest_snapshot, - const Slice& next_table_min_key, - const std::string& full_history_ts_low); + Status AddRangeDels( + CompactionRangeDelAggregator& range_del_agg, + const Slice* comp_start_user_key, const Slice* comp_end_user_key, + CompactionIterationStats& range_del_out_stats, bool bottommost_level, + const InternalKeyComparator& icmp, SequenceNumber earliest_snapshot, + std::pair keep_seqno_range, + const Slice& next_table_min_key, const std::string& full_history_ts_low); private: friend class SubcompactionState; @@ -261,7 +260,9 @@ class CompactionOutputs { const CompactionFileOpenFunc& open_file_func, const CompactionFileCloseFunc& close_file_func) { Status status = curr_status; - // handle subcompaction containing only range deletions + // Handle subcompaction containing only range deletions. They could + // be dropped or sent to another output level, so this is only an + // over-approximate check for whether opening is needed. if (status.ok() && !HasBuilder() && !HasOutput() && range_del_agg && !range_del_agg->IsEmpty()) { status = open_file_func(*this); diff --git a/db/compaction/subcompaction_state.cc b/db/compaction/subcompaction_state.cc index 1d45939ad7d..5b823cb3bac 100644 --- a/db/compaction/subcompaction_state.cc +++ b/db/compaction/subcompaction_state.cc @@ -15,8 +15,19 @@ namespace ROCKSDB_NAMESPACE { void SubcompactionState::AggregateCompactionOutputStats( InternalStats::CompactionStatsFull& compaction_stats) const { + // Outputs should be closed. By extension, any files created just for + // range deletes have already been written also. + assert(compaction_outputs_.HasBuilder() == false); + assert(penultimate_level_outputs_.HasBuilder() == false); + + // FIXME: These stats currently include abandonned output files + // assert(compaction_outputs_.stats_.num_output_files == + // compaction_outputs_.outputs_.size()); + // assert(penultimate_level_outputs_.stats_.num_output_files == + // penultimate_level_outputs_.outputs_.size()); + compaction_stats.stats.Add(compaction_outputs_.stats_); - if (penultimate_level_outputs_.HasOutput() || HasRangeDel()) { + if (penultimate_level_outputs_.HasOutput()) { compaction_stats.has_penultimate_level_output = true; compaction_stats.penultimate_level_stats.Add( penultimate_level_outputs_.stats_); diff --git a/db/compaction/subcompaction_state.h b/db/compaction/subcompaction_state.h index 0927dd9f559..6a28f74d908 100644 --- a/db/compaction/subcompaction_state.h +++ b/db/compaction/subcompaction_state.h @@ -81,14 +81,8 @@ class SubcompactionState { // it returns both the last level outputs and penultimate level outputs. OutputIterator GetOutputs() const; - // Assign range dels aggregator, for each range_del, it can only be assigned - // to one output level, for per_key_placement, it's going to be the - // penultimate level. - // TODO: This does not work for per_key_placement + user-defined timestamp + - // DeleteRange() combo. If user-defined timestamp is enabled, - // it is possible for a range tombstone to belong to bottommost level ( - // seqno < earliest snapshot) without being dropped (garbage collection - // for user-defined timestamp). + // Assign range dels aggregator. The various tombstones will potentially + // be filtered to different outputs. void AssignRangeDelAggregator( std::unique_ptr&& range_del_agg) { assert(range_del_agg_ == nullptr); @@ -199,12 +193,16 @@ class SubcompactionState { // Call FinishCompactionOutputFile() even if status is not ok: it needs to // close the output file. // CloseOutput() may open new compaction output files. - Status s = penultimate_level_outputs_.CloseOutput( - curr_status, per_key ? range_del_agg_.get() : nullptr, open_file_func, - close_file_func); - s = compaction_outputs_.CloseOutput( - s, per_key ? nullptr : range_del_agg_.get(), open_file_func, - close_file_func); + Status s = curr_status; + if (per_key) { + s = penultimate_level_outputs_.CloseOutput( + s, range_del_agg_.get(), open_file_func, close_file_func); + } else { + assert(penultimate_level_outputs_.HasBuilder() == false); + assert(penultimate_level_outputs_.HasOutput() == false); + } + s = compaction_outputs_.CloseOutput(s, range_del_agg_.get(), open_file_func, + close_file_func); return s; } diff --git a/db/compaction/tiered_compaction_test.cc b/db/compaction/tiered_compaction_test.cc index 5e691ef83d1..eed5cb936f0 100644 --- a/db/compaction/tiered_compaction_test.cc +++ b/db/compaction/tiered_compaction_test.cc @@ -338,7 +338,10 @@ TEST_F(TieredCompactionTest, SequenceBasedTieredStorageUniversal) { ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0); } -TEST_F(TieredCompactionTest, RangeBasedTieredStorageUniversal) { +// This test was essentially for a hacked-up version on future functionality. +// It can be resurrected if/when a form of range-based tiering is properly +// implemented. +TEST_F(TieredCompactionTest, DISABLED_RangeBasedTieredStorageUniversal) { const int kNumTrigger = 4; const int kNumLevels = 7; const int kNumKeys = 100; @@ -493,6 +496,7 @@ TEST_F(TieredCompactionTest, RangeBasedTieredStorageUniversal) { // verify data std::string value; for (int i = 0; i < kNumKeys; i++) { + SCOPED_TRACE(Key(i)); if (i < 70) { ASSERT_TRUE(db_->Get(ReadOptions(), Key(i), &value).IsNotFound()); } else { @@ -1104,7 +1108,13 @@ TEST_F(TieredCompactionTest, SequenceBasedTieredStorageLevel) { ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0); } -TEST_F(TieredCompactionTest, RangeBasedTieredStorageLevel) { +// This test was essentially for a hacked-up version on future functionality. +// It can be resurrected if/when a form of range-based tiering is properly +// implemented. +// FIXME: aside from that, this test reproduces a near-endless compaction +// cycle that needs to be reproduced independently and fixed before +// leveled compaction can be used with the preclude feature in production. +TEST_F(TieredCompactionTest, DISABLED_RangeBasedTieredStorageLevel) { const int kNumTrigger = 4; const int kNumLevels = 7; const int kNumKeys = 100; @@ -1213,6 +1223,9 @@ TEST_F(TieredCompactionTest, RangeBasedTieredStorageLevel) { // Tests that we only compact keys up to penultimate level // that are within penultimate level input's internal key range. + // UPDATE: this functionality has changed. With penultimate-enabled + // compaction, the expanded potential output range in the penultimate + // level is reserved so should be safe to use. { MutexLock l(&mutex); hot_start = Key(0); @@ -1221,7 +1234,6 @@ TEST_F(TieredCompactionTest, RangeBasedTieredStorageLevel) { const Snapshot* temp_snap = db_->GetSnapshot(); // Key(0) and Key(1) here are inserted with higher sequence number // than Key(0) and Key(1) inserted above. - // Only Key(0) in last level will be compacted up, not Key(1). ASSERT_OK(Put(Key(0), "value" + std::to_string(0))); ASSERT_OK(Put(Key(1), "value" + std::to_string(100))); ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); @@ -1231,14 +1243,17 @@ TEST_F(TieredCompactionTest, RangeBasedTieredStorageLevel) { db_->GetLiveFilesMetaData(&metas); for (const auto& f : metas) { if (f.temperature == Temperature::kUnknown) { - // Expect Key(0), Key(0), Key(1) - ASSERT_EQ(f.num_entries, 3); + // UPDATED (was 3 entries, Key0..Key1) + ASSERT_EQ(f.num_entries, 52); ASSERT_EQ(f.smallestkey, Key(0)); - ASSERT_EQ(f.largestkey, Key(1)); + ASSERT_EQ(f.largestkey, Key(49)); } else { ASSERT_EQ(f.temperature, Temperature::kCold); - // Key(2)-Key(49) and Key(100). - ASSERT_EQ(f.num_entries, 50); + // UPDATED (was 50 entries, Key0..Key49) + // Key(100) is outside the hot range + ASSERT_EQ(f.num_entries, 1); + ASSERT_EQ(f.smallestkey, Key(100)); + ASSERT_EQ(f.largestkey, Key(100)); } } } @@ -1279,8 +1294,6 @@ class PrecludeLastLevelTestBase : public DBTestBase { const std::unordered_map& db_config_change) { if (dynamic) { if (config_change.size() > 0) { - // FIXME: temporary while preserve/preclude options are not user mutable - SaveAndRestore m(&TEST_allowSetOptionsImmutableInMutable, true); ASSERT_OK(db_->SetOptions(config_change)); } if (db_config_change.size() > 0) { @@ -1754,13 +1767,12 @@ TEST_P(PrecludeWithCompactStyleTest, RangeTombstoneSnapshotMigrateFromLast) { VerifyLogicalState(__LINE__); // Try a limited range compaction - // FIXME: these currently hit the "Unsafe to store Seq later than snapshot" - // error. Needs to work safely for preclude option to be user mutable. + // (These would previously hit "Unsafe to store Seq later than snapshot") if (universal) { - // uint64_t middle_l5 = GetLevelFileMetadatas(5)[1]->fd.GetNumber(); - // ASSERT_OK(db_->CompactFiles({}, {MakeTableFileName(middle_l5)}, 6)); + uint64_t middle_l5 = GetLevelFileMetadatas(5)[1]->fd.GetNumber(); + ASSERT_OK(db_->CompactFiles({}, {MakeTableFileName(middle_l5)}, 6)); } else { - // ASSERT_OK(CompactRange({}, Key(3), Key(4))); + ASSERT_OK(CompactRange({}, Key(3), Key(4))); } EXPECT_EQ("0,0,0,0,0,3,1", FilesPerLevel()); VerifyLogicalState(__LINE__); @@ -1772,8 +1784,11 @@ TEST_P(PrecludeWithCompactStyleTest, RangeTombstoneSnapshotMigrateFromLast) { EXPECT_EQ("0,0,0,0,0,1,1", FilesPerLevel()); VerifyLogicalState(__LINE__); // Key1 should have been migrated out of the last level - auto& meta = *GetLevelFileMetadatas(6)[0]; - ASSERT_LT(Key(1), meta.smallest.user_key().ToString()); + // FIXME: doesn't yet work with leveled compaction + if (universal) { + auto& meta = *GetLevelFileMetadatas(6)[0]; + ASSERT_LT(Key(1), meta.smallest.user_key().ToString()); + } // Make data eligible for last level db_->ReleaseSnapshot(snapshot); diff --git a/db_stress_tool/db_stress_test_base.cc b/db_stress_tool/db_stress_test_base.cc index f5ff66cff20..3e4e719a762 100644 --- a/db_stress_tool/db_stress_test_base.cc +++ b/db_stress_tool/db_stress_test_base.cc @@ -423,6 +423,15 @@ bool StressTest::BuildOptionsTable() { "{{temperature=kCold;age=100}}", "{}"}); } + // NOTE: allow -1 to mean starting disabled but dynamically changing + // But 0 means tiering is disabled for the entire run. + if (FLAGS_preclude_last_level_data_seconds != 0) { + options_tbl.emplace("preclude_last_level_data_seconds", + std::vector{"0", "5", "30", "5000"}); + } + options_tbl.emplace("preserve_internal_time_seconds", + std::vector{"0", "5", "30", "5000"}); + options_table_ = std::move(options_tbl); for (const auto& iter : options_table_) { @@ -4200,8 +4209,9 @@ void InitializeOptionsFromFlags( exit(1); } } + // NOTE: allow -1 to mean starting disabled but dynamically changing options.preclude_last_level_data_seconds = - FLAGS_preclude_last_level_data_seconds; + std::max(FLAGS_preclude_last_level_data_seconds, int64_t{0}); options.preserve_internal_time_seconds = FLAGS_preserve_internal_time_seconds; switch (FLAGS_rep_factory) { diff --git a/options/cf_options.cc b/options/cf_options.cc index 88ca0325f5b..d50eade9320 100644 --- a/options/cf_options.cc +++ b/options/cf_options.cc @@ -555,11 +555,11 @@ static std::unordered_map {"preclude_last_level_data_seconds", {offsetof(struct MutableCFOptions, preclude_last_level_data_seconds), OptionType::kUInt64T, OptionVerificationType::kNormal, - OptionTypeFlags::kNone}}, + OptionTypeFlags::kMutable}}, {"preserve_internal_time_seconds", {offsetof(struct MutableCFOptions, preserve_internal_time_seconds), OptionType::kUInt64T, OptionVerificationType::kNormal, - OptionTypeFlags::kNone}}, + OptionTypeFlags::kMutable}}, {"bottommost_temperature", {0, OptionType::kTemperature, OptionVerificationType::kDeprecated, OptionTypeFlags::kMutable}}, diff --git a/tools/db_crashtest.py b/tools/db_crashtest.py index a43f5261a79..ef40dc302bc 100644 --- a/tools/db_crashtest.py +++ b/tools/db_crashtest.py @@ -600,7 +600,10 @@ def is_direct_io_supported(dbname): tiered_params = { # For Leveled/Universal compaction (ignored for FIFO) # Bias toward times that can elapse during a crash test run series - "preclude_last_level_data_seconds": lambda: random.choice([10, 60, 1200, 86400]), + # NOTE: -1 means starting disabled but dynamically changing + "preclude_last_level_data_seconds": lambda: random.choice( + [-1, -1, 10, 60, 1200, 86400] + ), "last_level_temperature": "kCold", # For FIFO compaction (ignored otherwise) "file_temperature_age_thresholds": lambda: random.choice( @@ -999,7 +1002,10 @@ def finalize_and_sanitize(src_params): or dest_params.get("delrangepercent") == 0 ): dest_params["test_ingest_standalone_range_deletion_one_in"] = 0 - if dest_params.get("use_txn", 0) == 1 and dest_params.get("commit_bypass_memtable_one_in", 0) > 0: + if ( + dest_params.get("use_txn", 0) == 1 + and dest_params.get("commit_bypass_memtable_one_in", 0) > 0 + ): dest_params["enable_blob_files"] = 0 dest_params["allow_setting_blob_options_dynamically"] = 0 dest_params["atomic_flush"] = 0 diff --git a/unreleased_history/behavior_changes/mutable_tiering_options.md b/unreleased_history/behavior_changes/mutable_tiering_options.md new file mode 100644 index 00000000000..3584362983c --- /dev/null +++ b/unreleased_history/behavior_changes/mutable_tiering_options.md @@ -0,0 +1 @@ +* Experimental tiering options `preclude_last_level_data_seconds` and `preserve_internal_time_seconds` are now mutable with `SetOptions()`. Some changes to handling of these features along with long-lived snapshots and range deletes made this possible.