Skip to content

Commit

Permalink
Merge pull request #33177 from vespa-engine/toregge/use-weighted-arra…
Browse files Browse the repository at this point in the history
…y-instead-of-bitvector-when-imported-filter-attribute-has-few-hits

Use weighted array instead of bitvector when imported filter attribute (MERGEOK)
  • Loading branch information
geirst authored Jan 24, 2025
2 parents 2c39cba + c9c7d6d commit 99826c0
Show file tree
Hide file tree
Showing 4 changed files with 101 additions and 25 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -521,15 +521,17 @@ TEST(ImportedSearchContextTest, queryTerm_returns_term_context_was_created_with)
}

struct SearchCacheFixture : Fixture {
SearchCacheFixture() : Fixture(true) {
reset_with_single_value_reference_mappings<IntegerAttribute, int32_t>(
SearchCacheFixture(FilterConfig filter_config = FilterConfig::ExplicitlyEnabled)
: Fixture(true)
{
reset_with_wset_value_reference_mappings<IntegerAttribute, WeightedInt>(
BasicType::INT32,
{{DocId(3), dummy_gid(5), DocId(5), 5678},
{DocId(4), dummy_gid(6), DocId(6), 1234},
{DocId(5), dummy_gid(8), DocId(8), 5678},
{DocId(7), dummy_gid(9), DocId(9), 4321}},
{{DocId(3), dummy_gid(5), DocId(5), {{5678, 5}}},
{DocId(4), dummy_gid(6), DocId(6), {{1234, 7}}},
{DocId(5), dummy_gid(8), DocId(8), {{5678, 9}}},
{DocId(7), dummy_gid(9), DocId(9), {{4321, 11}}}},
FastSearchConfig::ExplicitlyEnabled,
FilterConfig::ExplicitlyEnabled);
filter_config);
}
~SearchCacheFixture() override;
};
Expand Down Expand Up @@ -567,25 +569,58 @@ get_bitvector_hits(const BitVector &bitVector)
return actDocsIds;
}

TEST(ImportedSearchContextTest, entry_is_inserted_into_search_cache_if_bit_vector_posting_list_is_used)
void test_search_cache(bool increase_child_lidspace, FilterConfig filter_config)
{
SearchCacheFixture f;
SearchCacheFixture f(filter_config);
if (increase_child_lidspace) {
f.reference_attr->addDocs(2 * ImportedSearchContext::bitvector_limit_divisor);
f.reference_attr->commit();
}
EXPECT_EQ(0u, f.imported_attr->getSearchCache()->size());
auto old_mem_usage = f.imported_attr->get_memory_usage();
auto ctx = f.create_context(word_term("5678"));
ctx->fetchPostings(queryeval::ExecuteInfo::FULL, true);
TermFieldMatchData match;
auto iter = f.create_strict_iterator(*ctx, match);
EXPECT_EQ(SimpleResult({3, 5}), f.search(*iter));
iter->initFullRange();
EXPECT_FALSE(iter->seek(1));
EXPECT_TRUE(iter->seek(3));
iter->unpack(3);
EXPECT_EQ(3, match.getDocId());
if (filter_config == FilterConfig::ExplicitlyEnabled) {
EXPECT_EQ(1, match.getWeight());
} else {
EXPECT_EQ(5, match.getWeight());
}

if (increase_child_lidspace || filter_config == FilterConfig::Default) {
// weighted array
EXPECT_EQ(0u, f.imported_attr->getSearchCache()->size());
EXPECT_EQ(1u, f.document_meta_store->get_read_guard_cnt);
} else {
// bitvector
EXPECT_EQ(1u, f.imported_attr->getSearchCache()->size());
auto new_mem_usage = f.imported_attr->get_memory_usage();
EXPECT_LT(old_mem_usage.usedBytes(), new_mem_usage.usedBytes());
EXPECT_LT(old_mem_usage.allocatedBytes(), new_mem_usage.allocatedBytes());
auto cacheEntry = f.imported_attr->getSearchCache()->find("5678");
EXPECT_EQ(cacheEntry->docIdLimit, f.get_imported_attr()->getNumDocs());
EXPECT_EQ((std::vector<uint32_t>{3, 5}), get_bitvector_hits(*cacheEntry->bitVector));
EXPECT_EQ(1u, f.document_meta_store->get_read_guard_cnt);
}
}

TEST(ImportedSearchContextTest, entry_is_inserted_into_search_cache_if_bit_vector_posting_list_is_used) {
test_search_cache(false, FilterConfig::ExplicitlyEnabled);
}

TEST(ImportedSearchContextTest, entry_is_not_inserted_into_search_cache_if_weighted_array_posting_list_is_used) {
test_search_cache(true, FilterConfig::ExplicitlyEnabled);
}

EXPECT_EQ(1u, f.imported_attr->getSearchCache()->size());
auto new_mem_usage = f.imported_attr->get_memory_usage();
EXPECT_LT(old_mem_usage.usedBytes(), new_mem_usage.usedBytes());
EXPECT_LT(old_mem_usage.allocatedBytes(), new_mem_usage.allocatedBytes());
auto cacheEntry = f.imported_attr->getSearchCache()->find("5678");
EXPECT_EQ(cacheEntry->docIdLimit, f.get_imported_attr()->getNumDocs());
EXPECT_EQ((std::vector<uint32_t>{3, 5}), get_bitvector_hits(*cacheEntry->bitVector));
EXPECT_EQ(1u, f.document_meta_store->get_read_guard_cnt);
TEST(ImportedSearchContextTest, entry_is_not_inserted_into_search_cache_if_not_filter) {
test_search_cache(false, FilterConfig::Default);
}

}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,11 @@ ImportedSearchContext::createIterator(fef::TermFieldMatchData* matchData, bool s
DocIt postings;
auto array = _merger.getArray();
postings.set(&array[0], &array[array.size()]);
return std::make_unique<AttributePostingListIteratorT<DocIt>>(*this, matchData, postings);
if (_target_attribute.getIsFilter()) {
return std::make_unique<FilterAttributePostingListIteratorT<DocIt>>(*this, matchData, postings);
} else {
return std::make_unique<AttributePostingListIteratorT<DocIt>>(*this, matchData, postings);
}
}
} else if (_merger.hasBitVector()) {
return BitVectorIterator::create(_merger.getBitVector(), _merger.getDocIdLimit(), *matchData, strict);
Expand Down Expand Up @@ -268,12 +272,35 @@ class ReverseMappingPostingList

}

void ImportedSearchContext::makeMergedPostings(bool isFilter)
ImportedSearchContext::MergedPostingsType
ImportedSearchContext::select_merged_postings_type(bool is_filter)
{
if (!is_filter) {
return MergedPostingsType::WEIGHTED_ARRAY;
}
/*
* Select weighted array if the estimated number of hits is low to minimize memory usage. If lid space is 80M, and
* we estimate 100 hits then a bitvector will use 10MB while a weighted array will use 800 bytes. Always using
* bitvectors can be a problem when we have queries with many terms (e.g. queries using weightedset operator with
* 1000 or more terms).
*/
auto hit_estimate = calc_hit_estimate();
auto est_hits = hit_estimate.est_hits();
uint32_t docid_limit = _targetLids.size();
uint32_t bitvector_limit = 1 + docid_limit / bitvector_limit_divisor;
if (est_hits < bitvector_limit) {
return MergedPostingsType::WEIGHTED_ARRAY;
}
return MergedPostingsType::BITVECTOR;
}

void
ImportedSearchContext::makeMergedPostings(MergedPostingsType merged_postings_type)
{
uint32_t committedTargetDocIdLimit = _target_attribute.getCommittedDocIdLimit();
std::atomic_thread_fence(std::memory_order_acquire);
const auto &reverseMapping = _reference_attribute.getReverseMapping();
if (isFilter) {
if (merged_postings_type == MergedPostingsType::BITVECTOR) {
_merger.allocBitVector();
TargetResult::getResult(_reference_attribute.getReverseMappingRefs(),
_reference_attribute.getReverseMapping(),
Expand Down Expand Up @@ -309,7 +336,7 @@ ImportedSearchContext::fetchPostings(const queryeval::ExecuteInfo &execInfo, boo
if (!_searchCacheLookup) {
_target_search_context->fetchPostings(execInfo, strict);
if (!_merger.merge_done() && (strict || (_target_attribute.getIsFastSearch() && execInfo.hit_rate() > 0.01))) {
makeMergedPostings(_target_attribute.getIsFilter());
makeMergedPostings(select_merged_postings_type(_target_attribute.getIsFilter()));
considerAddSearchCacheEntry();
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,10 @@ class SearchContextParams;
class ImportedSearchContext : public ISearchContext {
using AtomicTargetLid = vespalib::datastore::AtomicValueWrapper<uint32_t>;
using TargetLids = std::span<const AtomicTargetLid>;
enum class MergedPostingsType : uint8_t {
WEIGHTED_ARRAY,
BITVECTOR
};
const ImportedAttributeVector& _imported_attribute;
std::string _queryTerm;
bool _useSearchCache;
Expand All @@ -52,7 +56,8 @@ class ImportedSearchContext : public ISearchContext {
return target_lid < _target_docid_limit ? target_lid : 0u;
}

void makeMergedPostings(bool isFilter);
MergedPostingsType select_merged_postings_type(bool is_filter);
void makeMergedPostings(MergedPostingsType merged_postings_type);
void considerAddSearchCacheEntry();
uint32_t calc_approx_hits(uint32_t target_approx_hits) const;
uint32_t calc_exact_hits() const;
Expand All @@ -63,6 +68,13 @@ class ImportedSearchContext : public ISearchContext {
const attribute::IAttributeVector &target_attribute);
~ImportedSearchContext() override;

/*
* Each array element in weighted array uses 8 bytes, thus a weighted array with docid_limit / 64 elements
* would use the same amount of memory as the bitvector. The divisor is adjusted to account for extra memory usage
* by an addition array (_startPos in posting list merger) and for cpu time spent doing the sorting.
*/
static constexpr uint32_t bitvector_limit_divisor = 150;

std::unique_ptr<queryeval::SearchIterator>
createIterator(fef::TermFieldMatchData* matchData, bool strict) override;
HitEstimate calc_hit_estimate() const override;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -86,8 +86,9 @@ std::shared_ptr<AttrVecType> create_array_attribute(BasicType type, std::string_
template<typename AttrVecType>
std::shared_ptr<AttrVecType> create_wset_attribute(BasicType type,
FastSearchConfig fast_search = FastSearchConfig::Default,
FilterConfig filter = FilterConfig::Default,
std::string_view name = "parent") {
return create_typed_attribute<AttrVecType>(type, CollectionType::WSET, fast_search, FilterConfig::Default, name);
return create_typed_attribute<AttrVecType>(type, CollectionType::WSET, fast_search, filter, name);
}

template<typename AttrVecType>
Expand Down Expand Up @@ -212,8 +213,9 @@ struct ImportedAttributeFixture {
void reset_with_wset_value_reference_mappings(
BasicType type,
const std::vector<LidToLidMapping<std::vector<WeightedValueType>>> &mappings,
FastSearchConfig fast_search = FastSearchConfig::Default) {
reset_with_new_target_attr(create_wset_attribute<AttrVecType>(type, fast_search));
FastSearchConfig fast_search = FastSearchConfig::Default,
FilterConfig filter = FilterConfig::Default) {
reset_with_new_target_attr(create_wset_attribute<AttrVecType>(type, fast_search, filter));
set_up_and_map<AttrVecType>(mappings, [](auto &target_vec, auto &mapping) {
for (const auto &v : mapping._value_in_target_attr) {
ASSERT_TRUE(target_vec.append(mapping._to_lid, v.value(), v.weight()));
Expand Down

0 comments on commit 99826c0

Please sign in to comment.