Merge pull request #33177 from vespa-engine/toregge/use-weighted-arra…

…y-instead-of-bitvector-when-imported-filter-attribute-has-few-hits Use weighted array instead of bitvector when imported filter attribute (MERGEOK)
vespa-engine · Jan 24, 2025 · 99826c0 · 99826c0
2 parents 2c39cba + c9c7d6d
commit 99826c0
Show file tree

Hide file tree

Showing 4 changed files with 101 additions and 25 deletions.
diff --git a/searchlib/src/tests/attribute/imported_search_context/imported_search_context_test.cpp b/searchlib/src/tests/attribute/imported_search_context/imported_search_context_test.cpp
@@ -521,15 +521,17 @@ TEST(ImportedSearchContextTest, queryTerm_returns_term_context_was_created_with)
 }
 
 struct SearchCacheFixture : Fixture {
-    SearchCacheFixture() : Fixture(true) {
-        reset_with_single_value_reference_mappings<IntegerAttribute, int32_t>(
+    SearchCacheFixture(FilterConfig filter_config = FilterConfig::ExplicitlyEnabled)
+        : Fixture(true)
+    {
+        reset_with_wset_value_reference_mappings<IntegerAttribute, WeightedInt>(
                 BasicType::INT32,
-                {{DocId(3), dummy_gid(5), DocId(5), 5678},
-                 {DocId(4), dummy_gid(6), DocId(6), 1234},
-                 {DocId(5), dummy_gid(8), DocId(8), 5678},
-                 {DocId(7), dummy_gid(9), DocId(9), 4321}},
+                {{DocId(3), dummy_gid(5), DocId(5), {{5678, 5}}},
+                 {DocId(4), dummy_gid(6), DocId(6), {{1234, 7}}},
+                 {DocId(5), dummy_gid(8), DocId(8), {{5678, 9}}},
+                 {DocId(7), dummy_gid(9), DocId(9), {{4321, 11}}}},
                 FastSearchConfig::ExplicitlyEnabled,
-                FilterConfig::ExplicitlyEnabled);
+                filter_config);
     }
     ~SearchCacheFixture() override;
 };
@@ -567,25 +569,58 @@ get_bitvector_hits(const BitVector &bitVector)
     return actDocsIds;
 }
 
-TEST(ImportedSearchContextTest, entry_is_inserted_into_search_cache_if_bit_vector_posting_list_is_used)
+void test_search_cache(bool increase_child_lidspace, FilterConfig filter_config)
 {
-    SearchCacheFixture f;
+    SearchCacheFixture f(filter_config);
+    if (increase_child_lidspace) {
+        f.reference_attr->addDocs(2 * ImportedSearchContext::bitvector_limit_divisor);
+        f.reference_attr->commit();
+    }
     EXPECT_EQ(0u, f.imported_attr->getSearchCache()->size());
     auto old_mem_usage = f.imported_attr->get_memory_usage();
     auto ctx = f.create_context(word_term("5678"));
     ctx->fetchPostings(queryeval::ExecuteInfo::FULL, true);
     TermFieldMatchData match;
     auto iter = f.create_strict_iterator(*ctx, match);
     EXPECT_EQ(SimpleResult({3, 5}), f.search(*iter));
+    iter->initFullRange();
+    EXPECT_FALSE(iter->seek(1));
+    EXPECT_TRUE(iter->seek(3));
+    iter->unpack(3);
+    EXPECT_EQ(3, match.getDocId());
+    if (filter_config == FilterConfig::ExplicitlyEnabled) {
+        EXPECT_EQ(1, match.getWeight());
+    } else {
+        EXPECT_EQ(5, match.getWeight());
+    }
+
+    if (increase_child_lidspace || filter_config == FilterConfig::Default) {
+        // weighted array
+        EXPECT_EQ(0u, f.imported_attr->getSearchCache()->size());
+        EXPECT_EQ(1u, f.document_meta_store->get_read_guard_cnt);
+    } else {
+        // bitvector
+        EXPECT_EQ(1u, f.imported_attr->getSearchCache()->size());
+        auto new_mem_usage = f.imported_attr->get_memory_usage();
+        EXPECT_LT(old_mem_usage.usedBytes(), new_mem_usage.usedBytes());
+        EXPECT_LT(old_mem_usage.allocatedBytes(), new_mem_usage.allocatedBytes());
+        auto cacheEntry = f.imported_attr->getSearchCache()->find("5678");
+        EXPECT_EQ(cacheEntry->docIdLimit, f.get_imported_attr()->getNumDocs());
+        EXPECT_EQ((std::vector<uint32_t>{3, 5}), get_bitvector_hits(*cacheEntry->bitVector));
+        EXPECT_EQ(1u, f.document_meta_store->get_read_guard_cnt);
+    }
+}
+
+TEST(ImportedSearchContextTest, entry_is_inserted_into_search_cache_if_bit_vector_posting_list_is_used) {
+    test_search_cache(false, FilterConfig::ExplicitlyEnabled);
+}
+
+TEST(ImportedSearchContextTest, entry_is_not_inserted_into_search_cache_if_weighted_array_posting_list_is_used) {
+    test_search_cache(true, FilterConfig::ExplicitlyEnabled);
+}
 
-    EXPECT_EQ(1u, f.imported_attr->getSearchCache()->size());
-    auto new_mem_usage = f.imported_attr->get_memory_usage();
-    EXPECT_LT(old_mem_usage.usedBytes(), new_mem_usage.usedBytes());
-    EXPECT_LT(old_mem_usage.allocatedBytes(), new_mem_usage.allocatedBytes());
-    auto cacheEntry = f.imported_attr->getSearchCache()->find("5678");
-    EXPECT_EQ(cacheEntry->docIdLimit, f.get_imported_attr()->getNumDocs());
-    EXPECT_EQ((std::vector<uint32_t>{3, 5}), get_bitvector_hits(*cacheEntry->bitVector));
-    EXPECT_EQ(1u, f.document_meta_store->get_read_guard_cnt);
+TEST(ImportedSearchContextTest, entry_is_not_inserted_into_search_cache_if_not_filter) {
+    test_search_cache(false, FilterConfig::Default);
 }
 
 }

diff --git a/searchlib/src/vespa/searchlib/attribute/imported_search_context.cpp b/searchlib/src/vespa/searchlib/attribute/imported_search_context.cpp
@@ -132,7 +132,11 @@ ImportedSearchContext::createIterator(fef::TermFieldMatchData* matchData, bool s
             DocIt postings;
             auto array = _merger.getArray();
             postings.set(&array[0], &array[array.size()]);
-            return std::make_unique<AttributePostingListIteratorT<DocIt>>(*this, matchData, postings);
+            if (_target_attribute.getIsFilter()) {
+                return std::make_unique<FilterAttributePostingListIteratorT<DocIt>>(*this, matchData, postings);
+            } else {
+                return std::make_unique<AttributePostingListIteratorT<DocIt>>(*this, matchData, postings);
+            }
         }
     } else if (_merger.hasBitVector()) {
         return BitVectorIterator::create(_merger.getBitVector(), _merger.getDocIdLimit(), *matchData, strict);
@@ -268,12 +272,35 @@ class ReverseMappingPostingList
 
 }
 
-void ImportedSearchContext::makeMergedPostings(bool isFilter)
+ImportedSearchContext::MergedPostingsType
+ImportedSearchContext::select_merged_postings_type(bool is_filter)
+{
+    if (!is_filter) {
+        return MergedPostingsType::WEIGHTED_ARRAY;
+    }
+    /*
+     * Select weighted array if the estimated number of hits is low to minimize memory usage. If lid space is 80M, and
+     * we estimate 100 hits then a bitvector will use 10MB while a weighted array will use 800 bytes. Always using
+     * bitvectors can be a problem when we have queries with many terms (e.g. queries using weightedset operator with
+     * 1000 or more terms).
+     */
+    auto hit_estimate = calc_hit_estimate();
+    auto est_hits = hit_estimate.est_hits();
+    uint32_t docid_limit = _targetLids.size();
+    uint32_t bitvector_limit = 1 + docid_limit / bitvector_limit_divisor;
+    if (est_hits < bitvector_limit) {
+        return MergedPostingsType::WEIGHTED_ARRAY;
+    }
+    return MergedPostingsType::BITVECTOR;
+}
+
+void
+ImportedSearchContext::makeMergedPostings(MergedPostingsType merged_postings_type)
 {
     uint32_t committedTargetDocIdLimit = _target_attribute.getCommittedDocIdLimit();
     std::atomic_thread_fence(std::memory_order_acquire);
     const auto &reverseMapping = _reference_attribute.getReverseMapping();
-    if (isFilter) {
+    if (merged_postings_type == MergedPostingsType::BITVECTOR) {
         _merger.allocBitVector();
         TargetResult::getResult(_reference_attribute.getReverseMappingRefs(),
                                 _reference_attribute.getReverseMapping(),
@@ -309,7 +336,7 @@ ImportedSearchContext::fetchPostings(const queryeval::ExecuteInfo &execInfo, boo
     if (!_searchCacheLookup) {
         _target_search_context->fetchPostings(execInfo, strict);
         if (!_merger.merge_done() && (strict || (_target_attribute.getIsFastSearch() && execInfo.hit_rate() > 0.01))) {
-                makeMergedPostings(_target_attribute.getIsFilter());
+                makeMergedPostings(select_merged_postings_type(_target_attribute.getIsFilter()));
                 considerAddSearchCacheEntry();
         }
     }

diff --git a/searchlib/src/vespa/searchlib/attribute/imported_search_context.h b/searchlib/src/vespa/searchlib/attribute/imported_search_context.h
@@ -29,6 +29,10 @@ class SearchContextParams;
 class ImportedSearchContext : public ISearchContext {
     using AtomicTargetLid = vespalib::datastore::AtomicValueWrapper<uint32_t>;
     using TargetLids = std::span<const AtomicTargetLid>;
+    enum class MergedPostingsType : uint8_t {
+        WEIGHTED_ARRAY,
+        BITVECTOR
+    };
     const ImportedAttributeVector&                  _imported_attribute;
     std::string                                _queryTerm;
     bool                                            _useSearchCache;
@@ -52,7 +56,8 @@ class ImportedSearchContext : public ISearchContext {
         return target_lid < _target_docid_limit ? target_lid : 0u;
     }
 
-    void makeMergedPostings(bool isFilter);
+    MergedPostingsType select_merged_postings_type(bool is_filter);
+    void makeMergedPostings(MergedPostingsType merged_postings_type);
     void considerAddSearchCacheEntry();
     uint32_t calc_approx_hits(uint32_t target_approx_hits) const;
     uint32_t calc_exact_hits() const;
@@ -63,6 +68,13 @@ class ImportedSearchContext : public ISearchContext {
                           const attribute::IAttributeVector &target_attribute);
     ~ImportedSearchContext() override;
 
+    /*
+     * Each array element in weighted array uses 8 bytes, thus a weighted array with docid_limit / 64 elements
+     * would use the same amount of memory as the bitvector. The divisor is adjusted to account for extra memory usage
+     * by an addition array (_startPos in posting list merger) and for cpu time spent doing the sorting.
+     */
+    static constexpr uint32_t bitvector_limit_divisor = 150;
+
     std::unique_ptr<queryeval::SearchIterator>
     createIterator(fef::TermFieldMatchData* matchData, bool strict) override;
     HitEstimate calc_hit_estimate() const override;

diff --git a/searchlib/src/vespa/searchlib/test/imported_attribute_fixture.h b/searchlib/src/vespa/searchlib/test/imported_attribute_fixture.h
@@ -86,8 +86,9 @@ std::shared_ptr<AttrVecType> create_array_attribute(BasicType type, std::string_
 template<typename AttrVecType>
 std::shared_ptr<AttrVecType> create_wset_attribute(BasicType type,
                                                    FastSearchConfig fast_search = FastSearchConfig::Default,
+                                                   FilterConfig filter = FilterConfig::Default,
                                                    std::string_view name = "parent") {
-    return create_typed_attribute<AttrVecType>(type, CollectionType::WSET, fast_search, FilterConfig::Default, name);
+    return create_typed_attribute<AttrVecType>(type, CollectionType::WSET, fast_search, filter, name);
 }
 
 template<typename AttrVecType>
@@ -212,8 +213,9 @@ struct ImportedAttributeFixture {
     void reset_with_wset_value_reference_mappings(
             BasicType type,
             const std::vector<LidToLidMapping<std::vector<WeightedValueType>>> &mappings,
-            FastSearchConfig fast_search = FastSearchConfig::Default) {
-        reset_with_new_target_attr(create_wset_attribute<AttrVecType>(type, fast_search));
+            FastSearchConfig fast_search = FastSearchConfig::Default,
+            FilterConfig filter = FilterConfig::Default) {
+        reset_with_new_target_attr(create_wset_attribute<AttrVecType>(type, fast_search, filter));
         set_up_and_map<AttrVecType>(mappings, [](auto &target_vec, auto &mapping) {
             for (const auto &v : mapping._value_in_target_attr) {
                 ASSERT_TRUE(target_vec.append(mapping._to_lid, v.value(), v.weight()));