Skip to content

Commit a6fe3b4

Browse files
authored
feat: support pre-filter & limit in full text search (alibaba#97)
1 parent 31f0033 commit a6fe3b4

File tree

13 files changed

+430
-69
lines changed

13 files changed

+430
-69
lines changed

include/paimon/global_config.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ namespace paimon {
2929
/// not necessarily the exact number of threads at a given point in time.
3030
///
3131
/// You can change this number using SetArrowCpuThreadPoolCapacity().
32-
PAIMON_EXPORT int GetArrowCpuThreadPoolCapacity();
32+
PAIMON_EXPORT int32_t GetArrowCpuThreadPoolCapacity();
3333

3434
/// Set the capacity of the arrow's global thread pool
3535
/// This is a simple wrapper of arrow::SetCpuThreadPoolCapacity()
@@ -40,6 +40,6 @@ PAIMON_EXPORT int GetArrowCpuThreadPoolCapacity();
4040
/// The current number is returned by GetArrowCpuThreadPoolCapacity().
4141
/// Currently, this capacity will significantly affect the performance
4242
/// of parquet file batch read.
43-
PAIMON_EXPORT Status SetArrowCpuThreadPoolCapacity(int threads);
43+
PAIMON_EXPORT Status SetArrowCpuThreadPoolCapacity(int32_t threads);
4444

4545
} // namespace paimon

include/paimon/predicate/full_text_search.h

Lines changed: 16 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -23,8 +23,8 @@
2323
#include <vector>
2424

2525
#include "paimon/predicate/predicate.h"
26+
#include "paimon/utils/roaring_bitmap64.h"
2627
#include "paimon/visibility.h"
27-
2828
namespace paimon {
2929
/// A configuration structure for full-text search operations.
3030
struct PAIMON_EXPORT FullTextSearch {
@@ -44,14 +44,20 @@ struct PAIMON_EXPORT FullTextSearch {
4444
UNKNOWN = 128
4545
};
4646

47-
FullTextSearch(const std::string& _field_name, int32_t _limit, const std::string& _query,
48-
const SearchType& _search_type)
49-
: field_name(_field_name), limit(_limit), query(_query), search_type(_search_type) {}
47+
FullTextSearch(const std::string& _field_name, std::optional<int32_t> _limit,
48+
const std::string& _query, const SearchType& _search_type,
49+
const std::optional<RoaringBitmap64>& _pre_filter)
50+
: field_name(_field_name),
51+
limit(_limit),
52+
query(_query),
53+
search_type(_search_type),
54+
pre_filter(_pre_filter) {}
5055

5156
/// Name of the field to search within (must be a full-text indexed field).
5257
std::string field_name;
53-
/// Maximum number of documents to return. Ordered by scores.
54-
int32_t limit;
58+
/// Maximum number of documents to return. If set, limit ordered by top scores. Otherwise, no
59+
/// score return.
60+
std::optional<int32_t> limit;
5561
/// The query string to search for. The interpretation depends on search_type:
5662
///
5763
/// - For MATCH_ALL/MATCH_ANY: keywords are split into terms using the **same analyzer as
@@ -70,5 +76,9 @@ struct PAIMON_EXPORT FullTextSearch {
7076
std::string query;
7177
/// Type of search to perform.
7278
SearchType search_type;
79+
/// A pre-filter based on **local row IDs**, implemented by leveraging another global index.
80+
/// Only rows whose local row ID is present in `pre_filter` will be included during search.
81+
/// If not set, all rows will be included.
82+
std::optional<RoaringBitmap64> pre_filter;
7383
};
7484
} // namespace paimon

include/paimon/utils/roaring_bitmap64.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,8 @@ class PAIMON_EXPORT RoaringBitmap64 {
6262
Iterator& operator++();
6363
bool operator==(const Iterator& other) const;
6464
bool operator!=(const Iterator& other) const;
65+
/// Move the iterator to the value which is equal or larger than input value
66+
void EqualOrLarger(int64_t value);
6567

6668
private:
6769
void* iterator_ = nullptr;

src/paimon/common/utils/roaring_bitmap64.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,10 @@ bool RoaringBitmap64::Iterator::operator!=(const Iterator& other) const {
9797
return !(*this == other);
9898
}
9999

100+
void RoaringBitmap64::Iterator::EqualOrLarger(int64_t value) {
101+
[[maybe_unused]] bool _ = GetIterator(iterator_).move(value);
102+
}
103+
100104
RoaringBitmap64::RoaringBitmap64() {
101105
roaring_bitmap_ = new roaring::Roaring64Map();
102106
}

src/paimon/common/utils/roaring_bitmap64_test.cpp

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -403,4 +403,15 @@ TEST(RoaringBitmap64Test, TestFromRoaringBitmap32) {
403403
}
404404
}
405405

406+
TEST(RoaringBitmap64Test, TestIteratorEqualOrLarger) {
407+
RoaringBitmap64 roaring = RoaringBitmap64::From({1l, 3l, 5l, 100l});
408+
auto iter = roaring.Begin();
409+
ASSERT_EQ(*iter, 1l);
410+
iter.EqualOrLarger(5l);
411+
ASSERT_EQ(*iter, 5l);
412+
iter.EqualOrLarger(10l);
413+
ASSERT_EQ(*iter, 100l);
414+
iter.EqualOrLarger(200l);
415+
ASSERT_EQ(iter, roaring.End());
416+
}
406417
} // namespace paimon::test

src/paimon/global_index/lucene/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@ if(PAIMON_ENABLE_LUCENE)
4141
lucene_api_test.cpp
4242
lucene_directory_test.cpp
4343
lucene_global_index_test.cpp
44+
lucene_filter_test.cpp
4445
EXTRA_INCLUDES
4546
${LUCENE_INCLUDE_DIR}
4647
STATIC_LINK_LIBS

src/paimon/global_index/lucene/lucene_api_test.cpp

Lines changed: 90 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,67 @@
2222
#include "paimon/testing/utils/testharness.h"
2323

2424
namespace paimon::lucene::test {
25-
TEST(LuceneInterfaceTest, TestSimple) {
25+
class LuceneInterfaceTest : public ::testing::Test {
26+
public:
27+
void SetUp() override {}
28+
void TearDown() override {}
29+
30+
class TestDocIdSetIterator : public Lucene::DocIdSetIterator {
31+
public:
32+
explicit TestDocIdSetIterator(const std::vector<int32_t>& ids)
33+
: Lucene::DocIdSetIterator(), ids_(ids) {}
34+
35+
int32_t advance(int32_t target) override {
36+
int32_t doc_id = nextDoc();
37+
while (doc_id < target) {
38+
doc_id = nextDoc();
39+
}
40+
return doc_id;
41+
}
42+
int32_t docID() override {
43+
return ids_[cursor_];
44+
}
45+
int32_t nextDoc() override {
46+
if (cursor_ == ids_.size()) {
47+
return Lucene::DocIdSetIterator::NO_MORE_DOCS;
48+
}
49+
return ids_[cursor_++];
50+
}
51+
52+
private:
53+
size_t cursor_ = 0;
54+
std::vector<int32_t> ids_;
55+
};
56+
57+
class TestDocIdSet : public Lucene::DocIdSet {
58+
public:
59+
explicit TestDocIdSet(const std::vector<int32_t>& ids) : DocIdSet(), ids_(ids) {}
60+
61+
Lucene::DocIdSetIteratorPtr iterator() override {
62+
return Lucene::newLucene<TestDocIdSetIterator>(ids_);
63+
}
64+
bool isCacheable() override {
65+
return true;
66+
}
67+
68+
private:
69+
std::vector<int32_t> ids_;
70+
};
71+
72+
class TestFilter : public Lucene::Filter {
73+
public:
74+
explicit TestFilter(const std::vector<int32_t>& ids) : ids_(ids) {}
75+
76+
Lucene::DocIdSetPtr getDocIdSet(const Lucene::IndexReaderPtr& reader) override {
77+
return Lucene::newLucene<TestDocIdSet>(ids_);
78+
}
79+
80+
private:
81+
std::vector<int32_t> ids_;
82+
};
83+
};
84+
85+
TEST_F(LuceneInterfaceTest, TestSimple) {
2686
auto dir = paimon::test::UniqueTestDirectory::Create("local");
2787
std::string index_path = dir->Str() + "/lucene_test";
2888
auto lucene_dir = Lucene::FSDirectory::open(LuceneUtils::StringToWstring(index_path),
@@ -68,10 +128,17 @@ TEST(LuceneInterfaceTest, TestSimple) {
68128
parser->setAllowLeadingWildcard(true);
69129

70130
auto search = [&](const std::wstring& query_str, int32_t limit,
131+
const std::optional<std::vector<int32_t>> selected_id,
71132
const std::vector<int32_t>& expected_doc_id_vec,
72133
const std::vector<std::wstring>& expected_doc_id_content_vec) {
73134
Lucene::QueryPtr query = parser->parse(query_str);
74-
Lucene::TopDocsPtr results = searcher->search(query, limit);
135+
Lucene::TopDocsPtr results;
136+
if (selected_id) {
137+
Lucene::FilterPtr lucene_filter = Lucene::newLucene<TestFilter>(selected_id.value());
138+
results = searcher->search(query, lucene_filter, limit);
139+
} else {
140+
results = searcher->search(query, limit);
141+
}
75142
ASSERT_EQ(expected_doc_id_vec.size(), results->scoreDocs.size());
76143

77144
std::vector<int32_t> resule_doc_id_vec;
@@ -86,18 +153,29 @@ TEST(LuceneInterfaceTest, TestSimple) {
86153
};
87154

88155
// result is sorted by tf-idf score
89-
search(L"document", /*limit=*/10, std::vector<int32_t>({2, 1, 0}),
156+
search(L"document", /*limit=*/10, /*selected_id=*/std::nullopt, std::vector<int32_t>({2, 1, 0}),
90157
std::vector<std::wstring>({L"2", L"1", L"0"}));
91-
search(L"document", /*limit=*/1, std::vector<int32_t>({2}), std::vector<std::wstring>({L"2"}));
92-
search(L"test AND document", /*limit=*/10, std::vector<int32_t>({2, 0}),
93-
std::vector<std::wstring>({L"2", L"0"}));
94-
search(L"test OR new", /*limit=*/10, std::vector<int32_t>({1, 0, 2}),
95-
std::vector<std::wstring>({L"1", L"0", L"2"}));
96-
search(L"\"test document\"", /*limit=*/10, std::vector<int32_t>({0}),
97-
std::vector<std::wstring>({L"0"}));
98-
search(L"unordered", /*limit=*/10, std::vector<int32_t>({3}),
158+
search(L"document", /*limit=*/1, /*selected_id=*/std::nullopt, std::vector<int32_t>({2}),
159+
std::vector<std::wstring>({L"2"}));
160+
search(L"test AND document", /*limit=*/10, /*selected_id=*/std::nullopt,
161+
std::vector<int32_t>({2, 0}), std::vector<std::wstring>({L"2", L"0"}));
162+
search(L"test OR new", /*limit=*/10, /*selected_id=*/std::nullopt,
163+
std::vector<int32_t>({1, 0, 2}), std::vector<std::wstring>({L"1", L"0", L"2"}));
164+
search(L"\"test document\"", /*limit=*/10, /*selected_id=*/std::nullopt,
165+
std::vector<int32_t>({0}), std::vector<std::wstring>({L"0"}));
166+
search(L"unordered", /*limit=*/10, /*selected_id=*/std::nullopt, std::vector<int32_t>({3}),
99167
std::vector<std::wstring>({L"5"}));
100-
search(L"*orDer*", /*limit=*/10, std::vector<int32_t>({3}), std::vector<std::wstring>({L"5"}));
168+
search(L"*orDer*", /*limit=*/10, /*selected_id=*/std::nullopt, std::vector<int32_t>({3}),
169+
std::vector<std::wstring>({L"5"}));
170+
171+
// test filter
172+
search(L"document", /*limit=*/10, /*selected_id=*/std::vector<int32_t>({0, 1}),
173+
std::vector<int32_t>({1, 0}), std::vector<std::wstring>({L"1", L"0"}));
174+
search(L"document OR unordered", /*limit=*/10,
175+
/*selected_id=*/std::vector<int32_t>({0, 1, 3}), std::vector<int32_t>({3, 1, 0}),
176+
std::vector<std::wstring>({L"5", L"1", L"0"}));
177+
search(L"unordered", /*limit=*/10, /*selected_id=*/std::vector<int32_t>({0}),
178+
std::vector<int32_t>(), std::vector<std::wstring>());
101179

102180
reader->close();
103181
lucene_dir->close();
Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
/*
2+
* Copyright 2026-present Alibaba Inc.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
#pragma once
17+
#include "lucene++/LuceneHeaders.h"
18+
#include "paimon/utils/roaring_bitmap64.h"
19+
20+
namespace paimon::lucene {
21+
class LuceneCollector : public Lucene::Collector {
22+
public:
23+
LuceneCollector() : Lucene::Collector() {}
24+
void setScorer(const Lucene::ScorerPtr& scorer) override {
25+
// ignore scorer
26+
}
27+
void collect(int32_t doc) override {
28+
bitmap_.Add(doc_base_ + doc);
29+
}
30+
void setNextReader(const Lucene::IndexReaderPtr& reader, int32_t doc_base) override {
31+
doc_base_ = doc_base;
32+
}
33+
bool acceptsDocsOutOfOrder() override {
34+
return true;
35+
}
36+
const RoaringBitmap64& GetBitmap() const {
37+
return bitmap_;
38+
}
39+
40+
private:
41+
RoaringBitmap64 bitmap_;
42+
int64_t doc_base_ = 0;
43+
};
44+
} // namespace paimon::lucene
Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,83 @@
1+
/*
2+
* Copyright 2026-present Alibaba Inc.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
#pragma once
17+
#include "lucene++/LuceneHeaders.h"
18+
#include "paimon/utils/roaring_bitmap64.h"
19+
20+
namespace paimon::lucene {
21+
class BitmapDocIdSetIterator : public Lucene::DocIdSetIterator {
22+
public:
23+
explicit BitmapDocIdSetIterator(const RoaringBitmap64* ids)
24+
: Lucene::DocIdSetIterator(), ids_(ids), iter_(ids->Begin()) {}
25+
26+
int32_t advance(int32_t target) override {
27+
iter_.EqualOrLarger(static_cast<int64_t>(target));
28+
if (iter_ == ids_->End()) {
29+
return Lucene::DocIdSetIterator::NO_MORE_DOCS;
30+
}
31+
return static_cast<int32_t>(*iter_);
32+
}
33+
34+
int32_t docID() override {
35+
if (iter_ == ids_->End()) {
36+
return Lucene::DocIdSetIterator::NO_MORE_DOCS;
37+
}
38+
return static_cast<int32_t>(*iter_);
39+
}
40+
41+
int32_t nextDoc() override {
42+
if (iter_ == ids_->End()) {
43+
return Lucene::DocIdSetIterator::NO_MORE_DOCS;
44+
}
45+
auto id = static_cast<int32_t>(*iter_);
46+
++iter_;
47+
return id;
48+
}
49+
50+
private:
51+
const RoaringBitmap64* ids_;
52+
RoaringBitmap64::Iterator iter_;
53+
};
54+
55+
class BitmapDocIdSet : public Lucene::DocIdSet {
56+
public:
57+
explicit BitmapDocIdSet(const RoaringBitmap64* ids) : DocIdSet(), ids_(ids) {}
58+
59+
Lucene::DocIdSetIteratorPtr iterator() override {
60+
return Lucene::newLucene<BitmapDocIdSetIterator>(ids_);
61+
}
62+
63+
bool isCacheable() override {
64+
return true;
65+
}
66+
67+
private:
68+
const RoaringBitmap64* ids_;
69+
};
70+
71+
class LuceneFilter : public Lucene::Filter {
72+
public:
73+
explicit LuceneFilter(const RoaringBitmap64* ids) : ids_(ids) {}
74+
75+
Lucene::DocIdSetPtr getDocIdSet(const Lucene::IndexReaderPtr& reader) override {
76+
return Lucene::newLucene<BitmapDocIdSet>(ids_);
77+
}
78+
79+
private:
80+
const RoaringBitmap64* ids_;
81+
};
82+
83+
} // namespace paimon::lucene

0 commit comments

Comments
 (0)