Skip to content

Commit 66616ff

Browse files
authored
Merge branch 'main' into feat/zstd-level-config
2 parents 5c97b11 + 2cb98dc commit 66616ff

File tree

7 files changed

+23
-26
lines changed

7 files changed

+23
-26
lines changed

src/paimon/global_index/lucene/jieba_analyzer.cpp

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,11 @@ void JiebaTokenizer::CutWithMode(const std::string& tokenize_mode, const cppjieb
8282
}
8383
}
8484

85+
bool JiebaTokenizer::IsWhitespaceOnly(const std::string& term) {
86+
return term.empty() ||
87+
std::all_of(term.begin(), term.end(), [](unsigned char c) { return std::isspace(c); });
88+
}
89+
8590
void JiebaTokenizer::Normalize(const std::unordered_set<std::string>& stop_words,
8691
std::vector<std::string>* input_ptr,
8792
std::vector<std::string_view>* output_ptr) {
@@ -90,11 +95,13 @@ void JiebaTokenizer::Normalize(const std::unordered_set<std::string>& stop_words
9095
output.clear();
9196
output.reserve(input.size());
9297
for (auto& term : input) {
98+
if (IsWhitespaceOnly(term)) {
99+
continue;
100+
}
93101
// remove stop words
94102
if (stop_words.find(term) != stop_words.end()) {
95103
continue;
96104
}
97-
98105
// to lower case
99106
bool is_alphanumeric = true;
100107
for (const auto& c : term) {

src/paimon/global_index/lucene/jieba_analyzer.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,8 @@ class JiebaTokenizer : public Lucene::Tokenizer {
6060
private:
6161
void InnerReset();
6262

63+
static bool IsWhitespaceOnly(const std::string& term);
64+
6365
private:
6466
JiebaTokenizerContext context_;
6567
size_t term_index_ = 0;

src/paimon/global_index/lucene/jieba_analyzer_test.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -90,7 +90,7 @@ TEST_P(JiebaAnalyzerTest, TestWithPosition) {
9090
TEST_P(JiebaAnalyzerTest, TestNormalize) {
9191
auto tokenizer = CreateJiebaTokenizer(
9292
/*with_position=*/false,
93-
L"由于购买了Iphone14,我越来越热爱网上学习了!Happy work, happy day!");
93+
L"由于购买了Iphone14,我越来越热爱网上学习了!Happy work, happy day! \n\t");
9494

9595
auto term_att = tokenizer->addAttribute<Lucene::TermAttribute>();
9696

src/paimon/global_index/lucene/lucene_defs.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,6 @@ namespace paimon::lucene {
2323
static inline const int32_t kVersion = 1;
2424
static inline const char kIdentifier[] = "lucene-fts";
2525
static inline const wchar_t kEmptyWstring[] = L"";
26-
static inline const wchar_t kRowIdFieldWstring[] = L"_ROW_ID";
2726

2827
static inline const char kOptionKeyPrefix[] = "lucene-fts.";
2928

src/paimon/global_index/lucene/lucene_global_index_reader.cpp

Lines changed: 1 addition & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -179,13 +179,7 @@ Result<std::shared_ptr<GlobalIndexResult>> LuceneGlobalIndexReader::SearchWithLi
179179
// prepare BitmapScoredGlobalIndexResult
180180
std::map<int64_t, float> id_to_score;
181181
for (auto score_doc : results->scoreDocs) {
182-
Lucene::DocumentPtr result_doc = searcher_->doc(score_doc->doc);
183-
std::string row_id_str = LuceneUtils::WstringToString(result_doc->get(kRowIdFieldWstring));
184-
std::optional<int32_t> row_id = StringUtils::StringToValue<int32_t>(row_id_str);
185-
if (!row_id) {
186-
return Status::Invalid(fmt::format("parse row id str {} to int failed", row_id_str));
187-
}
188-
id_to_score[static_cast<int64_t>(row_id.value())] = static_cast<float>(score_doc->score);
182+
id_to_score[static_cast<int64_t>(score_doc->doc)] = static_cast<float>(score_doc->score);
189183
}
190184
RoaringBitmap64 bitmap;
191185
std::vector<float> scores;

src/paimon/global_index/lucene/lucene_global_index_writer.cpp

Lines changed: 10 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -40,13 +40,12 @@ namespace paimon::lucene {
4040
LuceneGlobalIndexWriter::LuceneWriteContext::LuceneWriteContext(
4141
const std::string& _tmp_index_path, const Lucene::FSDirectoryPtr& _lucene_dir,
4242
const Lucene::IndexWriterPtr& _index_writer, const Lucene::DocumentPtr& _doc,
43-
const Lucene::FieldPtr& _field, const Lucene::FieldPtr& _row_id_field)
43+
const Lucene::FieldPtr& _field)
4444
: tmp_index_path(_tmp_index_path),
4545
lucene_dir(_lucene_dir),
4646
index_writer(_index_writer),
4747
doc(_doc),
48-
field(_field),
49-
row_id_field(_row_id_field) {}
48+
field(_field) {}
5049

5150
Result<std::shared_ptr<LuceneGlobalIndexWriter>> LuceneGlobalIndexWriter::Create(
5251
const std::string& field_name, const std::shared_ptr<arrow::DataType>& arrow_type,
@@ -89,17 +88,11 @@ Result<std::shared_ptr<LuceneGlobalIndexWriter>> LuceneGlobalIndexWriter::Create
8988
auto field = Lucene::newLucene<Lucene::Field>(LuceneUtils::StringToWstring(field_name),
9089
kEmptyWstring, Lucene::Field::STORE_NO,
9190
Lucene::Field::INDEX_ANALYZED_NO_NORMS);
92-
auto row_id_field = Lucene::newLucene<Lucene::Field>(
93-
kRowIdFieldWstring, kEmptyWstring, Lucene::Field::STORE_YES,
94-
Lucene::Field::INDEX_NOT_ANALYZED_NO_NORMS);
9591
field->setOmitTermFreqAndPositions(omit_term_freq_and_positions);
96-
row_id_field->setOmitTermFreqAndPositions(true);
9792
doc->add(field);
98-
doc->add(row_id_field);
9993
return std::shared_ptr<LuceneGlobalIndexWriter>(new LuceneGlobalIndexWriter(
100-
field_name, arrow_type,
101-
LuceneWriteContext(tmp_path, lucene_dir, writer, doc, field, row_id_field), file_writer,
102-
options, pool));
94+
field_name, arrow_type, LuceneWriteContext(tmp_path, lucene_dir, writer, doc, field),
95+
file_writer, options, pool));
10396
} catch (const std::exception& e) {
10497
return Status::Invalid(
10598
fmt::format("create lucene global index writer failed, with {} error.", e.what()));
@@ -153,8 +146,7 @@ Status LuceneGlobalIndexWriter::AddBatch(::ArrowArray* arrow_array) {
153146
auto view = string_array->Value(i);
154147
write_context_.field->setValue(LuceneUtils::StringToWstring(view));
155148
}
156-
write_context_.row_id_field->setValue(
157-
LuceneUtils::StringToWstring(std::to_string(row_id_++)));
149+
row_id_++;
158150
write_context_.index_writer->addDocument(write_context_.doc);
159151
}
160152
} catch (const std::exception& e) {
@@ -170,6 +162,11 @@ Status LuceneGlobalIndexWriter::AddBatch(::ArrowArray* arrow_array) {
170162
Result<std::string> LuceneGlobalIndexWriter::FlushIndexToFinal() {
171163
try {
172164
// flush index to tmp dir
165+
if (write_context_.index_writer->numDocs() != row_id_) {
166+
return Status::Invalid(
167+
fmt::format("lucene writer row count {} mismatch paimon inner row count {}",
168+
write_context_.index_writer->numDocs(), row_id_));
169+
}
173170
write_context_.index_writer->optimize();
174171
write_context_.index_writer->close();
175172

src/paimon/global_index/lucene/lucene_global_index_writer.h

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -29,8 +29,7 @@ class LuceneGlobalIndexWriter : public GlobalIndexWriter {
2929
LuceneWriteContext(const std::string& _tmp_index_path,
3030
const Lucene::FSDirectoryPtr& _lucene_dir,
3131
const Lucene::IndexWriterPtr& _index_writer,
32-
const Lucene::DocumentPtr& _doc, const Lucene::FieldPtr& _field,
33-
const Lucene::FieldPtr& _row_id_field);
32+
const Lucene::DocumentPtr& _doc, const Lucene::FieldPtr& _field);
3433

3534
LuceneWriteContext(LuceneWriteContext&&) = default;
3635
LuceneWriteContext& operator=(LuceneWriteContext&&) = default;
@@ -40,7 +39,6 @@ class LuceneGlobalIndexWriter : public GlobalIndexWriter {
4039
Lucene::IndexWriterPtr index_writer;
4140
Lucene::DocumentPtr doc;
4241
Lucene::FieldPtr field;
43-
Lucene::FieldPtr row_id_field;
4442
};
4543

4644
static Result<std::shared_ptr<LuceneGlobalIndexWriter>> Create(

0 commit comments

Comments
 (0)