Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions src/paimon/common/data/generic_row.h
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,12 @@ class GenericRow : public InternalRow {

BinaryString GetString(int32_t pos) const override {
assert(static_cast<size_t>(pos) < fields_.size());
auto* value_ptr = DataDefine::GetVariantPtr<std::string_view>(fields_[pos]);
if (value_ptr) {
auto bytes = std::make_shared<Bytes>(value_ptr->size(), GetDefaultPool().get());
memcpy(bytes->data(), value_ptr->data(), value_ptr->size());
return BinaryString::FromBytes(bytes);
}
return DataDefine::GetVariantValue<BinaryString>(fields_[pos]);
}

Expand Down
4 changes: 2 additions & 2 deletions src/paimon/common/data/serializer/binary_serializer_utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -144,11 +144,11 @@ Status BinarySerializerUtils::WriteBinaryData(const std::shared_ptr<arrow::DataT
break;
}
case arrow::Type::type::STRING: {
writer->WriteString(pos, getter->GetString(pos));
writer->WriteStringView(pos, getter->GetStringView(pos));
Copy link

Copilot AI Mar 24, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For arrow::Type::BINARY, this switches from getter->GetBinary(pos) to getter->GetStringView(pos) and from WriteBinary to WriteStringView. If GetStringView is implemented with STRING semantics (e.g., expecting UTF-8, or only supported for STRING arrays), this can mis-serialize BINARY data or fail at runtime. Prefer a binary-specific view API (e.g., GetBinaryView / WriteBinaryView) or keep using GetBinary + WriteBinary for the BINARY case while still avoiding allocations.

Copilot uses AI. Check for mistakes.
break;
}
case arrow::Type::type::BINARY: {
writer->WriteBinary(pos, *(getter->GetBinary(pos)));
writer->WriteStringView(pos, getter->GetStringView(pos));
Copy link

Copilot AI Mar 24, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For arrow::Type::BINARY, this switches from getter->GetBinary(pos) to getter->GetStringView(pos) and from WriteBinary to WriteStringView. If GetStringView is implemented with STRING semantics (e.g., expecting UTF-8, or only supported for STRING arrays), this can mis-serialize BINARY data or fail at runtime. Prefer a binary-specific view API (e.g., GetBinaryView / WriteBinaryView) or keep using GetBinary + WriteBinary for the BINARY case while still avoiding allocations.

Suggested change
writer->WriteStringView(pos, getter->GetStringView(pos));
writer->WriteBinary(pos, getter->GetBinary(pos));

Copilot uses AI. Check for mistakes.
break;
}
case arrow::Type::type::TIMESTAMP: {
Expand Down
11 changes: 3 additions & 8 deletions src/paimon/common/data/serializer/row_compacted_serializer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -33,9 +33,8 @@ Result<std::unique_ptr<RowCompactedSerializer>> RowCompactedSerializer::Create(
std::vector<RowCompactedSerializer::FieldReader> readers(schema->num_fields());
for (int32_t i = 0; i < schema->num_fields(); i++) {
auto field_type = schema->field(i)->type();
// TODO(xinyu.lxy): check if we can enable use view
PAIMON_ASSIGN_OR_RAISE(getters[i],
InternalRow::CreateFieldGetter(i, field_type, /*use_view=*/false));
InternalRow::CreateFieldGetter(i, field_type, /*use_view=*/true));
PAIMON_ASSIGN_OR_RAISE(writers[i], CreateFieldWriter(field_type, pool));
PAIMON_ASSIGN_OR_RAISE(readers[i], CreateFieldReader(field_type, pool));
}
Expand Down Expand Up @@ -306,9 +305,7 @@ Result<RowCompactedSerializer::FieldWriter> RowCompactedSerializer::CreateFieldW
field_writer = [](int32_t pos, const VariantType& field, RowWriter* writer) -> Status {
const auto* view = DataDefine::GetVariantPtr<std::string_view>(field);
if (view) {
// TODO(xinyu.lxy): remove copy from view
return writer->WriteString(
BinaryString::FromString(std::string(*view), GetDefaultPool().get()));
return writer->WriteStringView(*view);
}
return writer->WriteString(DataDefine::GetVariantValue<BinaryString>(field));
};
Expand All @@ -318,9 +315,7 @@ Result<RowCompactedSerializer::FieldWriter> RowCompactedSerializer::CreateFieldW
field_writer = [](int32_t pos, const VariantType& field, RowWriter* writer) -> Status {
const auto* view = DataDefine::GetVariantPtr<std::string_view>(field);
if (view) {
auto bytes =
std::make_shared<Bytes>(std::string(*view), GetDefaultPool().get());
return writer->WriteBinary(bytes.get());
return writer->WriteStringView(*view);
}
return writer->WriteBinary(
DataDefine::GetVariantValue<std::shared_ptr<Bytes>>(field).get());
Expand Down
4 changes: 4 additions & 0 deletions src/paimon/common/data/serializer/row_compacted_serializer.h
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,10 @@ class RowCompactedSerializer {
return WriteSegments(value.GetSegments(), value.GetOffset(), value.GetSizeInBytes());
}

Status WriteStringView(const std::string_view& view) {
return WriteBinary(&view);
}

template <typename T>
Status WriteBinary(const T* bytes) {
PAIMON_RETURN_NOT_OK(WriteUnsignedInt(bytes->size()));
Expand Down
12 changes: 8 additions & 4 deletions src/paimon/core/mergetree/compact/compact_strategy.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
#pragma once

#include "paimon/core/compact/compact_unit.h"
#include "paimon/core/deletionvectors/bucketed_dv_maintainer.h"
Copy link

Copilot AI Mar 24, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Including bucketed_dv_maintainer.h in this header increases coupling and compile-time impact, especially since PickFullCompaction is defined inline. If feasible, move the PickFullCompaction implementation to a .cpp file so the header can forward-declare BucketedDvMaintainer and avoid pulling in the heavier dependency transitively.

Copilot uses AI. Check for mistakes.
#include "paimon/core/io/data_file_meta.h"
#include "paimon/core/mergetree/level_sorted_run.h"
namespace paimon {
Expand All @@ -33,9 +34,9 @@ class CompactStrategy {
const std::vector<LevelSortedRun>& runs) = 0;
/// Pick a compaction unit consisting of all existing files.
// TODO(xinyu.lxy): support RecordLevelExpire and BucketedDvMaintainer
static std::optional<CompactUnit> PickFullCompaction(int32_t num_levels,
const std::vector<LevelSortedRun>& runs,
bool force_rewrite_all_files) {
static std::optional<CompactUnit> PickFullCompaction(
int32_t num_levels, const std::vector<LevelSortedRun>& runs,
const std::shared_ptr<BucketedDvMaintainer>& dv_maintainer, bool force_rewrite_all_files) {
Copy link

Copilot AI Mar 24, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This changes the public PickFullCompaction signature by adding dv_maintainer, which can ripple through many callers. Consider adding an overload retaining the previous signature (calling the new one with nullptr) to keep the API backwards-compatible and reduce churn for call sites that don’t care about deletion vectors.

Suggested change
const std::shared_ptr<BucketedDvMaintainer>& dv_maintainer, bool force_rewrite_all_files) {
bool force_rewrite_all_files) {
return PickFullCompaction(num_levels, runs, nullptr, force_rewrite_all_files);
}
static std::optional<CompactUnit> PickFullCompaction(
int32_t num_levels, const std::vector<LevelSortedRun>& runs,
const std::shared_ptr<BucketedDvMaintainer>& dv_maintainer,
bool force_rewrite_all_files) {

Copilot uses AI. Check for mistakes.
int32_t max_level = num_levels - 1;
if (runs.empty()) {
// no sorted run, no need to compact
Expand All @@ -49,8 +50,11 @@ class CompactStrategy {
if (force_rewrite_all_files) {
// add all files when force compacted
files_to_be_compacted.push_back(file);
} else if (dv_maintainer && dv_maintainer->DeletionVectorOf(file->file_name)) {
// check deletion vector for large files
Copy link

Copilot AI Mar 24, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The comment says ‘check deletion vector for large files’, but the condition does not check file size—only whether a deletion vector exists. To avoid misleading future readers, update the comment to reflect the actual logic (e.g., ‘rewrite files with deletion vectors’) or add an explicit size-based condition if that was intended.

Suggested change
// check deletion vector for large files
// rewrite files that have deletion vectors

Copilot uses AI. Check for mistakes.
files_to_be_compacted.push_back(file);
}
// TODO(xinyu.lxy): support RecordLevelExpire and BucketedDvMaintainer
// TODO(xinyu.lxy): support RecordLevelExpire
}
if (files_to_be_compacted.empty()) {
return std::nullopt;
Expand Down
37 changes: 29 additions & 8 deletions src/paimon/core/mergetree/compact/compact_strategy_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -55,22 +55,25 @@ TEST_F(CompactStrategyTest, TestPickFullCompaction) {
{
// no sorted run, no need to compact
auto runs = CreateRunsWithLevelAndSize({}, {});
auto unit = CompactStrategy::PickFullCompaction(/*num_levels=*/3, runs,
/*force_rewrite_all_files=*/false);
auto unit =
CompactStrategy::PickFullCompaction(/*num_levels=*/3, runs, /*dv_maintainer=*/nullptr,
/*force_rewrite_all_files=*/false);
ASSERT_FALSE(unit);
}
{
// only max level files, not rewrite
auto runs = CreateRunsWithLevelAndSize(/*levels=*/{3}, /*sizes*/ {10});
auto unit = CompactStrategy::PickFullCompaction(/*num_levels=*/4, runs,
/*force_rewrite_all_files=*/false);
auto unit =
CompactStrategy::PickFullCompaction(/*num_levels=*/4, runs, /*dv_maintainer=*/nullptr,
/*force_rewrite_all_files=*/false);
ASSERT_FALSE(unit);
}
{
// only max level files, force rewrite
auto runs = CreateRunsWithLevelAndSize(/*levels=*/{3}, /*sizes*/ {10});
auto unit = CompactStrategy::PickFullCompaction(/*num_levels=*/4, runs,
/*force_rewrite_all_files=*/true);
auto unit =
CompactStrategy::PickFullCompaction(/*num_levels=*/4, runs, /*dv_maintainer=*/nullptr,
/*force_rewrite_all_files=*/true);
ASSERT_TRUE(unit);
ASSERT_EQ(unit.value().output_level, 3);
ASSERT_EQ(unit.value().files.size(), 1);
Expand All @@ -79,12 +82,30 @@ TEST_F(CompactStrategyTest, TestPickFullCompaction) {
{
// full compaction
auto runs = CreateRunsWithLevelAndSize(/*levels=*/{0, 3}, /*sizes*/ {1, 10});
auto unit = CompactStrategy::PickFullCompaction(/*num_levels=*/4, runs,
/*force_rewrite_all_files=*/false);
auto unit =
CompactStrategy::PickFullCompaction(/*num_levels=*/4, runs, /*dv_maintainer=*/nullptr,
/*force_rewrite_all_files=*/false);
ASSERT_TRUE(unit);
ASSERT_EQ(unit.value().output_level, 3);
ASSERT_EQ(unit.value().files.size(), 2);
ASSERT_FALSE(unit.value().file_rewrite);
}
{
// test with dv maintainer
std::map<std::string, std::shared_ptr<DeletionVector>> deletion_vectors = {
{"fake.data", std::make_shared<BitmapDeletionVector>(RoaringBitmap32())}};
Comment on lines +95 to +96
Copy link

Copilot AI Mar 24, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This test assumes the file(s) produced by CreateRunsWithLevelAndSize will have file_name == \"fake.data\". If the helper generates different names (common in such factories), DeletionVectorOf(file->file_name) won’t find a DV, and the compaction selection may return nullopt, making the test flaky/incorrect. Make the DV map key match the actual file_name produced by runs (e.g., derive the name from runs/files created in the test) so the test asserts the intended behavior reliably.

Copilot uses AI. Check for mistakes.

auto dv_maintainer = std::make_shared<BucketedDvMaintainer>(
std::make_shared<DeletionVectorsIndexFile>(nullptr, nullptr, /*bitmap64=*/false,
GetDefaultPool()),
deletion_vectors);
auto runs = CreateRunsWithLevelAndSize(/*levels=*/{3}, /*sizes*/ {10});
Comment on lines +95 to +102
Copy link

Copilot AI Mar 24, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This test assumes the file(s) produced by CreateRunsWithLevelAndSize will have file_name == \"fake.data\". If the helper generates different names (common in such factories), DeletionVectorOf(file->file_name) won’t find a DV, and the compaction selection may return nullopt, making the test flaky/incorrect. Make the DV map key match the actual file_name produced by runs (e.g., derive the name from runs/files created in the test) so the test asserts the intended behavior reliably.

Suggested change
std::map<std::string, std::shared_ptr<DeletionVector>> deletion_vectors = {
{"fake.data", std::make_shared<BitmapDeletionVector>(RoaringBitmap32())}};
auto dv_maintainer = std::make_shared<BucketedDvMaintainer>(
std::make_shared<DeletionVectorsIndexFile>(nullptr, nullptr, /*bitmap64=*/false,
GetDefaultPool()),
deletion_vectors);
auto runs = CreateRunsWithLevelAndSize(/*levels=*/{3}, /*sizes*/ {10});
auto runs = CreateRunsWithLevelAndSize(/*levels=*/{3}, /*sizes*/ {10});
// Derive the file name from the created runs to ensure the DV map key matches.
ASSERT_FALSE(runs.empty());
ASSERT_TRUE(runs[0].run);
ASSERT_FALSE(runs[0].run->files.empty());
const std::string& file_name = runs[0].run->files[0]->file_name;
std::map<std::string, std::shared_ptr<DeletionVector>> deletion_vectors = {
{file_name, std::make_shared<BitmapDeletionVector>(RoaringBitmap32())}};
auto dv_maintainer = std::make_shared<BucketedDvMaintainer>(
std::make_shared<DeletionVectorsIndexFile>(nullptr, nullptr, /*bitmap64=*/false,
GetDefaultPool()),
deletion_vectors);

Copilot uses AI. Check for mistakes.
auto unit = CompactStrategy::PickFullCompaction(/*num_levels=*/4, runs, dv_maintainer,
/*force_rewrite_all_files=*/false);
ASSERT_TRUE(unit);
ASSERT_EQ(unit.value().output_level, 3);
ASSERT_EQ(unit.value().files.size(), 1);
ASSERT_TRUE(unit.value().file_rewrite);
}
}
} // namespace paimon::test
30 changes: 24 additions & 6 deletions src/paimon/core/mergetree/lookup/persist_processor_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,17 +16,39 @@

#include "paimon/core/mergetree/lookup/persist_processor.h"

#include "arrow/ipc/json_simple.h"
Copy link

Copilot AI Mar 24, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The test now depends on Arrow IPC ‘internal’ JSON utilities (arrow::ipc::internal::json and json_simple.h). These internal APIs are more likely to change across Arrow upgrades and ValueOrDie() will hard-abort rather than producing a clean gtest failure. Prefer a public Arrow test utility (if available in this repo’s Arrow version) and/or propagate failures via ASSERT-style checks so failures are reported as test assertions rather than process aborts.

Copilot uses AI. Check for mistakes.
#include "gtest/gtest.h"
#include "paimon/common/data/columnar/columnar_row.h"
#include "paimon/core/mergetree/lookup/default_lookup_serializer_factory.h"
#include "paimon/core/mergetree/lookup/persist_empty_processor.h"
#include "paimon/core/mergetree/lookup/persist_position_processor.h"
#include "paimon/core/mergetree/lookup/persist_value_and_pos_processor.h"
#include "paimon/core/mergetree/lookup/persist_value_processor.h"
#include "paimon/testing/utils/binary_row_generator.h"
#include "paimon/testing/utils/testharness.h"
namespace paimon::test {
class PersistProcessorTest : public testing::Test {
public:
void SetUp() override {
auto key_type = arrow::struct_({arrow::field("f1", arrow::int32())});
auto key_array = std::dynamic_pointer_cast<arrow::StructArray>(
arrow::ipc::internal::json::ArrayFromJSON(key_type, R"([[10]])").ValueOrDie());
Comment on lines +33 to +34
Copy link

Copilot AI Mar 24, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The test now depends on Arrow IPC ‘internal’ JSON utilities (arrow::ipc::internal::json and json_simple.h). These internal APIs are more likely to change across Arrow upgrades and ValueOrDie() will hard-abort rather than producing a clean gtest failure. Prefer a public Arrow test utility (if available in this repo’s Arrow version) and/or propagate failures via ASSERT-style checks so failures are reported as test assertions rather than process aborts.

Copilot uses AI. Check for mistakes.

auto value_type = arrow::struct_(
{arrow::field("f0", arrow::utf8()), arrow::field("f1", arrow::int32()),
arrow::field("f2", arrow::int32()), arrow::field("f3", arrow::float64())});
auto value_array = std::dynamic_pointer_cast<arrow::StructArray>(
arrow::ipc::internal::json::ArrayFromJSON(value_type, R"([["Alice", 10, null, 10.1]])")
.ValueOrDie());
Comment on lines +39 to +41
Copy link

Copilot AI Mar 24, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The test now depends on Arrow IPC ‘internal’ JSON utilities (arrow::ipc::internal::json and json_simple.h). These internal APIs are more likely to change across Arrow upgrades and ValueOrDie() will hard-abort rather than producing a clean gtest failure. Prefer a public Arrow test utility (if available in this repo’s Arrow version) and/or propagate failures via ASSERT-style checks so failures are reported as test assertions rather than process aborts.

Copilot uses AI. Check for mistakes.

auto key_row = std::make_shared<ColumnarRow>(
/*struct_array=*/key_array, key_array->fields(), pool_, /*row_id=*/0);
auto value_row = std::make_unique<ColumnarRow>(
/*struct_array=*/value_array, value_array->fields(), pool_, /*row_id=*/0);

kv_ = KeyValue(RowKind::Insert(), /*sequence_number=*/500, /*level=*/4, std::move(key_row),
std::move(value_row));
}

void CheckResult(const KeyValue& kv) {
ASSERT_EQ(kv_.key, kv.key);

Expand All @@ -44,11 +66,7 @@ class PersistProcessorTest : public testing::Test {

private:
std::shared_ptr<MemoryPool> pool_ = GetDefaultPool();
KeyValue kv_ = KeyValue(RowKind::Insert(), /*sequence_number=*/500, /*level=*/4, /*key=*/
BinaryRowGenerator::GenerateRowPtr({10}, pool_.get()),
/*value=*/
BinaryRowGenerator::GenerateRowPtr(
{std::string("Alice"), 10, NullType(), 10.1}, pool_.get()));
KeyValue kv_;
std::shared_ptr<arrow::Schema> file_schema_ =
arrow::schema({arrow::field("f0", arrow::utf8()), arrow::field("f1", arrow::int32()),
arrow::field("f2", arrow::int32()), arrow::field("f3", arrow::float64())});
Expand Down
Loading