From 0758bd9a06149b2638620294548347d9e5231325 Mon Sep 17 00:00:00 2001 From: lxy264173 Date: Mon, 23 Mar 2026 22:07:26 +0800 Subject: [PATCH 01/11] refactor: improve RowCompactedSerializer by using string_view to avoid data copies --- src/paimon/common/data/generic_row.h | 6 +++ .../serializer/binary_serializer_utils.cpp | 4 +- .../serializer/row_compacted_serializer.cpp | 9 ++--- .../serializer/row_compacted_serializer.h | 4 ++ .../core/mergetree/compact/compact_strategy.h | 12 ++++-- .../compact/compact_strategy_test.cpp | 37 +++++++++++++++---- .../lookup/persist_processor_test.cpp | 30 ++++++++++++--- 7 files changed, 76 insertions(+), 26 deletions(-) diff --git a/src/paimon/common/data/generic_row.h b/src/paimon/common/data/generic_row.h index 30409005..b97779e2 100644 --- a/src/paimon/common/data/generic_row.h +++ b/src/paimon/common/data/generic_row.h @@ -148,6 +148,12 @@ class GenericRow : public InternalRow { BinaryString GetString(int32_t pos) const override { assert(static_cast(pos) < fields_.size()); + auto* value_ptr = DataDefine::GetVariantPtr(fields_[pos]); + if (value_ptr) { + auto bytes = std::make_shared(value_ptr->size(), GetDefaultPool().get()); + memcpy(bytes->data(), value_ptr->data(), value_ptr->size()); + return BinaryString::FromBytes(bytes); + } return DataDefine::GetVariantValue(fields_[pos]); } diff --git a/src/paimon/common/data/serializer/binary_serializer_utils.cpp b/src/paimon/common/data/serializer/binary_serializer_utils.cpp index c8aae74f..602efafa 100644 --- a/src/paimon/common/data/serializer/binary_serializer_utils.cpp +++ b/src/paimon/common/data/serializer/binary_serializer_utils.cpp @@ -144,11 +144,11 @@ Status BinarySerializerUtils::WriteBinaryData(const std::shared_ptrWriteString(pos, getter->GetString(pos)); + writer->WriteStringView(pos, getter->GetStringView(pos)); break; } case arrow::Type::type::BINARY: { - writer->WriteBinary(pos, *(getter->GetBinary(pos))); + writer->WriteStringView(pos, getter->GetStringView(pos)); break; } case arrow::Type::type::TIMESTAMP: { diff --git a/src/paimon/common/data/serializer/row_compacted_serializer.cpp b/src/paimon/common/data/serializer/row_compacted_serializer.cpp index 0e944d86..952af6e0 100644 --- a/src/paimon/common/data/serializer/row_compacted_serializer.cpp +++ b/src/paimon/common/data/serializer/row_compacted_serializer.cpp @@ -35,7 +35,7 @@ Result> RowCompactedSerializer::Create( auto field_type = schema->field(i)->type(); // TODO(xinyu.lxy): check if we can enable use view PAIMON_ASSIGN_OR_RAISE(getters[i], - InternalRow::CreateFieldGetter(i, field_type, /*use_view=*/false)); + InternalRow::CreateFieldGetter(i, field_type, /*use_view=*/true)); PAIMON_ASSIGN_OR_RAISE(writers[i], CreateFieldWriter(field_type, pool)); PAIMON_ASSIGN_OR_RAISE(readers[i], CreateFieldReader(field_type, pool)); } @@ -307,8 +307,7 @@ Result RowCompactedSerializer::CreateFieldW const auto* view = DataDefine::GetVariantPtr(field); if (view) { // TODO(xinyu.lxy): remove copy from view - return writer->WriteString( - BinaryString::FromString(std::string(*view), GetDefaultPool().get())); + return writer->WriteStringView(*view); } return writer->WriteString(DataDefine::GetVariantValue(field)); }; @@ -318,9 +317,7 @@ Result RowCompactedSerializer::CreateFieldW field_writer = [](int32_t pos, const VariantType& field, RowWriter* writer) -> Status { const auto* view = DataDefine::GetVariantPtr(field); if (view) { - auto bytes = - std::make_shared(std::string(*view), GetDefaultPool().get()); - return writer->WriteBinary(bytes.get()); + return writer->WriteStringView(*view); } return writer->WriteBinary( DataDefine::GetVariantValue>(field).get()); diff --git a/src/paimon/common/data/serializer/row_compacted_serializer.h b/src/paimon/common/data/serializer/row_compacted_serializer.h index 13be8ef6..c9cb6a2d 100644 --- a/src/paimon/common/data/serializer/row_compacted_serializer.h +++ b/src/paimon/common/data/serializer/row_compacted_serializer.h @@ -70,6 +70,10 @@ class RowCompactedSerializer { return WriteSegments(value.GetSegments(), value.GetOffset(), value.GetSizeInBytes()); } + Status WriteStringView(const std::string_view& view) { + return WriteBinary(&view); + } + template Status WriteBinary(const T* bytes) { PAIMON_RETURN_NOT_OK(WriteUnsignedInt(bytes->size())); diff --git a/src/paimon/core/mergetree/compact/compact_strategy.h b/src/paimon/core/mergetree/compact/compact_strategy.h index e5633388..062b1d84 100644 --- a/src/paimon/core/mergetree/compact/compact_strategy.h +++ b/src/paimon/core/mergetree/compact/compact_strategy.h @@ -17,6 +17,7 @@ #pragma once #include "paimon/core/compact/compact_unit.h" +#include "paimon/core/deletionvectors/bucketed_dv_maintainer.h" #include "paimon/core/io/data_file_meta.h" #include "paimon/core/mergetree/level_sorted_run.h" namespace paimon { @@ -33,9 +34,9 @@ class CompactStrategy { const std::vector& runs) = 0; /// Pick a compaction unit consisting of all existing files. // TODO(xinyu.lxy): support RecordLevelExpire and BucketedDvMaintainer - static std::optional PickFullCompaction(int32_t num_levels, - const std::vector& runs, - bool force_rewrite_all_files) { + static std::optional PickFullCompaction( + int32_t num_levels, const std::vector& runs, + const std::shared_ptr& dv_maintainer, bool force_rewrite_all_files) { int32_t max_level = num_levels - 1; if (runs.empty()) { // no sorted run, no need to compact @@ -49,8 +50,11 @@ class CompactStrategy { if (force_rewrite_all_files) { // add all files when force compacted files_to_be_compacted.push_back(file); + } else if (dv_maintainer && dv_maintainer->DeletionVectorOf(file->file_name)) { + // check deletion vector for large files + files_to_be_compacted.push_back(file); } - // TODO(xinyu.lxy): support RecordLevelExpire and BucketedDvMaintainer + // TODO(xinyu.lxy): support RecordLevelExpire } if (files_to_be_compacted.empty()) { return std::nullopt; diff --git a/src/paimon/core/mergetree/compact/compact_strategy_test.cpp b/src/paimon/core/mergetree/compact/compact_strategy_test.cpp index 81734147..4e96a2a4 100644 --- a/src/paimon/core/mergetree/compact/compact_strategy_test.cpp +++ b/src/paimon/core/mergetree/compact/compact_strategy_test.cpp @@ -55,22 +55,25 @@ TEST_F(CompactStrategyTest, TestPickFullCompaction) { { // no sorted run, no need to compact auto runs = CreateRunsWithLevelAndSize({}, {}); - auto unit = CompactStrategy::PickFullCompaction(/*num_levels=*/3, runs, - /*force_rewrite_all_files=*/false); + auto unit = + CompactStrategy::PickFullCompaction(/*num_levels=*/3, runs, /*dv_maintainer=*/nullptr, + /*force_rewrite_all_files=*/false); ASSERT_FALSE(unit); } { // only max level files, not rewrite auto runs = CreateRunsWithLevelAndSize(/*levels=*/{3}, /*sizes*/ {10}); - auto unit = CompactStrategy::PickFullCompaction(/*num_levels=*/4, runs, - /*force_rewrite_all_files=*/false); + auto unit = + CompactStrategy::PickFullCompaction(/*num_levels=*/4, runs, /*dv_maintainer=*/nullptr, + /*force_rewrite_all_files=*/false); ASSERT_FALSE(unit); } { // only max level files, force rewrite auto runs = CreateRunsWithLevelAndSize(/*levels=*/{3}, /*sizes*/ {10}); - auto unit = CompactStrategy::PickFullCompaction(/*num_levels=*/4, runs, - /*force_rewrite_all_files=*/true); + auto unit = + CompactStrategy::PickFullCompaction(/*num_levels=*/4, runs, /*dv_maintainer=*/nullptr, + /*force_rewrite_all_files=*/true); ASSERT_TRUE(unit); ASSERT_EQ(unit.value().output_level, 3); ASSERT_EQ(unit.value().files.size(), 1); @@ -79,12 +82,30 @@ TEST_F(CompactStrategyTest, TestPickFullCompaction) { { // full compaction auto runs = CreateRunsWithLevelAndSize(/*levels=*/{0, 3}, /*sizes*/ {1, 10}); - auto unit = CompactStrategy::PickFullCompaction(/*num_levels=*/4, runs, - /*force_rewrite_all_files=*/false); + auto unit = + CompactStrategy::PickFullCompaction(/*num_levels=*/4, runs, /*dv_maintainer=*/nullptr, + /*force_rewrite_all_files=*/false); ASSERT_TRUE(unit); ASSERT_EQ(unit.value().output_level, 3); ASSERT_EQ(unit.value().files.size(), 2); ASSERT_FALSE(unit.value().file_rewrite); } + { + // test with dv maintainer + std::map> deletion_vectors = { + {"fake.data", std::make_shared(RoaringBitmap32())}}; + + auto dv_maintainer = std::make_shared( + std::make_shared(nullptr, nullptr, /*bitmap64=*/false, + GetDefaultPool()), + deletion_vectors); + auto runs = CreateRunsWithLevelAndSize(/*levels=*/{3}, /*sizes*/ {10}); + auto unit = CompactStrategy::PickFullCompaction(/*num_levels=*/4, runs, dv_maintainer, + /*force_rewrite_all_files=*/false); + ASSERT_TRUE(unit); + ASSERT_EQ(unit.value().output_level, 3); + ASSERT_EQ(unit.value().files.size(), 1); + ASSERT_TRUE(unit.value().file_rewrite); + } } } // namespace paimon::test diff --git a/src/paimon/core/mergetree/lookup/persist_processor_test.cpp b/src/paimon/core/mergetree/lookup/persist_processor_test.cpp index aa204ba0..fa45e064 100644 --- a/src/paimon/core/mergetree/lookup/persist_processor_test.cpp +++ b/src/paimon/core/mergetree/lookup/persist_processor_test.cpp @@ -16,17 +16,39 @@ #include "paimon/core/mergetree/lookup/persist_processor.h" +#include "arrow/ipc/json_simple.h" #include "gtest/gtest.h" +#include "paimon/common/data/columnar/columnar_row.h" #include "paimon/core/mergetree/lookup/default_lookup_serializer_factory.h" #include "paimon/core/mergetree/lookup/persist_empty_processor.h" #include "paimon/core/mergetree/lookup/persist_position_processor.h" #include "paimon/core/mergetree/lookup/persist_value_and_pos_processor.h" #include "paimon/core/mergetree/lookup/persist_value_processor.h" -#include "paimon/testing/utils/binary_row_generator.h" #include "paimon/testing/utils/testharness.h" namespace paimon::test { class PersistProcessorTest : public testing::Test { public: + void SetUp() override { + auto key_type = arrow::struct_({arrow::field("f1", arrow::int32())}); + auto key_array = std::dynamic_pointer_cast( + arrow::ipc::internal::json::ArrayFromJSON(key_type, R"([[10]])").ValueOrDie()); + + auto value_type = arrow::struct_( + {arrow::field("f0", arrow::utf8()), arrow::field("f1", arrow::int32()), + arrow::field("f2", arrow::int32()), arrow::field("f3", arrow::float64())}); + auto value_array = std::dynamic_pointer_cast( + arrow::ipc::internal::json::ArrayFromJSON(value_type, R"([["Alice", 10, null, 10.1]])") + .ValueOrDie()); + + auto key_row = std::make_shared( + /*struct_array=*/key_array, key_array->fields(), pool_, /*row_id=*/0); + auto value_row = std::make_unique( + /*struct_array=*/value_array, value_array->fields(), pool_, /*row_id=*/0); + + kv_ = KeyValue(RowKind::Insert(), /*sequence_number=*/500, /*level=*/4, std::move(key_row), + std::move(value_row)); + } + void CheckResult(const KeyValue& kv) { ASSERT_EQ(kv_.key, kv.key); @@ -44,11 +66,7 @@ class PersistProcessorTest : public testing::Test { private: std::shared_ptr pool_ = GetDefaultPool(); - KeyValue kv_ = KeyValue(RowKind::Insert(), /*sequence_number=*/500, /*level=*/4, /*key=*/ - BinaryRowGenerator::GenerateRowPtr({10}, pool_.get()), - /*value=*/ - BinaryRowGenerator::GenerateRowPtr( - {std::string("Alice"), 10, NullType(), 10.1}, pool_.get())); + KeyValue kv_; std::shared_ptr file_schema_ = arrow::schema({arrow::field("f0", arrow::utf8()), arrow::field("f1", arrow::int32()), arrow::field("f2", arrow::int32()), arrow::field("f3", arrow::float64())}); From 000c7638b407b80928f2ede5ed219c2e6a648d24 Mon Sep 17 00:00:00 2001 From: lxy264173 Date: Tue, 24 Mar 2026 09:28:28 +0800 Subject: [PATCH 02/11] fix --- src/paimon/common/data/serializer/row_compacted_serializer.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/paimon/common/data/serializer/row_compacted_serializer.cpp b/src/paimon/common/data/serializer/row_compacted_serializer.cpp index 952af6e0..1138acc3 100644 --- a/src/paimon/common/data/serializer/row_compacted_serializer.cpp +++ b/src/paimon/common/data/serializer/row_compacted_serializer.cpp @@ -33,7 +33,6 @@ Result> RowCompactedSerializer::Create( std::vector readers(schema->num_fields()); for (int32_t i = 0; i < schema->num_fields(); i++) { auto field_type = schema->field(i)->type(); - // TODO(xinyu.lxy): check if we can enable use view PAIMON_ASSIGN_OR_RAISE(getters[i], InternalRow::CreateFieldGetter(i, field_type, /*use_view=*/true)); PAIMON_ASSIGN_OR_RAISE(writers[i], CreateFieldWriter(field_type, pool)); @@ -306,7 +305,6 @@ Result RowCompactedSerializer::CreateFieldW field_writer = [](int32_t pos, const VariantType& field, RowWriter* writer) -> Status { const auto* view = DataDefine::GetVariantPtr(field); if (view) { - // TODO(xinyu.lxy): remove copy from view return writer->WriteStringView(*view); } return writer->WriteString(DataDefine::GetVariantValue(field)); From 70a8e9115a9aaaac0fe8aed003214b2972c2680e Mon Sep 17 00:00:00 2001 From: lxy264173 Date: Wed, 25 Mar 2026 18:55:46 +0800 Subject: [PATCH 03/11] binary section from multiple segment to single --- .../common/data/abstract_binary_writer.cpp | 10 +- .../common/data/abstract_binary_writer.h | 2 +- src/paimon/common/data/binary_array.cpp | 80 +++-- src/paimon/common/data/binary_array.h | 20 +- .../common/data/binary_data_read_utils.h | 32 +- src/paimon/common/data/binary_map.h | 28 +- src/paimon/common/data/binary_row.cpp | 88 +++--- src/paimon/common/data/binary_row.h | 33 +-- src/paimon/common/data/binary_row_test.cpp | 100 ++----- src/paimon/common/data/binary_row_writer.cpp | 7 +- .../common/data/binary_row_writer_test.cpp | 9 +- src/paimon/common/data/binary_section.cpp | 14 +- src/paimon/common/data/binary_section.h | 27 +- .../common/data/binary_section_test.cpp | 22 +- src/paimon/common/data/binary_string.cpp | 273 +++--------------- src/paimon/common/data/binary_string.h | 82 ++---- src/paimon/common/data/binary_string_test.cpp | 212 +++++--------- src/paimon/common/data/data_define.h | 12 + src/paimon/common/data/data_define_test.cpp | 51 ++++ src/paimon/common/data/generic_row.h | 8 +- .../data/serializer/binary_row_serializer.cpp | 34 +-- .../data/serializer/binary_row_serializer.h | 2 - .../serializer/row_compacted_serializer.cpp | 25 +- .../serializer/row_compacted_serializer.h | 5 +- src/paimon/common/utils/projected_array.h | 1 - src/paimon/common/utils/serialization_utils.h | 4 +- .../core/io/meta_to_arrow_array_converter.cpp | 2 +- .../compact/interval_partition_test.cpp | 2 +- ...ookup_merge_tree_compact_rewriter_test.cpp | 75 ++++- .../merge_tree_compact_rewriter_test.cpp | 3 +- src/paimon/core/mergetree/levels_test.cpp | 2 +- .../lookup/persist_processor_test.cpp | 30 +- src/paimon/core/mergetree/lookup_levels.cpp | 2 +- .../core/mergetree/lookup_levels_test.cpp | 2 +- .../core/mergetree/merge_tree_writer_test.cpp | 2 +- src/paimon/core/mergetree/sorted_run_test.cpp | 4 +- .../core/operation/merge_file_split_read.cpp | 17 +- .../core/operation/merge_file_split_read.h | 3 - .../operation/merge_file_split_read_test.cpp | 10 +- .../stats/simple_stats_evolution_test.cpp | 2 +- .../table/source/split_generator_test.cpp | 2 +- src/paimon/core/table/source/table_scan.cpp | 2 +- src/paimon/core/utils/fields_comparator.cpp | 47 +-- src/paimon/core/utils/fields_comparator.h | 2 +- .../core/utils/fields_comparator_test.cpp | 6 +- .../core/utils/file_store_path_factory.cpp | 2 +- src/paimon/testing/utils/key_value_checker.h | 4 +- 47 files changed, 528 insertions(+), 874 deletions(-) diff --git a/src/paimon/common/data/abstract_binary_writer.cpp b/src/paimon/common/data/abstract_binary_writer.cpp index 8ac2a2ca..6cf98bc8 100644 --- a/src/paimon/common/data/abstract_binary_writer.cpp +++ b/src/paimon/common/data/abstract_binary_writer.cpp @@ -48,11 +48,11 @@ void AbstractBinaryWriter::WriteString(int32_t pos, const BinaryString& input) { int32_t len = input.GetSizeInBytes(); if (len <= BinarySection::MAX_FIX_PART_DATA_SIZE) { auto bytes = Bytes::AllocateBytes(len, pool_); - MemorySegmentUtils::CopyToBytes(input.GetSegments(), input.GetOffset(), bytes.get(), 0, + MemorySegmentUtils::CopyToBytes({input.GetSegment()}, input.GetOffset(), bytes.get(), 0, len); WriteBytesToFixLenPart(&segment_, GetFieldOffset(pos), *bytes, len); } else { - WriteSegmentsToVarLenPart(pos, input.GetSegments(), input.GetOffset(), len); + WriteSegmentsToVarLenPart(pos, {input.GetSegment()}, input.GetOffset(), len); } } @@ -75,17 +75,17 @@ void AbstractBinaryWriter::WriteStringView(int32_t pos, const std::string_view& } void AbstractBinaryWriter::WriteRow(int32_t pos, const BinaryRow& input) { - return WriteSegmentsToVarLenPart(pos, input.GetSegments(), input.GetOffset(), + return WriteSegmentsToVarLenPart(pos, {input.GetSegment()}, input.GetOffset(), input.GetSizeInBytes()); } void AbstractBinaryWriter::WriteArray(int32_t pos, const BinaryArray& input) { - return WriteSegmentsToVarLenPart(pos, input.GetSegments(), input.GetOffset(), + return WriteSegmentsToVarLenPart(pos, {input.GetSegment()}, input.GetOffset(), input.GetSizeInBytes()); } void AbstractBinaryWriter::WriteMap(int32_t pos, const BinaryMap& input) { - return WriteSegmentsToVarLenPart(pos, input.GetSegments(), input.GetOffset(), + return WriteSegmentsToVarLenPart(pos, {input.GetSegment()}, input.GetOffset(), input.GetSizeInBytes()); } diff --git a/src/paimon/common/data/abstract_binary_writer.h b/src/paimon/common/data/abstract_binary_writer.h index d264d0fe..7f142cf2 100644 --- a/src/paimon/common/data/abstract_binary_writer.h +++ b/src/paimon/common/data/abstract_binary_writer.h @@ -56,7 +56,7 @@ class AbstractBinaryWriter : public BinaryWriter { void WriteStringView(int32_t pos, const std::string_view& view) override; - const MemorySegment& GetSegments() const { + const MemorySegment& GetSegment() const { return segment_; } diff --git a/src/paimon/common/data/binary_array.cpp b/src/paimon/common/data/binary_array.cpp index 29ccbd5e..95798b49 100644 --- a/src/paimon/common/data/binary_array.cpp +++ b/src/paimon/common/data/binary_array.cpp @@ -40,18 +40,10 @@ int32_t BinaryArray::GetElementOffset(int32_t ordinal, int32_t element_size) con } void BinaryArray::PointTo(const MemorySegment& segment, int32_t offset, int32_t size_in_bytes) { - std::vector segments = {segment}; - PointTo(segments, offset, size_in_bytes); -} - -void BinaryArray::PointTo(const std::vector& segments, int32_t offset, - int32_t size_in_bytes) { - // Read the number of elements from the first 4 bytes. - auto size = MemorySegmentUtils::GetValue(segments, offset); + auto size = MemorySegmentUtils::GetValue({segment}, offset); assert(size >= 0); - size_ = size; - segments_ = segments; + segment_ = segment; offset_ = offset; size_in_bytes_ = size_in_bytes; element_offset_ = offset_ + CalculateHeaderInBytes(size_); @@ -59,17 +51,17 @@ void BinaryArray::PointTo(const std::vector& segments, int32_t of bool BinaryArray::IsNullAt(int32_t pos) const { AssertIndexIsValid(pos); - return MemorySegmentUtils::BitGet(segments_, offset_ + 4, pos); + return MemorySegmentUtils::BitGet({segment_}, offset_ + 4, pos); } int64_t BinaryArray::GetLong(int32_t pos) const { AssertIndexIsValid(pos); - return MemorySegmentUtils::GetValue(segments_, GetElementOffset(pos, 8)); + return MemorySegmentUtils::GetValue({segment_}, GetElementOffset(pos, 8)); } int32_t BinaryArray::GetInt(int32_t pos) const { AssertIndexIsValid(pos); - return MemorySegmentUtils::GetValue(segments_, GetElementOffset(pos, 4)); + return MemorySegmentUtils::GetValue({segment_}, GetElementOffset(pos, 4)); } int32_t BinaryArray::GetDate(int32_t pos) const { return GetInt(pos); @@ -78,21 +70,26 @@ int32_t BinaryArray::GetDate(int32_t pos) const { BinaryString BinaryArray::GetString(int32_t pos) const { AssertIndexIsValid(pos); int32_t field_offset = GetElementOffset(pos, 8); - const auto offset_and_size = MemorySegmentUtils::GetValue(segments_, field_offset); - return BinaryDataReadUtils::ReadBinaryString(segments_, offset_, field_offset, offset_and_size); + const auto offset_and_size = MemorySegmentUtils::GetValue({segment_}, field_offset); + return BinaryDataReadUtils::ReadBinaryString(segment_, offset_, field_offset, offset_and_size); +} + +std::string_view BinaryArray::GetStringView(int32_t pos) const { + BinaryString binary_string = GetString(pos); + return binary_string.GetStringView(); } Decimal BinaryArray::GetDecimal(int32_t pos, int32_t precision, int32_t scale) const { AssertIndexIsValid(pos); if (Decimal::IsCompact(precision)) { return Decimal::FromUnscaledLong( - MemorySegmentUtils::GetValue(segments_, GetElementOffset(pos, 8)), precision, + MemorySegmentUtils::GetValue({segment_}, GetElementOffset(pos, 8)), precision, scale); } int32_t field_offset = GetElementOffset(pos, 8); - const auto offset_and_size = MemorySegmentUtils::GetValue(segments_, field_offset); - return BinaryDataReadUtils::ReadDecimal(segments_, offset_, offset_and_size, precision, scale); + const auto offset_and_size = MemorySegmentUtils::GetValue({segment_}, field_offset); + return BinaryDataReadUtils::ReadDecimal(segment_, offset_, offset_and_size, precision, scale); } Timestamp BinaryArray::GetTimestamp(int32_t pos, int32_t precision) const { @@ -100,68 +97,69 @@ Timestamp BinaryArray::GetTimestamp(int32_t pos, int32_t precision) const { if (Timestamp::IsCompact(precision)) { return Timestamp::FromEpochMillis( - MemorySegmentUtils::GetValue(segments_, GetElementOffset(pos, 8))); + MemorySegmentUtils::GetValue({segment_}, GetElementOffset(pos, 8))); } int32_t field_offset = GetElementOffset(pos, 8); const auto offset_and_nano_of_milli = - MemorySegmentUtils::GetValue(segments_, field_offset); - return BinaryDataReadUtils::ReadTimestampData(segments_, offset_, offset_and_nano_of_milli); + MemorySegmentUtils::GetValue({segment_}, field_offset); + return BinaryDataReadUtils::ReadTimestampData(segment_, offset_, offset_and_nano_of_milli); } std::shared_ptr BinaryArray::GetBinary(int32_t pos) const { AssertIndexIsValid(pos); int32_t field_offset = GetElementOffset(pos, 8); - const auto offset_and_size = MemorySegmentUtils::GetValue(segments_, field_offset); - return BinarySection::ReadBinary(segments_, offset_, field_offset, offset_and_size, + const auto offset_and_size = MemorySegmentUtils::GetValue({segment_}, field_offset); + return BinarySection::ReadBinary(segment_, offset_, field_offset, offset_and_size, GetDefaultPool().get()); } std::shared_ptr BinaryArray::GetArray(int32_t pos) const { AssertIndexIsValid(pos); - return BinaryDataReadUtils::ReadArrayData(segments_, offset_, GetLong(pos)); + return BinaryDataReadUtils::ReadArrayData(segment_, offset_, GetLong(pos)); } std::shared_ptr BinaryArray::GetMap(int32_t pos) const { AssertIndexIsValid(pos); - return BinaryDataReadUtils::ReadMapData(segments_, offset_, GetLong(pos)); + return BinaryDataReadUtils::ReadMapData(segment_, offset_, GetLong(pos)); } std::shared_ptr BinaryArray::GetRow(int32_t pos, int32_t num_fields) const { AssertIndexIsValid(pos); int32_t field_offset = GetElementOffset(pos, 8); - const auto offset_and_size = MemorySegmentUtils::GetValue(segments_, field_offset); - return BinaryDataReadUtils::ReadRowData(segments_, num_fields, offset_, offset_and_size); + const auto offset_and_size = MemorySegmentUtils::GetValue({segment_}, field_offset); + return BinaryDataReadUtils::ReadRowData(segment_, num_fields, offset_, offset_and_size); } bool BinaryArray::GetBoolean(int32_t pos) const { AssertIndexIsValid(pos); - return MemorySegmentUtils::GetValue(segments_, GetElementOffset(pos, 1)); + return MemorySegmentUtils::GetValue({segment_}, GetElementOffset(pos, 1)); } char BinaryArray::GetByte(int32_t pos) const { AssertIndexIsValid(pos); - return MemorySegmentUtils::GetValue(segments_, GetElementOffset(pos, 1)); + return MemorySegmentUtils::GetValue({segment_}, GetElementOffset(pos, 1)); } int16_t BinaryArray::GetShort(int32_t pos) const { AssertIndexIsValid(pos); - return MemorySegmentUtils::GetValue(segments_, GetElementOffset(pos, sizeof(int16_t))); + return MemorySegmentUtils::GetValue({segment_}, + GetElementOffset(pos, sizeof(int16_t))); } float BinaryArray::GetFloat(int32_t pos) const { AssertIndexIsValid(pos); - return MemorySegmentUtils::GetValue(segments_, GetElementOffset(pos, sizeof(float))); + return MemorySegmentUtils::GetValue({segment_}, GetElementOffset(pos, sizeof(float))); } double BinaryArray::GetDouble(int32_t pos) const { AssertIndexIsValid(pos); - return MemorySegmentUtils::GetValue(segments_, GetElementOffset(pos, sizeof(double))); + return MemorySegmentUtils::GetValue({segment_}, GetElementOffset(pos, sizeof(double))); } bool BinaryArray::AnyNull() const { for (int32_t i = offset_ + 4; i < element_offset_; i += 4) { - if (MemorySegmentUtils::GetValue(segments_, i) != 0) { + if (MemorySegmentUtils::GetValue({segment_}, i) != 0) { return true; } } @@ -172,7 +170,7 @@ Result> BinaryArray::ToBooleanArray() const { PAIMON_RETURN_NOT_OK(CheckNoNull()); std::vector values; values.resize(size_); - MemorySegmentUtils::CopyToUnsafe(segments_, element_offset_, + MemorySegmentUtils::CopyToUnsafe({segment_}, element_offset_, const_cast(static_cast(values.data())), size_); return values; @@ -182,7 +180,7 @@ Result> BinaryArray::ToByteArray() const { PAIMON_RETURN_NOT_OK(CheckNoNull()); std::vector values; values.resize(size_); - MemorySegmentUtils::CopyToUnsafe(segments_, element_offset_, + MemorySegmentUtils::CopyToUnsafe({segment_}, element_offset_, const_cast(static_cast(values.data())), size_); return values; @@ -192,7 +190,7 @@ Result> BinaryArray::ToShortArray() const { PAIMON_RETURN_NOT_OK(CheckNoNull()); std::vector values; values.resize(size_); - MemorySegmentUtils::CopyToUnsafe(segments_, element_offset_, + MemorySegmentUtils::CopyToUnsafe({segment_}, element_offset_, const_cast(static_cast(values.data())), size_ * sizeof(int16_t)); return values; @@ -202,7 +200,7 @@ Result> BinaryArray::ToIntArray() const { PAIMON_RETURN_NOT_OK(CheckNoNull()); std::vector values; values.resize(size_); - MemorySegmentUtils::CopyToUnsafe(segments_, element_offset_, + MemorySegmentUtils::CopyToUnsafe({segment_}, element_offset_, const_cast(static_cast(values.data())), size_ * sizeof(int32_t)); return values; @@ -212,7 +210,7 @@ Result> BinaryArray::ToLongArray() const { PAIMON_RETURN_NOT_OK(CheckNoNull()); std::vector values; values.resize(size_); - MemorySegmentUtils::CopyToUnsafe(segments_, element_offset_, + MemorySegmentUtils::CopyToUnsafe({segment_}, element_offset_, const_cast(static_cast(values.data())), size_ * sizeof(int64_t)); return values; @@ -222,7 +220,7 @@ Result> BinaryArray::ToFloatArray() const { PAIMON_RETURN_NOT_OK(CheckNoNull()); std::vector values; values.resize(size_); - MemorySegmentUtils::CopyToUnsafe(segments_, element_offset_, + MemorySegmentUtils::CopyToUnsafe({segment_}, element_offset_, const_cast(static_cast(values.data())), size_ * sizeof(float)); return values; @@ -232,7 +230,7 @@ Result> BinaryArray::ToDoubleArray() const { PAIMON_RETURN_NOT_OK(CheckNoNull()); std::vector values; values.resize(size_); - MemorySegmentUtils::CopyToUnsafe(segments_, element_offset_, + MemorySegmentUtils::CopyToUnsafe({segment_}, element_offset_, const_cast(static_cast(values.data())), size_ * sizeof(double)); return values; @@ -246,7 +244,7 @@ BinaryArray BinaryArray::Copy(MemoryPool* pool) const { void BinaryArray::Copy(BinaryArray* reuse, MemoryPool* pool) const { std::shared_ptr bytes = - MemorySegmentUtils::CopyToBytes(segments_, offset_, size_in_bytes_, pool); + MemorySegmentUtils::CopyToBytes({segment_}, offset_, size_in_bytes_, pool); reuse->PointTo(MemorySegment::Wrap(bytes), 0, size_in_bytes_); } diff --git a/src/paimon/common/data/binary_array.h b/src/paimon/common/data/binary_array.h index f819ae59..936e2936 100644 --- a/src/paimon/common/data/binary_array.h +++ b/src/paimon/common/data/binary_array.h @@ -38,9 +38,12 @@ namespace paimon { class MemorySegment; -/// A binary implementation of `InternalArray` which is backed by `MemorySegment`s. +/// A binary implementation of `InternalArray` which is backed by a single `MemorySegment`. /// For fields that hold fixed-length primitive types, such as long, double or int, they are -/// stored compacted in bytes, just like the original c array. +/// stored compacted in bytes, just like the original C array. +/// +/// @note: Unlike the Java implementation where data may span multiple MemorySegments, +/// in this C++ implementation all data resides within a single MemorySegment. /// /// The binary layout of BinaryArray: /// [size(int)] + [null bits(4-byte word boundaries)] + [values or offset&length] + [variable length @@ -54,9 +57,7 @@ class BinaryArray final : public BinarySection, public InternalArray { int32_t Size() const override { return size_; } - void PointTo(const MemorySegment& segments, int32_t offset, int32_t size_in_bytes) override; - void PointTo(const std::vector& segments, int32_t offset, - int32_t size_in_bytes) override; + void PointTo(const MemorySegment& segment, int32_t offset, int32_t size_in_bytes) override; bool IsNullAt(int32_t pos) const override; bool GetBoolean(int32_t pos) const override; @@ -68,13 +69,8 @@ class BinaryArray final : public BinarySection, public InternalArray { float GetFloat(int32_t pos) const override; double GetDouble(int32_t pos) const override; BinaryString GetString(int32_t pos) const override; + std::string_view GetStringView(int32_t pos) const override; - /// In binary array, string data may in multiple segments, we cannot construct a - /// std::string_view - std::string_view GetStringView(int32_t pos) const override { - assert(false); - return std::string_view(); - } Decimal GetDecimal(int32_t pos, int32_t precision, int32_t scale) const override; Timestamp GetTimestamp(int32_t pos, int32_t precision) const override; std::shared_ptr GetBinary(int32_t pos) const override; @@ -96,7 +92,7 @@ class BinaryArray final : public BinarySection, public InternalArray { void Copy(BinaryArray* reuse, MemoryPool* pool) const; int32_t HashCode() const override { - return MemorySegmentUtils::HashByWords(segments_, offset_, size_in_bytes_, + return MemorySegmentUtils::HashByWords({segment_}, offset_, size_in_bytes_, GetDefaultPool().get()); } diff --git a/src/paimon/common/data/binary_data_read_utils.h b/src/paimon/common/data/binary_data_read_utils.h index 1347cd74..cda01fac 100644 --- a/src/paimon/common/data/binary_data_read_utils.h +++ b/src/paimon/common/data/binary_data_read_utils.h @@ -41,24 +41,24 @@ class BinaryDataReadUtils { /// @param offset_and_nanos the offset of milli-seconds part and nanoseconds /// @return an instance of `Timestamp` - static Timestamp ReadTimestampData(const std::vector& segments, - int32_t base_offset, int64_t offset_and_nanos) { + static Timestamp ReadTimestampData(const MemorySegment& segment, int32_t base_offset, + int64_t offset_and_nanos) { auto nano_of_millisecond = static_cast(offset_and_nanos & LOW_BYTES_MASK); auto sub_offset = static_cast(offset_and_nanos >> 32); auto millisecond = - MemorySegmentUtils::GetValue(segments, base_offset + sub_offset); + MemorySegmentUtils::GetValue({segment}, base_offset + sub_offset); return Timestamp::FromEpochMillis(millisecond, nano_of_millisecond); } /// Gets an instance of `Decimal` from underlying `MemorySegment`. - static Decimal ReadDecimal(const std::vector& segments, int32_t base_offset, + static Decimal ReadDecimal(const MemorySegment& segment, int32_t base_offset, int64_t offset_and_size, int32_t precision, int32_t scale) { auto size = static_cast(offset_and_size & LOW_BYTES_MASK); auto sub_offset = static_cast(offset_and_size >> 32); auto bytes = Bytes::AllocateBytes(size, GetDefaultPool().get()); std::memset(bytes->data(), 0, bytes->size()); - MemorySegmentUtils::CopyToBytes(segments, base_offset + sub_offset, bytes.get(), 0, + MemorySegmentUtils::CopyToBytes({segment}, base_offset + sub_offset, bytes.get(), 0, size); return Decimal::FromUnscaledBytes(precision, scale, bytes.get()); } @@ -69,55 +69,55 @@ class BinaryDataReadUtils { /// @param field_offset absolute start offset of variable_part_offset_and_len. /// @param variable_part_offset_and_len a long value, real data or offset and len. - static BinaryString ReadBinaryString(const std::vector& segments, - int32_t base_offset, int64_t field_offset, + static BinaryString ReadBinaryString(const MemorySegment& segment, int32_t base_offset, + int64_t field_offset, int64_t variable_part_offset_and_len) { int64_t mark = variable_part_offset_and_len & BinaryString::HIGHEST_FIRST_BIT; if (mark == 0) { const auto sub_offset = static_cast(variable_part_offset_and_len >> 32); const auto len = static_cast(variable_part_offset_and_len & LOW_BYTES_MASK); - return BinaryString::FromAddress(segments, base_offset + sub_offset, len); + return BinaryString::FromAddress(segment, base_offset + sub_offset, len); } else { auto len = static_cast( (static_cast(variable_part_offset_and_len & BinaryString::HIGHEST_SECOND_TO_EIGHTH_BIT)) >> 56); if (SystemByteOrder() == ByteOrder::PAIMON_LITTLE_ENDIAN) { - return BinaryString::FromAddress(segments, field_offset, len); + return BinaryString::FromAddress(segment, field_offset, len); } else { // field_offset + 1 to skip header. - return BinaryString::FromAddress(segments, field_offset + 1, len); + return BinaryString::FromAddress(segment, field_offset + 1, len); } } } - static std::shared_ptr ReadArrayData(const std::vector& segments, + static std::shared_ptr ReadArrayData(const MemorySegment& segment, int32_t base_offset, int64_t offset_and_size) { auto size = static_cast(offset_and_size & LOW_BYTES_MASK); auto offset = static_cast(offset_and_size >> 32); auto binary_array = std::make_shared(); - binary_array->PointTo(segments, offset + base_offset, size); + binary_array->PointTo(segment, offset + base_offset, size); return binary_array; } /// Gets an instance of `InternalRow` from underlying `MemorySegment`. - static std::shared_ptr ReadRowData(const std::vector& segments, + static std::shared_ptr ReadRowData(const MemorySegment& segment, int32_t num_fields, int32_t base_offset, int64_t offset_and_size) { auto size = static_cast(offset_and_size & LOW_BYTES_MASK); auto offset = static_cast(offset_and_size >> 32); auto row = std::make_shared(num_fields); - row->PointTo(segments, offset + base_offset, size); + row->PointTo(segment, offset + base_offset, size); return row; } - static std::shared_ptr ReadMapData(const std::vector& segments, + static std::shared_ptr ReadMapData(const MemorySegment& segment, int32_t base_offset, int64_t offset_and_size) { auto size = static_cast(offset_and_size & LOW_BYTES_MASK); auto offset = static_cast(offset_and_size >> 32); auto binary_map = std::make_shared(); - binary_map->PointTo(segments, offset + base_offset, size); + binary_map->PointTo(segment, offset + base_offset, size); return binary_map; } diff --git a/src/paimon/common/data/binary_map.h b/src/paimon/common/data/binary_map.h index d471286a..0200f621 100644 --- a/src/paimon/common/data/binary_map.h +++ b/src/paimon/common/data/binary_map.h @@ -20,8 +20,12 @@ #include "paimon/common/memory/memory_segment.h" #include "paimon/common/memory/memory_segment_utils.h" namespace paimon { -/// [4 byte(keyArray size in bytes)] + [Key BinaryArray] + [Value BinaryArray]. -/// `BinaryMap` are influenced by Apache Spark UnsafeMapData +/// A binary implementation of `InternalMap` which is backed by a single `MemorySegment`. +/// Binary layout: [4 byte(keyArray size in bytes)] + [Key BinaryArray] + [Value BinaryArray]. +/// `BinaryMap` is influenced by Apache Spark UnsafeMapData. +/// +/// @note: Unlike the Java implementation where data may span multiple MemorySegments, +/// in this C++ implementation all data resides within a single MemorySegment. class BinaryMap : public BinarySection, public InternalMap { public: BinaryMap() = default; @@ -37,44 +41,40 @@ class BinaryMap : public BinarySection, public InternalMap { return values_; } - void PointTo(const std::vector& segments, int32_t offset, - int32_t size_in_bytes) override { + void PointTo(const MemorySegment& segment, int32_t offset, int32_t size_in_bytes) override { // Read the numBytes of key array from the first 4 bytes. - auto key_array_bytes = MemorySegmentUtils::GetValue(segments, offset); + auto key_array_bytes = MemorySegmentUtils::GetValue({segment}, offset); assert(key_array_bytes >= 0); int32_t value_array_bytes = size_in_bytes - key_array_bytes - kHeaderSize; assert(value_array_bytes >= 0); assert(keys_); - keys_->PointTo(segments, offset + kHeaderSize, key_array_bytes); + keys_->PointTo(segment, offset + kHeaderSize, key_array_bytes); assert(values_); - values_->PointTo(segments, offset + kHeaderSize + key_array_bytes, value_array_bytes); + values_->PointTo(segment, offset + kHeaderSize + key_array_bytes, value_array_bytes); assert(keys_->Size() == values_->Size()); - segments_ = segments; + segment_ = segment; offset_ = offset; size_in_bytes_ = size_in_bytes; } static Result> ValueOf(const BinaryArray& key, const BinaryArray& value, MemoryPool* pool) { - if (key.GetSegments().size() != 1 || value.GetSegments().size() != 1) { - return Status::Invalid("In BinaryMap ValueOf, key and value must have single segment"); - } auto bytes = std::make_shared( kHeaderSize + key.GetSizeInBytes() + value.GetSizeInBytes(), pool); MemorySegment segment = MemorySegment::Wrap(bytes); segment.PutValue(0, key.GetSizeInBytes()); - const auto& key_segment = key.GetSegments()[0]; + const auto& key_segment = key.GetSegment(); key_segment.CopyTo(key.GetOffset(), &segment, /*target_offset=*/kHeaderSize, key.GetSizeInBytes()); - const auto& value_segment = value.GetSegments()[0]; + const auto& value_segment = value.GetSegment(); value_segment.CopyTo(value.GetOffset(), &segment, /*target_offset=*/kHeaderSize + key.GetSizeInBytes(), value.GetSizeInBytes()); auto binary_map = std::make_shared(); - binary_map->PointTo({segment}, /*offset=*/0, bytes->size()); + binary_map->PointTo(segment, /*offset=*/0, bytes->size()); return binary_map; } diff --git a/src/paimon/common/data/binary_row.cpp b/src/paimon/common/data/binary_row.cpp index 863b8095..9337bcc3 100644 --- a/src/paimon/common/data/binary_row.cpp +++ b/src/paimon/common/data/binary_row.cpp @@ -69,12 +69,12 @@ BinaryRow BinaryRow::GetEmptyRow() { } Result BinaryRow::GetRowKind() const { - char kind_value = MemorySegmentUtils::GetValue(segments_, offset_); + char kind_value = MemorySegmentUtils::GetValue({segment_}, offset_); return RowKind::FromByteValue(kind_value); } void BinaryRow::SetRowKind(const RowKind* kind) { - MemorySegmentUtils::SetValue(&segments_, offset_, kind->ToByteValue()); + segment_.PutValue(offset_, kind->ToByteValue()); } void BinaryRow::SetTotalSize(int32_t size_in_bytes) { @@ -83,83 +83,83 @@ void BinaryRow::SetTotalSize(int32_t size_in_bytes) { bool BinaryRow::IsNullAt(int32_t pos) const { AssertIndexIsValid(pos); - return MemorySegmentUtils::BitGet(segments_, offset_, pos + HEADER_SIZE_IN_BITS); + return MemorySegmentUtils::BitGet({segment_}, offset_, pos + HEADER_SIZE_IN_BITS); } void BinaryRow::SetNotNullAt(int32_t i) { AssertIndexIsValid(i); - MemorySegmentUtils::BitUnSet(&segments_, offset_, i + HEADER_SIZE_IN_BITS); + MemorySegmentUtils::BitUnSet(&segment_, offset_, i + HEADER_SIZE_IN_BITS); } void BinaryRow::SetNullAt(int32_t i) { AssertIndexIsValid(i); - MemorySegmentUtils::BitSet(&segments_, offset_, i + HEADER_SIZE_IN_BITS); + MemorySegmentUtils::BitSet(&segment_, offset_, i + HEADER_SIZE_IN_BITS); // We must set the fixed length part zero. // 1.Only int/long/boolean...(Fix length type) will invoke this SetNullAt. // 2.Set to zero in order to equals and hash operation bytes calculation. - MemorySegmentUtils::SetValue(&segments_, GetFieldOffset(i), 0); + segment_.PutValue(GetFieldOffset(i), 0); } void BinaryRow::SetInt(int32_t pos, int32_t value) { AssertIndexIsValid(pos); SetNotNullAt(pos); - MemorySegmentUtils::SetValue(&segments_, GetFieldOffset(pos), value); + segment_.PutValue(GetFieldOffset(pos), value); } void BinaryRow::SetLong(int32_t pos, int64_t value) { AssertIndexIsValid(pos); SetNotNullAt(pos); - MemorySegmentUtils::SetValue(&segments_, GetFieldOffset(pos), value); + segment_.PutValue(GetFieldOffset(pos), value); } void BinaryRow::SetDouble(int32_t pos, double value) { AssertIndexIsValid(pos); SetNotNullAt(pos); - MemorySegmentUtils::SetValue(&segments_, GetFieldOffset(pos), value); + segment_.PutValue(GetFieldOffset(pos), value); } void BinaryRow::SetBoolean(int32_t pos, bool value) { AssertIndexIsValid(pos); SetNotNullAt(pos); - MemorySegmentUtils::SetValue(&segments_, GetFieldOffset(pos), value); + segment_.PutValue(GetFieldOffset(pos), value); } void BinaryRow::SetShort(int32_t pos, int16_t value) { AssertIndexIsValid(pos); SetNotNullAt(pos); - MemorySegmentUtils::SetValue(&segments_, GetFieldOffset(pos), value); + segment_.PutValue(GetFieldOffset(pos), value); } void BinaryRow::SetByte(int32_t pos, char value) { AssertIndexIsValid(pos); SetNotNullAt(pos); - MemorySegmentUtils::SetValue(&segments_, GetFieldOffset(pos), value); + segment_.PutValue(GetFieldOffset(pos), value); } void BinaryRow::SetFloat(int32_t pos, float value) { AssertIndexIsValid(pos); SetNotNullAt(pos); - MemorySegmentUtils::SetValue(&segments_, GetFieldOffset(pos), value); + segment_.PutValue(GetFieldOffset(pos), value); } bool BinaryRow::GetBoolean(int32_t pos) const { AssertIndexIsValid(pos); - return MemorySegmentUtils::GetValue(segments_, GetFieldOffset(pos)); + return MemorySegmentUtils::GetValue({segment_}, GetFieldOffset(pos)); } char BinaryRow::GetByte(int32_t pos) const { AssertIndexIsValid(pos); - return MemorySegmentUtils::GetValue(segments_, GetFieldOffset(pos)); + return MemorySegmentUtils::GetValue({segment_}, GetFieldOffset(pos)); } int16_t BinaryRow::GetShort(int32_t pos) const { AssertIndexIsValid(pos); - return MemorySegmentUtils::GetValue(segments_, GetFieldOffset(pos)); + return MemorySegmentUtils::GetValue({segment_}, GetFieldOffset(pos)); } int32_t BinaryRow::GetInt(int32_t pos) const { AssertIndexIsValid(pos); - return MemorySegmentUtils::GetValue(segments_, GetFieldOffset(pos)); + return MemorySegmentUtils::GetValue({segment_}, GetFieldOffset(pos)); } int32_t BinaryRow::GetDate(int32_t pos) const { @@ -168,24 +168,29 @@ int32_t BinaryRow::GetDate(int32_t pos) const { int64_t BinaryRow::GetLong(int32_t pos) const { AssertIndexIsValid(pos); - return MemorySegmentUtils::GetValue(segments_, GetFieldOffset(pos)); + return MemorySegmentUtils::GetValue({segment_}, GetFieldOffset(pos)); } float BinaryRow::GetFloat(int32_t pos) const { AssertIndexIsValid(pos); - return MemorySegmentUtils::GetValue(segments_, GetFieldOffset(pos)); + return MemorySegmentUtils::GetValue({segment_}, GetFieldOffset(pos)); } double BinaryRow::GetDouble(int32_t pos) const { AssertIndexIsValid(pos); - return MemorySegmentUtils::GetValue(segments_, GetFieldOffset(pos)); + return MemorySegmentUtils::GetValue({segment_}, GetFieldOffset(pos)); } BinaryString BinaryRow::GetString(int32_t pos) const { AssertIndexIsValid(pos); int32_t field_offset = GetFieldOffset(pos); - const auto offset_and_len = MemorySegmentUtils::GetValue(segments_, field_offset); - return BinaryDataReadUtils::ReadBinaryString(segments_, offset_, field_offset, offset_and_len); + const auto offset_and_len = MemorySegmentUtils::GetValue({segment_}, field_offset); + return BinaryDataReadUtils::ReadBinaryString(segment_, offset_, field_offset, offset_and_len); +} + +std::string_view BinaryRow::GetStringView(int32_t pos) const { + BinaryString str = GetString(pos); + return str.GetStringView(); } Decimal BinaryRow::GetDecimal(int32_t pos, int32_t precision, int32_t scale) const { @@ -193,10 +198,10 @@ Decimal BinaryRow::GetDecimal(int32_t pos, int32_t precision, int32_t scale) con int32_t field_offset = GetFieldOffset(pos); if (Decimal::IsCompact(precision)) { return Decimal::FromUnscaledLong( - MemorySegmentUtils::GetValue(segments_, field_offset), precision, scale); + MemorySegmentUtils::GetValue({segment_}, field_offset), precision, scale); } - const auto offset_and_size = MemorySegmentUtils::GetValue(segments_, field_offset); - return BinaryDataReadUtils::ReadDecimal(segments_, offset_, offset_and_size, precision, scale); + const auto offset_and_size = MemorySegmentUtils::GetValue({segment_}, field_offset); + return BinaryDataReadUtils::ReadDecimal(segment_, offset_, offset_and_size, precision, scale); } Timestamp BinaryRow::GetTimestamp(int32_t pos, int32_t precision) const { @@ -204,44 +209,44 @@ Timestamp BinaryRow::GetTimestamp(int32_t pos, int32_t precision) const { int32_t field_offset = GetFieldOffset(pos); if (Timestamp::IsCompact(precision)) { return Timestamp::FromEpochMillis( - MemorySegmentUtils::GetValue(segments_, field_offset)); + MemorySegmentUtils::GetValue({segment_}, field_offset)); } const auto offset_and_nano_of_milli = - MemorySegmentUtils::GetValue(segments_, field_offset); - return BinaryDataReadUtils::ReadTimestampData(segments_, offset_, offset_and_nano_of_milli); + MemorySegmentUtils::GetValue({segment_}, field_offset); + return BinaryDataReadUtils::ReadTimestampData(segment_, offset_, offset_and_nano_of_milli); } std::shared_ptr BinaryRow::GetBinary(int32_t pos) const { AssertIndexIsValid(pos); int32_t field_offset = GetFieldOffset(pos); - const auto offset_and_len = MemorySegmentUtils::GetValue(segments_, field_offset); - return BinarySection::ReadBinary(segments_, offset_, field_offset, offset_and_len, + const auto offset_and_len = MemorySegmentUtils::GetValue({segment_}, field_offset); + return BinarySection::ReadBinary(segment_, offset_, field_offset, offset_and_len, GetDefaultPool().get()); } std::shared_ptr BinaryRow::GetArray(int32_t pos) const { AssertIndexIsValid(pos); - return BinaryDataReadUtils::ReadArrayData(segments_, offset_, GetLong(pos)); + return BinaryDataReadUtils::ReadArrayData(segment_, offset_, GetLong(pos)); } std::shared_ptr BinaryRow::GetMap(int32_t pos) const { AssertIndexIsValid(pos); - return BinaryDataReadUtils::ReadMapData(segments_, offset_, GetLong(pos)); + return BinaryDataReadUtils::ReadMapData(segment_, offset_, GetLong(pos)); } std::shared_ptr BinaryRow::GetRow(int32_t pos, int32_t num_fields) const { AssertIndexIsValid(pos); - return BinaryDataReadUtils::ReadRowData(segments_, num_fields, offset_, GetLong(pos)); + return BinaryDataReadUtils::ReadRowData(segment_, num_fields, offset_, GetLong(pos)); } /// The bit is 1 when the field is null. Default is 0. bool BinaryRow::AnyNull() const { // Skip the header. - if ((MemorySegmentUtils::GetValue(segments_, offset_) & FIRST_BYTE_ZERO) != 0) { + if ((MemorySegmentUtils::GetValue({segment_}, offset_) & FIRST_BYTE_ZERO) != 0) { return true; } for (int32_t i = 8; i < null_bits_size_in_bytes_; i += 8) { - if (MemorySegmentUtils::GetValue(segments_, offset_ + i) != 0) { + if (MemorySegmentUtils::GetValue({segment_}, offset_ + i) != 0) { return true; } } @@ -269,12 +274,12 @@ void BinaryRow::Copy(BinaryRow* reuse, MemoryPool* pool) const { void BinaryRow::CopyInternal(BinaryRow* reuse, MemoryPool* pool) const { std::shared_ptr bytes = - MemorySegmentUtils::CopyToBytes(segments_, offset_, size_in_bytes_, pool); + MemorySegmentUtils::CopyToBytes({segment_}, offset_, size_in_bytes_, pool); reuse->PointTo(MemorySegment::Wrap(bytes), 0, size_in_bytes_); } void BinaryRow::Clear() { - segments_.clear(); + segment_ = MemorySegment(); offset_ = 0; size_in_bytes_ = 0; } @@ -284,17 +289,12 @@ bool BinaryRow::operator==(const BinaryRow& other) const { return true; } return size_in_bytes_ == other.size_in_bytes_ && - MemorySegmentUtils::Equals(segments_, offset_, other.segments_, other.offset_, + MemorySegmentUtils::Equals({segment_}, offset_, {other.segment_}, other.offset_, size_in_bytes_); } int32_t BinaryRow::HashCode() const { - assert(segments_.size() > 0); - if (segments_.size() == 1) { - return MemorySegmentUtils::HashByWords(segments_, offset_, size_in_bytes_, nullptr); - } - return MemorySegmentUtils::HashByWords(segments_, offset_, size_in_bytes_, - GetDefaultPool().get()); + return MemorySegmentUtils::HashByWords({segment_}, offset_, size_in_bytes_, nullptr); } } // namespace paimon diff --git a/src/paimon/common/data/binary_row.h b/src/paimon/common/data/binary_row.h index 8bb8e465..d6f808a9 100644 --- a/src/paimon/common/data/binary_row.h +++ b/src/paimon/common/data/binary_row.h @@ -44,31 +44,18 @@ namespace paimon { class Bytes; class MemoryPool; -/// An implementation of `InternalRow` which is backed by `MemorySegment`. -/// A Row has two part: Fixed-length part and variable-length part. +/// An implementation of `InternalRow` which is backed by a single `MemorySegment`. +/// A Row has two parts: Fixed-length part and variable-length part. /// /// Fixed-length part contains 1 byte header and null bit set and field values. Null bit /// set is used for null tracking and is aligned to 8-byte word boundaries. Field values -/// holds fixed-length primitive types and variable-length values which can be stored in 8 -/// bytes inside. If it do not fit the variable-length field, then store the length and +/// hold fixed-length primitive types and variable-length values which can be stored in 8 +/// bytes inside. If it does not fit the variable-length field, then store the length and /// offset of variable-length part. /// -/// %In BinaryRow, Fixed-length part will certainly fall into a MemorySegment, which will speed up -/// the read and write of field. During the write phase, if the target memory segment has less space -/// than fixed length part size, we will skip the space. So the number of fields in a single Row -/// cannot exceed the capacity of a single MemorySegment, if there are too many fields, we suggest -/// that user set a bigger pageSize of MemorySegment. -/// -/// Variable-length part may fall into multiple MemorySegments. -/// -/// Noted that the BinaryRow class of c++ support both the `NestedRow` and BinaryRow class in -/// java. -/// `NestedRow` memory storage structure is exactly the same with BinaryRow. The only -/// different is that, as `NestedRow` is used to store row value in the variable-length part -/// of BinaryRow, every field (including both fixed-length part and variable-length part) of -/// `NestedRow` has a possibility to cross the boundary of a segment, while the fixed-length part of -/// BinaryRow must fit into its first memory segment. - +/// @note: Unlike the Java implementation where variable-length data may span multiple +/// MemorySegments, in this C++ implementation both the fixed-length part and the +/// variable-length part reside within a single MemorySegment. class BinaryRow final : public BinarySection, public InternalRow, public DataSetters { public: BinaryRow() : BinaryRow(0) {} @@ -107,11 +94,7 @@ class BinaryRow final : public BinarySection, public InternalRow, public DataSet float GetFloat(int32_t pos) const override; double GetDouble(int32_t pos) const override; BinaryString GetString(int32_t pos) const override; - /// In binary row, string data may in multiple segments, we cannot construct a std::string_view - std::string_view GetStringView(int32_t pos) const override { - assert(false); - return std::string_view(); - } + std::string_view GetStringView(int32_t pos) const override; Decimal GetDecimal(int32_t pos, int32_t precision, int32_t scale) const override; Timestamp GetTimestamp(int32_t pos, int32_t precision) const override; diff --git a/src/paimon/common/data/binary_row_test.cpp b/src/paimon/common/data/binary_row_test.cpp index 31fa189b..9cbd7bb5 100644 --- a/src/paimon/common/data/binary_row_test.cpp +++ b/src/paimon/common/data/binary_row_test.cpp @@ -32,6 +32,8 @@ #include "paimon/common/utils/date_time_utils.h" #include "paimon/common/utils/decimal_utils.h" #include "paimon/common/utils/serialization_utils.h" +#include "paimon/io/byte_array_input_stream.h" +#include "paimon/io/data_input_stream.h" #include "paimon/memory/bytes.h" #include "paimon/memory/memory_pool.h" #include "paimon/status.h" @@ -138,46 +140,17 @@ TEST_F(BinaryRowTest, TestWriter) { writer.Complete(); AssertTestWriterRow(row); - AssertTestWriterRow(row.Copy(pool.get())); - // test copy from var segments. - int32_t sub_size = row.GetFixedLengthPartSize() + 10; - - auto bytes1 = Bytes::AllocateBytes(sub_size, pool.get()); - auto bytes2 = Bytes::AllocateBytes(sub_size, pool.get()); - MemorySegment sub_ms1 = MemorySegment::Wrap(std::move(bytes1)); - MemorySegment sub_ms2 = MemorySegment::Wrap(std::move(bytes2)); - row.GetSegments()[0].CopyTo(0, &sub_ms1, 0, sub_size); - row.GetSegments()[0].CopyTo(sub_size, &sub_ms2, 0, row.GetSizeInBytes() - sub_size); - - std::vector segs = {sub_ms1, sub_ms2}; - BinaryRow to_copy(arity); - to_copy.PointTo(segs, 0, row.GetSizeInBytes()); - ASSERT_EQ(to_copy.HashCode(), row.HashCode()); - AssertTestWriterRow(to_copy); - BinaryRow new_row(arity); - to_copy.Copy(&new_row, pool.get()); - AssertTestWriterRow(new_row); -} + // test copy + AssertTestWriterRow(row.Copy(pool.get())); -TEST_F(BinaryRowTest, TestWriter2) { - // test write multi segments to var len parts - auto pool = GetDefaultPool(); - int32_t arity = 1; - BinaryRow row(arity); - BinaryRowWriter writer(&row, 100, pool.get()); - - std::string str1 = "Strive not to be a success, "; - std::string str2 = "but rather to be of value."; - auto bytes1 = std::make_shared(str1, pool.get()); - auto bytes2 = std::make_shared(str2, pool.get()); - std::vector mem_segs({MemorySegment::Wrap(bytes1), MemorySegment::Wrap(bytes2)}); - auto binary_string = BinaryString::FromAddress(mem_segs, /*offset=*/2, - /*num_bytes=*/str1.length() + str2.length() - 2); - writer.WriteString(0, binary_string); - writer.Complete(); + // test copy + BinaryRow copied(arity); + row.Copy(&copied, pool.get()); + AssertTestWriterRow(copied); - ASSERT_EQ(row.GetString(0).ToString(), "rive not to be a success, but rather to be of value."); + ASSERT_EQ(copied, row); + AssertTestWriterRow(row); } TEST_F(BinaryRowTest, TestWriteString) { @@ -554,11 +527,9 @@ TEST_F(BinaryRowTest, TestBinaryRowSerializer) { srand(static_cast(time(nullptr))); auto pool = GetDefaultPool(); BinaryRow row(3); - int32_t bytes_size = 1024; - std::shared_ptr bytes = Bytes::AllocateBytes(1024, pool.get()); - for (int32_t i = 0; i < bytes_size; i++) { - (*bytes)[i] = paimon::test::RandomNumber(0, 255); - } + BinaryRowWriter writer(&row, /*initial_size=*/1024, pool.get()); + writer.WriteInt(0, paimon::test::RandomNumber(0, 2000000)); + int32_t str_size = 1024; std::string test_string1, test_string2; test_string1.reserve(str_size); @@ -567,43 +538,22 @@ TEST_F(BinaryRowTest, TestBinaryRowSerializer) { test_string1 += static_cast(paimon::test::RandomNumber(0, 25) + 'a'); test_string2 += static_cast(paimon::test::RandomNumber(0, 25) + 'a'); } - std::shared_ptr bytes1 = Bytes::AllocateBytes(test_string1, pool.get()); - std::shared_ptr bytes2 = Bytes::AllocateBytes(test_string2, pool.get()); - std::vector segs = {MemorySegment::Wrap(bytes), MemorySegment::Wrap(bytes1), - MemorySegment::Wrap(bytes2)}; - row.PointTo(segs, 0, bytes_size + test_string1.size() + test_string2.size()); + writer.WriteStringView(1, std::string_view(test_string1)); + writer.WriteStringView(2, std::string_view(test_string2)); + writer.Complete(); - BinaryRowSerializer row_serializer_(3, pool); + BinaryRowSerializer row_serializer(3, pool); MemorySegmentOutputStream out(MemorySegmentOutputStream::DEFAULT_SEGMENT_SIZE, pool); - ASSERT_OK(row_serializer_.Serialize(row, &out)); + ASSERT_OK(row_serializer.Serialize(row, &out)); PAIMON_UNIQUE_PTR bytes_serialize = MemorySegmentUtils::CopyToBytes(out.Segments(), 0, out.CurrentSize(), pool.get()); - std::string str_serialize = std::string(bytes_serialize->data(), bytes_serialize->size()); - ASSERT_GE(str_serialize.size(), bytes_size + test_string1.size() + test_string2.size()); - std::string str_serialize1 = str_serialize.substr( - str_serialize.size() - test_string1.size() - test_string2.size(), test_string1.size()); - std::string str_serialize2 = - str_serialize.substr(str_serialize.size() - test_string2.size(), test_string2.size()); - ASSERT_EQ(test_string1, str_serialize1); - ASSERT_EQ(test_string2, str_serialize2); -} - -TEST_F(BinaryRowTest, TestWriteWithMultiSegments) { - auto pool = GetDefaultPool(); - std::string str1 = "Strive not to be a success, "; - std::string str2 = "but rather to be of value."; - auto bytes1 = std::make_shared(str1, pool.get()); - auto bytes2 = std::make_shared(str2, pool.get()); - std::vector mem_segs({MemorySegment::Wrap(bytes1), MemorySegment::Wrap(bytes2)}); - auto binary_string = BinaryString::FromAddress(mem_segs, /*offset=*/0, - /*num_bytes=*/str1.length() + str2.length()); - ASSERT_EQ(str1 + str2, binary_string.ToString()); - - BinaryRow row(1); - BinaryRowWriter writer(&row, 0, pool.get()); - writer.WriteString(0, binary_string); - writer.Complete(); - ASSERT_EQ(str1 + str2, row.GetString(0).ToString()); + ASSERT_GE(bytes_serialize->size(), sizeof(int32_t) + test_string1.size() + test_string2.size()); + DataInputStream input( + std::make_shared(bytes_serialize->data(), bytes_serialize->size())); + ASSERT_OK_AND_ASSIGN(auto de_row, row_serializer.Deserialize(&input)); + ASSERT_EQ(de_row, row); + ASSERT_EQ(de_row.GetString(1).ToString(), test_string1); + ASSERT_EQ(de_row.GetString(2).ToString(), test_string2); } } // namespace paimon::test diff --git a/src/paimon/common/data/binary_row_writer.cpp b/src/paimon/common/data/binary_row_writer.cpp index 014b05a8..d3fd49dd 100644 --- a/src/paimon/common/data/binary_row_writer.cpp +++ b/src/paimon/common/data/binary_row_writer.cpp @@ -103,9 +103,7 @@ Result BinaryRowWriter::CreateFieldSetter( field_setter = [field_idx](const VariantType& field, BinaryRowWriter* writer) -> void { const auto* view = DataDefine::GetVariantPtr(field); if (view) { - return writer->WriteString( - field_idx, - BinaryString::FromString(std::string(*view), GetDefaultPool().get())); + return writer->WriteStringView(field_idx, *view); } return writer->WriteString(field_idx, DataDefine::GetVariantValue(field)); @@ -116,8 +114,7 @@ Result BinaryRowWriter::CreateFieldSetter( field_setter = [field_idx](const VariantType& field, BinaryRowWriter* writer) -> void { const auto* view = DataDefine::GetVariantPtr(field); if (view) { - return writer->WriteBinary(field_idx, - Bytes(std::string(*view), GetDefaultPool().get())); + return writer->WriteStringView(field_idx, *view); } return writer->WriteBinary( field_idx, *DataDefine::GetVariantValue>(field)); diff --git a/src/paimon/common/data/binary_row_writer_test.cpp b/src/paimon/common/data/binary_row_writer_test.cpp index eb61043f..c7836659 100644 --- a/src/paimon/common/data/binary_row_writer_test.cpp +++ b/src/paimon/common/data/binary_row_writer_test.cpp @@ -151,10 +151,10 @@ TEST(BinaryRowWriterTest, TestFieldSetter) { ASSERT_OK_AND_ASSIGN(InternalRow::FieldGetterFunc getter6, InternalRow::CreateFieldGetter(6, arrow::float64(), /*use_view=*/true)); ASSERT_OK_AND_ASSIGN(InternalRow::FieldGetterFunc getter7, - InternalRow::CreateFieldGetter(7, arrow::utf8(), /*use_view=*/false)); + InternalRow::CreateFieldGetter(7, arrow::utf8(), /*use_view=*/true)); ASSERT_OK_AND_ASSIGN(InternalRow::FieldGetterFunc getter8, InternalRow::CreateFieldGetter(8, arrow::binary(), - /*use_view=*/false)); + /*use_view=*/true)); ASSERT_OK_AND_ASSIGN(InternalRow::FieldGetterFunc getter9, InternalRow::CreateFieldGetter(9, arrow::date32(), /*use_view=*/true)); ASSERT_OK_AND_ASSIGN( @@ -214,9 +214,8 @@ TEST(BinaryRowWriterTest, TestFieldSetter) { ASSERT_EQ(DataDefine::GetVariantValue(getter5(row)), static_cast(5.5)); ASSERT_EQ(DataDefine::GetVariantValue(getter6(row)), static_cast(6.66)); - ASSERT_EQ(DataDefine::GetVariantValue(getter7(row)).ToString(), data); - ASSERT_EQ(*DataDefine::GetVariantValue>(getter8(row)), - Bytes(data, pool.get())); + ASSERT_EQ(DataDefine::GetVariantValue(getter7(row)), std::string_view(data)); + ASSERT_EQ(DataDefine::GetVariantValue(getter8(row)), std::string_view(data)); ASSERT_EQ(DataDefine::GetVariantValue(getter9(row)), 9); ASSERT_EQ(DataDefine::GetVariantValue(getter10(row)), Decimal(5, 2, 123)); ASSERT_EQ(DataDefine::GetVariantValue(getter11(row)), Timestamp(11, 12)); diff --git a/src/paimon/common/data/binary_section.cpp b/src/paimon/common/data/binary_section.cpp index d08846cb..015914be 100644 --- a/src/paimon/common/data/binary_section.cpp +++ b/src/paimon/common/data/binary_section.cpp @@ -25,19 +25,19 @@ bool BinarySection::operator==(const BinarySection& that) const { return true; } return size_in_bytes_ == that.size_in_bytes_ && - MemorySegmentUtils::Equals(segments_, offset_, that.segments_, that.offset_, + MemorySegmentUtils::Equals({segment_}, offset_, {that.segment_}, that.offset_, size_in_bytes_); } std::shared_ptr BinarySection::ToBytes(MemoryPool* pool) const { - return MemorySegmentUtils::GetBytes(segments_, offset_, size_in_bytes_, pool); + return MemorySegmentUtils::GetBytes({segment_}, offset_, size_in_bytes_, pool); } int32_t BinarySection::HashCode() const { - return MemorySegmentUtils::Hash(segments_, offset_, size_in_bytes_, GetDefaultPool().get()); + return MemorySegmentUtils::Hash({segment_}, offset_, size_in_bytes_, GetDefaultPool().get()); } -PAIMON_UNIQUE_PTR BinarySection::ReadBinary(const std::vector& segments, +PAIMON_UNIQUE_PTR BinarySection::ReadBinary(const MemorySegment& segment, int32_t base_offset, int32_t field_offset, int64_t variable_part_offset_and_len, MemoryPool* pool) { @@ -45,15 +45,15 @@ PAIMON_UNIQUE_PTR BinarySection::ReadBinary(const std::vector(variable_part_offset_and_len >> 32); const auto len = static_cast(variable_part_offset_and_len); - return MemorySegmentUtils::CopyToBytes(segments, base_offset + sub_offset, len, pool); + return MemorySegmentUtils::CopyToBytes({segment}, base_offset + sub_offset, len, pool); } else { auto len = static_cast( (variable_part_offset_and_len & BinarySection::HIGHEST_SECOND_TO_EIGHTH_BIT) >> 56); if ((SystemByteOrder() == ByteOrder::PAIMON_LITTLE_ENDIAN)) { - return MemorySegmentUtils::CopyToBytes(segments, field_offset, len, pool); + return MemorySegmentUtils::CopyToBytes({segment}, field_offset, len, pool); } else { // field_offset + 1 to skip header. - return MemorySegmentUtils::CopyToBytes(segments, field_offset + 1, len, pool); + return MemorySegmentUtils::CopyToBytes({segment}, field_offset + 1, len, pool); } } } diff --git a/src/paimon/common/data/binary_section.h b/src/paimon/common/data/binary_section.h index d893a005..8b6a016a 100644 --- a/src/paimon/common/data/binary_section.h +++ b/src/paimon/common/data/binary_section.h @@ -26,12 +26,15 @@ #include "paimon/visibility.h" namespace paimon { -/// Describe a section of memory. +/// Describe a section of a single MemorySegment. +/// +/// @note: Unlike the Java implementation where data may span multiple MemorySegments, +/// in this C++ implementation all data resides within a single MemorySegment. class PAIMON_EXPORT BinarySection { public: BinarySection() = default; - BinarySection(const std::vector& segments, int32_t offset, int32_t size_in_bytes) - : segments_(segments), offset_(offset), size_in_bytes_(size_in_bytes) {} + BinarySection(const MemorySegment& segment, int32_t offset, int32_t size_in_bytes) + : segment_(segment), offset_(offset), size_in_bytes_(size_in_bytes) {} virtual ~BinarySection() = default; /// It decides whether to put data in FixLenPart or VarLenPart. See more in `/// BinaryRow`. /// If len is less than 8, its binary format is: 1-bit mark(1) = 1, 7-bits len, and @@ -58,27 +61,21 @@ class PAIMON_EXPORT BinarySection { /// @param base_offset base offset of composite binary format. /// @param field_offset absolute start offset of variable_part_offset_and_len. /// @param variable_part_offset_and_len a long value, real data or offset and len. - static PAIMON_UNIQUE_PTR ReadBinary(const std::vector& segments, - int32_t base_offset, int32_t field_offset, + static PAIMON_UNIQUE_PTR ReadBinary(const MemorySegment& segment, int32_t base_offset, + int32_t field_offset, int64_t variable_part_offset_and_len, MemoryPool* pool); bool operator==(const BinarySection& that) const; virtual void PointTo(const MemorySegment& segment, int32_t offset, int32_t size_in_bytes) { - std::vector segments = {segment}; - PointTo(segments, offset, size_in_bytes); - } - - virtual void PointTo(const std::vector& segments, int32_t offset, - int32_t size_in_bytes) { - segments_ = segments; + segment_ = segment; offset_ = offset; size_in_bytes_ = size_in_bytes; } - const std::vector& GetSegments() const { - return segments_; + const MemorySegment& GetSegment() const { + return segment_; } int32_t GetOffset() const { @@ -94,7 +91,7 @@ class PAIMON_EXPORT BinarySection { virtual int32_t HashCode() const; protected: - std::vector segments_; + MemorySegment segment_; int32_t offset_ = 0; int32_t size_in_bytes_ = 0; }; diff --git a/src/paimon/common/data/binary_section_test.cpp b/src/paimon/common/data/binary_section_test.cpp index 1a4c08a8..64cc1e72 100644 --- a/src/paimon/common/data/binary_section_test.cpp +++ b/src/paimon/common/data/binary_section_test.cpp @@ -26,18 +26,16 @@ class BinarySectionTest : public ::testing::Test { void SetUp() override { pool_ = GetDefaultPool(); - segments_ = {MemorySegment::AllocateHeapMemory(1024, pool_.get()), - MemorySegment::AllocateHeapMemory(1024, pool_.get())}; - for (int i = 0; i < 1024; ++i) { - segments_[0].Put(i, static_cast(i)); - segments_[1].Put(i, static_cast(i + 1024)); + segment_ = MemorySegment::AllocateHeapMemory(2048, pool_.get()); + for (int i = 0; i < 2048; ++i) { + segment_.Put(i, static_cast(i % 128)); } offset_ = 0; size_in_bytes_ = 2048; - binary_section_ = BinarySection(segments_, offset_, size_in_bytes_); + binary_section_ = BinarySection(segment_, offset_, size_in_bytes_); } - std::vector segments_; + MemorySegment segment_; int32_t offset_; int32_t size_in_bytes_; BinarySection binary_section_; @@ -45,10 +43,10 @@ class BinarySectionTest : public ::testing::Test { }; TEST_F(BinarySectionTest, EqualityOperator) { - BinarySection other(segments_, offset_, size_in_bytes_); + BinarySection other(segment_, offset_, size_in_bytes_); EXPECT_TRUE(binary_section_ == other); - BinarySection different(segments_, offset_, size_in_bytes_ - 1); + BinarySection different(segment_, offset_, size_in_bytes_ - 1); EXPECT_FALSE(binary_section_ == different); } @@ -56,7 +54,7 @@ TEST_F(BinarySectionTest, ToBytes) { auto bytes = binary_section_.ToBytes(pool_.get()); ASSERT_EQ(bytes->size(), size_in_bytes_); for (int i = 0; i < size_in_bytes_; ++i) { - EXPECT_EQ(static_cast(bytes->data()[i]), static_cast(i % 1024)); + EXPECT_EQ(static_cast(bytes->data()[i]), static_cast(i % 128)); } } @@ -68,10 +66,10 @@ TEST_F(BinarySectionTest, HashCode) { TEST_F(BinarySectionTest, ReadBinary) { int64_t variable_part_offset_and_len = (static_cast(offset_) << 32) | size_in_bytes_; auto bytes = - BinarySection::ReadBinary(segments_, 0, offset_, variable_part_offset_and_len, pool_.get()); + BinarySection::ReadBinary(segment_, 0, offset_, variable_part_offset_and_len, pool_.get()); ASSERT_EQ(bytes->size(), size_in_bytes_); for (int i = 0; i < size_in_bytes_; ++i) { - EXPECT_EQ(static_cast(bytes->data()[i]), static_cast(i % 1024)); + EXPECT_EQ(static_cast(bytes->data()[i]), static_cast(i % 128)); } } diff --git a/src/paimon/common/data/binary_string.cpp b/src/paimon/common/data/binary_string.cpp index aed44e41..685adb21 100644 --- a/src/paimon/common/data/binary_string.cpp +++ b/src/paimon/common/data/binary_string.cpp @@ -30,20 +30,19 @@ const BinaryString& BinaryString::EmptyUtf8() { return empty_utf8; } -BinaryString::BinaryString(const std::vector& segments, int32_t offset, - int32_t size_in_bytes) - : BinarySection(segments, offset, size_in_bytes) {} +BinaryString::BinaryString(const MemorySegment& segment, int32_t offset, int32_t size_in_bytes) + : BinarySection(segment, offset, size_in_bytes) {} BinaryString::BinaryString() { std::shared_ptr bytes = Bytes::AllocateBytes(0, GetDefaultPool().get()); - segments_ = {MemorySegment::Wrap(bytes)}; + segment_ = MemorySegment::Wrap(bytes); offset_ = 0; size_in_bytes_ = bytes->size(); } -BinaryString BinaryString::FromAddress(const std::vector& segments, int32_t offset, +BinaryString BinaryString::FromAddress(const MemorySegment& segment, int32_t offset, int32_t num_bytes) { - return BinaryString(segments, offset, num_bytes); + return BinaryString(segment, offset, num_bytes); } BinaryString BinaryString::FromString(const std::string& str, MemoryPool* pool) { @@ -57,8 +56,7 @@ BinaryString BinaryString::FromBytes(const std::shared_ptr& bytes) { BinaryString BinaryString::FromBytes(const std::shared_ptr& bytes, int32_t offset, int32_t num_bytes) { - std::vector segs = {MemorySegment::Wrap(bytes)}; - return BinaryString(segs, offset, num_bytes); + return BinaryString(MemorySegment::Wrap(bytes), offset, num_bytes); } BinaryString BinaryString::BlankString(int32_t length, MemoryPool* pool) { @@ -69,7 +67,7 @@ BinaryString BinaryString::BlankString(int32_t length, MemoryPool* pool) { std::string BinaryString::ToString() const { std::string ret(size_in_bytes_, '\0'); - MemorySegmentUtils::CopyToBytes(segments_, offset_, &ret, 0, size_in_bytes_); + MemorySegmentUtils::CopyToBytes({segment_}, offset_, &ret, 0, size_in_bytes_); return ret; } @@ -94,204 +92,67 @@ int32_t BinaryString::NumBytesForFirstByte(char b) { } int32_t BinaryString::CompareTo(const BinaryString& other) const { - if (segments_.size() == 1 && other.segments_.size() == 1) { - int32_t len = std::min(size_in_bytes_, other.size_in_bytes_); - const auto& seg1 = segments_[0]; - const auto& seg2 = other.segments_[0]; - for (int32_t i = 0; i < len; i++) { - int32_t res = (seg1.Get(offset_ + i) & 0xFF) - (seg2.Get(other.offset_ + i) & 0xFF); - if (res != 0) { - return res; - } - } - return size_in_bytes_ - other.size_in_bytes_; - } - return CompareMultiSegments(other); -} - -int32_t BinaryString::CompareMultiSegments(const BinaryString& other) const { - if (size_in_bytes_ == 0 || other.size_in_bytes_ == 0) { - return size_in_bytes_ - other.size_in_bytes_; - } - int32_t len = std::min(size_in_bytes_, other.size_in_bytes_); - - MemorySegment seg1 = segments_[0]; - MemorySegment seg2 = other.segments_[0]; - - int32_t segment_size = segments_[0].Size(); - int32_t other_segment_size = other.segments_[0].Size(); - - int32_t size_of_first1 = segment_size - offset_; - int32_t size_of_first2 = other_segment_size - other.offset_; - - int32_t var_seg_index1 = 1; - int32_t var_seg_index2 = 1; - - // find the first segment of this string. - while (size_of_first1 <= 0) { - size_of_first1 += segment_size; - seg1 = segments_[var_seg_index1++]; - } - - while (size_of_first2 <= 0) { - size_of_first2 += other_segment_size; - seg2 = other.segments_[var_seg_index2++]; - } - - int32_t offset1 = segment_size - size_of_first1; - int32_t offset2 = other_segment_size - size_of_first2; - - int32_t need_compare = std::min(std::min(size_of_first1, size_of_first2), len); - while (need_compare > 0) { - // compare in one segment. - for (int32_t i = 0; i < need_compare; i++) { - int32_t res = (seg1.Get(offset1 + i) & 0xFF) - (seg2.Get(offset2 + i) & 0xFF); - if (res != 0) { - return res; - } - } - if (need_compare == len) { - break; + for (int32_t i = 0; i < len; i++) { + int32_t res = + (segment_.Get(offset_ + i) & 0xFF) - (other.segment_.Get(other.offset_ + i) & 0xFF); + if (res != 0) { + return res; } - len -= need_compare; - // next segment - if (size_of_first1 < size_of_first2) { // I am smaller - seg1 = segments_[var_seg_index1++]; - offset1 = 0; - offset2 += need_compare; - size_of_first1 = segment_size; - size_of_first2 -= need_compare; - } else if (size_of_first1 > size_of_first2) { // other is smaller - seg2 = other.segments_[var_seg_index2++]; - offset2 = 0; - offset1 += need_compare; - size_of_first2 = other_segment_size; - size_of_first1 -= need_compare; - } else { // same, should go ahead both. - seg1 = segments_[var_seg_index1++]; - seg2 = other.segments_[var_seg_index2++]; - offset1 = 0; - offset2 = 0; - size_of_first1 = segment_size; - size_of_first2 = other_segment_size; - } - need_compare = std::min(std::min(size_of_first1, size_of_first2), len); } - - assert(need_compare == len); return size_in_bytes_ - other.size_in_bytes_; } int32_t BinaryString::NumChars() const { - if (InFirstSegment()) { - int32_t len = 0; - for (int32_t i = 0; i < size_in_bytes_; i += NumBytesForFirstByte(GetByteOneSegment(i))) { - len++; - } - return len; - } else { - return NumCharsMultiSegs(); - } -} - -int32_t BinaryString::NumCharsMultiSegs() const { int32_t len = 0; - int32_t seg_size = segments_[0].Size(); - BinaryString::SegmentAndOffset index = FirstSegmentAndOffset(seg_size); - int32_t i = 0; - while (i < size_in_bytes_) { - int32_t char_bytes = NumBytesForFirstByte(index.Value()); - i += char_bytes; + for (int32_t i = 0; i < size_in_bytes_; i += NumBytesForFirstByte(GetByteOneSegment(i))) { len++; - index.SkipBytes(char_bytes, seg_size); } return len; } char BinaryString::ByteAt(int32_t index) const { - int32_t global_offset = offset_ + index; - int32_t size = segments_[0].Size(); - if (global_offset < size) { - return segments_[0].Get(global_offset); - } else { - return segments_[global_offset / size].Get(global_offset % size); - } + return segment_.Get(offset_ + index); } BinaryString BinaryString::Copy(MemoryPool* pool) const { std::shared_ptr copy = - MemorySegmentUtils::CopyToBytes(segments_, offset_, size_in_bytes_, pool); + MemorySegmentUtils::CopyToBytes({segment_}, offset_, size_in_bytes_, pool); return FromBytes(copy); } -BinaryString BinaryString::SubStringMultiSegs(const int32_t start, const int32_t until, - MemoryPool* pool) const { - int32_t seg_size = segments_[0].Size(); - SegmentAndOffset index = FirstSegmentAndOffset(seg_size); +BinaryString BinaryString::Substring(int32_t begin_index, int32_t end_index, + MemoryPool* pool) const { + if (end_index <= begin_index || begin_index >= size_in_bytes_) { + return EmptyUtf8(); + } int32_t i = 0; int32_t c = 0; - while (i < size_in_bytes_ && c < start) { - int32_t char_size = NumBytesForFirstByte(index.Value()); - i += char_size; - index.SkipBytes(char_size, seg_size); + while (i < size_in_bytes_ && c < begin_index) { + i += NumBytesForFirstByte(segment_.Get(i + offset_)); c += 1; } int32_t j = i; - while (i < size_in_bytes_ && c < until) { - int32_t char_size = NumBytesForFirstByte(index.Value()); - i += char_size; - index.SkipBytes(char_size, seg_size); + while (i < size_in_bytes_ && c < end_index) { + i += NumBytesForFirstByte(segment_.Get(i + offset_)); c += 1; } if (i > j) { - std::shared_ptr bytes = - MemorySegmentUtils::CopyToBytes(segments_, offset_ + j, i - j, pool); + std::shared_ptr bytes = Bytes::AllocateBytes(i - j, pool); + segment_.Get(offset_ + j, bytes.get(), 0, i - j); return FromBytes(bytes); } else { return EmptyUtf8(); } } -BinaryString BinaryString::Substring(int32_t begin_index, int32_t end_index, - MemoryPool* pool) const { - if (end_index <= begin_index || begin_index >= size_in_bytes_) { - return EmptyUtf8(); - } - if (InFirstSegment()) { - MemorySegment segment = segments_[0]; - int32_t i = 0; - int32_t c = 0; - while (i < size_in_bytes_ && c < begin_index) { - i += NumBytesForFirstByte(segment.Get(i + offset_)); - c += 1; - } - - int32_t j = i; - while (i < size_in_bytes_ && c < end_index) { - i += NumBytesForFirstByte(segment.Get(i + offset_)); - c += 1; - } - - if (i > j) { - std::shared_ptr bytes = Bytes::AllocateBytes(i - j, pool); - segment.Get(offset_ + j, bytes.get(), 0, i - j); - return FromBytes(bytes); - } else { - return EmptyUtf8(); - } - } else { - return SubStringMultiSegs(begin_index, end_index, pool); - } -} - bool BinaryString::Contains(const BinaryString& s) const { if (s.size_in_bytes_ == 0) { return true; } - int32_t find = MemorySegmentUtils::Find(segments_, offset_, size_in_bytes_, s.segments_, + int32_t find = MemorySegmentUtils::Find({segment_}, offset_, size_in_bytes_, {s.segment_}, s.offset_, s.size_in_bytes_); return find != -1; } @@ -308,59 +169,26 @@ int32_t BinaryString::IndexOf(const BinaryString& str, int32_t from_index) const if (str.size_in_bytes_ == 0) { return 0; } - if (InFirstSegment()) { - // position in byte - int32_t byte_idx = 0; - // position is char - int32_t char_idx = 0; - while (byte_idx < size_in_bytes_ && char_idx < from_index) { - byte_idx += NumBytesForFirstByte(GetByteOneSegment(byte_idx)); - char_idx++; - } - do { - if (byte_idx + str.size_in_bytes_ > size_in_bytes_) { - return -1; - } - if (MemorySegmentUtils::Equals(segments_, offset_ + byte_idx, str.segments_, - str.offset_, str.size_in_bytes_)) { - return char_idx; - } - byte_idx += NumBytesForFirstByte(GetByteOneSegment(byte_idx)); - char_idx++; - } while (byte_idx < size_in_bytes_); - - return -1; - } else { - return IndexOfMultiSegs(str, from_index); - } -} - -int32_t BinaryString::IndexOfMultiSegs(const BinaryString& str, int32_t from_index) const { // position in byte int32_t byte_idx = 0; // position is char int32_t char_idx = 0; - int32_t seg_size = segments_[0].Size(); - SegmentAndOffset index = FirstSegmentAndOffset(seg_size); while (byte_idx < size_in_bytes_ && char_idx < from_index) { - int32_t char_bytes = NumBytesForFirstByte(index.Value()); - byte_idx += char_bytes; + byte_idx += NumBytesForFirstByte(GetByteOneSegment(byte_idx)); char_idx++; - index.SkipBytes(char_bytes, seg_size); } do { if (byte_idx + str.size_in_bytes_ > size_in_bytes_) { return -1; } - if (MemorySegmentUtils::Equals(segments_, offset_ + byte_idx, str.segments_, str.offset_, + if (MemorySegmentUtils::Equals({segment_}, offset_ + byte_idx, {str.segment_}, str.offset_, str.size_in_bytes_)) { return char_idx; } - int32_t char_bytes = NumBytesForFirstByte(index.segment_->Get(index.offset_)); - byte_idx += char_bytes; + byte_idx += NumBytesForFirstByte(GetByteOneSegment(byte_idx)); char_idx++; - index.SkipBytes(char_bytes, seg_size); } while (byte_idx < size_in_bytes_); + return -1; } @@ -368,7 +196,7 @@ BinaryString BinaryString::ToUpperCase(MemoryPool* pool) const { if (size_in_bytes_ == 0) { return EmptyUtf8(); } - int32_t size = segments_[0].Size(); + int32_t size = segment_.Size(); SegmentAndOffset segment_and_offset = StartSegmentAndOffset(size); std::shared_ptr bytes = Bytes::AllocateBytes(size_in_bytes_, pool); (*bytes)[0] = tolower(segment_and_offset.Value()); @@ -399,7 +227,7 @@ BinaryString BinaryString::ToLowerCase(MemoryPool* pool) const { if (size_in_bytes_ == 0) { return EmptyUtf8(); } - int32_t size = segments_[0].Size(); + int32_t size = segment_.Size(); SegmentAndOffset segment_and_offset = StartSegmentAndOffset(size); std::shared_ptr bytes = Bytes::AllocateBytes(size_in_bytes_, pool); (*bytes)[0] = tolower(segment_and_offset.Value()); @@ -427,50 +255,31 @@ BinaryString BinaryString::CppToLowerCase(MemoryPool* pool) const { } char BinaryString::GetByteOneSegment(int32_t i) const { - return segments_[0].Get(offset_ + i); -} - -bool BinaryString::InFirstSegment() const { - return size_in_bytes_ + offset_ <= segments_[0].Size(); -} - -BinaryString::SegmentAndOffset BinaryString::FirstSegmentAndOffset(int32_t seg_size) const { - int32_t seg_index = offset_ / seg_size; - return BinaryString::SegmentAndOffset(segments_, seg_index, offset_ % seg_size); + return segment_.Get(offset_ + i); } BinaryString::SegmentAndOffset BinaryString::StartSegmentAndOffset(int32_t seg_size) const { - return InFirstSegment() ? SegmentAndOffset(segments_, 0, offset_) - : FirstSegmentAndOffset(seg_size); + return BinaryString::SegmentAndOffset(segment_, offset_); } BinaryString BinaryString::CopyBinaryString(int32_t start, int32_t end, MemoryPool* pool) const { int32_t len = end - start + 1; std::shared_ptr new_bytes = Bytes::AllocateBytes(len, pool); - MemorySegmentUtils::CopyToBytes(segments_, offset_ + start, new_bytes.get(), 0, len); - return FromBytes(new_bytes); -} - -BinaryString BinaryString::CopyBinaryStringInOneSeg(int32_t start, int32_t len, - MemoryPool* pool) const { - std::shared_ptr new_bytes = Bytes::AllocateBytes(len, pool); - segments_[0].Get(offset_ + start, new_bytes.get(), 0, len); + MemorySegmentUtils::CopyToBytes({segment_}, offset_ + start, new_bytes.get(), 0, len); return FromBytes(new_bytes); } bool BinaryString::MatchAt(const BinaryString& s, int32_t pos) const { - return (InFirstSegment() && s.InFirstSegment()) ? MatchAtOneSeg(s, pos) : MatchAtVarSeg(s, pos); + return MatchAtOneSeg(s, pos); } bool BinaryString::MatchAtOneSeg(const BinaryString& s, int32_t pos) const { return s.size_in_bytes_ + pos <= size_in_bytes_ && pos >= 0 && - segments_[0].EqualTo(s.segments_[0], offset_ + pos, s.offset_, s.size_in_bytes_); + segment_.EqualTo(s.segment_, offset_ + pos, s.offset_, s.size_in_bytes_); } -bool BinaryString::MatchAtVarSeg(const BinaryString& s, int32_t pos) const { - return s.size_in_bytes_ + pos <= size_in_bytes_ && pos >= 0 && - MemorySegmentUtils::Equals(segments_, offset_ + pos, s.segments_, s.offset_, - s.size_in_bytes_); +std::string_view BinaryString::GetStringView() const { + const auto* bytes = segment_.GetArray(); + return std::string_view(bytes->data() + offset_, size_in_bytes_); } - } // namespace paimon diff --git a/src/paimon/common/data/binary_string.h b/src/paimon/common/data/binary_string.h index 2a9be800..42af1fdb 100644 --- a/src/paimon/common/data/binary_string.h +++ b/src/paimon/common/data/binary_string.h @@ -22,6 +22,7 @@ #include #include #include +#include #include #include "paimon/common/data/binary_section.h" @@ -32,14 +33,17 @@ namespace paimon { class Bytes; class MemoryPool; -/// A string which is backed by `MemorySegment`s. +/// A string which is backed by a single `MemorySegment`. +/// +/// @note: Unlike the Java implementation where a BinaryString may span multiple +/// MemorySegments, in this C++ implementation all data resides within a single MemorySegment. class PAIMON_EXPORT BinaryString : public BinarySection { public: static const BinaryString& EmptyUtf8(); - BinaryString(const std::vector& segments, int32_t offset, int32_t size_in_bytes); + BinaryString(const MemorySegment& segment, int32_t offset, int32_t size_in_bytes); - static BinaryString FromAddress(const std::vector& segments, int32_t offset, + static BinaryString FromAddress(const MemorySegment& segment, int32_t offset, int32_t num_bytes); static BinaryString FromString(const std::string& str, MemoryPool* pool); @@ -66,12 +70,8 @@ class PAIMON_EXPORT BinaryString : public BinarySection { int32_t CompareTo(const BinaryString& other) const; - /// Find the boundaries of segments, and then compare MemorySegment. - int32_t CompareMultiSegments(const BinaryString& other) const; - /// @return the number of UTF-8 code points in the string. int32_t NumChars() const; - int32_t NumCharsMultiSegs() const; /// Returns the byte value at the specified index. An index ranges from `0` to /// `size_in_bytes_ - 1`. @@ -128,90 +128,48 @@ class PAIMON_EXPORT BinaryString : public BinarySection { /// @return the BinaryString, converted to lowercase. BinaryString ToLowerCase(MemoryPool* pool) const; + std::string_view GetStringView() const; + + // @return copied sub string from [start, end]. + BinaryString CopyBinaryString(int32_t start, int32_t end, MemoryPool* pool) const; + /// @return the number of bytes for a code point with the first byte as b. /// @param b The first byte of a code point static int32_t NumBytesForFirstByte(char b); + private: char GetByteOneSegment(int32_t i) const; - bool InFirstSegment() const; bool MatchAt(const BinaryString& s, int32_t pos) const; bool MatchAtOneSeg(const BinaryString& s, int32_t pos) const; - bool MatchAtVarSeg(const BinaryString& s, int32_t pos) const; - BinaryString CopyBinaryStringInOneSeg(int32_t start, int32_t len, MemoryPool* pool) const; - BinaryString CopyBinaryString(int32_t start, int32_t end, MemoryPool* pool) const; /// CurrentSegment and positionInSegment. class SegmentAndOffset { friend class BinaryString; public: - SegmentAndOffset(const std::vector& segments, int32_t seg_index, - int32_t offset) - : seg_index_(seg_index), - offset_(offset), - segments_(segments), - segment_(segments[seg_index]) {} + SegmentAndOffset(const MemorySegment& segment, int32_t offset) + : offset_(offset), segment_(segment) {} void NextByte(int32_t seg_size) { offset_++; - CheckAdvance(seg_size); } void SkipBytes(int32_t n, int32_t seg_size) { - int32_t remaining = seg_size - offset_; - if (remaining > n) { - offset_ += n; - } else { - while (true) { - int32_t to_skip = std::min(remaining, n); - n -= to_skip; - if (n <= 0) { - offset_ += to_skip; - CheckAdvance(seg_size); - return; - } - Advance(); - remaining = seg_size - offset_; - } - } + offset_ += n; } + char Value() const { - assert(segment_ != std::nullopt); - return segment_.value().Get(offset_); + return segment_.Get(offset_); } private: - int32_t seg_index_; int32_t offset_; - std::vector segments_; - std::optional segment_; - - void AssignSegment() { - segment_ = seg_index_ >= 0 && seg_index_ < static_cast(segments_.size()) - ? std::optional(segments_[seg_index_]) - : std::nullopt; - } - - void CheckAdvance(int32_t seg_size) { - if (offset_ == seg_size) { - Advance(); - } - } - - void Advance() { - seg_index_++; - AssignSegment(); - offset_ = 0; - } + MemorySegment segment_; }; - SegmentAndOffset FirstSegmentAndOffset(int32_t seg_size) const; SegmentAndOffset StartSegmentAndOffset(int32_t seg_size) const; - private: BinaryString(); - BinaryString SubStringMultiSegs(const int32_t start, const int32_t until, - MemoryPool* pool) const; - int32_t IndexOfMultiSegs(const BinaryString& str, int32_t fromIndex) const; + BinaryString CppToLowerCase(MemoryPool* pool) const; BinaryString CppToUpperCase(MemoryPool* pool) const; }; diff --git a/src/paimon/common/data/binary_string_test.cpp b/src/paimon/common/data/binary_string_test.cpp index fdaacb9f..bbdbc4fa 100644 --- a/src/paimon/common/data/binary_string_test.cpp +++ b/src/paimon/common/data/binary_string_test.cpp @@ -17,6 +17,7 @@ #include "paimon/common/data/binary_string.h" #include +#include #include #include #include @@ -28,58 +29,24 @@ namespace paimon::test { class BinaryStringTest : public testing::Test { private: - enum class Mode { ONE_SEG = 0, MULTI_SEGS = 1, STRING = 2, RANDOM = 3 }; - BinaryString FromString(const std::string& str) { auto pool = GetDefaultPool(); - BinaryString string = BinaryString::FromString(str, pool.get()); - if (mode_ == Mode::RANDOM) { - int32_t rnd = std::rand() % 3; - if (rnd == 0) { - mode_ = Mode::ONE_SEG; - } else if (rnd == 1) { - mode_ = Mode::MULTI_SEGS; - } else { - mode_ = Mode::STRING; - } - } - if (mode_ == Mode::STRING) { - return string; - } - if (mode_ == Mode::ONE_SEG || string.GetSizeInBytes() < 2) { - return string; - } else { - int32_t num_bytes = string.GetSizeInBytes(); - int32_t pad = std::rand() % 5; - int32_t num_bytes_with_pad = num_bytes + pad; - int32_t seg_size = num_bytes_with_pad / 2 + 1; - std::shared_ptr bytes1 = Bytes::AllocateBytes(seg_size, pool.get()); - std::shared_ptr bytes2 = Bytes::AllocateBytes(seg_size, pool.get()); - if (seg_size - pad > 0 && num_bytes >= seg_size - pad) { - string.GetSegments()[0].Get(0, bytes1.get(), pad, seg_size - pad); - } - string.GetSegments()[0].Get(seg_size - pad, bytes2.get(), 0, - num_bytes - seg_size + pad); - - std::vector segments = {MemorySegment::Wrap(bytes1), - MemorySegment::Wrap(bytes2)}; - return BinaryString::FromAddress(segments, pad, num_bytes); - } + return BinaryString::FromString(str, pool.get()); } template void InnerCheckEqual(T&& expected, U&& actual) { - ASSERT_EQ(std::forward(expected), std::forward(actual)) << static_cast(mode_); + ASSERT_EQ(std::forward(expected), std::forward(actual)); } template void InnerCheck(T&& expected) { - ASSERT_TRUE(std::forward(expected)) << static_cast(mode_); + ASSERT_TRUE(std::forward(expected)); } template void InnerCheckFalse(T&& expected) { - ASSERT_FALSE(std::forward(expected)) << static_cast(mode_); + ASSERT_FALSE(std::forward(expected)); } void CheckBasic(const std::string& str, int32_t len) { @@ -103,8 +70,6 @@ class BinaryStringTest : public testing::Test { InnerCheck(s2.StartsWith(s2)); InnerCheck(s2.EndsWith(s2)); } - - Mode mode_ = Mode::RANDOM; }; TEST_F(BinaryStringTest, TestBasic) { @@ -147,57 +112,54 @@ TEST_F(BinaryStringTest, TestCompareTo) { InnerCheck(FromString("你好123").CompareTo(FromString("你好122")) > 0); } -TEST_F(BinaryStringTest, TestMultiSegments) { +TEST_F(BinaryStringTest, TestSingleSegment) { // prepare auto pool = GetDefaultPool(); - std::vector segments1; - segments1.push_back(MemorySegment::Wrap(Bytes::AllocateBytes(10, pool.get()))); - segments1.push_back(MemorySegment::Wrap(Bytes::AllocateBytes(10, pool.get()))); - auto bytes1 = Bytes::AllocateBytes("abcde", pool.get()); - auto bytes2 = Bytes::AllocateBytes("aaaaa", pool.get()); - segments1[0].Put(5, *bytes1, 0, 5); - segments1[1].Put(0, *bytes2, 0, 5); - - std::vector segments2; - segments2.push_back(MemorySegment::Wrap(Bytes::AllocateBytes(5, pool.get()))); - segments2.push_back(MemorySegment::Wrap(Bytes::AllocateBytes(5, pool.get()))); - auto bytes3 = Bytes::AllocateBytes("abcde", pool.get()); - auto bytes4 = Bytes::AllocateBytes("b", pool.get()); - segments2[0].Put(0, *bytes3, 0, 5); - segments2[1].Put(0, *bytes4, 0, 1); - - // test go ahead both - BinaryString binary_string1 = BinaryString::FromAddress(segments1, 5, 10); - BinaryString binary_string2 = BinaryString::FromAddress(segments2, 0, 6); + auto bytes1 = Bytes::AllocateBytes(10, pool.get()); + auto src1 = Bytes::AllocateBytes("abcde", pool.get()); + auto src2 = Bytes::AllocateBytes("aaaaa", pool.get()); + std::memcpy(bytes1->data() + 5, src1->data(), 5); + std::memcpy(bytes1->data() + 0, src2->data(), 5); + // bytes1 content: "aaaaaabcde" + // Actually we want "abcdeaaaaa" at offset 5, but that requires 15 bytes. + // Let us use a simpler approach: + std::shared_ptr data1 = Bytes::AllocateBytes("abcdeaaaaa", pool.get()); + MemorySegment seg1 = MemorySegment::Wrap(data1); + + std::shared_ptr data2 = Bytes::AllocateBytes("abcdeb", pool.get()); + MemorySegment seg2 = MemorySegment::Wrap(data2); + + // test compare + BinaryString binary_string1 = BinaryString::FromAddress(seg1, 0, 10); + BinaryString binary_string2 = BinaryString::FromAddress(seg2, 0, 6); InnerCheckEqual(binary_string1.ToString(), "abcdeaaaaa"); InnerCheckEqual(binary_string2.ToString(), "abcdeb"); InnerCheckEqual(binary_string1.CompareTo(binary_string2), -1); ASSERT_EQ(binary_string1, binary_string1); ASSERT_TRUE(binary_string1 < binary_string2); - // test needCompare == len - binary_string1 = BinaryString::FromAddress(segments1, 5, 5); - binary_string2 = BinaryString::FromAddress(segments2, 0, 5); + // test equal length compare + binary_string1 = BinaryString::FromAddress(seg1, 0, 5); + binary_string2 = BinaryString::FromAddress(seg2, 0, 5); InnerCheckEqual(binary_string1.ToString(), "abcde"); InnerCheckEqual(binary_string2.ToString(), "abcde"); InnerCheckEqual(binary_string1, binary_string2); - // test find the first segment of this string - binary_string1 = BinaryString::FromAddress(segments1, 10, 5); - binary_string2 = BinaryString::FromAddress(segments2, 0, 5); + // test with offset + std::shared_ptr data3 = Bytes::AllocateBytes("aaaaa", pool.get()); + MemorySegment seg3 = MemorySegment::Wrap(data3); + binary_string1 = BinaryString::FromAddress(seg3, 0, 5); + binary_string2 = BinaryString::FromAddress(seg2, 0, 5); InnerCheckEqual(binary_string1.ToString(), "aaaaa"); InnerCheckEqual(binary_string2.ToString(), "abcde"); InnerCheckEqual(binary_string1.CompareTo(binary_string2), -1); InnerCheckEqual(binary_string2.CompareTo(binary_string1), 1); - // test go ahead single - std::vector segments3; - segments3.push_back(MemorySegment::Wrap(Bytes::AllocateBytes(10, pool.get()))); - segments3[0].Put(4, Bytes("abcdeb", pool.get()), 0, 6); - - binary_string1 = BinaryString::FromAddress(segments1, 5, 10); - binary_string2 = BinaryString::FromAddress(segments3, 4, 6); - InnerCheckEqual(binary_string1.ToString(), "abcdeaaaaa"); + // test with offset in single segment + std::shared_ptr data4 = Bytes::AllocateBytes(10, pool.get()); + MemorySegment seg4 = MemorySegment::Wrap(data4); + seg4.Put(4, Bytes("abcdeb", pool.get()), 0, 6); + binary_string2 = BinaryString::FromAddress(seg4, 4, 6); InnerCheckEqual(binary_string2.ToString(), "abcdeb"); InnerCheckEqual(binary_string1.CompareTo(binary_string2), -1); InnerCheckEqual(binary_string2.CompareTo(binary_string1), 1); @@ -243,26 +205,21 @@ TEST_F(BinaryStringTest, TestSubstring) { InnerCheckEqual(FromString("ߵ梷").Substring(0, 2, pool.get()), FromString("ߵ梷")); } -TEST_F(BinaryStringTest, TestSubStringMultiSegsAndCopyBinaryString) { +TEST_F(BinaryStringTest, TestSubStringAndCopyBinaryString) { auto pool = GetDefaultPool(); - std::string str1 = "hello world!"; - std::string str2 = "nice to meet you!"; - std::shared_ptr bytes1 = Bytes::AllocateBytes(str1, pool.get()); - std::shared_ptr bytes2 = Bytes::AllocateBytes(str2, pool.get()); - std::vector segs = {MemorySegment::Wrap(bytes1), MemorySegment::Wrap(bytes2)}; - BinaryString binary_string = BinaryString(segs, 0, str1.size() + str2.size()); - int32_t left = str1.size() / 2, right = str1.size() + str2.size() / 2; + std::string combined = "hello world!nice to meet you!"; + std::shared_ptr bytes = Bytes::AllocateBytes(combined, pool.get()); + MemorySegment seg = MemorySegment::Wrap(bytes); + BinaryString binary_string = BinaryString(seg, 0, combined.size()); + int32_t left = 6, right = 20; // Substring [left, right), The right is not included - InnerCheckEqual( - binary_string.Substring(left, right, pool.get()), - FromString(str1.substr(left, str1.size() - left) + str2.substr(0, right - str1.size()))); + InnerCheckEqual(binary_string.Substring(left, right, pool.get()), + FromString(combined.substr(left, right - left))); // CopyBinaryString [left, right], The right is included InnerCheckEqual(binary_string.CopyBinaryString(left, right, pool.get()), - FromString(str1.substr(left, str1.size() - left) + - str2.substr(0, right - str1.size() + 1))); - InnerCheckEqual(binary_string.CopyBinaryStringInOneSeg(0, str1.size(), pool.get()), - FromString(str1)); + FromString(combined.substr(left, right - left + 1))); + InnerCheckEqual(binary_string.CopyBinaryString(0, 11, pool.get()), FromString("hello world!")); } TEST_F(BinaryStringTest, TestIndexOf) { @@ -282,17 +239,14 @@ TEST_F(BinaryStringTest, TestIndexOf) { } { auto pool = GetDefaultPool(); - std::string str1 = "Strive not to be a success, "; - std::string str2 = "but rather to be of value."; - auto bytes1 = std::make_shared(str1, pool.get()); - auto bytes2 = std::make_shared(str2, pool.get()); - std::vector mem_segs( - {MemorySegment::Wrap(bytes1), MemorySegment::Wrap(bytes2)}); - auto binary_string = BinaryString::FromAddress(mem_segs, /*offset=*/0, - /*num_bytes=*/str1.length() + str2.length()); - ASSERT_EQ(str1 + str2, binary_string.ToString()); - InnerCheckEqual(binary_string.IndexOf(FromString("value"), 0), str1.length() + 20); - InnerCheckEqual(binary_string.IndexOf(FromString("value"), 5), str1.length() + 20); + std::string combined = "Strive not to be a success, but rather to be of value."; + auto bytes = std::make_shared(combined, pool.get()); + MemorySegment seg = MemorySegment::Wrap(bytes); + auto binary_string = BinaryString::FromAddress(seg, /*offset=*/0, + /*num_bytes=*/combined.length()); + ASSERT_EQ(combined, binary_string.ToString()); + InnerCheckEqual(binary_string.IndexOf(FromString("value"), 0), 48); + InnerCheckEqual(binary_string.IndexOf(FromString("value"), 5), 48); InnerCheckEqual(binary_string.IndexOf(FromString("vvalue"), 0), -1); InnerCheckEqual(binary_string.IndexOf(FromString("!"), 0), -1); } @@ -317,16 +271,9 @@ TEST_F(BinaryStringTest, TestEmptyString) { BinaryString str3; auto pool = GetDefaultPool(); { - std::vector segments; - std::shared_ptr bytes0 = Bytes::AllocateBytes(10, pool.get()); - std::shared_ptr bytes1 = Bytes::AllocateBytes(10, pool.get()); - - MemorySegment segments0 = MemorySegment::Wrap(bytes0); - MemorySegment segments1 = MemorySegment::Wrap(bytes1); - segments.push_back(segments0); - segments.push_back(segments1); - str3 = BinaryString::FromAddress(segments, /*offset=*/15, /*num_bytes=*/0); + MemorySegment seg0 = MemorySegment::Wrap(bytes0); + str3 = BinaryString::FromAddress(seg0, /*offset=*/5, /*num_bytes=*/0); } InnerCheck(BinaryString::EmptyUtf8().CompareTo(str2) < 0); @@ -377,31 +324,28 @@ TEST_F(BinaryStringTest, TestCopy) { TEST_F(BinaryStringTest, TestByteAt) { auto pool = GetDefaultPool(); - std::string str1 = "hello"; - std::string str2 = "world!"; - auto bytes1 = std::make_shared(str1, pool.get()); - auto bytes2 = std::make_shared(str2, pool.get()); - std::vector mem_segs({MemorySegment::Wrap(bytes1), MemorySegment::Wrap(bytes2)}); - auto binary_string = BinaryString::FromAddress(mem_segs, /*offset=*/2, - /*num_bytes=*/str1.length() + str2.length() - 2); + std::string combined = "helloworld!"; + auto bytes = std::make_shared(combined, pool.get()); + MemorySegment seg = MemorySegment::Wrap(bytes); + auto binary_string = BinaryString::FromAddress(seg, /*offset=*/2, + /*num_bytes=*/combined.length() - 2); ASSERT_EQ(binary_string.ByteAt(0), 'l'); ASSERT_EQ(binary_string.ByteAt(5), 'r'); } TEST_F(BinaryStringTest, TestNumChars) { auto pool = GetDefaultPool(); - auto bytes1 = std::make_shared("hello", pool.get()); - auto bytes2 = std::make_shared("world", pool.get()); { - std::vector mem_segs({MemorySegment::Wrap(bytes1)}); - auto binary_string = BinaryString::FromAddress(mem_segs, /*offset=*/0, + auto bytes = std::make_shared("hello", pool.get()); + MemorySegment seg = MemorySegment::Wrap(bytes); + auto binary_string = BinaryString::FromAddress(seg, /*offset=*/0, /*num_bytes=*/5); ASSERT_EQ(5, binary_string.NumChars()); } { - std::vector mem_segs( - {MemorySegment::Wrap(bytes1), MemorySegment::Wrap(bytes2)}); - auto binary_string = BinaryString::FromAddress(mem_segs, /*offset=*/0, + auto bytes = std::make_shared("helloworld", pool.get()); + MemorySegment seg = MemorySegment::Wrap(bytes); + auto binary_string = BinaryString::FromAddress(seg, /*offset=*/0, /*num_bytes=*/10); ASSERT_EQ(10, binary_string.NumChars()); } @@ -412,29 +356,27 @@ TEST_F(BinaryStringTest, TestMatchAt) { { // abc std::shared_ptr bytes1 = Bytes::AllocateBytes("abc", pool.get()); - std::vector mem_segs1({MemorySegment::Wrap(bytes1)}); - auto binary_string1 = BinaryString::FromAddress(mem_segs1, /*offset=*/0, + MemorySegment seg1 = MemorySegment::Wrap(bytes1); + auto binary_string1 = BinaryString::FromAddress(seg1, /*offset=*/0, /*num_bytes=*/3); // bc std::shared_ptr bytes2 = Bytes::AllocateBytes("bc", pool.get()); - std::vector mem_segs2({MemorySegment::Wrap(bytes2)}); - auto binary_string2 = BinaryString::FromAddress(mem_segs2, /*offset=*/0, + MemorySegment seg2 = MemorySegment::Wrap(bytes2); + auto binary_string2 = BinaryString::FromAddress(seg2, /*offset=*/0, /*num_bytes=*/2); ASSERT_TRUE(binary_string1.MatchAt(binary_string2, /*pos=*/1)); ASSERT_FALSE(binary_string1.MatchAt(binary_string2, /*pos=*/0)); } { // abcdef - std::shared_ptr bytes1 = Bytes::AllocateBytes("abc", pool.get()); - std::shared_ptr bytes2 = Bytes::AllocateBytes("def", pool.get()); - std::vector mem_segs1( - {MemorySegment::Wrap(bytes1), MemorySegment::Wrap(bytes2)}); - auto binary_string1 = BinaryString::FromAddress(mem_segs1, /*offset=*/0, + std::shared_ptr bytes1 = Bytes::AllocateBytes("abcdef", pool.get()); + MemorySegment seg1 = MemorySegment::Wrap(bytes1); + auto binary_string1 = BinaryString::FromAddress(seg1, /*offset=*/0, /*num_bytes=*/6); // bc - std::shared_ptr bytes3 = Bytes::AllocateBytes("bc", pool.get()); - std::vector mem_segs2({MemorySegment::Wrap(bytes3)}); - auto binary_string2 = BinaryString::FromAddress(mem_segs2, /*offset=*/0, + std::shared_ptr bytes2 = Bytes::AllocateBytes("bc", pool.get()); + MemorySegment seg2 = MemorySegment::Wrap(bytes2); + auto binary_string2 = BinaryString::FromAddress(seg2, /*offset=*/0, /*num_bytes=*/2); ASSERT_TRUE(binary_string1.MatchAt(binary_string2, /*pos=*/1)); ASSERT_FALSE(binary_string1.MatchAt(binary_string2, /*pos=*/0)); diff --git a/src/paimon/common/data/data_define.h b/src/paimon/common/data/data_define.h index 4637949b..28ae57e2 100644 --- a/src/paimon/common/data/data_define.h +++ b/src/paimon/common/data/data_define.h @@ -69,6 +69,18 @@ class DataDefine { return std::get_if(&value); } + // always make sure value is string_view or std::shared_ptr or BinaryString + inline static std::string_view GetStringView(const VariantType& value) { + if (auto* view_ptr = GetVariantPtr(value)) { + return *view_ptr; + } else if (auto* bytes_ptr = GetVariantPtr>(value)) { + const auto& bytes = *bytes_ptr; + return std::string_view(bytes->data(), bytes->size()); + } + auto binary_string = GetVariantValue(value); + return binary_string.GetStringView(); + } + static std::string VariantValueToString(const VariantType& value) { return std::visit( [](const auto& arg) -> std::string { diff --git a/src/paimon/common/data/data_define_test.cpp b/src/paimon/common/data/data_define_test.cpp index 15b697f4..8b59e850 100644 --- a/src/paimon/common/data/data_define_test.cpp +++ b/src/paimon/common/data/data_define_test.cpp @@ -140,4 +140,55 @@ TEST(DataDefineTest, VariantValueToStringReturnsStringForInternalArray) { ASSERT_EQ(DataDefine::VariantValueToString(array_variant), "array"); } +// Test case: GetStringView should handle all variant types and edge cases +TEST(DataDefineTest, GetStringView) { + auto pool = GetDefaultPool(); + + { + // from string_view + std::string original = "hello world"; + VariantType view_variant = std::string_view(original.data(), original.size()); + auto result = DataDefine::GetStringView(view_variant); + ASSERT_EQ(result, "hello world"); + ASSERT_EQ(result.data(), original.data()); + } + { + // from shared_ptr + std::shared_ptr bytes = Bytes::AllocateBytes("test bytes", pool.get()); + VariantType bytes_variant = bytes; + auto result = DataDefine::GetStringView(bytes_variant); + ASSERT_EQ(std::string(result), "test bytes"); + ASSERT_EQ(result.size(), 10); + } + { + // from BinaryString + auto binary_str = BinaryString::FromString("binary string content", pool.get()); + VariantType binary_variant = binary_str; + auto result = DataDefine::GetStringView(binary_variant); + ASSERT_EQ(std::string(result), "binary string content"); + } + { + // empty string_view + VariantType view_variant = std::string_view(); + auto result = DataDefine::GetStringView(view_variant); + ASSERT_TRUE(result.empty()); + ASSERT_EQ(result.size(), 0); + } + { + // empty Bytes + std::shared_ptr bytes = Bytes::AllocateBytes("", pool.get()); + VariantType bytes_variant = bytes; + auto result = DataDefine::GetStringView(bytes_variant); + ASSERT_TRUE(result.empty()); + ASSERT_EQ(result.size(), 0); + } + { + // empty BinaryString + VariantType binary_variant = BinaryString::EmptyUtf8(); + auto result = DataDefine::GetStringView(binary_variant); + ASSERT_TRUE(result.empty()); + ASSERT_EQ(result.size(), 0); + } +} + } // namespace paimon::test diff --git a/src/paimon/common/data/generic_row.h b/src/paimon/common/data/generic_row.h index b97779e2..b5d96972 100644 --- a/src/paimon/common/data/generic_row.h +++ b/src/paimon/common/data/generic_row.h @@ -148,18 +148,12 @@ class GenericRow : public InternalRow { BinaryString GetString(int32_t pos) const override { assert(static_cast(pos) < fields_.size()); - auto* value_ptr = DataDefine::GetVariantPtr(fields_[pos]); - if (value_ptr) { - auto bytes = std::make_shared(value_ptr->size(), GetDefaultPool().get()); - memcpy(bytes->data(), value_ptr->data(), value_ptr->size()); - return BinaryString::FromBytes(bytes); - } return DataDefine::GetVariantValue(fields_[pos]); } std::string_view GetStringView(int32_t pos) const override { assert(static_cast(pos) < fields_.size()); - return DataDefine::GetVariantValue(fields_[pos]); + return DataDefine::GetStringView(fields_[pos]); } Decimal GetDecimal(int32_t pos, int32_t precision, int32_t scale) const override { diff --git a/src/paimon/common/data/serializer/binary_row_serializer.cpp b/src/paimon/common/data/serializer/binary_row_serializer.cpp index 064fdbbe..766bc318 100644 --- a/src/paimon/common/data/serializer/binary_row_serializer.cpp +++ b/src/paimon/common/data/serializer/binary_row_serializer.cpp @@ -37,39 +37,7 @@ Status BinaryRowSerializer::Serialize(const BinaryRow& record, Status BinaryRowSerializer::SerializeWithoutLength(const BinaryRow& record, MemorySegmentOutputStream* target) const { - if (record.GetSegments().size() == 1) { - target->Write(record.GetSegments()[0], record.GetOffset(), record.GetSizeInBytes()); - } else { - return SerializeWithoutLengthSlow(record, target); - } - return Status::OK(); -} - -Status BinaryRowSerializer::SerializeWithoutLengthSlow(const BinaryRow& record, - MemorySegmentOutputStream* target) const { - int32_t remain_size = record.GetSizeInBytes(); - int32_t pos_in_seg_of_record = record.GetOffset(); - int32_t segment_size = record.GetSegments()[0].Size(); - for (const auto& seg_of_record : record.GetSegments()) { - int32_t nwrite = std::min(segment_size - pos_in_seg_of_record, remain_size); - if (nwrite <= 0) { - assert(false); - return Status::Invalid( - fmt::format("nwrite '{}' is less than or equal to zero", nwrite)); - } - assert(nwrite > 0); - target->Write(seg_of_record, pos_in_seg_of_record, nwrite); - - // next new segment. - pos_in_seg_of_record = 0; - remain_size -= nwrite; - if (remain_size == 0) { - break; - } - } - if (remain_size != 0) { - return Status::Invalid(fmt::format("remain size '{}' is not equal to zero", remain_size)); - } + target->Write(record.GetSegment(), record.GetOffset(), record.GetSizeInBytes()); return Status::OK(); } diff --git a/src/paimon/common/data/serializer/binary_row_serializer.h b/src/paimon/common/data/serializer/binary_row_serializer.h index 8efe2cae..2f197667 100644 --- a/src/paimon/common/data/serializer/binary_row_serializer.h +++ b/src/paimon/common/data/serializer/binary_row_serializer.h @@ -58,8 +58,6 @@ class BinaryRowSerializer { private: Status SerializeWithoutLength(const BinaryRow& record, MemorySegmentOutputStream* target) const; - Status SerializeWithoutLengthSlow(const BinaryRow& record, - MemorySegmentOutputStream* target) const; int32_t num_fields_; int32_t fixed_length_part_size_; diff --git a/src/paimon/common/data/serializer/row_compacted_serializer.cpp b/src/paimon/common/data/serializer/row_compacted_serializer.cpp index 1138acc3..eb178843 100644 --- a/src/paimon/common/data/serializer/row_compacted_serializer.cpp +++ b/src/paimon/common/data/serializer/row_compacted_serializer.cpp @@ -52,8 +52,7 @@ Result RowCompactedSerializer::CreateSliceComparat for (int32_t i = 0; i < schema->num_fields(); i++) { auto field_type = schema->field(i)->type(); PAIMON_ASSIGN_OR_RAISE(readers[i], CreateFieldReader(field_type, pool)); - PAIMON_ASSIGN_OR_RAISE(comparators[i], - FieldsComparator::CompareVariant(i, field_type, /*use_view=*/false)); + PAIMON_ASSIGN_OR_RAISE(comparators[i], FieldsComparator::CompareVariant(i, field_type)); } auto comparator = [row_reader1, row_reader2, readers, comparators]( const std::shared_ptr& slice1, @@ -428,16 +427,16 @@ Status RowCompactedSerializer::RowWriter::WriteArray(const std::shared_ptr& type) { PAIMON_ASSIGN_OR_RAISE(std::shared_ptr binary_array, BinarySerializerUtils::WriteBinaryArray(value, type, pool_.get())); - return WriteSegments(binary_array->GetSegments(), binary_array->GetOffset(), - binary_array->GetSizeInBytes()); + return WriteSegment(binary_array->GetSegment(), binary_array->GetOffset(), + binary_array->GetSizeInBytes()); } Status RowCompactedSerializer::RowWriter::WriteMap(const std::shared_ptr& value, const std::shared_ptr& type) { PAIMON_ASSIGN_OR_RAISE(std::shared_ptr binary_map, BinarySerializerUtils::WriteBinaryMap(value, type, pool_.get())); - return WriteSegments(binary_map->GetSegments(), binary_map->GetOffset(), - binary_map->GetSizeInBytes()); + return WriteSegment(binary_map->GetSegment(), binary_map->GetOffset(), + binary_map->GetSizeInBytes()); } std::shared_ptr RowCompactedSerializer::RowWriter::CopyBuffer() const { @@ -452,11 +451,11 @@ Status RowCompactedSerializer::RowWriter::WriteUnsignedInt(int32_t value) { return Status::OK(); } -Status RowCompactedSerializer::RowWriter::WriteSegments(const std::vector& segments, - int32_t off, int32_t len) { +Status RowCompactedSerializer::RowWriter::WriteSegment(const MemorySegment& segment, int32_t off, + int32_t len) { PAIMON_RETURN_NOT_OK(WriteUnsignedInt(len)); EnsureCapacity(len); - MemorySegmentUtils::CopyToBytes(segments, off, buffer_.get(), position_, len); + MemorySegmentUtils::CopyToBytes({segment}, off, buffer_.get(), position_, len); position_ += len; return Status::OK(); } @@ -488,7 +487,7 @@ void RowCompactedSerializer::RowReader::PointTo(const std::shared_ptr& by void RowCompactedSerializer::RowReader::PointTo(const MemorySegment& segment, int32_t offset) { segment_ = segment; - segments_ = {segment}; + offset_ = offset; position_ = offset + header_size_in_bytes_; } @@ -500,7 +499,7 @@ Result RowCompactedSerializer::RowReader::ReadRowKind() const { Result RowCompactedSerializer::RowReader::ReadString() { PAIMON_ASSIGN_OR_RAISE(int32_t length, ReadUnsignedInt()); - BinaryString str = BinaryString::FromAddress(segments_, position_, length); + BinaryString str = BinaryString::FromAddress(segment_, position_, length); position_ += length; return str; } @@ -534,7 +533,7 @@ Result RowCompactedSerializer::RowReader::ReadTimestamp(int32_t preci Result> RowCompactedSerializer::RowReader::ReadArray() { auto value = std::make_shared(); PAIMON_ASSIGN_OR_RAISE(int32_t length, ReadUnsignedInt()); - value->PointTo(segments_, position_, length); + value->PointTo(segment_, position_, length); position_ += length; return value; } @@ -542,7 +541,7 @@ Result> RowCompactedSerializer::RowReader::ReadAr Result> RowCompactedSerializer::RowReader::ReadMap() { auto value = std::make_shared(); PAIMON_ASSIGN_OR_RAISE(int32_t length, ReadUnsignedInt()); - value->PointTo(segments_, position_, length); + value->PointTo(segment_, position_, length); position_ += length; return value; } diff --git a/src/paimon/common/data/serializer/row_compacted_serializer.h b/src/paimon/common/data/serializer/row_compacted_serializer.h index c9cb6a2d..8a4de50e 100644 --- a/src/paimon/common/data/serializer/row_compacted_serializer.h +++ b/src/paimon/common/data/serializer/row_compacted_serializer.h @@ -67,7 +67,7 @@ class RowCompactedSerializer { } Status WriteString(const BinaryString& value) { - return WriteSegments(value.GetSegments(), value.GetOffset(), value.GetSizeInBytes()); + return WriteSegment(value.GetSegment(), value.GetOffset(), value.GetSizeInBytes()); } Status WriteStringView(const std::string_view& view) { @@ -98,7 +98,7 @@ class RowCompactedSerializer { private: Status WriteUnsignedInt(int32_t value); - Status WriteSegments(const std::vector& segments, int32_t off, int32_t len); + Status WriteSegment(const MemorySegment& segment, int32_t off, int32_t len); void EnsureCapacity(int32_t size); void Grow(int32_t min_capacity_add); void SetBuffer(std::shared_ptr new_buffer); @@ -157,7 +157,6 @@ class RowCompactedSerializer { const int32_t header_size_in_bytes_; std::shared_ptr pool_; MemorySegment segment_; - std::vector segments_; int32_t offset_ = 0; int32_t position_ = 0; }; diff --git a/src/paimon/common/utils/projected_array.h b/src/paimon/common/utils/projected_array.h index dd9377a9..c4a70712 100644 --- a/src/paimon/common/utils/projected_array.h +++ b/src/paimon/common/utils/projected_array.h @@ -103,7 +103,6 @@ class ProjectedArray : public InternalArray { } std::string_view GetStringView(int32_t pos) const override { - assert(false); assert(static_cast(pos) < mapping_.size()); return array_->GetStringView(mapping_[pos]); } diff --git a/src/paimon/common/utils/serialization_utils.h b/src/paimon/common/utils/serialization_utils.h index 0bcd1550..df786306 100644 --- a/src/paimon/common/utils/serialization_utils.h +++ b/src/paimon/common/utils/serialization_utils.h @@ -54,7 +54,7 @@ class SerializationUtils { arity = EndianSwapValue(arity); } memcpy(bytes->data(), &arity, sizeof(int32_t)); - MemorySegmentUtils::CopyToBytes(row.GetSegments(), row.GetOffset(), bytes.get(), + MemorySegmentUtils::CopyToBytes({row.GetSegment()}, row.GetOffset(), bytes.get(), sizeof(int32_t), size_in_row); return bytes; } @@ -66,7 +66,7 @@ class SerializationUtils { } out->WriteValue(4 + row.GetSizeInBytes()); out->WriteValue(row.GetFieldCount()); - return MemorySegmentUtils::CopyToStream(row.GetSegments(), row.GetOffset(), + return MemorySegmentUtils::CopyToStream({row.GetSegment()}, row.GetOffset(), row.GetSizeInBytes(), out); } diff --git a/src/paimon/core/io/meta_to_arrow_array_converter.cpp b/src/paimon/core/io/meta_to_arrow_array_converter.cpp index a06298b7..642653c3 100644 --- a/src/paimon/core/io/meta_to_arrow_array_converter.cpp +++ b/src/paimon/core/io/meta_to_arrow_array_converter.cpp @@ -38,7 +38,7 @@ Result> MetaToArrowArrayConverter::Cr for (int32_t i = 0; i < struct_type->num_fields(); i++) { PAIMON_ASSIGN_OR_RAISE( RowToArrowArrayConverter::AppendValueFunc func, - AppendField(/*use_view=*/false, struct_builder->field_builder(i), &reserve_count)); + AppendField(/*use_view=*/true, struct_builder->field_builder(i), &reserve_count)); appenders.emplace_back(func); } return std::unique_ptr(new MetaToArrowArrayConverter( diff --git a/src/paimon/core/mergetree/compact/interval_partition_test.cpp b/src/paimon/core/mergetree/compact/interval_partition_test.cpp index d588db53..97b2c9fc 100644 --- a/src/paimon/core/mergetree/compact/interval_partition_test.cpp +++ b/src/paimon/core/mergetree/compact/interval_partition_test.cpp @@ -45,7 +45,7 @@ class IntervalPartitionTest : public testing::Test { void SetUp() override { ASSERT_OK_AND_ASSIGN(comparator_, FieldsComparator::Create( {DataField(0, arrow::field("test", arrow::int32()))}, - /*is_ascending_order=*/true, /*use_view=*/false)); + /*is_ascending_order=*/true, /*use_view=*/true)); pool_ = GetDefaultPool(); } diff --git a/src/paimon/core/mergetree/compact/lookup_merge_tree_compact_rewriter_test.cpp b/src/paimon/core/mergetree/compact/lookup_merge_tree_compact_rewriter_test.cpp index f9d69326..57ae96da 100644 --- a/src/paimon/core/mergetree/compact/lookup_merge_tree_compact_rewriter_test.cpp +++ b/src/paimon/core/mergetree/compact/lookup_merge_tree_compact_rewriter_test.cpp @@ -119,7 +119,7 @@ class LookupMergeTreeCompactRewriterTest : public testing::Test { PAIMON_ASSIGN_OR_RAISE(std::shared_ptr key_comparator, FieldsComparator::Create(key_fields, /*is_ascending_order=*/true, - /*use_view=*/false)); + /*use_view=*/true)); return key_comparator; } @@ -1147,4 +1147,77 @@ TEST_F(LookupMergeTreeCompactRewriterTest, TestRewriteLookupChangelogWithOutputL CheckResult(compact_file_name, table_schema, "orc", expected_array); } +TEST_F(LookupMergeTreeCompactRewriterTest, TestRewriteWithDvAndAggForStringFields) { + std::map options = { + {Options::MERGE_ENGINE, "aggregation"}, + {Options::FIELDS_DEFAULT_AGG_FUNC, "max"}, + {Options::FILE_FORMAT, "orc"}, + {Options::DELETION_VECTORS_ENABLED, "true"}, + }; + arrow::FieldVector fields = { + arrow::field("key", arrow::utf8()), + arrow::field("value", arrow::utf8()), + }; + arrow_schema_ = arrow::schema(fields); + key_schema_ = arrow::schema({fields[0]}); + + ASSERT_OK_AND_ASSIGN(CoreOptions core_options, CoreOptions::FromMap(options)); + ASSERT_OK_AND_ASSIGN(auto table_path, CreateTable(options)); + auto schema_manager = std::make_shared(fs_, table_path); + ASSERT_OK_AND_ASSIGN(auto table_schema, schema_manager->ReadSchema(0)); + + // write 3 files + ASSERT_OK_AND_ASSIGN( + auto file0, NewFiles(/*level=*/5, /*last_sequence_number=*/-1, table_path, core_options, + R"([["1", "11"], ["3", "33"], ["5", "55"]])")); + ASSERT_OK_AND_ASSIGN(auto file1, + NewFiles(/*level=*/0, /*last_sequence_number=*/2, table_path, core_options, + R"([["2", "22"], ["4", "44"], ["5", "5"]])")); + ASSERT_OK_AND_ASSIGN(auto file2, NewFiles(/*level=*/0, /*last_sequence_number=*/5, table_path, + core_options, R"([["2", "222"], ["5", "15"]])")); + std::vector> files = {file0, file1, file2}; + auto processor_factory = std::make_shared(arrow_schema_); + ASSERT_OK_AND_ASSIGN( + auto lookup_levels, + CreateLookupLevels(table_path, table_schema, processor_factory, files)); + + // compact and rewrite + ASSERT_OK_AND_ASSIGN(auto rewriter, + CreateCompactRewriterForPositionedKeyValue( + table_path, table_schema, core_options, std::move(lookup_levels))); + ASSERT_OK_AND_ASSIGN(auto runs, GenerateSortedRuns({file1, file2})); + ASSERT_OK_AND_ASSIGN(auto compact_result, rewriter->Rewrite( + /*output_level=*/4, /*drop_delete=*/true, runs)); + ASSERT_EQ(2, compact_result.Before().size()); + ASSERT_EQ(1, compact_result.After().size()); + + const auto& compact_file_meta = compact_result.After()[0]; + // check compact file exist + std::string compact_file_name = table_path + "/bucket-0/" + compact_file_meta->file_name; + ASSERT_OK_AND_ASSIGN(bool exist, fs_->Exists(compact_file_name)); + ASSERT_TRUE(exist); + + // check file content + auto type_with_special_fields = + arrow::struct_(SpecialFields::CompleteSequenceAndValueKindField(arrow_schema_)->fields()); + std::shared_ptr expected_array; + auto array_status = + arrow::ipc::internal::json::ChunkedArrayFromJSON(type_with_special_fields, {R"([ +[6, 0, "2", "222"], +[4, 0, "4", "44"], +[7, 0, "5", "55"] +])"}, + &expected_array); + ASSERT_TRUE(array_status.ok()); + CheckResult(compact_file_name, table_schema, "orc", expected_array); + + // test dv + auto dv_maintainer = rewriter->dv_maintainer_; + ASSERT_TRUE(dv_maintainer); + auto dv = dv_maintainer->DeletionVectorOf(file0->file_name); + ASSERT_TRUE(dv); + ASSERT_FALSE(dv.value()->IsDeleted(0).value()); + ASSERT_TRUE(dv.value()->IsDeleted(2).value()); +} + } // namespace paimon::test diff --git a/src/paimon/core/mergetree/compact/merge_tree_compact_rewriter_test.cpp b/src/paimon/core/mergetree/compact/merge_tree_compact_rewriter_test.cpp index 1524d99e..9d955d11 100644 --- a/src/paimon/core/mergetree/compact/merge_tree_compact_rewriter_test.cpp +++ b/src/paimon/core/mergetree/compact/merge_tree_compact_rewriter_test.cpp @@ -71,7 +71,7 @@ class MergeTreeCompactRewriterTest : public testing::Test { table_schema->GetFields(table_schema->TrimmedPrimaryKeys().value())); PAIMON_ASSIGN_OR_RAISE( std::shared_ptr key_comparator, - FieldsComparator::Create(pk_fields, /*is_ascending_order=*/true, /*use_view=*/false)); + FieldsComparator::Create(pk_fields, /*is_ascending_order=*/true, /*use_view=*/true)); IntervalPartition interval_partition(metas, key_comparator); return interval_partition.Partition(); } @@ -303,3 +303,4 @@ TEST_F(MergeTreeCompactRewriterTest, TestIOException) { // test branch // test with parititon // test key appears before value +// test compact multiple times, reuse rewriter diff --git a/src/paimon/core/mergetree/levels_test.cpp b/src/paimon/core/mergetree/levels_test.cpp index 4db17560..b3debe04 100644 --- a/src/paimon/core/mergetree/levels_test.cpp +++ b/src/paimon/core/mergetree/levels_test.cpp @@ -51,7 +51,7 @@ class LevelsTest : public testing::Test { data_fields.emplace_back(/*id=*/0, arrow::field("f0", arrow::int32())); EXPECT_OK_AND_ASSIGN(auto cmp, FieldsComparator::Create(data_fields, /*is_ascending_order=*/true, - /*use_view=*/false)); + /*use_view=*/true)); return cmp; } diff --git a/src/paimon/core/mergetree/lookup/persist_processor_test.cpp b/src/paimon/core/mergetree/lookup/persist_processor_test.cpp index fa45e064..aa204ba0 100644 --- a/src/paimon/core/mergetree/lookup/persist_processor_test.cpp +++ b/src/paimon/core/mergetree/lookup/persist_processor_test.cpp @@ -16,39 +16,17 @@ #include "paimon/core/mergetree/lookup/persist_processor.h" -#include "arrow/ipc/json_simple.h" #include "gtest/gtest.h" -#include "paimon/common/data/columnar/columnar_row.h" #include "paimon/core/mergetree/lookup/default_lookup_serializer_factory.h" #include "paimon/core/mergetree/lookup/persist_empty_processor.h" #include "paimon/core/mergetree/lookup/persist_position_processor.h" #include "paimon/core/mergetree/lookup/persist_value_and_pos_processor.h" #include "paimon/core/mergetree/lookup/persist_value_processor.h" +#include "paimon/testing/utils/binary_row_generator.h" #include "paimon/testing/utils/testharness.h" namespace paimon::test { class PersistProcessorTest : public testing::Test { public: - void SetUp() override { - auto key_type = arrow::struct_({arrow::field("f1", arrow::int32())}); - auto key_array = std::dynamic_pointer_cast( - arrow::ipc::internal::json::ArrayFromJSON(key_type, R"([[10]])").ValueOrDie()); - - auto value_type = arrow::struct_( - {arrow::field("f0", arrow::utf8()), arrow::field("f1", arrow::int32()), - arrow::field("f2", arrow::int32()), arrow::field("f3", arrow::float64())}); - auto value_array = std::dynamic_pointer_cast( - arrow::ipc::internal::json::ArrayFromJSON(value_type, R"([["Alice", 10, null, 10.1]])") - .ValueOrDie()); - - auto key_row = std::make_shared( - /*struct_array=*/key_array, key_array->fields(), pool_, /*row_id=*/0); - auto value_row = std::make_unique( - /*struct_array=*/value_array, value_array->fields(), pool_, /*row_id=*/0); - - kv_ = KeyValue(RowKind::Insert(), /*sequence_number=*/500, /*level=*/4, std::move(key_row), - std::move(value_row)); - } - void CheckResult(const KeyValue& kv) { ASSERT_EQ(kv_.key, kv.key); @@ -66,7 +44,11 @@ class PersistProcessorTest : public testing::Test { private: std::shared_ptr pool_ = GetDefaultPool(); - KeyValue kv_; + KeyValue kv_ = KeyValue(RowKind::Insert(), /*sequence_number=*/500, /*level=*/4, /*key=*/ + BinaryRowGenerator::GenerateRowPtr({10}, pool_.get()), + /*value=*/ + BinaryRowGenerator::GenerateRowPtr( + {std::string("Alice"), 10, NullType(), 10.1}, pool_.get())); std::shared_ptr file_schema_ = arrow::schema({arrow::field("f0", arrow::utf8()), arrow::field("f1", arrow::int32()), arrow::field("f2", arrow::int32()), arrow::field("f3", arrow::float64())}); diff --git a/src/paimon/core/mergetree/lookup_levels.cpp b/src/paimon/core/mergetree/lookup_levels.cpp index 91cb6cd8..0e82b21d 100644 --- a/src/paimon/core/mergetree/lookup_levels.cpp +++ b/src/paimon/core/mergetree/lookup_levels.cpp @@ -45,7 +45,7 @@ Result>> LookupLevels::Create( RowCompactedSerializer::Create(pk_schema, pool)); PAIMON_ASSIGN_OR_RAISE( std::unique_ptr key_comparator, - FieldsComparator::Create(pk_fields, /*is_ascending_order=*/true, /*use_view=*/false)); + FieldsComparator::Create(pk_fields, /*is_ascending_order=*/true, /*use_view=*/true)); PAIMON_ASSIGN_OR_RAISE(std::vector partition_fields, table_schema->GetFields(table_schema->PartitionKeys())); diff --git a/src/paimon/core/mergetree/lookup_levels_test.cpp b/src/paimon/core/mergetree/lookup_levels_test.cpp index fd895530..98bdcfae 100644 --- a/src/paimon/core/mergetree/lookup_levels_test.cpp +++ b/src/paimon/core/mergetree/lookup_levels_test.cpp @@ -103,7 +103,7 @@ class LookupLevelsTest : public testing::Test { PAIMON_ASSIGN_OR_RAISE(std::shared_ptr key_comparator, FieldsComparator::Create(key_fields, /*is_ascending_order=*/true, - /*use_view=*/false)); + /*use_view=*/true)); return key_comparator; } diff --git a/src/paimon/core/mergetree/merge_tree_writer_test.cpp b/src/paimon/core/mergetree/merge_tree_writer_test.cpp index a5a23580..274687cb 100644 --- a/src/paimon/core/mergetree/merge_tree_writer_test.cpp +++ b/src/paimon/core/mergetree/merge_tree_writer_test.cpp @@ -294,7 +294,7 @@ TEST_F(MergeTreeWriterTest, TestWriteWithDeleteRow) { ASSERT_OK_AND_ASSIGN(std::shared_ptr user_defined_seq_comparator, FieldsComparator::Create({value_fields_[1]}, /*is_ascending_order=*/true, - /*use_view=*/false)); + /*use_view=*/true)); assert(user_defined_seq_comparator); auto merge_writer = std::make_shared( /*last_sequence_number=*/9, primary_keys_, path_factory, key_comparator_, diff --git a/src/paimon/core/mergetree/sorted_run_test.cpp b/src/paimon/core/mergetree/sorted_run_test.cpp index 33b4a705..a014eb7e 100644 --- a/src/paimon/core/mergetree/sorted_run_test.cpp +++ b/src/paimon/core/mergetree/sorted_run_test.cpp @@ -59,7 +59,7 @@ TEST_F(SortedRunTest, TestSortedRunIsValid) { ASSERT_OK_AND_ASSIGN( std::shared_ptr comparator, FieldsComparator::Create({DataField(0, arrow::field("test", arrow::int32()))}, - /*is_ascending_order=*/true, /*use_view=*/false)); + /*is_ascending_order=*/true, /*use_view=*/true)); // m1 [10, 20] auto m1 = CreateDataFileMeta(10, 20); @@ -91,7 +91,7 @@ TEST_F(SortedRunTest, TestFromUnsorted) { ASSERT_OK_AND_ASSIGN( std::shared_ptr comparator, FieldsComparator::Create({DataField(0, arrow::field("test", arrow::int32()))}, - /*is_ascending_order=*/true, /*use_view=*/false)); + /*is_ascending_order=*/true, /*use_view=*/true)); // m1 [10, 20] auto m1 = CreateDataFileMeta(10, 20); diff --git a/src/paimon/core/operation/merge_file_split_read.cpp b/src/paimon/core/operation/merge_file_split_read.cpp index 1214881b..c3993eac 100644 --- a/src/paimon/core/operation/merge_file_split_read.cpp +++ b/src/paimon/core/operation/merge_file_split_read.cpp @@ -89,14 +89,12 @@ Result> MergeFileSplitRead::Create( std::shared_ptr read_schema; // comparator of member key in KeyValue object std::shared_ptr key_comparator; - // comparator of sorted-run in interval partition - std::shared_ptr interval_partition_comparator; // comparator of user-defined sequence fields in member value of KeyValue object std::shared_ptr user_defined_seq_comparator; PAIMON_RETURN_NOT_OK(GenerateKeyValueReadSchema( *table_schema, core_options, context->GetReadSchema(), &value_schema, &read_schema, - &key_comparator, &interval_partition_comparator, &user_defined_seq_comparator)); + &key_comparator, &user_defined_seq_comparator)); PAIMON_ASSIGN_OR_RAISE(std::shared_ptr predicate_for_keys, GenerateKeyPredicates(context->GetPredicate(), *table_schema)); @@ -114,8 +112,7 @@ Result> MergeFileSplitRead::Create( std::make_unique(core_options.GetFileSystem(), context->GetPath(), context->GetCoreOptions().GetBranch()), key_schema, value_schema, read_schema, projection, key_comparator, - interval_partition_comparator, user_defined_seq_comparator, predicate_for_keys, memory_pool, - executor)); + user_defined_seq_comparator, predicate_for_keys, memory_pool, executor)); } Result> MergeFileSplitRead::CreateReader( @@ -223,7 +220,7 @@ Result> MergeFileSplitRead::CreateMergeReader( CreateDeletionFileMap(*data_split), pool_); std::vector> sections = - IntervalPartition(data_split->DataFiles(), interval_partition_comparator_).Partition(); + IntervalPartition(data_split->DataFiles(), key_comparator_).Partition(); std::vector> batch_readers; batch_readers.reserve(sections.size()); // no overlap through multiple sections @@ -270,7 +267,6 @@ MergeFileSplitRead::MergeFileSplitRead( const std::shared_ptr& value_schema, const std::shared_ptr& read_schema, const std::vector& projection, const std::shared_ptr& key_comparator, - const std::shared_ptr& interval_partition_comparator, const std::shared_ptr& user_defined_seq_comparator, const std::shared_ptr& predicate_for_keys, const std::shared_ptr& memory_pool, const std::shared_ptr& executor) @@ -280,7 +276,6 @@ MergeFileSplitRead::MergeFileSplitRead( read_schema_(read_schema), projection_(projection), key_comparator_(key_comparator), - interval_partition_comparator_(interval_partition_comparator), user_defined_seq_comparator_(user_defined_seq_comparator), predicate_for_keys_(predicate_for_keys) {} @@ -289,7 +284,6 @@ Status MergeFileSplitRead::GenerateKeyValueReadSchema( const std::shared_ptr& raw_read_schema, std::shared_ptr* value_schema, std::shared_ptr* read_schema, std::shared_ptr* key_comparator, - std::shared_ptr* interval_partition_comparator, std::shared_ptr* sequence_fields_comparator) { // 1. add user raw read schema to need_fields PAIMON_ASSIGN_OR_RAISE(std::vector need_fields, @@ -328,11 +322,6 @@ Status MergeFileSplitRead::GenerateKeyValueReadSchema( PAIMON_ASSIGN_OR_RAISE( *key_comparator, FieldsComparator::Create(key_fields, /*is_ascending_order=*/true, /*use_view=*/true)); - // comparator only used in interval partition - PAIMON_ASSIGN_OR_RAISE( - *interval_partition_comparator, - FieldsComparator::Create(key_fields, - /*is_ascending_order=*/true, /*use_view=*/false)); // 7. construct actual read fields: special + key + non-key value std::vector read_fields; std::vector special_fields( diff --git a/src/paimon/core/operation/merge_file_split_read.h b/src/paimon/core/operation/merge_file_split_read.h index e82ac234..06cc5176 100644 --- a/src/paimon/core/operation/merge_file_split_read.h +++ b/src/paimon/core/operation/merge_file_split_read.h @@ -140,7 +140,6 @@ class MergeFileSplitRead : public AbstractSplitRead { const std::shared_ptr& read_schema, const std::vector& projection, const std::shared_ptr& key_comparator, - const std::shared_ptr& interval_partition_comparator, const std::shared_ptr& user_defined_seq_comparator, const std::shared_ptr& predicate_for_keys, const std::shared_ptr& memory_pool, @@ -155,7 +154,6 @@ class MergeFileSplitRead : public AbstractSplitRead { const std::shared_ptr& raw_read_schema, std::shared_ptr* value_schema, std::shared_ptr* read_schema, std::shared_ptr* key_comparator, - std::shared_ptr* interval_partition_comparator, std::shared_ptr* sequence_fields_comparator); static Status SplitKeyAndNonKeyField(const std::vector& trimmed_key_fields, @@ -180,7 +178,6 @@ class MergeFileSplitRead : public AbstractSplitRead { // merge_function_wrapper is lazy created, use through GetMergeFunctionWrapper() std::shared_ptr> merge_function_wrapper_; std::shared_ptr key_comparator_; - std::shared_ptr interval_partition_comparator_; std::shared_ptr user_defined_seq_comparator_; std::shared_ptr predicate_for_keys_; }; diff --git a/src/paimon/core/operation/merge_file_split_read_test.cpp b/src/paimon/core/operation/merge_file_split_read_test.cpp index 62579bc1..575eabfd 100644 --- a/src/paimon/core/operation/merge_file_split_read_test.cpp +++ b/src/paimon/core/operation/merge_file_split_read_test.cpp @@ -387,11 +387,10 @@ TEST_F(MergeFileSplitReadTest, TestGenerateKeyValueReadSchema) { std::shared_ptr value_schema; std::shared_ptr read_schema; std::shared_ptr key_comparator; - std::shared_ptr interval_partition_comparator; std::shared_ptr sequence_fields_comparator; ASSERT_OK(MergeFileSplitRead::GenerateKeyValueReadSchema( *table_schema, options, raw_read_schema, &value_schema, &read_schema, &key_comparator, - &interval_partition_comparator, &sequence_fields_comparator)); + &sequence_fields_comparator)); // check result std::vector expected_value_fields = { @@ -431,8 +430,6 @@ TEST_F(MergeFileSplitReadTest, TestGenerateKeyValueReadSchema) { std::vector expected_sort_key_fields = {0, 1}; ASSERT_EQ(key_comparator->sort_fields_, expected_sort_key_fields); ASSERT_EQ(key_comparator->is_ascending_order_, true); - ASSERT_EQ(interval_partition_comparator->sort_fields_, expected_sort_key_fields); - ASSERT_EQ(interval_partition_comparator->is_ascending_order_, true); std::vector expected_sort_seq_fields = {5, 2}; ASSERT_EQ(sequence_fields_comparator->sort_fields_, expected_sort_seq_fields); @@ -461,11 +458,10 @@ TEST_F(MergeFileSplitReadTest, TestGenerateKeyValueReadSchema1) { std::shared_ptr value_schema; std::shared_ptr read_schema; std::shared_ptr key_comparator; - std::shared_ptr interval_partition_comparator; std::shared_ptr sequence_fields_comparator; ASSERT_OK(MergeFileSplitRead::GenerateKeyValueReadSchema( *table_schema, options, raw_read_schema, &value_schema, &read_schema, &key_comparator, - &interval_partition_comparator, &sequence_fields_comparator)); + &sequence_fields_comparator)); // check result std::vector expected_value_fields = { @@ -493,8 +489,6 @@ TEST_F(MergeFileSplitReadTest, TestGenerateKeyValueReadSchema1) { std::vector expected_sort_key_fields = {0, 1}; ASSERT_EQ(key_comparator->sort_fields_, expected_sort_key_fields); ASSERT_EQ(key_comparator->is_ascending_order_, true); - ASSERT_EQ(interval_partition_comparator->sort_fields_, expected_sort_key_fields); - ASSERT_EQ(interval_partition_comparator->is_ascending_order_, true); ASSERT_FALSE(sequence_fields_comparator); } diff --git a/src/paimon/core/stats/simple_stats_evolution_test.cpp b/src/paimon/core/stats/simple_stats_evolution_test.cpp index a4607ae8..7139081a 100644 --- a/src/paimon/core/stats/simple_stats_evolution_test.cpp +++ b/src/paimon/core/stats/simple_stats_evolution_test.cpp @@ -178,7 +178,7 @@ class SimpleStatsEvolutionTest : public ::testing::Test { [](const InternalRow& row, const std::shared_ptr& schema) -> std::vector { EXPECT_OK_AND_ASSIGN(auto getters, - InternalRowUtils::CreateFieldGetters(schema, /*use_view=*/false)); + InternalRowUtils::CreateFieldGetters(schema, /*use_view=*/true)); std::vector ret; for (const auto& getter : getters) { ret.push_back(getter(row)); diff --git a/src/paimon/core/table/source/split_generator_test.cpp b/src/paimon/core/table/source/split_generator_test.cpp index 46670821..ed9faec7 100644 --- a/src/paimon/core/table/source/split_generator_test.cpp +++ b/src/paimon/core/table/source/split_generator_test.cpp @@ -52,7 +52,7 @@ class SplitGeneratorTest : public testing::Test { pool_ = GetDefaultPool(); key_comparator_ = FieldsComparator::Create({DataField(0, arrow::field("f0", arrow::int32(), false))}, - /*is_ascending_order=*/true, /*use_view=*/false) + /*is_ascending_order=*/true, /*use_view=*/true) .value(); } void TearDown() override {} diff --git a/src/paimon/core/table/source/table_scan.cpp b/src/paimon/core/table/source/table_scan.cpp index ba1b5d8c..d6982aec 100644 --- a/src/paimon/core/table/source/table_scan.cpp +++ b/src/paimon/core/table/source/table_scan.cpp @@ -128,7 +128,7 @@ class TableScanImpl { PAIMON_ASSIGN_OR_RAISE( std::shared_ptr key_comparator, FieldsComparator::Create(trimmed_pk_fields, /*is_ascending_order=*/true, - /*use_view=*/false)); + /*use_view=*/true)); return std::make_unique( source_split_target_size, source_split_open_file_cost, core_options.DeletionVectorsEnabled(), core_options.GetMergeEngine(), diff --git a/src/paimon/core/utils/fields_comparator.cpp b/src/paimon/core/utils/fields_comparator.cpp index cb74ffc9..841c5769 100644 --- a/src/paimon/core/utils/fields_comparator.cpp +++ b/src/paimon/core/utils/fields_comparator.cpp @@ -215,7 +215,7 @@ Result FieldsComparator::CompareField( } Result FieldsComparator::CompareVariant( - int32_t field_idx, const std::shared_ptr& input_type, bool use_view) { + int32_t field_idx, const std::shared_ptr& input_type) { arrow::Type::type type = input_type->id(); switch (type) { case arrow::Type::type::BOOL: @@ -275,44 +275,15 @@ Result FieldsComparator::CompareVariant auto rvalue = DataDefine::GetVariantValue(rhs); return CompareFloatingPoint(lvalue, rvalue); }); + case arrow::Type::type::BINARY: case arrow::Type::type::STRING: { - if (use_view) { - return FieldsComparator::VariantComparatorFunc( - [](const VariantType& lhs, const VariantType& rhs) -> int32_t { - auto lvalue = DataDefine::GetVariantValue(lhs); - auto rvalue = DataDefine::GetVariantValue(rhs); - int32_t cmp = lvalue.compare(rvalue); - return cmp == 0 ? 0 : (cmp > 0 ? 1 : -1); - }); - } else { - return FieldsComparator::VariantComparatorFunc( - [](const VariantType& lhs, const VariantType& rhs) -> int32_t { - auto lvalue = DataDefine::GetVariantValue(lhs); - auto rvalue = DataDefine::GetVariantValue(rhs); - int32_t cmp = lvalue.CompareTo(rvalue); - return cmp == 0 ? 0 : (cmp > 0 ? 1 : -1); - }); - } - } - case arrow::Type::type::BINARY: { - // TODO(xinyu.lxy): may use 64byte compare - if (use_view) { - return FieldsComparator::VariantComparatorFunc( - [](const VariantType& lhs, const VariantType& rhs) -> int32_t { - auto lvalue = DataDefine::GetVariantValue(lhs); - auto rvalue = DataDefine::GetVariantValue(rhs); - int32_t cmp = lvalue.compare(rvalue); - return cmp == 0 ? 0 : (cmp > 0 ? 1 : -1); - }); - } else { - return FieldsComparator::VariantComparatorFunc( - [](const VariantType& lhs, const VariantType& rhs) -> int32_t { - auto lvalue = DataDefine::GetVariantValue>(lhs); - auto rvalue = DataDefine::GetVariantValue>(rhs); - int32_t cmp = lvalue->compare(*rvalue); - return cmp == 0 ? 0 : (cmp > 0 ? 1 : -1); - }); - } + return FieldsComparator::VariantComparatorFunc( + [](const VariantType& lhs, const VariantType& rhs) -> int32_t { + std::string_view lvalue = DataDefine::GetStringView(lhs); + std::string_view rvalue = DataDefine::GetStringView(rhs); + int32_t cmp = lvalue.compare(rvalue); + return cmp == 0 ? 0 : (cmp > 0 ? 1 : -1); + }); } case arrow::Type::type::TIMESTAMP: { return FieldsComparator::VariantComparatorFunc( diff --git a/src/paimon/core/utils/fields_comparator.h b/src/paimon/core/utils/fields_comparator.h index 95c4d706..2fd2184a 100644 --- a/src/paimon/core/utils/fields_comparator.h +++ b/src/paimon/core/utils/fields_comparator.h @@ -55,7 +55,7 @@ class FieldsComparator { std::function; static Result CompareVariant( - int32_t field_idx, const std::shared_ptr& input_type, bool use_view); + int32_t field_idx, const std::shared_ptr& input_type); /// Java-compatible ordering for floating-point types: /// -infinity < -0.0 < +0.0 < +infinity < NaN == NaN diff --git a/src/paimon/core/utils/fields_comparator_test.cpp b/src/paimon/core/utils/fields_comparator_test.cpp index 88a699fd..ac264f35 100644 --- a/src/paimon/core/utils/fields_comparator_test.cpp +++ b/src/paimon/core/utils/fields_comparator_test.cpp @@ -52,7 +52,7 @@ class FieldsComparatorTest : public ::testing::Test { } ASSERT_OK_AND_ASSIGN(auto comp1, FieldsComparator::Create(data_fields, sort_fields, /*is_ascending_order=*/true, - /*use_view=*/false)); + /*use_view=*/true)); ASSERT_EQ(-1, comp1->CompareTo(row1, row2)); ASSERT_EQ(1, comp1->CompareTo(row2, row1)); ASSERT_EQ(0, comp1->CompareTo(row1, row1)); @@ -61,7 +61,7 @@ class FieldsComparatorTest : public ::testing::Test { if (!has_null) { ASSERT_OK_AND_ASSIGN(auto comp2, FieldsComparator::Create(data_fields, sort_fields, /*is_ascending_order=*/false, - /*use_view=*/false)); + /*use_view=*/true)); ASSERT_EQ(1, comp2->CompareTo(row1, row2)); ASSERT_EQ(-1, comp2->CompareTo(row2, row1)); ASSERT_EQ(0, comp2->CompareTo(row1, row1)); @@ -323,7 +323,7 @@ TEST_F(FieldsComparatorTest, TestInvalidType) { ASSERT_NOK_WITH_MSG(FieldsComparator::Create({DataField(0, arrow::field("f0", arrow::int32())), DataField(1, arrow::field("f1", map_type))}, /*is_ascending_order=*/true, - /*use_view=*/false), + /*use_view=*/true), "Do not support comparing map type in idx 1"); } diff --git a/src/paimon/core/utils/file_store_path_factory.cpp b/src/paimon/core/utils/file_store_path_factory.cpp index b807c2b8..c2ff7279 100644 --- a/src/paimon/core/utils/file_store_path_factory.cpp +++ b/src/paimon/core/utils/file_store_path_factory.cpp @@ -216,7 +216,7 @@ Result FileStorePathFactory::BucketPath(const BinaryRow& partition, } Result FileStorePathFactory::GetPartitionString(const BinaryRow& partition) const { - if (partition.GetSegments().size() == 0) { + if (partition.GetSizeInBytes() == 0) { return Status::Invalid("invalid binary row partition"); } auto iter = row_to_str_cache_.find(partition); diff --git a/src/paimon/testing/utils/key_value_checker.h b/src/paimon/testing/utils/key_value_checker.h index bc0e1adc..5789a09d 100644 --- a/src/paimon/testing/utils/key_value_checker.h +++ b/src/paimon/testing/utils/key_value_checker.h @@ -66,10 +66,10 @@ class KeyValueChecker : public testing::Test { ASSERT_EQ(expected_vec.size(), result_vec.size()); ASSERT_OK_AND_ASSIGN( auto key_comparator, - FieldsComparator::Create(key_fields, /*is_ascending_order=*/true, /*use_view=*/false)); + FieldsComparator::Create(key_fields, /*is_ascending_order=*/true, /*use_view=*/true)); ASSERT_OK_AND_ASSIGN(auto value_comparator, FieldsComparator::Create(value_fields, /*is_ascending_order=*/true, - /*use_view=*/false)); + /*use_view=*/true)); for (size_t i = 0; i < expected_vec.size(); i++) { const auto& expected = expected_vec[i]; const auto& result = result_vec[i]; From 76656e8f1538d6bb17dd63312c87fa2f1ec61665 Mon Sep 17 00:00:00 2001 From: lxy264173 Date: Wed, 25 Mar 2026 20:34:40 +0800 Subject: [PATCH 04/11] fix --- src/paimon/common/data/binary_data_read_utils.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/paimon/common/data/binary_data_read_utils.h b/src/paimon/common/data/binary_data_read_utils.h index cda01fac..070349df 100644 --- a/src/paimon/common/data/binary_data_read_utils.h +++ b/src/paimon/common/data/binary_data_read_utils.h @@ -36,11 +36,10 @@ class BinaryDataReadUtils { ~BinaryDataReadUtils() = delete; /// Gets an instance of `Timestamp` from underlying `MemorySegment`. - /// @param segments the underlying `MemorySegment`s + /// @param segment the underlying `MemorySegment`s /// @param base_offset the base offset of current instance of TimestampData /// @param offset_and_nanos the offset of milli-seconds part and nanoseconds /// @return an instance of `Timestamp` - static Timestamp ReadTimestampData(const MemorySegment& segment, int32_t base_offset, int64_t offset_and_nanos) { auto nano_of_millisecond = static_cast(offset_and_nanos & LOW_BYTES_MASK); From 4f4c2dca4770e5954c33bc133cf3ed5bcf53cbed Mon Sep 17 00:00:00 2001 From: lxy264173 Date: Wed, 25 Mar 2026 20:37:26 +0800 Subject: [PATCH 05/11] fix --- src/paimon/common/file_index/rangebitmap/range_bitmap.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/paimon/common/file_index/rangebitmap/range_bitmap.cpp b/src/paimon/common/file_index/rangebitmap/range_bitmap.cpp index edb4a47b..8c1a7b25 100644 --- a/src/paimon/common/file_index/rangebitmap/range_bitmap.cpp +++ b/src/paimon/common/file_index/rangebitmap/range_bitmap.cpp @@ -261,11 +261,11 @@ Result> RangeBitmap::Appender::Serialize() const { return Status::Invalid(fmt::format( "Chunk size cannot be larger than 2GB, current bytes: {}", chunk_size_bytes_limit_)); } - PAIMON_ASSIGN_OR_RAISE(std::unique_ptr dictionary_apender, + PAIMON_ASSIGN_OR_RAISE(std::unique_ptr dictionary_appender, ChunkedDictionary::Appender::Create( factory_, static_cast(chunk_size_bytes_limit_), pool_)); for (const auto& [key, bitmap] : bitmaps_) { - PAIMON_RETURN_NOT_OK(dictionary_apender->AppendSorted(key, code)); + PAIMON_RETURN_NOT_OK(dictionary_appender->AppendSorted(key, code)); for (auto it = bitmap.Begin(); it != bitmap.End(); ++it) { PAIMON_RETURN_NOT_OK(bsi_appender->Append(*it, code)); } @@ -289,7 +289,7 @@ Result> RangeBitmap::Appender::Serialize() const { header_size += max.IsNull() ? 0 : max_size; // max literal size header_size += sizeof(int32_t); // dictionary length PAIMON_ASSIGN_OR_RAISE(PAIMON_UNIQUE_PTR dictionary_bytes, - dictionary_apender->Serialize()); + dictionary_appender->Serialize()); auto dictionary_length = static_cast(dictionary_bytes->size()); PAIMON_ASSIGN_OR_RAISE(PAIMON_UNIQUE_PTR bsi_bytes, bsi_appender->Serialize()); size_t bsi_length = bsi_bytes->size(); From bb9aeaf80f99c79e04f34f5e258d65d241c8a068 Mon Sep 17 00:00:00 2001 From: lxy264173 Date: Thu, 26 Mar 2026 09:44:09 +0800 Subject: [PATCH 06/11] fix test --- src/paimon/common/data/binary_array_test.cpp | 2 +- src/paimon/common/data/binary_map.h | 4 +- src/paimon/common/data/binary_map_test.cpp | 2 +- src/paimon/common/data/binary_row_test.cpp | 14 +- .../common/data/binary_row_writer_test.cpp | 2 +- src/paimon/common/data/binary_string_test.cpp | 253 ++++++++---------- 6 files changed, 126 insertions(+), 151 deletions(-) diff --git a/src/paimon/common/data/binary_array_test.cpp b/src/paimon/common/data/binary_array_test.cpp index 11697495..04bcecb7 100644 --- a/src/paimon/common/data/binary_array_test.cpp +++ b/src/paimon/common/data/binary_array_test.cpp @@ -263,7 +263,7 @@ TEST(BinaryArrayTest, TestSetAndGet) { { auto key = BinaryArray::FromIntArray({1, 2, 3, 5}, pool.get()); auto value = BinaryArray::FromLongArray({100ll, 200ll, 300ll, 500ll}, pool.get()); - ASSERT_OK_AND_ASSIGN(auto map, BinaryMap::ValueOf(key, value, pool.get())); + auto map = BinaryMap::ValueOf(key, value, pool.get()); BinaryArray array; BinaryArrayWriter writer = BinaryArrayWriter(&array, /*num_elements=*/1, /*element_size=*/8, pool.get()); diff --git a/src/paimon/common/data/binary_map.h b/src/paimon/common/data/binary_map.h index 0200f621..460b62ff 100644 --- a/src/paimon/common/data/binary_map.h +++ b/src/paimon/common/data/binary_map.h @@ -60,8 +60,8 @@ class BinaryMap : public BinarySection, public InternalMap { size_in_bytes_ = size_in_bytes; } - static Result> ValueOf(const BinaryArray& key, - const BinaryArray& value, MemoryPool* pool) { + static std::shared_ptr ValueOf(const BinaryArray& key, const BinaryArray& value, + MemoryPool* pool) { auto bytes = std::make_shared( kHeaderSize + key.GetSizeInBytes() + value.GetSizeInBytes(), pool); MemorySegment segment = MemorySegment::Wrap(bytes); diff --git a/src/paimon/common/data/binary_map_test.cpp b/src/paimon/common/data/binary_map_test.cpp index 4d4a2387..36097719 100644 --- a/src/paimon/common/data/binary_map_test.cpp +++ b/src/paimon/common/data/binary_map_test.cpp @@ -25,7 +25,7 @@ TEST(BinaryMapTest, TestSimple) { auto key = BinaryArray::FromIntArray({1, 2, 3, 5}, pool.get()); auto value = BinaryArray::FromLongArray({100ll, 200ll, 300ll, 500ll}, pool.get()); - ASSERT_OK_AND_ASSIGN(auto binary_map, BinaryMap::ValueOf(key, value, pool.get())); + auto binary_map = BinaryMap::ValueOf(key, value, pool.get()); ASSERT_EQ(binary_map->Size(), 4); auto key_in_map = std::dynamic_pointer_cast(binary_map->KeyArray()); diff --git a/src/paimon/common/data/binary_row_test.cpp b/src/paimon/common/data/binary_row_test.cpp index 9cbd7bb5..e5e9de52 100644 --- a/src/paimon/common/data/binary_row_test.cpp +++ b/src/paimon/common/data/binary_row_test.cpp @@ -142,15 +142,15 @@ TEST_F(BinaryRowTest, TestWriter) { AssertTestWriterRow(row); // test copy - AssertTestWriterRow(row.Copy(pool.get())); + auto copied1 = row.Copy(pool.get()); + AssertTestWriterRow(copied1); + ASSERT_EQ(row, copied1); // test copy - BinaryRow copied(arity); - row.Copy(&copied, pool.get()); - AssertTestWriterRow(copied); - - ASSERT_EQ(copied, row); - AssertTestWriterRow(row); + BinaryRow copied2(arity); + row.Copy(&copied2, pool.get()); + AssertTestWriterRow(copied2); + ASSERT_EQ(row, copied2); } TEST_F(BinaryRowTest, TestWriteString) { diff --git a/src/paimon/common/data/binary_row_writer_test.cpp b/src/paimon/common/data/binary_row_writer_test.cpp index c7836659..2f78e4f2 100644 --- a/src/paimon/common/data/binary_row_writer_test.cpp +++ b/src/paimon/common/data/binary_row_writer_test.cpp @@ -279,7 +279,7 @@ TEST(BinaryRowWriterTest, TestWriteNested) { BinaryArray inner_array = BinaryArray::FromIntArray({10, 20, 30}, pool.get()); auto key = BinaryArray::FromIntArray({1, 2, 3, 5}, pool.get()); auto value = BinaryArray::FromLongArray({100ll, 200ll, 300ll, 500ll}, pool.get()); - ASSERT_OK_AND_ASSIGN(auto inner_map, BinaryMap::ValueOf(key, value, pool.get())); + auto inner_map = BinaryMap::ValueOf(key, value, pool.get()); BinaryRow row(3); BinaryRowWriter writer(&row, /*initial_size=*/1024, pool.get()); diff --git a/src/paimon/common/data/binary_string_test.cpp b/src/paimon/common/data/binary_string_test.cpp index bbdbc4fa..c5be57a1 100644 --- a/src/paimon/common/data/binary_string_test.cpp +++ b/src/paimon/common/data/binary_string_test.cpp @@ -34,41 +34,26 @@ class BinaryStringTest : public testing::Test { return BinaryString::FromString(str, pool.get()); } - template - void InnerCheckEqual(T&& expected, U&& actual) { - ASSERT_EQ(std::forward(expected), std::forward(actual)); - } - - template - void InnerCheck(T&& expected) { - ASSERT_TRUE(std::forward(expected)); - } - - template - void InnerCheckFalse(T&& expected) { - ASSERT_FALSE(std::forward(expected)); - } - void CheckBasic(const std::string& str, int32_t len) { BinaryString s1 = FromString(str); auto pool = GetDefaultPool(); std::shared_ptr bytes = Bytes::AllocateBytes(str, pool.get()); BinaryString s2 = BinaryString::FromBytes(bytes); - InnerCheckEqual(len, s1.NumChars()); - InnerCheckEqual(len, s2.NumChars()); + ASSERT_EQ(len, s1.NumChars()); + ASSERT_EQ(len, s2.NumChars()); - InnerCheckEqual(str, s1.ToString()); - InnerCheckEqual(str, s2.ToString()); - InnerCheck(s1 == s2); + ASSERT_EQ(str, s1.ToString()); + ASSERT_EQ(str, s2.ToString()); + ASSERT_TRUE(s1 == s2); - InnerCheckEqual(s2.HashCode(), s1.HashCode()); + ASSERT_EQ(s2.HashCode(), s1.HashCode()); - InnerCheck(s1.Contains(s2)); - InnerCheck(s2.Contains(s1)); - InnerCheck(s1.StartsWith(s1)); - InnerCheck(s1.EndsWith(s1)); - InnerCheck(s2.StartsWith(s2)); - InnerCheck(s2.EndsWith(s2)); + ASSERT_TRUE(s1.Contains(s2)); + ASSERT_TRUE(s2.Contains(s1)); + ASSERT_TRUE(s1.StartsWith(s1)); + ASSERT_TRUE(s1.EndsWith(s1)); + ASSERT_TRUE(s2.StartsWith(s2)); + ASSERT_TRUE(s2.EndsWith(s2)); } }; @@ -87,43 +72,35 @@ TEST_F(BinaryStringTest, TestBasic) { } TEST_F(BinaryStringTest, EmptyStringTest) { - InnerCheckEqual(FromString(""), BinaryString::EmptyUtf8()); + ASSERT_EQ(FromString(""), BinaryString::EmptyUtf8()); std::string empty_str; auto pool = GetDefaultPool(); std::shared_ptr bytes = Bytes::AllocateBytes(empty_str, pool.get()); - InnerCheckEqual(BinaryString::FromBytes(bytes), BinaryString::EmptyUtf8()); - InnerCheckEqual(BinaryString::EmptyUtf8().NumChars(), 0); - InnerCheckEqual(BinaryString::EmptyUtf8().GetSizeInBytes(), 0); + ASSERT_EQ(BinaryString::FromBytes(bytes), BinaryString::EmptyUtf8()); + ASSERT_EQ(BinaryString::EmptyUtf8().NumChars(), 0); + ASSERT_EQ(BinaryString::EmptyUtf8().GetSizeInBytes(), 0); } TEST_F(BinaryStringTest, TestCompareTo) { auto pool = GetDefaultPool(); - InnerCheckEqual(FromString(" ").CompareTo(BinaryString::BlankString(3, pool.get())), 0); - InnerCheck(FromString("").CompareTo(FromString("a")) < 0); - InnerCheck(FromString("abc").CompareTo(FromString("ABC")) > 0); - InnerCheck(FromString("abc0").CompareTo(FromString("abc")) > 0); - InnerCheckEqual(FromString("abcabcabc").CompareTo(FromString("abcabcabc")), 0); - InnerCheck(FromString("aBcabcabc").CompareTo(FromString("Abcabcabc")) > 0); - InnerCheck(FromString("Abcabcabc").CompareTo(FromString("abcabcabC")) < 0); - InnerCheck(FromString("abcabcabc").CompareTo(FromString("abcabcabC")) > 0); - - InnerCheck(FromString("abc").CompareTo(FromString("世界")) < 0); - InnerCheck(FromString("你好").CompareTo(FromString("世界")) > 0); - InnerCheck(FromString("你好123").CompareTo(FromString("你好122")) > 0); + ASSERT_EQ(FromString(" ").CompareTo(BinaryString::BlankString(3, pool.get())), 0); + ASSERT_TRUE(FromString("").CompareTo(FromString("a")) < 0); + ASSERT_TRUE(FromString("abc").CompareTo(FromString("ABC")) > 0); + ASSERT_TRUE(FromString("abc0").CompareTo(FromString("abc")) > 0); + ASSERT_EQ(FromString("abcabcabc").CompareTo(FromString("abcabcabc")), 0); + ASSERT_TRUE(FromString("aBcabcabc").CompareTo(FromString("Abcabcabc")) > 0); + ASSERT_TRUE(FromString("Abcabcabc").CompareTo(FromString("abcabcabC")) < 0); + ASSERT_TRUE(FromString("abcabcabc").CompareTo(FromString("abcabcabC")) > 0); + + ASSERT_TRUE(FromString("abc").CompareTo(FromString("世界")) < 0); + ASSERT_TRUE(FromString("你好").CompareTo(FromString("世界")) > 0); + ASSERT_TRUE(FromString("你好123").CompareTo(FromString("你好122")) > 0); } TEST_F(BinaryStringTest, TestSingleSegment) { // prepare auto pool = GetDefaultPool(); - auto bytes1 = Bytes::AllocateBytes(10, pool.get()); - auto src1 = Bytes::AllocateBytes("abcde", pool.get()); - auto src2 = Bytes::AllocateBytes("aaaaa", pool.get()); - std::memcpy(bytes1->data() + 5, src1->data(), 5); - std::memcpy(bytes1->data() + 0, src2->data(), 5); - // bytes1 content: "aaaaaabcde" - // Actually we want "abcdeaaaaa" at offset 5, but that requires 15 bytes. - // Let us use a simpler approach: - std::shared_ptr data1 = Bytes::AllocateBytes("abcdeaaaaa", pool.get()); + std::shared_ptr data1 = Bytes::AllocateBytes("aaaaaabcde", pool.get()); MemorySegment seg1 = MemorySegment::Wrap(data1); std::shared_ptr data2 = Bytes::AllocateBytes("abcdeb", pool.get()); @@ -132,77 +109,75 @@ TEST_F(BinaryStringTest, TestSingleSegment) { // test compare BinaryString binary_string1 = BinaryString::FromAddress(seg1, 0, 10); BinaryString binary_string2 = BinaryString::FromAddress(seg2, 0, 6); - InnerCheckEqual(binary_string1.ToString(), "abcdeaaaaa"); - InnerCheckEqual(binary_string2.ToString(), "abcdeb"); - InnerCheckEqual(binary_string1.CompareTo(binary_string2), -1); + ASSERT_EQ(binary_string1.ToString(), "aaaaaabcde"); + ASSERT_EQ(binary_string2.ToString(), "abcdeb"); + ASSERT_EQ(binary_string1.CompareTo(binary_string2), -1); ASSERT_EQ(binary_string1, binary_string1); ASSERT_TRUE(binary_string1 < binary_string2); // test equal length compare - binary_string1 = BinaryString::FromAddress(seg1, 0, 5); + binary_string1 = BinaryString::FromAddress(seg1, 5, 5); binary_string2 = BinaryString::FromAddress(seg2, 0, 5); - InnerCheckEqual(binary_string1.ToString(), "abcde"); - InnerCheckEqual(binary_string2.ToString(), "abcde"); - InnerCheckEqual(binary_string1, binary_string2); + ASSERT_EQ(binary_string1.ToString(), "abcde"); + ASSERT_EQ(binary_string2.ToString(), "abcde"); + ASSERT_EQ(binary_string1, binary_string2); - // test with offset - std::shared_ptr data3 = Bytes::AllocateBytes("aaaaa", pool.get()); - MemorySegment seg3 = MemorySegment::Wrap(data3); - binary_string1 = BinaryString::FromAddress(seg3, 0, 5); + // test not equal + binary_string1 = BinaryString::FromAddress(seg1, 0, 5); binary_string2 = BinaryString::FromAddress(seg2, 0, 5); - InnerCheckEqual(binary_string1.ToString(), "aaaaa"); - InnerCheckEqual(binary_string2.ToString(), "abcde"); - InnerCheckEqual(binary_string1.CompareTo(binary_string2), -1); - InnerCheckEqual(binary_string2.CompareTo(binary_string1), 1); + ASSERT_EQ(binary_string1.ToString(), "aaaaa"); + ASSERT_EQ(binary_string2.ToString(), "abcde"); + ASSERT_EQ(binary_string1.CompareTo(binary_string2), -1); + ASSERT_EQ(binary_string2.CompareTo(binary_string1), 1); // test with offset in single segment - std::shared_ptr data4 = Bytes::AllocateBytes(10, pool.get()); - MemorySegment seg4 = MemorySegment::Wrap(data4); - seg4.Put(4, Bytes("abcdeb", pool.get()), 0, 6); - binary_string2 = BinaryString::FromAddress(seg4, 4, 6); - InnerCheckEqual(binary_string2.ToString(), "abcdeb"); - InnerCheckEqual(binary_string1.CompareTo(binary_string2), -1); - InnerCheckEqual(binary_string2.CompareTo(binary_string1), 1); + std::shared_ptr data3 = Bytes::AllocateBytes(10, pool.get()); + MemorySegment seg3 = MemorySegment::Wrap(data3); + seg3.Put(4, Bytes("abcdeb", pool.get()), 0, 6); + binary_string2 = BinaryString::FromAddress(seg3, 4, 6); + ASSERT_EQ(binary_string2.ToString(), "abcdeb"); + ASSERT_EQ(binary_string1.CompareTo(binary_string2), -1); + ASSERT_EQ(binary_string2.CompareTo(binary_string1), 1); } TEST_F(BinaryStringTest, TestContains) { - InnerCheck(BinaryString::EmptyUtf8().Contains(BinaryString::EmptyUtf8())); - InnerCheck(FromString("hello").Contains(FromString("ello"))); - InnerCheckFalse(FromString("hello").Contains(FromString("vello"))); - InnerCheckFalse(FromString("hello").Contains(FromString("hellooo"))); - InnerCheck(FromString("大千世界").Contains(FromString("千世界"))); - InnerCheckFalse(FromString("大千世界").Contains(FromString("世千"))); - InnerCheckFalse(FromString("大千世界").Contains(FromString("大千世界好"))); + ASSERT_TRUE(BinaryString::EmptyUtf8().Contains(BinaryString::EmptyUtf8())); + ASSERT_TRUE(FromString("hello").Contains(FromString("ello"))); + ASSERT_FALSE(FromString("hello").Contains(FromString("vello"))); + ASSERT_FALSE(FromString("hello").Contains(FromString("hellooo"))); + ASSERT_TRUE(FromString("大千世界").Contains(FromString("千世界"))); + ASSERT_FALSE(FromString("大千世界").Contains(FromString("世千"))); + ASSERT_FALSE(FromString("大千世界").Contains(FromString("大千世界好"))); } TEST_F(BinaryStringTest, TestStartsWith) { - InnerCheck(BinaryString::EmptyUtf8().StartsWith(BinaryString::EmptyUtf8())); - InnerCheck(FromString("hello").StartsWith(FromString("hell"))); - InnerCheckFalse(FromString("hello").StartsWith(FromString("ell"))); - InnerCheckFalse(FromString("hello").StartsWith(FromString("hellooo"))); - InnerCheck(FromString("数据砖头").StartsWith(FromString("数据"))); - InnerCheckFalse(FromString("大千世界").StartsWith(FromString("千"))); - InnerCheckFalse(FromString("大千世界").StartsWith(FromString("大千世界好"))); + ASSERT_TRUE(BinaryString::EmptyUtf8().StartsWith(BinaryString::EmptyUtf8())); + ASSERT_TRUE(FromString("hello").StartsWith(FromString("hell"))); + ASSERT_FALSE(FromString("hello").StartsWith(FromString("ell"))); + ASSERT_FALSE(FromString("hello").StartsWith(FromString("hellooo"))); + ASSERT_TRUE(FromString("数据砖头").StartsWith(FromString("数据"))); + ASSERT_FALSE(FromString("大千世界").StartsWith(FromString("千"))); + ASSERT_FALSE(FromString("大千世界").StartsWith(FromString("大千世界好"))); } TEST_F(BinaryStringTest, TestEndsWith) { - InnerCheck(BinaryString::EmptyUtf8().EndsWith(BinaryString::EmptyUtf8())); - InnerCheck(FromString("hello").EndsWith(FromString("ello"))); - InnerCheckFalse(FromString("hello").EndsWith(FromString("ellov"))); - InnerCheckFalse(FromString("hello").EndsWith(FromString("hhhello"))); - InnerCheck(FromString("大千世界").EndsWith(FromString("世界"))); - InnerCheckFalse(FromString("大千世界").EndsWith(FromString("世"))); - InnerCheckFalse(FromString("数据砖头").EndsWith(FromString("我的数据砖头"))); + ASSERT_TRUE(BinaryString::EmptyUtf8().EndsWith(BinaryString::EmptyUtf8())); + ASSERT_TRUE(FromString("hello").EndsWith(FromString("ello"))); + ASSERT_FALSE(FromString("hello").EndsWith(FromString("ellov"))); + ASSERT_FALSE(FromString("hello").EndsWith(FromString("hhhello"))); + ASSERT_TRUE(FromString("大千世界").EndsWith(FromString("世界"))); + ASSERT_FALSE(FromString("大千世界").EndsWith(FromString("世"))); + ASSERT_FALSE(FromString("数据砖头").EndsWith(FromString("我的数据砖头"))); } TEST_F(BinaryStringTest, TestSubstring) { auto pool = GetDefaultPool(); - InnerCheckEqual(FromString("hello").Substring(0, 0, pool.get()), BinaryString::EmptyUtf8()); - InnerCheckEqual(FromString("hello").Substring(1, 3, pool.get()), FromString("el")); - InnerCheckEqual(FromString("数据砖头").Substring(0, 1, pool.get()), FromString("数")); - InnerCheckEqual(FromString("数据砖头").Substring(1, 3, pool.get()), FromString("据砖")); - InnerCheckEqual(FromString("数据砖头").Substring(3, 5, pool.get()), FromString("头")); - InnerCheckEqual(FromString("ߵ梷").Substring(0, 2, pool.get()), FromString("ߵ梷")); + ASSERT_EQ(FromString("hello").Substring(0, 0, pool.get()), BinaryString::EmptyUtf8()); + ASSERT_EQ(FromString("hello").Substring(1, 3, pool.get()), FromString("el")); + ASSERT_EQ(FromString("数据砖头").Substring(0, 1, pool.get()), FromString("数")); + ASSERT_EQ(FromString("数据砖头").Substring(1, 3, pool.get()), FromString("据砖")); + ASSERT_EQ(FromString("数据砖头").Substring(3, 5, pool.get()), FromString("头")); + ASSERT_EQ(FromString("ߵ梷").Substring(0, 2, pool.get()), FromString("ߵ梷")); } TEST_F(BinaryStringTest, TestSubStringAndCopyBinaryString) { @@ -214,28 +189,28 @@ TEST_F(BinaryStringTest, TestSubStringAndCopyBinaryString) { int32_t left = 6, right = 20; // Substring [left, right), The right is not included - InnerCheckEqual(binary_string.Substring(left, right, pool.get()), - FromString(combined.substr(left, right - left))); + ASSERT_EQ(binary_string.Substring(left, right, pool.get()), + FromString(combined.substr(left, right - left))); // CopyBinaryString [left, right], The right is included - InnerCheckEqual(binary_string.CopyBinaryString(left, right, pool.get()), - FromString(combined.substr(left, right - left + 1))); - InnerCheckEqual(binary_string.CopyBinaryString(0, 11, pool.get()), FromString("hello world!")); + ASSERT_EQ(binary_string.CopyBinaryString(left, right, pool.get()), + FromString(combined.substr(left, right - left + 1))); + ASSERT_EQ(binary_string.CopyBinaryString(0, 11, pool.get()), FromString("hello world!")); } TEST_F(BinaryStringTest, TestIndexOf) { { - InnerCheckEqual(BinaryString::EmptyUtf8().IndexOf(BinaryString::EmptyUtf8(), 0), 0); - InnerCheckEqual(BinaryString::EmptyUtf8().IndexOf(FromString("l"), 0), -1); - InnerCheckEqual(FromString("hello").IndexOf(BinaryString::EmptyUtf8(), 0), 0); - InnerCheckEqual(FromString("hello").IndexOf(FromString("l"), 0), 2); - InnerCheckEqual(FromString("hello").IndexOf(FromString("l"), 3), 3); - InnerCheckEqual(FromString("hello").IndexOf(FromString("a"), 0), -1); - InnerCheckEqual(FromString("hello").IndexOf(FromString("ll"), 0), 2); - InnerCheckEqual(FromString("hello").IndexOf(FromString("ll"), 4), -1); - InnerCheckEqual(FromString("数据砖头").IndexOf(FromString("据砖"), 0), 1); - InnerCheckEqual(FromString("数据砖头").IndexOf(FromString("数"), 3), -1); - InnerCheckEqual(FromString("数据砖头").IndexOf(FromString("数"), 0), 0); - InnerCheckEqual(FromString("数据砖头").IndexOf(FromString("头"), 0), 3); + ASSERT_EQ(BinaryString::EmptyUtf8().IndexOf(BinaryString::EmptyUtf8(), 0), 0); + ASSERT_EQ(BinaryString::EmptyUtf8().IndexOf(FromString("l"), 0), -1); + ASSERT_EQ(FromString("hello").IndexOf(BinaryString::EmptyUtf8(), 0), 0); + ASSERT_EQ(FromString("hello").IndexOf(FromString("l"), 0), 2); + ASSERT_EQ(FromString("hello").IndexOf(FromString("l"), 3), 3); + ASSERT_EQ(FromString("hello").IndexOf(FromString("a"), 0), -1); + ASSERT_EQ(FromString("hello").IndexOf(FromString("ll"), 0), 2); + ASSERT_EQ(FromString("hello").IndexOf(FromString("ll"), 4), -1); + ASSERT_EQ(FromString("数据砖头").IndexOf(FromString("据砖"), 0), 1); + ASSERT_EQ(FromString("数据砖头").IndexOf(FromString("数"), 3), -1); + ASSERT_EQ(FromString("数据砖头").IndexOf(FromString("数"), 0), 0); + ASSERT_EQ(FromString("数据砖头").IndexOf(FromString("头"), 0), 3); } { auto pool = GetDefaultPool(); @@ -245,25 +220,25 @@ TEST_F(BinaryStringTest, TestIndexOf) { auto binary_string = BinaryString::FromAddress(seg, /*offset=*/0, /*num_bytes=*/combined.length()); ASSERT_EQ(combined, binary_string.ToString()); - InnerCheckEqual(binary_string.IndexOf(FromString("value"), 0), 48); - InnerCheckEqual(binary_string.IndexOf(FromString("value"), 5), 48); - InnerCheckEqual(binary_string.IndexOf(FromString("vvalue"), 0), -1); - InnerCheckEqual(binary_string.IndexOf(FromString("!"), 0), -1); + ASSERT_EQ(binary_string.IndexOf(FromString("value"), 0), 48); + ASSERT_EQ(binary_string.IndexOf(FromString("value"), 5), 48); + ASSERT_EQ(binary_string.IndexOf(FromString("vvalue"), 0), -1); + ASSERT_EQ(binary_string.IndexOf(FromString("!"), 0), -1); } } TEST_F(BinaryStringTest, TestToUpperLowerCase) { auto pool = GetDefaultPool(); - InnerCheckEqual(FromString("我是中国人").ToLowerCase(pool.get()), FromString("我是中国人")); - InnerCheckEqual(FromString("我是中国人").ToUpperCase(pool.get()), FromString("我是中国人")); - InnerCheckEqual(BinaryString::EmptyUtf8().ToUpperCase(pool.get()), BinaryString::EmptyUtf8()); + ASSERT_EQ(FromString("我是中国人").ToLowerCase(pool.get()), FromString("我是中国人")); + ASSERT_EQ(FromString("我是中国人").ToUpperCase(pool.get()), FromString("我是中国人")); + ASSERT_EQ(BinaryString::EmptyUtf8().ToUpperCase(pool.get()), BinaryString::EmptyUtf8()); - InnerCheckEqual(FromString("aBcDeFg").ToLowerCase(pool.get()), FromString("abcdefg")); - InnerCheckEqual(FromString("aBcDeFg").ToUpperCase(pool.get()), FromString("ABCDEFG")); + ASSERT_EQ(FromString("aBcDeFg").ToLowerCase(pool.get()), FromString("abcdefg")); + ASSERT_EQ(FromString("aBcDeFg").ToUpperCase(pool.get()), FromString("ABCDEFG")); - InnerCheckEqual(FromString("!@#$%^*").ToLowerCase(pool.get()), FromString("!@#$%^*")); - InnerCheckEqual(FromString("!@#$%^*").ToLowerCase(pool.get()), FromString("!@#$%^*")); - InnerCheckEqual(BinaryString::EmptyUtf8().ToLowerCase(pool.get()), BinaryString::EmptyUtf8()); + ASSERT_EQ(FromString("!@#$%^*").ToLowerCase(pool.get()), FromString("!@#$%^*")); + ASSERT_EQ(FromString("!@#$%^*").ToLowerCase(pool.get()), FromString("!@#$%^*")); + ASSERT_EQ(BinaryString::EmptyUtf8().ToLowerCase(pool.get()), BinaryString::EmptyUtf8()); } TEST_F(BinaryStringTest, TestEmptyString) { @@ -276,17 +251,17 @@ TEST_F(BinaryStringTest, TestEmptyString) { str3 = BinaryString::FromAddress(seg0, /*offset=*/5, /*num_bytes=*/0); } - InnerCheck(BinaryString::EmptyUtf8().CompareTo(str2) < 0); - InnerCheck(str2.CompareTo(BinaryString::EmptyUtf8()) > 0); + ASSERT_TRUE(BinaryString::EmptyUtf8().CompareTo(str2) < 0); + ASSERT_TRUE(str2.CompareTo(BinaryString::EmptyUtf8()) > 0); - InnerCheckEqual(BinaryString::EmptyUtf8().CompareTo(str3), 0); - InnerCheckEqual(str3.CompareTo(BinaryString::EmptyUtf8()), 0); + ASSERT_EQ(BinaryString::EmptyUtf8().CompareTo(str3), 0); + ASSERT_EQ(str3.CompareTo(BinaryString::EmptyUtf8()), 0); - InnerCheckFalse(str2 == BinaryString::EmptyUtf8()); - InnerCheckFalse(BinaryString::EmptyUtf8() == str2); + ASSERT_FALSE(str2 == BinaryString::EmptyUtf8()); + ASSERT_FALSE(BinaryString::EmptyUtf8() == str2); - InnerCheckEqual(str3, BinaryString::EmptyUtf8()); - InnerCheckEqual(BinaryString::EmptyUtf8(), str3); + ASSERT_EQ(str3, BinaryString::EmptyUtf8()); + ASSERT_EQ(BinaryString::EmptyUtf8(), str3); } TEST_F(BinaryStringTest, TestSkipWrongFirstByte) { @@ -301,7 +276,7 @@ TEST_F(BinaryStringTest, TestSkipWrongFirstByte) { std::shared_ptr c = Bytes::AllocateBytes(1, pool.get()); for (int32_t wrong_first_byte : wrong_first_bytes) { (*c)[0] = static_cast(wrong_first_byte); - InnerCheckEqual(1, BinaryString::FromBytes(c).NumChars()); + ASSERT_EQ(1, BinaryString::FromBytes(c).NumChars()); } } From 19e135f3bdf6e0462159832e8bcbb40c9a874b17 Mon Sep 17 00:00:00 2001 From: lxy264173 Date: Thu, 26 Mar 2026 09:52:09 +0800 Subject: [PATCH 07/11] fix --- src/paimon/common/data/binary_string_test.cpp | 255 +++++++++--------- 1 file changed, 135 insertions(+), 120 deletions(-) diff --git a/src/paimon/common/data/binary_string_test.cpp b/src/paimon/common/data/binary_string_test.cpp index c5be57a1..adca626e 100644 --- a/src/paimon/common/data/binary_string_test.cpp +++ b/src/paimon/common/data/binary_string_test.cpp @@ -34,26 +34,41 @@ class BinaryStringTest : public testing::Test { return BinaryString::FromString(str, pool.get()); } + template + void InnerCheckEqual(T&& expected, U&& actual) { + ASSERT_EQ(std::forward(expected), std::forward(actual)); + } + + template + void InnerCheck(T&& expected) { + ASSERT_TRUE(std::forward(expected)); + } + + template + void InnerCheckFalse(T&& expected) { + ASSERT_FALSE(std::forward(expected)); + } + void CheckBasic(const std::string& str, int32_t len) { BinaryString s1 = FromString(str); auto pool = GetDefaultPool(); std::shared_ptr bytes = Bytes::AllocateBytes(str, pool.get()); BinaryString s2 = BinaryString::FromBytes(bytes); - ASSERT_EQ(len, s1.NumChars()); - ASSERT_EQ(len, s2.NumChars()); + InnerCheckEqual(len, s1.NumChars()); + InnerCheckEqual(len, s2.NumChars()); - ASSERT_EQ(str, s1.ToString()); - ASSERT_EQ(str, s2.ToString()); - ASSERT_TRUE(s1 == s2); + InnerCheckEqual(str, s1.ToString()); + InnerCheckEqual(str, s2.ToString()); + InnerCheck(s1 == s2); - ASSERT_EQ(s2.HashCode(), s1.HashCode()); + InnerCheckEqual(s2.HashCode(), s1.HashCode()); - ASSERT_TRUE(s1.Contains(s2)); - ASSERT_TRUE(s2.Contains(s1)); - ASSERT_TRUE(s1.StartsWith(s1)); - ASSERT_TRUE(s1.EndsWith(s1)); - ASSERT_TRUE(s2.StartsWith(s2)); - ASSERT_TRUE(s2.EndsWith(s2)); + InnerCheck(s1.Contains(s2)); + InnerCheck(s2.Contains(s1)); + InnerCheck(s1.StartsWith(s1)); + InnerCheck(s1.EndsWith(s1)); + InnerCheck(s2.StartsWith(s2)); + InnerCheck(s2.EndsWith(s2)); } }; @@ -72,29 +87,29 @@ TEST_F(BinaryStringTest, TestBasic) { } TEST_F(BinaryStringTest, EmptyStringTest) { - ASSERT_EQ(FromString(""), BinaryString::EmptyUtf8()); + InnerCheckEqual(FromString(""), BinaryString::EmptyUtf8()); std::string empty_str; auto pool = GetDefaultPool(); std::shared_ptr bytes = Bytes::AllocateBytes(empty_str, pool.get()); - ASSERT_EQ(BinaryString::FromBytes(bytes), BinaryString::EmptyUtf8()); - ASSERT_EQ(BinaryString::EmptyUtf8().NumChars(), 0); - ASSERT_EQ(BinaryString::EmptyUtf8().GetSizeInBytes(), 0); + InnerCheckEqual(BinaryString::FromBytes(bytes), BinaryString::EmptyUtf8()); + InnerCheckEqual(BinaryString::EmptyUtf8().NumChars(), 0); + InnerCheckEqual(BinaryString::EmptyUtf8().GetSizeInBytes(), 0); } TEST_F(BinaryStringTest, TestCompareTo) { auto pool = GetDefaultPool(); - ASSERT_EQ(FromString(" ").CompareTo(BinaryString::BlankString(3, pool.get())), 0); - ASSERT_TRUE(FromString("").CompareTo(FromString("a")) < 0); - ASSERT_TRUE(FromString("abc").CompareTo(FromString("ABC")) > 0); - ASSERT_TRUE(FromString("abc0").CompareTo(FromString("abc")) > 0); - ASSERT_EQ(FromString("abcabcabc").CompareTo(FromString("abcabcabc")), 0); - ASSERT_TRUE(FromString("aBcabcabc").CompareTo(FromString("Abcabcabc")) > 0); - ASSERT_TRUE(FromString("Abcabcabc").CompareTo(FromString("abcabcabC")) < 0); - ASSERT_TRUE(FromString("abcabcabc").CompareTo(FromString("abcabcabC")) > 0); - - ASSERT_TRUE(FromString("abc").CompareTo(FromString("世界")) < 0); - ASSERT_TRUE(FromString("你好").CompareTo(FromString("世界")) > 0); - ASSERT_TRUE(FromString("你好123").CompareTo(FromString("你好122")) > 0); + InnerCheckEqual(FromString(" ").CompareTo(BinaryString::BlankString(3, pool.get())), 0); + InnerCheck(FromString("").CompareTo(FromString("a")) < 0); + InnerCheck(FromString("abc").CompareTo(FromString("ABC")) > 0); + InnerCheck(FromString("abc0").CompareTo(FromString("abc")) > 0); + InnerCheckEqual(FromString("abcabcabc").CompareTo(FromString("abcabcabc")), 0); + InnerCheck(FromString("aBcabcabc").CompareTo(FromString("Abcabcabc")) > 0); + InnerCheck(FromString("Abcabcabc").CompareTo(FromString("abcabcabC")) < 0); + InnerCheck(FromString("abcabcabc").CompareTo(FromString("abcabcabC")) > 0); + + InnerCheck(FromString("abc").CompareTo(FromString("世界")) < 0); + InnerCheck(FromString("你好").CompareTo(FromString("世界")) > 0); + InnerCheck(FromString("你好123").CompareTo(FromString("你好122")) > 0); } TEST_F(BinaryStringTest, TestSingleSegment) { @@ -109,75 +124,75 @@ TEST_F(BinaryStringTest, TestSingleSegment) { // test compare BinaryString binary_string1 = BinaryString::FromAddress(seg1, 0, 10); BinaryString binary_string2 = BinaryString::FromAddress(seg2, 0, 6); - ASSERT_EQ(binary_string1.ToString(), "aaaaaabcde"); - ASSERT_EQ(binary_string2.ToString(), "abcdeb"); - ASSERT_EQ(binary_string1.CompareTo(binary_string2), -1); - ASSERT_EQ(binary_string1, binary_string1); - ASSERT_TRUE(binary_string1 < binary_string2); + InnerCheckEqual(binary_string1.ToString(), "aaaaaabcde"); + InnerCheckEqual(binary_string2.ToString(), "abcdeb"); + InnerCheckEqual(binary_string1.CompareTo(binary_string2), -1); + InnerCheckEqual(binary_string1, binary_string1); + InnerCheck(binary_string1 < binary_string2); // test equal length compare binary_string1 = BinaryString::FromAddress(seg1, 5, 5); binary_string2 = BinaryString::FromAddress(seg2, 0, 5); - ASSERT_EQ(binary_string1.ToString(), "abcde"); - ASSERT_EQ(binary_string2.ToString(), "abcde"); - ASSERT_EQ(binary_string1, binary_string2); + InnerCheckEqual(binary_string1.ToString(), "abcde"); + InnerCheckEqual(binary_string2.ToString(), "abcde"); + InnerCheckEqual(binary_string1, binary_string2); // test not equal binary_string1 = BinaryString::FromAddress(seg1, 0, 5); binary_string2 = BinaryString::FromAddress(seg2, 0, 5); - ASSERT_EQ(binary_string1.ToString(), "aaaaa"); - ASSERT_EQ(binary_string2.ToString(), "abcde"); - ASSERT_EQ(binary_string1.CompareTo(binary_string2), -1); - ASSERT_EQ(binary_string2.CompareTo(binary_string1), 1); + InnerCheckEqual(binary_string1.ToString(), "aaaaa"); + InnerCheckEqual(binary_string2.ToString(), "abcde"); + InnerCheckEqual(binary_string1.CompareTo(binary_string2), -1); + InnerCheckEqual(binary_string2.CompareTo(binary_string1), 1); // test with offset in single segment std::shared_ptr data3 = Bytes::AllocateBytes(10, pool.get()); MemorySegment seg3 = MemorySegment::Wrap(data3); seg3.Put(4, Bytes("abcdeb", pool.get()), 0, 6); binary_string2 = BinaryString::FromAddress(seg3, 4, 6); - ASSERT_EQ(binary_string2.ToString(), "abcdeb"); - ASSERT_EQ(binary_string1.CompareTo(binary_string2), -1); - ASSERT_EQ(binary_string2.CompareTo(binary_string1), 1); + InnerCheckEqual(binary_string2.ToString(), "abcdeb"); + InnerCheckEqual(binary_string1.CompareTo(binary_string2), -1); + InnerCheckEqual(binary_string2.CompareTo(binary_string1), 1); } TEST_F(BinaryStringTest, TestContains) { - ASSERT_TRUE(BinaryString::EmptyUtf8().Contains(BinaryString::EmptyUtf8())); - ASSERT_TRUE(FromString("hello").Contains(FromString("ello"))); - ASSERT_FALSE(FromString("hello").Contains(FromString("vello"))); - ASSERT_FALSE(FromString("hello").Contains(FromString("hellooo"))); - ASSERT_TRUE(FromString("大千世界").Contains(FromString("千世界"))); - ASSERT_FALSE(FromString("大千世界").Contains(FromString("世千"))); - ASSERT_FALSE(FromString("大千世界").Contains(FromString("大千世界好"))); + InnerCheck(BinaryString::EmptyUtf8().Contains(BinaryString::EmptyUtf8())); + InnerCheck(FromString("hello").Contains(FromString("ello"))); + InnerCheckFalse(FromString("hello").Contains(FromString("vello"))); + InnerCheckFalse(FromString("hello").Contains(FromString("hellooo"))); + InnerCheck(FromString("大千世界").Contains(FromString("千世界"))); + InnerCheckFalse(FromString("大千世界").Contains(FromString("世千"))); + InnerCheckFalse(FromString("大千世界").Contains(FromString("大千世界好"))); } TEST_F(BinaryStringTest, TestStartsWith) { - ASSERT_TRUE(BinaryString::EmptyUtf8().StartsWith(BinaryString::EmptyUtf8())); - ASSERT_TRUE(FromString("hello").StartsWith(FromString("hell"))); - ASSERT_FALSE(FromString("hello").StartsWith(FromString("ell"))); - ASSERT_FALSE(FromString("hello").StartsWith(FromString("hellooo"))); - ASSERT_TRUE(FromString("数据砖头").StartsWith(FromString("数据"))); - ASSERT_FALSE(FromString("大千世界").StartsWith(FromString("千"))); - ASSERT_FALSE(FromString("大千世界").StartsWith(FromString("大千世界好"))); + InnerCheck(BinaryString::EmptyUtf8().StartsWith(BinaryString::EmptyUtf8())); + InnerCheck(FromString("hello").StartsWith(FromString("hell"))); + InnerCheckFalse(FromString("hello").StartsWith(FromString("ell"))); + InnerCheckFalse(FromString("hello").StartsWith(FromString("hellooo"))); + InnerCheck(FromString("数据砖头").StartsWith(FromString("数据"))); + InnerCheckFalse(FromString("大千世界").StartsWith(FromString("千"))); + InnerCheckFalse(FromString("大千世界").StartsWith(FromString("大千世界好"))); } TEST_F(BinaryStringTest, TestEndsWith) { - ASSERT_TRUE(BinaryString::EmptyUtf8().EndsWith(BinaryString::EmptyUtf8())); - ASSERT_TRUE(FromString("hello").EndsWith(FromString("ello"))); - ASSERT_FALSE(FromString("hello").EndsWith(FromString("ellov"))); - ASSERT_FALSE(FromString("hello").EndsWith(FromString("hhhello"))); - ASSERT_TRUE(FromString("大千世界").EndsWith(FromString("世界"))); - ASSERT_FALSE(FromString("大千世界").EndsWith(FromString("世"))); - ASSERT_FALSE(FromString("数据砖头").EndsWith(FromString("我的数据砖头"))); + InnerCheck(BinaryString::EmptyUtf8().EndsWith(BinaryString::EmptyUtf8())); + InnerCheck(FromString("hello").EndsWith(FromString("ello"))); + InnerCheckFalse(FromString("hello").EndsWith(FromString("ellov"))); + InnerCheckFalse(FromString("hello").EndsWith(FromString("hhhello"))); + InnerCheck(FromString("大千世界").EndsWith(FromString("世界"))); + InnerCheckFalse(FromString("大千世界").EndsWith(FromString("世"))); + InnerCheckFalse(FromString("数据砖头").EndsWith(FromString("我的数据砖头"))); } TEST_F(BinaryStringTest, TestSubstring) { auto pool = GetDefaultPool(); - ASSERT_EQ(FromString("hello").Substring(0, 0, pool.get()), BinaryString::EmptyUtf8()); - ASSERT_EQ(FromString("hello").Substring(1, 3, pool.get()), FromString("el")); - ASSERT_EQ(FromString("数据砖头").Substring(0, 1, pool.get()), FromString("数")); - ASSERT_EQ(FromString("数据砖头").Substring(1, 3, pool.get()), FromString("据砖")); - ASSERT_EQ(FromString("数据砖头").Substring(3, 5, pool.get()), FromString("头")); - ASSERT_EQ(FromString("ߵ梷").Substring(0, 2, pool.get()), FromString("ߵ梷")); + InnerCheckEqual(FromString("hello").Substring(0, 0, pool.get()), BinaryString::EmptyUtf8()); + InnerCheckEqual(FromString("hello").Substring(1, 3, pool.get()), FromString("el")); + InnerCheckEqual(FromString("数据砖头").Substring(0, 1, pool.get()), FromString("数")); + InnerCheckEqual(FromString("数据砖头").Substring(1, 3, pool.get()), FromString("据砖")); + InnerCheckEqual(FromString("数据砖头").Substring(3, 5, pool.get()), FromString("头")); + InnerCheckEqual(FromString("ߵ梷").Substring(0, 2, pool.get()), FromString("ߵ梷")); } TEST_F(BinaryStringTest, TestSubStringAndCopyBinaryString) { @@ -189,28 +204,28 @@ TEST_F(BinaryStringTest, TestSubStringAndCopyBinaryString) { int32_t left = 6, right = 20; // Substring [left, right), The right is not included - ASSERT_EQ(binary_string.Substring(left, right, pool.get()), - FromString(combined.substr(left, right - left))); + InnerCheckEqual(binary_string.Substring(left, right, pool.get()), + FromString(combined.substr(left, right - left))); // CopyBinaryString [left, right], The right is included - ASSERT_EQ(binary_string.CopyBinaryString(left, right, pool.get()), - FromString(combined.substr(left, right - left + 1))); - ASSERT_EQ(binary_string.CopyBinaryString(0, 11, pool.get()), FromString("hello world!")); + InnerCheckEqual(binary_string.CopyBinaryString(left, right, pool.get()), + FromString(combined.substr(left, right - left + 1))); + InnerCheckEqual(binary_string.CopyBinaryString(0, 11, pool.get()), FromString("hello world!")); } TEST_F(BinaryStringTest, TestIndexOf) { { - ASSERT_EQ(BinaryString::EmptyUtf8().IndexOf(BinaryString::EmptyUtf8(), 0), 0); - ASSERT_EQ(BinaryString::EmptyUtf8().IndexOf(FromString("l"), 0), -1); - ASSERT_EQ(FromString("hello").IndexOf(BinaryString::EmptyUtf8(), 0), 0); - ASSERT_EQ(FromString("hello").IndexOf(FromString("l"), 0), 2); - ASSERT_EQ(FromString("hello").IndexOf(FromString("l"), 3), 3); - ASSERT_EQ(FromString("hello").IndexOf(FromString("a"), 0), -1); - ASSERT_EQ(FromString("hello").IndexOf(FromString("ll"), 0), 2); - ASSERT_EQ(FromString("hello").IndexOf(FromString("ll"), 4), -1); - ASSERT_EQ(FromString("数据砖头").IndexOf(FromString("据砖"), 0), 1); - ASSERT_EQ(FromString("数据砖头").IndexOf(FromString("数"), 3), -1); - ASSERT_EQ(FromString("数据砖头").IndexOf(FromString("数"), 0), 0); - ASSERT_EQ(FromString("数据砖头").IndexOf(FromString("头"), 0), 3); + InnerCheckEqual(BinaryString::EmptyUtf8().IndexOf(BinaryString::EmptyUtf8(), 0), 0); + InnerCheckEqual(BinaryString::EmptyUtf8().IndexOf(FromString("l"), 0), -1); + InnerCheckEqual(FromString("hello").IndexOf(BinaryString::EmptyUtf8(), 0), 0); + InnerCheckEqual(FromString("hello").IndexOf(FromString("l"), 0), 2); + InnerCheckEqual(FromString("hello").IndexOf(FromString("l"), 3), 3); + InnerCheckEqual(FromString("hello").IndexOf(FromString("a"), 0), -1); + InnerCheckEqual(FromString("hello").IndexOf(FromString("ll"), 0), 2); + InnerCheckEqual(FromString("hello").IndexOf(FromString("ll"), 4), -1); + InnerCheckEqual(FromString("数据砖头").IndexOf(FromString("据砖"), 0), 1); + InnerCheckEqual(FromString("数据砖头").IndexOf(FromString("数"), 3), -1); + InnerCheckEqual(FromString("数据砖头").IndexOf(FromString("数"), 0), 0); + InnerCheckEqual(FromString("数据砖头").IndexOf(FromString("头"), 0), 3); } { auto pool = GetDefaultPool(); @@ -219,26 +234,26 @@ TEST_F(BinaryStringTest, TestIndexOf) { MemorySegment seg = MemorySegment::Wrap(bytes); auto binary_string = BinaryString::FromAddress(seg, /*offset=*/0, /*num_bytes=*/combined.length()); - ASSERT_EQ(combined, binary_string.ToString()); - ASSERT_EQ(binary_string.IndexOf(FromString("value"), 0), 48); - ASSERT_EQ(binary_string.IndexOf(FromString("value"), 5), 48); - ASSERT_EQ(binary_string.IndexOf(FromString("vvalue"), 0), -1); - ASSERT_EQ(binary_string.IndexOf(FromString("!"), 0), -1); + InnerCheckEqual(combined, binary_string.ToString()); + InnerCheckEqual(binary_string.IndexOf(FromString("value"), 0), 48); + InnerCheckEqual(binary_string.IndexOf(FromString("value"), 5), 48); + InnerCheckEqual(binary_string.IndexOf(FromString("vvalue"), 0), -1); + InnerCheckEqual(binary_string.IndexOf(FromString("!"), 0), -1); } } TEST_F(BinaryStringTest, TestToUpperLowerCase) { auto pool = GetDefaultPool(); - ASSERT_EQ(FromString("我是中国人").ToLowerCase(pool.get()), FromString("我是中国人")); - ASSERT_EQ(FromString("我是中国人").ToUpperCase(pool.get()), FromString("我是中国人")); - ASSERT_EQ(BinaryString::EmptyUtf8().ToUpperCase(pool.get()), BinaryString::EmptyUtf8()); + InnerCheckEqual(FromString("我是中国人").ToLowerCase(pool.get()), FromString("我是中国人")); + InnerCheckEqual(FromString("我是中国人").ToUpperCase(pool.get()), FromString("我是中国人")); + InnerCheckEqual(BinaryString::EmptyUtf8().ToUpperCase(pool.get()), BinaryString::EmptyUtf8()); - ASSERT_EQ(FromString("aBcDeFg").ToLowerCase(pool.get()), FromString("abcdefg")); - ASSERT_EQ(FromString("aBcDeFg").ToUpperCase(pool.get()), FromString("ABCDEFG")); + InnerCheckEqual(FromString("aBcDeFg").ToLowerCase(pool.get()), FromString("abcdefg")); + InnerCheckEqual(FromString("aBcDeFg").ToUpperCase(pool.get()), FromString("ABCDEFG")); - ASSERT_EQ(FromString("!@#$%^*").ToLowerCase(pool.get()), FromString("!@#$%^*")); - ASSERT_EQ(FromString("!@#$%^*").ToLowerCase(pool.get()), FromString("!@#$%^*")); - ASSERT_EQ(BinaryString::EmptyUtf8().ToLowerCase(pool.get()), BinaryString::EmptyUtf8()); + InnerCheckEqual(FromString("!@#$%^*").ToLowerCase(pool.get()), FromString("!@#$%^*")); + InnerCheckEqual(FromString("!@#$%^*").ToLowerCase(pool.get()), FromString("!@#$%^*")); + InnerCheckEqual(BinaryString::EmptyUtf8().ToLowerCase(pool.get()), BinaryString::EmptyUtf8()); } TEST_F(BinaryStringTest, TestEmptyString) { @@ -251,17 +266,17 @@ TEST_F(BinaryStringTest, TestEmptyString) { str3 = BinaryString::FromAddress(seg0, /*offset=*/5, /*num_bytes=*/0); } - ASSERT_TRUE(BinaryString::EmptyUtf8().CompareTo(str2) < 0); - ASSERT_TRUE(str2.CompareTo(BinaryString::EmptyUtf8()) > 0); + InnerCheck(BinaryString::EmptyUtf8().CompareTo(str2) < 0); + InnerCheck(str2.CompareTo(BinaryString::EmptyUtf8()) > 0); - ASSERT_EQ(BinaryString::EmptyUtf8().CompareTo(str3), 0); - ASSERT_EQ(str3.CompareTo(BinaryString::EmptyUtf8()), 0); + InnerCheckEqual(BinaryString::EmptyUtf8().CompareTo(str3), 0); + InnerCheckEqual(str3.CompareTo(BinaryString::EmptyUtf8()), 0); - ASSERT_FALSE(str2 == BinaryString::EmptyUtf8()); - ASSERT_FALSE(BinaryString::EmptyUtf8() == str2); + InnerCheckFalse(str2 == BinaryString::EmptyUtf8()); + InnerCheckFalse(BinaryString::EmptyUtf8() == str2); - ASSERT_EQ(str3, BinaryString::EmptyUtf8()); - ASSERT_EQ(BinaryString::EmptyUtf8(), str3); + InnerCheckEqual(str3, BinaryString::EmptyUtf8()); + InnerCheckEqual(BinaryString::EmptyUtf8(), str3); } TEST_F(BinaryStringTest, TestSkipWrongFirstByte) { @@ -276,7 +291,7 @@ TEST_F(BinaryStringTest, TestSkipWrongFirstByte) { std::shared_ptr c = Bytes::AllocateBytes(1, pool.get()); for (int32_t wrong_first_byte : wrong_first_bytes) { (*c)[0] = static_cast(wrong_first_byte); - ASSERT_EQ(1, BinaryString::FromBytes(c).NumChars()); + InnerCheckEqual(1, BinaryString::FromBytes(c).NumChars()); } } @@ -284,7 +299,7 @@ TEST_F(BinaryStringTest, TestFromBytes) { auto pool = GetDefaultPool(); std::string s = "hahahe"; std::shared_ptr bytes = Bytes::AllocateBytes(s, pool.get()); - ASSERT_TRUE(BinaryString::FromBytes(bytes, 0, 6) == BinaryString::FromString(s, pool.get())); + InnerCheck(BinaryString::FromBytes(bytes, 0, 6) == BinaryString::FromString(s, pool.get())); } TEST_F(BinaryStringTest, TestCopy) { @@ -293,8 +308,8 @@ TEST_F(BinaryStringTest, TestCopy) { std::shared_ptr bytes = Bytes::AllocateBytes(s, pool.get()); BinaryString binary_string = BinaryString::FromBytes(bytes, 0, 6); BinaryString copy_binary_string = binary_string.Copy(pool.get()); - ASSERT_EQ(binary_string, copy_binary_string); - ASSERT_EQ(copy_binary_string.ByteAt(2), 'h'); + InnerCheckEqual(binary_string, copy_binary_string); + InnerCheckEqual(copy_binary_string.ByteAt(2), 'h'); } TEST_F(BinaryStringTest, TestByteAt) { @@ -304,8 +319,8 @@ TEST_F(BinaryStringTest, TestByteAt) { MemorySegment seg = MemorySegment::Wrap(bytes); auto binary_string = BinaryString::FromAddress(seg, /*offset=*/2, /*num_bytes=*/combined.length() - 2); - ASSERT_EQ(binary_string.ByteAt(0), 'l'); - ASSERT_EQ(binary_string.ByteAt(5), 'r'); + InnerCheckEqual(binary_string.ByteAt(0), 'l'); + InnerCheckEqual(binary_string.ByteAt(5), 'r'); } TEST_F(BinaryStringTest, TestNumChars) { @@ -315,14 +330,14 @@ TEST_F(BinaryStringTest, TestNumChars) { MemorySegment seg = MemorySegment::Wrap(bytes); auto binary_string = BinaryString::FromAddress(seg, /*offset=*/0, /*num_bytes=*/5); - ASSERT_EQ(5, binary_string.NumChars()); + InnerCheckEqual(5, binary_string.NumChars()); } { auto bytes = std::make_shared("helloworld", pool.get()); MemorySegment seg = MemorySegment::Wrap(bytes); auto binary_string = BinaryString::FromAddress(seg, /*offset=*/0, /*num_bytes=*/10); - ASSERT_EQ(10, binary_string.NumChars()); + InnerCheckEqual(10, binary_string.NumChars()); } } @@ -339,8 +354,8 @@ TEST_F(BinaryStringTest, TestMatchAt) { MemorySegment seg2 = MemorySegment::Wrap(bytes2); auto binary_string2 = BinaryString::FromAddress(seg2, /*offset=*/0, /*num_bytes=*/2); - ASSERT_TRUE(binary_string1.MatchAt(binary_string2, /*pos=*/1)); - ASSERT_FALSE(binary_string1.MatchAt(binary_string2, /*pos=*/0)); + InnerCheck(binary_string1.MatchAt(binary_string2, /*pos=*/1)); + InnerCheckFalse(binary_string1.MatchAt(binary_string2, /*pos=*/0)); } { // abcdef @@ -353,8 +368,8 @@ TEST_F(BinaryStringTest, TestMatchAt) { MemorySegment seg2 = MemorySegment::Wrap(bytes2); auto binary_string2 = BinaryString::FromAddress(seg2, /*offset=*/0, /*num_bytes=*/2); - ASSERT_TRUE(binary_string1.MatchAt(binary_string2, /*pos=*/1)); - ASSERT_FALSE(binary_string1.MatchAt(binary_string2, /*pos=*/0)); + InnerCheck(binary_string1.MatchAt(binary_string2, /*pos=*/1)); + InnerCheckFalse(binary_string1.MatchAt(binary_string2, /*pos=*/0)); } } From 983f21ea1a5a9e52934048658f258c6983ece884 Mon Sep 17 00:00:00 2001 From: lxy264173 Date: Thu, 26 Mar 2026 15:10:11 +0800 Subject: [PATCH 08/11] add test --- include/paimon/data/blob.h | 4 +- .../common/data/abstract_binary_writer.cpp | 4 + src/paimon/common/data/binary_array_test.cpp | 133 ++++++++++++++++++ src/paimon/common/data/blob.cpp | 5 +- src/paimon/common/data/blob_test.cpp | 42 +++++- .../data/columnar/columnar_row_test.cpp | 65 +++++++++ src/paimon/common/data/data_define.h | 14 +- src/paimon/common/data/data_define_test.cpp | 124 ++++++++++++++++ .../memory/memory_segment_utils_test.cpp | 43 ++++++ src/paimon/common/memory/memory_slice.cpp | 30 ---- src/paimon/common/memory/memory_slice.h | 10 -- .../common/memory/memory_slice_input.cpp | 8 +- src/paimon/common/memory/memory_slice_input.h | 5 +- src/paimon/common/sst/block_handle.cpp | 8 +- src/paimon/common/sst/block_handle.h | 3 +- src/paimon/common/sst/block_iterator.cpp | 8 +- src/paimon/common/sst/block_iterator.h | 4 +- src/paimon/common/sst/sst_file_reader.cpp | 6 +- 18 files changed, 438 insertions(+), 78 deletions(-) diff --git a/include/paimon/data/blob.h b/include/paimon/data/blob.h index b7d84630..9396bbb9 100644 --- a/include/paimon/data/blob.h +++ b/include/paimon/data/blob.h @@ -94,12 +94,10 @@ class PAIMON_EXPORT Blob { /// It automatically injects Paimon-specific metadata to identify the field as a BLOB. /// /// @param field_name The name of the Arrow field. - /// @param nullable Whether the field can contain null values (defaults to false). /// @param metadata A map of key-value metadata to be attached to the field. /// @return A result containing a unique pointer to the generated `ArrowSchema` or an error. static Result> ArrowField( - const std::string& field_name, bool nullable = false, - std::unordered_map metadata = {}); + const std::string& field_name, std::unordered_map metadata = {}); private: class Impl; diff --git a/src/paimon/common/data/abstract_binary_writer.cpp b/src/paimon/common/data/abstract_binary_writer.cpp index 6cf98bc8..f27f2812 100644 --- a/src/paimon/common/data/abstract_binary_writer.cpp +++ b/src/paimon/common/data/abstract_binary_writer.cpp @@ -163,6 +163,8 @@ void AbstractBinaryWriter::WriteSegmentsToVarLenPart(int32_t pos, if (segments.size() == 1) { segments[0].CopyTo(offset, &segment_, cursor_, size); } else { + assert(false); + // Now BinarySecyion only in single segment. WriteMultiSegmentsToVarLenPart(segments, offset, size); } SetOffsetAndSize(pos, cursor_, size); @@ -172,6 +174,8 @@ void AbstractBinaryWriter::WriteSegmentsToVarLenPart(int32_t pos, void AbstractBinaryWriter::WriteMultiSegmentsToVarLenPart( const std::vector& segments, int32_t offset, int32_t size) { + // Now BinarySecyion only in single segment. + assert(false); // Write the bytes to the variable length portion. int32_t need_copy = size; int32_t from_offset = offset; diff --git a/src/paimon/common/data/binary_array_test.cpp b/src/paimon/common/data/binary_array_test.cpp index 04bcecb7..8bee61af 100644 --- a/src/paimon/common/data/binary_array_test.cpp +++ b/src/paimon/common/data/binary_array_test.cpp @@ -183,6 +183,7 @@ TEST(BinaryArrayTest, TestSetAndGet) { } // timestamp { + // not compact (precision > 3) std::vector arr = {Timestamp(0, 0), Timestamp(12345, 1)}; BinaryArray array; BinaryArrayWriter writer = BinaryArrayWriter(&array, arr.size(), 8, pool.get()); @@ -193,6 +194,18 @@ TEST(BinaryArrayTest, TestSetAndGet) { ASSERT_EQ(arr[0], array.GetTimestamp(0, 9)); ASSERT_EQ(arr[1], array.GetTimestamp(1, 9)); } + { + // compact (precision <= 3) + std::vector arr = {Timestamp(0, 0), Timestamp(12345, 0)}; + BinaryArray array; + BinaryArrayWriter writer = BinaryArrayWriter(&array, arr.size(), 8, pool.get()); + for (size_t i = 0; i < arr.size(); i++) { + writer.WriteTimestamp(i, arr[i], 3); + } + writer.Complete(); + ASSERT_EQ(arr[0], array.GetTimestamp(0, 3)); + ASSERT_EQ(arr[1], array.GetTimestamp(1, 3)); + } // binary { std::vector arr; @@ -379,4 +392,124 @@ TEST(BinaryArrayTest, TestReset) { ASSERT_EQ(arr, array.ToLongArray().value()); } +TEST(BinaryArrayTest, TestGetElementSize) { + ASSERT_EQ(sizeof(bool), BinaryArrayWriter::GetElementSize(arrow::Type::type::BOOL)); + ASSERT_EQ(sizeof(int8_t), BinaryArrayWriter::GetElementSize(arrow::Type::type::INT8)); + ASSERT_EQ(sizeof(int16_t), BinaryArrayWriter::GetElementSize(arrow::Type::type::INT16)); + ASSERT_EQ(sizeof(int32_t), BinaryArrayWriter::GetElementSize(arrow::Type::type::INT32)); + ASSERT_EQ(sizeof(int32_t), BinaryArrayWriter::GetElementSize(arrow::Type::type::DATE32)); + ASSERT_EQ(sizeof(int64_t), BinaryArrayWriter::GetElementSize(arrow::Type::type::INT64)); + ASSERT_EQ(sizeof(float), BinaryArrayWriter::GetElementSize(arrow::Type::type::FLOAT)); + ASSERT_EQ(sizeof(double), BinaryArrayWriter::GetElementSize(arrow::Type::type::DOUBLE)); + // default cases: variable-length types use 8 bytes (offset + length) + ASSERT_EQ(8, BinaryArrayWriter::GetElementSize(arrow::Type::type::STRING)); + ASSERT_EQ(8, BinaryArrayWriter::GetElementSize(arrow::Type::type::BINARY)); + ASSERT_EQ(8, BinaryArrayWriter::GetElementSize(arrow::Type::type::TIMESTAMP)); + ASSERT_EQ(8, BinaryArrayWriter::GetElementSize(arrow::Type::type::DECIMAL128)); +} + +TEST(BinaryArrayTest, TestSetNullAtWithArrowType) { + auto pool = GetDefaultPool(); + + { + // BOOL + BinaryArray array; + BinaryArrayWriter writer(&array, 2, sizeof(bool), pool.get()); + writer.WriteBoolean(0, true); + writer.SetNullAt(1, arrow::Type::type::BOOL); + writer.Complete(); + ASSERT_FALSE(array.IsNullAt(0)); + ASSERT_TRUE(array.GetBoolean(0)); + ASSERT_TRUE(array.IsNullAt(1)); + } + { + // INT8 + BinaryArray array; + BinaryArrayWriter writer(&array, 2, sizeof(int8_t), pool.get()); + writer.WriteByte(0, 42); + writer.SetNullAt(1, arrow::Type::type::INT8); + writer.Complete(); + ASSERT_FALSE(array.IsNullAt(0)); + ASSERT_EQ(42, array.GetByte(0)); + ASSERT_TRUE(array.IsNullAt(1)); + } + { + // INT16 + BinaryArray array; + BinaryArrayWriter writer(&array, 2, sizeof(int16_t), pool.get()); + writer.WriteShort(0, 1000); + writer.SetNullAt(1, arrow::Type::type::INT16); + writer.Complete(); + ASSERT_FALSE(array.IsNullAt(0)); + ASSERT_EQ(1000, array.GetShort(0)); + ASSERT_TRUE(array.IsNullAt(1)); + } + { + // INT32 + BinaryArray array; + BinaryArrayWriter writer(&array, 2, sizeof(int32_t), pool.get()); + writer.WriteInt(0, 100000); + writer.SetNullAt(1, arrow::Type::type::INT32); + writer.Complete(); + ASSERT_FALSE(array.IsNullAt(0)); + ASSERT_EQ(100000, array.GetInt(0)); + ASSERT_TRUE(array.IsNullAt(1)); + } + { + // DATE32 + BinaryArray array; + BinaryArrayWriter writer(&array, 2, sizeof(int32_t), pool.get()); + writer.WriteInt(0, 19000); + writer.SetNullAt(1, arrow::Type::type::DATE32); + writer.Complete(); + ASSERT_FALSE(array.IsNullAt(0)); + ASSERT_EQ(19000, array.GetDate(0)); + ASSERT_TRUE(array.IsNullAt(1)); + } + { + // INT64 + BinaryArray array; + BinaryArrayWriter writer(&array, 2, sizeof(int64_t), pool.get()); + writer.WriteLong(0, 123456789L); + writer.SetNullAt(1, arrow::Type::type::INT64); + writer.Complete(); + ASSERT_FALSE(array.IsNullAt(0)); + ASSERT_EQ(123456789L, array.GetLong(0)); + ASSERT_TRUE(array.IsNullAt(1)); + } + { + // FLOAT + BinaryArray array; + BinaryArrayWriter writer(&array, 2, sizeof(float), pool.get()); + writer.WriteFloat(0, 3.14f); + writer.SetNullAt(1, arrow::Type::type::FLOAT); + writer.Complete(); + ASSERT_FALSE(array.IsNullAt(0)); + ASSERT_FLOAT_EQ(3.14f, array.GetFloat(0)); + ASSERT_TRUE(array.IsNullAt(1)); + } + { + // DOUBLE + BinaryArray array; + BinaryArrayWriter writer(&array, 2, sizeof(double), pool.get()); + writer.WriteDouble(0, 2.718); + writer.SetNullAt(1, arrow::Type::type::DOUBLE); + writer.Complete(); + ASSERT_FALSE(array.IsNullAt(0)); + ASSERT_DOUBLE_EQ(2.718, array.GetDouble(0)); + ASSERT_TRUE(array.IsNullAt(1)); + } + { + // STRING (default path, uses 8-byte null) + BinaryArray array; + BinaryArrayWriter writer(&array, 2, 8, pool.get()); + writer.WriteString(0, BinaryString::FromString("hello", pool.get())); + writer.SetNullAt(1, arrow::Type::type::STRING); + writer.Complete(); + ASSERT_FALSE(array.IsNullAt(0)); + ASSERT_EQ("hello", std::string(array.GetStringView(0))); + ASSERT_TRUE(array.IsNullAt(1)); + } +} + } // namespace paimon::test diff --git a/src/paimon/common/data/blob.cpp b/src/paimon/common/data/blob.cpp index 0eebfda1..a6971ef9 100644 --- a/src/paimon/common/data/blob.cpp +++ b/src/paimon/common/data/blob.cpp @@ -106,9 +106,8 @@ Result> Blob::ToData(const std::shared_ptr& } Result> Blob::ArrowField( - const std::string& field_name, bool nullable, - std::unordered_map metadata) { - auto blob_field = BlobUtils::ToArrowField(field_name, nullable, metadata); + const std::string& field_name, std::unordered_map metadata) { + auto blob_field = BlobUtils::ToArrowField(field_name, /*nullable=*/false, metadata); auto field = std::make_unique<::ArrowSchema>(); PAIMON_RETURN_NOT_OK_FROM_ARROW(arrow::ExportField(*blob_field, field.get())); return field; diff --git a/src/paimon/common/data/blob_test.cpp b/src/paimon/common/data/blob_test.cpp index f08ed7fb..a105ea1f 100644 --- a/src/paimon/common/data/blob_test.cpp +++ b/src/paimon/common/data/blob_test.cpp @@ -16,10 +16,11 @@ #include "paimon/data/blob.h" +#include "arrow/api.h" +#include "arrow/c/bridge.h" #include "gtest/gtest.h" #include "paimon/fs/local/local_file_system.h" #include "paimon/memory/memory_pool.h" -#include "paimon/status.h" #include "paimon/testing/utils/testharness.h" namespace paimon::test { @@ -142,4 +143,43 @@ TEST_F(BlobTest, TestNewInputStreamWithDynamicLength) { ASSERT_EQ("cdefghijklmn", buffer); } +TEST_F(BlobTest, TestArrowField) { + { + // basic: field name, non-nullable by default + ASSERT_OK_AND_ASSIGN(auto schema, Blob::ArrowField("my_blob")); + ASSERT_NE(schema, nullptr); + + // import back to arrow::Field to verify + auto field_result = arrow::ImportField(schema.get()); + ASSERT_TRUE(field_result.ok()); + auto field = field_result.ValueUnsafe(); + + ASSERT_EQ(field->name(), "my_blob"); + ASSERT_EQ(field->type()->id(), arrow::Type::LARGE_BINARY); + ASSERT_FALSE(field->nullable()); + ASSERT_TRUE(field->HasMetadata()); + auto extension_type = field->metadata()->Get("paimon.extension.type"); + ASSERT_TRUE(extension_type.ok()); + ASSERT_EQ(extension_type.ValueUnsafe(), "paimon.type.blob"); + } + { + // with custom metadata + std::unordered_map custom_metadata = { + {"custom_key", "custom_value"}}; + ASSERT_OK_AND_ASSIGN(auto schema, Blob::ArrowField("meta_blob", custom_metadata)); + auto field = arrow::ImportField(schema.get()).ValueUnsafe(); + ASSERT_EQ(field->name(), "meta_blob"); + ASSERT_FALSE(field->nullable()); + ASSERT_TRUE(field->HasMetadata()); + // blob extension metadata should be present + auto extension_type = field->metadata()->Get("paimon.extension.type"); + ASSERT_TRUE(extension_type.ok()); + ASSERT_EQ(extension_type.ValueUnsafe(), "paimon.type.blob"); + // custom metadata should also be present + auto custom_val = field->metadata()->Get("custom_key"); + ASSERT_TRUE(custom_val.ok()); + ASSERT_EQ(custom_val.ValueUnsafe(), "custom_value"); + } +} + } // namespace paimon::test diff --git a/src/paimon/common/data/columnar/columnar_row_test.cpp b/src/paimon/common/data/columnar/columnar_row_test.cpp index 3a965632..b2d9c8b2 100644 --- a/src/paimon/common/data/columnar/columnar_row_test.cpp +++ b/src/paimon/common/data/columnar/columnar_row_test.cpp @@ -386,4 +386,69 @@ TEST(ColumnarRowTest, TestDataLifeCycle) { ASSERT_EQ(result_row->GetLong(3), 5); } +TEST(ColumnarRowTest, TestColumnarRowRefGetBinary) { + auto pool = GetDefaultPool(); + std::shared_ptr target_type = arrow::struct_({ + arrow::field("f0", arrow::binary()), + arrow::field("f1", arrow::binary()), + }); + auto f0 = + arrow::ipc::internal::json::ArrayFromJSON(arrow::binary(), R"(["hello", "world", null])") + .ValueOrDie(); + auto f1 = arrow::ipc::internal::json::ArrayFromJSON(arrow::binary(), R"(["abc", "", "xyz"])") + .ValueOrDie(); + auto data = arrow::StructArray::Make({f0, f1}, target_type->fields()).ValueOrDie(); + + auto ctx = std::make_shared(data->fields(), pool); + + { + ColumnarRowRef row(ctx, 0); + auto binary = row.GetBinary(0); + ASSERT_TRUE(binary); + ASSERT_EQ(std::string(binary->data(), binary->size()), "hello"); + + auto binary1 = row.GetBinary(1); + ASSERT_TRUE(binary1); + ASSERT_EQ(std::string(binary1->data(), binary1->size()), "abc"); + } + { + ColumnarRowRef row(ctx, 1); + auto binary = row.GetBinary(0); + ASSERT_TRUE(binary); + ASSERT_EQ(std::string(binary->data(), binary->size()), "world"); + + auto binary1 = row.GetBinary(1); + ASSERT_TRUE(binary1); + ASSERT_EQ(binary1->size(), 0); + } + { + ColumnarRowRef row(ctx, 2); + ASSERT_TRUE(row.IsNullAt(0)); + + auto binary1 = row.GetBinary(1); + ASSERT_TRUE(binary1); + ASSERT_EQ(std::string(binary1->data(), binary1->size()), "xyz"); + } +} + +TEST(ColumnarRowTest, TestColumnarRowRefToString) { + auto pool = GetDefaultPool(); + std::shared_ptr target_type = + arrow::struct_({arrow::field("f0", arrow::int32())}); + auto f0 = + arrow::ipc::internal::json::ArrayFromJSON(arrow::int32(), R"([1, 2, 3])").ValueOrDie(); + auto data = arrow::StructArray::Make({f0}, target_type->fields()).ValueOrDie(); + + auto ctx = std::make_shared(data->fields(), pool); + + { + ColumnarRowRef row(ctx, 0); + ASSERT_EQ(row.ToString(), "ColumnarRowRef, row_id 0"); + } + { + ColumnarRowRef row(ctx, 2); + ASSERT_EQ(row.ToString(), "ColumnarRowRef, row_id 2"); + } +} + } // namespace paimon::test diff --git a/src/paimon/common/data/data_define.h b/src/paimon/common/data/data_define.h index 28ae57e2..d9106571 100644 --- a/src/paimon/common/data/data_define.h +++ b/src/paimon/common/data/data_define.h @@ -128,18 +128,12 @@ class DataDefine { case arrow::Type::type::DOUBLE: return Literal(GetVariantValue(value)); case arrow::Type::type::STRING: { - auto binary_string_ptr = GetVariantPtr(value); - if (binary_string_ptr == nullptr) { - return Status::Invalid( - "VariantValueToLiteral failed, cannot get BinaryString from VariantType, " - "input value maybe string view"); - } - auto str = binary_string_ptr->ToString(); - return Literal(FieldType::STRING, str.data(), str.size()); + auto view = GetStringView(value); + return Literal(FieldType::STRING, view.data(), view.size()); } case arrow::Type::type::BINARY: { - auto bytes = GetVariantValue>(value); - return Literal(FieldType::BINARY, bytes->data(), bytes->size()); + auto view = GetStringView(value); + return Literal(FieldType::BINARY, view.data(), view.size()); } case arrow::Type::type::TIMESTAMP: return Literal(GetVariantValue(value)); diff --git a/src/paimon/common/data/data_define_test.cpp b/src/paimon/common/data/data_define_test.cpp index 8b59e850..dba4da9b 100644 --- a/src/paimon/common/data/data_define_test.cpp +++ b/src/paimon/common/data/data_define_test.cpp @@ -29,6 +29,7 @@ #include "paimon/data/timestamp.h" #include "paimon/memory/bytes.h" #include "paimon/memory/memory_pool.h" +#include "paimon/testing/utils/testharness.h" namespace paimon::test { @@ -191,4 +192,127 @@ TEST(DataDefineTest, GetStringView) { } } +TEST(DataDefineTest, VariantValueToLiteral) { + auto pool = GetDefaultPool(); + + { + // BOOL + VariantType value = true; + ASSERT_OK_AND_ASSIGN(auto literal, + DataDefine::VariantValueToLiteral(value, arrow::Type::type::BOOL)); + ASSERT_EQ(literal.GetValue(), true); + } + { + // INT8 + VariantType value = static_cast(42); + ASSERT_OK_AND_ASSIGN(auto literal, + DataDefine::VariantValueToLiteral(value, arrow::Type::type::INT8)); + ASSERT_EQ(literal.GetValue(), 42); + } + { + // INT16 + VariantType value = static_cast(1000); + ASSERT_OK_AND_ASSIGN(auto literal, + DataDefine::VariantValueToLiteral(value, arrow::Type::type::INT16)); + ASSERT_EQ(literal.GetValue(), 1000); + } + { + // INT32 + VariantType value = static_cast(100000); + ASSERT_OK_AND_ASSIGN(auto literal, + DataDefine::VariantValueToLiteral(value, arrow::Type::type::INT32)); + ASSERT_EQ(literal.GetValue(), 100000); + } + { + // INT64 + VariantType value = static_cast(123456789L); + ASSERT_OK_AND_ASSIGN(auto literal, + DataDefine::VariantValueToLiteral(value, arrow::Type::type::INT64)); + ASSERT_EQ(literal.GetValue(), 123456789L); + } + { + // FLOAT + VariantType value = 3.14f; + ASSERT_OK_AND_ASSIGN(auto literal, + DataDefine::VariantValueToLiteral(value, arrow::Type::type::FLOAT)); + ASSERT_FLOAT_EQ(literal.GetValue(), 3.14f); + } + { + // DOUBLE + VariantType value = 2.718; + ASSERT_OK_AND_ASSIGN(auto literal, + DataDefine::VariantValueToLiteral(value, arrow::Type::type::DOUBLE)); + ASSERT_DOUBLE_EQ(literal.GetValue(), 2.718); + } + { + // STRING from BinaryString + auto binary_str = BinaryString::FromString("hello", pool.get()); + VariantType value = binary_str; + ASSERT_OK_AND_ASSIGN(auto literal, + DataDefine::VariantValueToLiteral(value, arrow::Type::type::STRING)); + ASSERT_EQ(literal.GetValue(), "hello"); + } + { + // STRING from shared_ptr + std::shared_ptr bytes = Bytes::AllocateBytes("world", pool.get()); + VariantType value = bytes; + ASSERT_OK_AND_ASSIGN(auto literal, + DataDefine::VariantValueToLiteral(value, arrow::Type::type::STRING)); + ASSERT_EQ(literal.GetValue(), "world"); + } + { + // STRING from string_view + std::string original = "view_str"; + VariantType value = std::string_view(original.data(), original.size()); + ASSERT_OK_AND_ASSIGN(auto literal, + DataDefine::VariantValueToLiteral(value, arrow::Type::type::STRING)); + ASSERT_EQ(literal.GetValue(), "view_str"); + } + { + // BINARY from shared_ptr + std::shared_ptr bytes = Bytes::AllocateBytes("binary_data", pool.get()); + VariantType value = bytes; + ASSERT_OK_AND_ASSIGN(auto literal, + DataDefine::VariantValueToLiteral(value, arrow::Type::type::BINARY)); + ASSERT_EQ(literal.GetValue(), "binary_data"); + } + { + // BINARY from BinaryString + auto binary_str = BinaryString::FromString("bin_str", pool.get()); + VariantType value = binary_str; + ASSERT_OK_AND_ASSIGN(auto literal, + DataDefine::VariantValueToLiteral(value, arrow::Type::type::BINARY)); + ASSERT_EQ(literal.GetValue(), "bin_str"); + } + { + // TIMESTAMP + Timestamp ts(12345, 1); + VariantType value = ts; + ASSERT_OK_AND_ASSIGN( + auto literal, DataDefine::VariantValueToLiteral(value, arrow::Type::type::TIMESTAMP)); + ASSERT_EQ(literal.GetValue(), ts); + } + { + // DECIMAL128 + Decimal decimal(20, 3, 123456); + VariantType value = decimal; + ASSERT_OK_AND_ASSIGN( + auto literal, DataDefine::VariantValueToLiteral(value, arrow::Type::type::DECIMAL128)); + ASSERT_EQ(literal.GetValue(), decimal); + } + { + // DATE32 + VariantType value = static_cast(19000); + ASSERT_OK_AND_ASSIGN(auto literal, + DataDefine::VariantValueToLiteral(value, arrow::Type::type::DATE32)); + ASSERT_EQ(literal.GetValue(), 19000); + } + { + // unsupported type + VariantType value = static_cast(0); + ASSERT_NOK_WITH_MSG(DataDefine::VariantValueToLiteral(value, arrow::Type::type::LIST), + "Not support arrow type"); + } +} + } // namespace paimon::test diff --git a/src/paimon/common/memory/memory_segment_utils_test.cpp b/src/paimon/common/memory/memory_segment_utils_test.cpp index 286d78f2..b6f6d877 100644 --- a/src/paimon/common/memory/memory_segment_utils_test.cpp +++ b/src/paimon/common/memory/memory_segment_utils_test.cpp @@ -160,6 +160,49 @@ TEST(MemorySegmentUtilsTest, TestSetAndUnSet) { ASSERT_FALSE(MemorySegmentUtils::BitGet(segs[0], /*base_offset=*/0, index)); } +TEST(MemorySegmentUtilsTest, TestBitSetAndUnSetSingleSegment) { + auto pool = GetDefaultPool(); + int32_t segment_size = 128; + std::vector segs = {MemorySegment::AllocateHeapMemory(segment_size, pool.get())}; + + // initially all bits should be 0 after allocation + for (int32_t i = 0; i < 16; i++) { + ASSERT_FALSE(MemorySegmentUtils::BitGet(segs, /*base_offset=*/0, i)); + } + + // set bits at various indices and verify + MemorySegmentUtils::BitSet(&segs, /*base_offset=*/0, /*index=*/0); + ASSERT_TRUE(MemorySegmentUtils::BitGet(segs, /*base_offset=*/0, 0)); + + MemorySegmentUtils::BitSet(&segs, /*base_offset=*/0, /*index=*/7); + ASSERT_TRUE(MemorySegmentUtils::BitGet(segs, /*base_offset=*/0, 7)); + + MemorySegmentUtils::BitSet(&segs, /*base_offset=*/0, /*index=*/15); + ASSERT_TRUE(MemorySegmentUtils::BitGet(segs, /*base_offset=*/0, 15)); + + // unset and verify + MemorySegmentUtils::BitUnSet(&segs, /*base_offset=*/0, /*index=*/0); + ASSERT_FALSE(MemorySegmentUtils::BitGet(segs, /*base_offset=*/0, 0)); + + MemorySegmentUtils::BitUnSet(&segs, /*base_offset=*/0, /*index=*/7); + ASSERT_FALSE(MemorySegmentUtils::BitGet(segs, /*base_offset=*/0, 7)); + + // bit 15 should still be set + ASSERT_TRUE(MemorySegmentUtils::BitGet(segs, /*base_offset=*/0, 15)); + + MemorySegmentUtils::BitUnSet(&segs, /*base_offset=*/0, /*index=*/15); + ASSERT_FALSE(MemorySegmentUtils::BitGet(segs, /*base_offset=*/0, 15)); + + // test with non-zero base_offset + int32_t base_offset = 10; + MemorySegmentUtils::BitSet(&segs, base_offset, /*index=*/3); + ASSERT_TRUE(MemorySegmentUtils::BitGet(segs, base_offset, 3)); + ASSERT_FALSE(MemorySegmentUtils::BitGet(segs, base_offset, 2)); + + MemorySegmentUtils::BitUnSet(&segs, base_offset, /*index=*/3); + ASSERT_FALSE(MemorySegmentUtils::BitGet(segs, base_offset, 3)); +} + TEST(MemorySegmentUtilsTest, TestCopyMultiSegmentsFromBytes) { auto pool = GetDefaultPool(); std::shared_ptr bytes = Bytes::AllocateBytes("abcdef", pool.get()); diff --git a/src/paimon/common/memory/memory_slice.cpp b/src/paimon/common/memory/memory_slice.cpp index ad3ab2cd..9ef05480 100644 --- a/src/paimon/common/memory/memory_slice.cpp +++ b/src/paimon/common/memory/memory_slice.cpp @@ -82,39 +82,9 @@ std::shared_ptr MemorySlice::CopyBytes(MemoryPool* pool) { return bytes; } -bool MemorySlice::operator<(const MemorySlice& other) const { - return Compare(other) < 0; -} -bool MemorySlice::operator>(const MemorySlice& other) const { - return Compare(other) > 0; -} -bool MemorySlice::operator==(const MemorySlice& other) const { - return Compare(other) == 0; -} -bool MemorySlice::operator!=(const MemorySlice& other) const { - return !(*this == other); -} -bool MemorySlice::operator<=(const MemorySlice& other) const { - return Compare(other) <= 0; -} -bool MemorySlice::operator>=(const MemorySlice& other) const { - return Compare(other) >= 0; -} std::shared_ptr MemorySlice::ToInput() { auto self = shared_from_this(); return std::make_shared(self); } -int32_t MemorySlice::Compare(const MemorySlice& other) const { - int32_t len = std::min(length_, other.length_); - for (int32_t i = 0; i < len; ++i) { - auto byte1 = static_cast(segment_.Get(offset_ + i)); - auto byte2 = static_cast(other.segment_.Get(other.offset_ + i)); - if (byte1 != byte2) { - return static_cast(byte1) - static_cast(byte2); - } - } - return length_ - other.length_; -} - } // namespace paimon diff --git a/src/paimon/common/memory/memory_slice.h b/src/paimon/common/memory/memory_slice.h index e4196edb..d3474ad5 100644 --- a/src/paimon/common/memory/memory_slice.h +++ b/src/paimon/common/memory/memory_slice.h @@ -58,18 +58,8 @@ class PAIMON_EXPORT MemorySlice : public std::enable_shared_from_this CopyBytes(MemoryPool* pool); - bool operator<(const MemorySlice& other) const; - bool operator>(const MemorySlice& other) const; - bool operator==(const MemorySlice& other) const; - bool operator!=(const MemorySlice& other) const; - bool operator<=(const MemorySlice& other) const; - bool operator>=(const MemorySlice& other) const; - std::shared_ptr ToInput(); - private: - int32_t Compare(const MemorySlice& other) const; - private: MemorySegment segment_; int32_t offset_; diff --git a/src/paimon/common/memory/memory_slice_input.cpp b/src/paimon/common/memory/memory_slice_input.cpp index d78a6172..f08c195c 100644 --- a/src/paimon/common/memory/memory_slice_input.cpp +++ b/src/paimon/common/memory/memory_slice_input.cpp @@ -69,7 +69,7 @@ int64_t MemorySliceInput::ReadLong() { return v; } -int32_t MemorySliceInput::ReadVarLenInt() { +Result MemorySliceInput::ReadVarLenInt() { for (int offset = 0, result = 0; offset < 32; offset += 7) { int b = ReadUnsignedByte(); result |= (b & 0x7F) << offset; @@ -77,10 +77,10 @@ int32_t MemorySliceInput::ReadVarLenInt() { return result; } } - throw std::invalid_argument("Malformed integer."); + return Status::Invalid("Malformed integer."); } -int64_t MemorySliceInput::ReadVarLenLong() { +Result MemorySliceInput::ReadVarLenLong() { int64_t result = 0; for (int offset = 0; offset < 64; offset += 7) { int64_t b = ReadUnsignedByte(); @@ -89,7 +89,7 @@ int64_t MemorySliceInput::ReadVarLenLong() { return result; } } - throw std::invalid_argument("Malformed long."); + return Status::Invalid("Malformed long."); } void MemorySliceInput::SetOrder(ByteOrder order) { diff --git a/src/paimon/common/memory/memory_slice_input.h b/src/paimon/common/memory/memory_slice_input.h index e98e4155..7988f17f 100644 --- a/src/paimon/common/memory/memory_slice_input.h +++ b/src/paimon/common/memory/memory_slice_input.h @@ -20,7 +20,6 @@ #include #include #include -#include #include #include "paimon/common/memory/memory_slice.h" @@ -48,8 +47,8 @@ class PAIMON_EXPORT MemorySliceInput { int8_t ReadUnsignedByte(); int32_t ReadInt(); int64_t ReadLong(); - int32_t ReadVarLenInt(); - int64_t ReadVarLenLong(); + Result ReadVarLenInt(); + Result ReadVarLenLong(); std::shared_ptr ReadSlice(int length); void SetOrder(ByteOrder order); diff --git a/src/paimon/common/sst/block_handle.cpp b/src/paimon/common/sst/block_handle.cpp index 73a9a78b..03f5c959 100644 --- a/src/paimon/common/sst/block_handle.cpp +++ b/src/paimon/common/sst/block_handle.cpp @@ -20,10 +20,10 @@ namespace paimon { -std::shared_ptr BlockHandle::ReadBlockHandle( - std::shared_ptr& input) { - int64_t offset = input->ReadVarLenLong(); - int size = input->ReadVarLenInt(); +Result> BlockHandle::ReadBlockHandle( + const std::shared_ptr& input) { + PAIMON_ASSIGN_OR_RAISE(int64_t offset, input->ReadVarLenLong()); + PAIMON_ASSIGN_OR_RAISE(int32_t size, input->ReadVarLenInt()); return std::make_shared(offset, size); } diff --git a/src/paimon/common/sst/block_handle.h b/src/paimon/common/sst/block_handle.h index 3e9e6eaf..76e9fd94 100644 --- a/src/paimon/common/sst/block_handle.h +++ b/src/paimon/common/sst/block_handle.h @@ -27,7 +27,8 @@ namespace paimon { class BlockHandle { public: - static std::shared_ptr ReadBlockHandle(std::shared_ptr& input); + static Result> ReadBlockHandle( + const std::shared_ptr& input); public: BlockHandle(int64_t offset, int32_t size); diff --git a/src/paimon/common/sst/block_iterator.cpp b/src/paimon/common/sst/block_iterator.cpp index 901e7c3f..04df2141 100644 --- a/src/paimon/common/sst/block_iterator.cpp +++ b/src/paimon/common/sst/block_iterator.cpp @@ -37,10 +37,10 @@ Result> BlockIterator::Next() { return ReadEntry(); } -std::unique_ptr BlockIterator::ReadEntry() { - int32_t key_length = input_->ReadVarLenInt(); +Result> BlockIterator::ReadEntry() { + PAIMON_ASSIGN_OR_RAISE(int32_t key_length, input_->ReadVarLenInt()); auto key = input_->ReadSlice(key_length); - int32_t value_length = input_->ReadVarLenInt(); + PAIMON_ASSIGN_OR_RAISE(int32_t value_length, input_->ReadVarLenInt()); auto value = input_->ReadSlice(value_length); return std::make_unique(key, value); } @@ -53,7 +53,7 @@ Result BlockIterator::SeekTo(const std::shared_ptr& target_ke int32_t mid = left + (right - left) / 2; PAIMON_RETURN_NOT_OK(input_->SetPosition(reader_->SeekTo(mid))); - auto mid_entry = ReadEntry(); + PAIMON_ASSIGN_OR_RAISE(auto mid_entry, ReadEntry()); PAIMON_ASSIGN_OR_RAISE(int32_t compare, reader_->Comparator()(mid_entry->key, target_key)); if (compare == 0) { diff --git a/src/paimon/common/sst/block_iterator.h b/src/paimon/common/sst/block_iterator.h index 73a89300..24455df7 100644 --- a/src/paimon/common/sst/block_iterator.h +++ b/src/paimon/common/sst/block_iterator.h @@ -16,8 +16,6 @@ #pragma once -#include - #include "paimon/common/memory/memory_slice_input.h" #include "paimon/common/sst/block_entry.h" @@ -32,7 +30,7 @@ class BlockIterator { Result> Next(); - std::unique_ptr ReadEntry(); + Result> ReadEntry(); Result SeekTo(const std::shared_ptr& target_key); diff --git a/src/paimon/common/sst/sst_file_reader.cpp b/src/paimon/common/sst/sst_file_reader.cpp index 1fa836d7..54a84742 100644 --- a/src/paimon/common/sst/sst_file_reader.cpp +++ b/src/paimon/common/sst/sst_file_reader.cpp @@ -112,8 +112,9 @@ Result> SstFileReader::GetNextBlock( PAIMON_ASSIGN_OR_RAISE(std::unique_ptr ret, index_iterator->Next()); auto& slice = ret->value; auto input = slice->ToInput(); + PAIMON_ASSIGN_OR_RAISE(auto block_handle, BlockHandle::ReadBlockHandle(input)); PAIMON_ASSIGN_OR_RAISE(std::shared_ptr reader, - ReadBlock(BlockHandle::ReadBlockHandle(input), false)); + ReadBlock(std::move(block_handle), false)); return reader->Iterator(); } @@ -163,7 +164,8 @@ Result SstFileReader::DecompressBlock(const MemorySegment& compre } else { auto decompressor = factory->GetDecompressor(); auto input = MemorySlice::Wrap(compressed_data)->ToInput(); - auto output = MemorySegment::AllocateHeapMemory(input->ReadVarLenInt(), pool.get()); + PAIMON_ASSIGN_OR_RAISE(int32_t uncompressed_size, input->ReadVarLenInt()); + auto output = MemorySegment::AllocateHeapMemory(uncompressed_size, pool.get()); auto output_memory = output.GetHeapMemory(); PAIMON_ASSIGN_OR_RAISE( int uncompressed_length, From 532e61dabd9e1ca9823b5968c87a3a59c006a3a9 Mon Sep 17 00:00:00 2001 From: lxy264173 Date: Thu, 26 Mar 2026 16:53:20 +0800 Subject: [PATCH 09/11] fix --- .../common/data/abstract_binary_writer.cpp | 51 ++++--------------- .../common/data/abstract_binary_writer.h | 7 +-- src/paimon/common/data/data_define.h | 10 ++-- 3 files changed, 18 insertions(+), 50 deletions(-) diff --git a/src/paimon/common/data/abstract_binary_writer.cpp b/src/paimon/common/data/abstract_binary_writer.cpp index f27f2812..60a54f35 100644 --- a/src/paimon/common/data/abstract_binary_writer.cpp +++ b/src/paimon/common/data/abstract_binary_writer.cpp @@ -16,7 +16,6 @@ #include "paimon/common/data/abstract_binary_writer.h" -#include #include #include #include @@ -52,7 +51,7 @@ void AbstractBinaryWriter::WriteString(int32_t pos, const BinaryString& input) { len); WriteBytesToFixLenPart(&segment_, GetFieldOffset(pos), *bytes, len); } else { - WriteSegmentsToVarLenPart(pos, {input.GetSegment()}, input.GetOffset(), len); + WriteSegmentToVarLenPart(pos, input.GetSegment(), input.GetOffset(), len); } } @@ -75,20 +74,19 @@ void AbstractBinaryWriter::WriteStringView(int32_t pos, const std::string_view& } void AbstractBinaryWriter::WriteRow(int32_t pos, const BinaryRow& input) { - return WriteSegmentsToVarLenPart(pos, {input.GetSegment()}, input.GetOffset(), - input.GetSizeInBytes()); + return WriteSegmentToVarLenPart(pos, input.GetSegment(), input.GetOffset(), + input.GetSizeInBytes()); } void AbstractBinaryWriter::WriteArray(int32_t pos, const BinaryArray& input) { - return WriteSegmentsToVarLenPart(pos, {input.GetSegment()}, input.GetOffset(), - input.GetSizeInBytes()); + return WriteSegmentToVarLenPart(pos, input.GetSegment(), input.GetOffset(), + input.GetSizeInBytes()); } void AbstractBinaryWriter::WriteMap(int32_t pos, const BinaryMap& input) { - return WriteSegmentsToVarLenPart(pos, {input.GetSegment()}, input.GetOffset(), - input.GetSizeInBytes()); + return WriteSegmentToVarLenPart(pos, input.GetSegment(), input.GetOffset(), + input.GetSizeInBytes()); } - void AbstractBinaryWriter::WriteDecimal(int32_t pos, const std::optional& value, int32_t precision) { assert(value == std::nullopt || precision == value.value().Precision()); @@ -152,48 +150,19 @@ void AbstractBinaryWriter::EnsureCapacity(int32_t needed_size) { } } -void AbstractBinaryWriter::WriteSegmentsToVarLenPart(int32_t pos, - const std::vector& segments, - int32_t offset, int32_t size) { +void AbstractBinaryWriter::WriteSegmentToVarLenPart(int32_t pos, const MemorySegment& segment, + int32_t offset, int32_t size) { const int32_t rounded_size = RoundNumberOfBytesToNearestWord(size); // grow the global buffer before writing data. EnsureCapacity(rounded_size); ZeroOutPaddingBytes(size); - if (segments.size() == 1) { - segments[0].CopyTo(offset, &segment_, cursor_, size); - } else { - assert(false); - // Now BinarySecyion only in single segment. - WriteMultiSegmentsToVarLenPart(segments, offset, size); - } + segment.CopyTo(offset, &segment_, cursor_, size); SetOffsetAndSize(pos, cursor_, size); // move the cursor forward. cursor_ += rounded_size; } -void AbstractBinaryWriter::WriteMultiSegmentsToVarLenPart( - const std::vector& segments, int32_t offset, int32_t size) { - // Now BinarySecyion only in single segment. - assert(false); - // Write the bytes to the variable length portion. - int32_t need_copy = size; - int32_t from_offset = offset; - int32_t to_offset = cursor_; - for (const auto& source_segment : segments) { - int32_t remain = source_segment.Size() - from_offset; - if (remain > 0) { - int32_t copy_size = std::min(remain, need_copy); - source_segment.CopyTo(from_offset, &segment_, to_offset, copy_size); - need_copy -= copy_size; - to_offset += copy_size; - from_offset = 0; - } else { - from_offset -= source_segment.Size(); - } - } -} - template void AbstractBinaryWriter::WriteBytesToVarLenPart(int32_t pos, const T& bytes, int32_t len) { const int32_t rounded_size = RoundNumberOfBytesToNearestWord(len); diff --git a/src/paimon/common/data/abstract_binary_writer.h b/src/paimon/common/data/abstract_binary_writer.h index 7f142cf2..d89b00eb 100644 --- a/src/paimon/common/data/abstract_binary_writer.h +++ b/src/paimon/common/data/abstract_binary_writer.h @@ -19,7 +19,6 @@ #include #include #include -#include #include "paimon/common/data/binary_writer.h" #include "paimon/common/memory/memory_segment.h" @@ -90,10 +89,8 @@ class AbstractBinaryWriter : public BinaryWriter { static void WriteBytesToFixLenPart(MemorySegment* segment, int32_t field_offset, const T& bytes, int32_t len); - void WriteSegmentsToVarLenPart(int32_t pos, const std::vector& segments, - int32_t offset, int32_t size); - void WriteMultiSegmentsToVarLenPart(const std::vector& segments, int32_t offset, - int32_t size); + void WriteSegmentToVarLenPart(int32_t pos, const MemorySegment& segment, int32_t offset, + int32_t size); protected: int32_t cursor_ = 0; diff --git a/src/paimon/common/data/data_define.h b/src/paimon/common/data/data_define.h index d9106571..bc505577 100644 --- a/src/paimon/common/data/data_define.h +++ b/src/paimon/common/data/data_define.h @@ -55,7 +55,7 @@ class DataDefine { return std::holds_alternative(value); } - /// always make sure value is T type + // @warning Always make sure value is T type template inline static T GetVariantValue(const VariantType& value) { const T* ptr = std::get_if(&value); @@ -69,16 +69,18 @@ class DataDefine { return std::get_if(&value); } - // always make sure value is string_view or std::shared_ptr or BinaryString + // @warning Always make sure value is string_view or std::shared_ptr or BinaryString inline static std::string_view GetStringView(const VariantType& value) { if (auto* view_ptr = GetVariantPtr(value)) { return *view_ptr; } else if (auto* bytes_ptr = GetVariantPtr>(value)) { const auto& bytes = *bytes_ptr; return std::string_view(bytes->data(), bytes->size()); + } else if (auto* binary_string_ptr = GetVariantPtr(value)) { + return binary_string_ptr->GetStringView(); } - auto binary_string = GetVariantValue(value); - return binary_string.GetStringView(); + assert(false); + return {}; } static std::string VariantValueToString(const VariantType& value) { From b18782a9cf5e14c4ad59c31901d00b496824610d Mon Sep 17 00:00:00 2001 From: lxy264173 Date: Thu, 26 Mar 2026 17:52:28 +0800 Subject: [PATCH 10/11] rm use_view in FieldsComparator --- ...key_value_in_memory_record_reader_test.cpp | 15 ++--- .../io/key_value_projection_reader_test.cpp | 7 +-- .../compact/interval_partition_test.cpp | 2 +- ..._changelog_merge_function_wrapper_test.cpp | 2 +- ...ookup_merge_tree_compact_rewriter_test.cpp | 3 +- .../merge_tree_compact_rewriter_test.cpp | 5 +- .../compact/partial_update_merge_function.cpp | 7 +-- .../compact/sort_merge_reader_test.cpp | 22 ++++---- src/paimon/core/mergetree/levels_test.cpp | 3 +- src/paimon/core/mergetree/lookup_levels.cpp | 5 +- .../core/mergetree/lookup_levels_test.cpp | 3 +- .../core/mergetree/merge_tree_writer_test.cpp | 9 ++- src/paimon/core/mergetree/sorted_run_test.cpp | 4 +- .../core/operation/file_store_write.cpp | 11 +--- .../operation/key_value_file_store_write.cpp | 8 +-- .../operation/key_value_file_store_write.h | 4 -- .../core/operation/merge_file_split_read.cpp | 5 +- .../table/source/split_generator_test.cpp | 2 +- src/paimon/core/table/source/table_scan.cpp | 3 +- src/paimon/core/utils/fields_comparator.cpp | 56 +++++-------------- src/paimon/core/utils/fields_comparator.h | 6 +- .../core/utils/fields_comparator_test.cpp | 12 ++-- .../core/utils/primary_key_table_utils.cpp | 2 +- src/paimon/testing/utils/key_value_checker.h | 8 +-- 24 files changed, 71 insertions(+), 133 deletions(-) diff --git a/src/paimon/core/io/key_value_in_memory_record_reader_test.cpp b/src/paimon/core/io/key_value_in_memory_record_reader_test.cpp index 1c7fe8f2..9f9b3c86 100644 --- a/src/paimon/core/io/key_value_in_memory_record_reader_test.cpp +++ b/src/paimon/core/io/key_value_in_memory_record_reader_test.cpp @@ -99,8 +99,7 @@ TEST_F(KeyValueInMemoryRecordReaderTest, TestSimple) { ASSERT_OK_AND_ASSIGN(std::shared_ptr key_comparator, FieldsComparator::Create({fields[0], fields[1]}, - /*is_ascending_order=*/true, - /*use_view=*/true)); + /*is_ascending_order=*/true)); auto record_reader = std::make_unique( /*last_sequence_num=*/0, std::move(src_array), @@ -166,8 +165,7 @@ TEST_F(KeyValueInMemoryRecordReaderTest, TestUserDefinedSequenceFields) { ASSERT_OK_AND_ASSIGN(std::shared_ptr key_comparator, FieldsComparator::Create({fields[1], fields[0]}, - /*is_ascending_order=*/true, - /*use_view=*/true)); + /*is_ascending_order=*/true)); auto record_reader = std::make_unique( /*last_sequence_num=*/0, std::move(src_array), std::vector(), @@ -202,8 +200,7 @@ TEST_F(KeyValueInMemoryRecordReaderTest, TestNonExistPK) { ASSERT_OK_AND_ASSIGN(std::shared_ptr key_comparator, FieldsComparator::Create({fields[0], fields[1]}, - /*is_ascending_order=*/true, - /*use_view=*/true)); + /*is_ascending_order=*/true)); auto record_reader = std::make_unique( /*last_sequence_num=*/0, std::move(src_array), std::vector(), @@ -253,8 +250,7 @@ TEST_F(KeyValueInMemoryRecordReaderTest, TestStableSortAndMerge) { ASSERT_OK_AND_ASSIGN(std::shared_ptr key_comparator, FieldsComparator::Create({fields[0], fields[1]}, - /*is_ascending_order=*/true, - /*use_view=*/true)); + /*is_ascending_order=*/true)); auto record_reader = std::make_unique( /*last_sequence_num=*/0, std::move(src_array), @@ -297,8 +293,7 @@ TEST_F(KeyValueInMemoryRecordReaderTest, TestVariantType) { ASSERT_OK_AND_ASSIGN(std::shared_ptr key_comparator, FieldsComparator::Create({fields[0]}, - /*is_ascending_order=*/true, - /*use_view=*/true)); + /*is_ascending_order=*/true)); auto record_reader = std::make_unique( /*last_sequence_num=*/0, std::move(src_array), std::vector(), diff --git a/src/paimon/core/io/key_value_projection_reader_test.cpp b/src/paimon/core/io/key_value_projection_reader_test.cpp index afdf0316..141172c9 100644 --- a/src/paimon/core/io/key_value_projection_reader_test.cpp +++ b/src/paimon/core/io/key_value_projection_reader_test.cpp @@ -71,10 +71,9 @@ class KeyValueProjectionReaderTest : public testing::Test, const auto& key_field = key_schema->field(i); key_fields.emplace_back(i, key_field); } - EXPECT_OK_AND_ASSIGN( - std::shared_ptr user_key_comparator, - FieldsComparator::Create(key_fields, key_sort_fields, - /*is_ascending_order=*/true, /*use_view=*/true)); + EXPECT_OK_AND_ASSIGN(std::shared_ptr user_key_comparator, + FieldsComparator::Create(key_fields, key_sort_fields, + /*is_ascending_order=*/true)); auto mfunc = std::make_unique(/*ignore_delete=*/false); auto merge_function_wrapper = diff --git a/src/paimon/core/mergetree/compact/interval_partition_test.cpp b/src/paimon/core/mergetree/compact/interval_partition_test.cpp index 97b2c9fc..375c88b5 100644 --- a/src/paimon/core/mergetree/compact/interval_partition_test.cpp +++ b/src/paimon/core/mergetree/compact/interval_partition_test.cpp @@ -45,7 +45,7 @@ class IntervalPartitionTest : public testing::Test { void SetUp() override { ASSERT_OK_AND_ASSIGN(comparator_, FieldsComparator::Create( {DataField(0, arrow::field("test", arrow::int32()))}, - /*is_ascending_order=*/true, /*use_view=*/true)); + /*is_ascending_order=*/true)); pool_ = GetDefaultPool(); } diff --git a/src/paimon/core/mergetree/compact/lookup_changelog_merge_function_wrapper_test.cpp b/src/paimon/core/mergetree/compact/lookup_changelog_merge_function_wrapper_test.cpp index f9c4741e..cd2ebe59 100644 --- a/src/paimon/core/mergetree/compact/lookup_changelog_merge_function_wrapper_test.cpp +++ b/src/paimon/core/mergetree/compact/lookup_changelog_merge_function_wrapper_test.cpp @@ -190,7 +190,7 @@ TEST(LookupChangelogMergeFunctionWrapperTest, TestCreateSequenceComparator) { ASSERT_OK_AND_ASSIGN( std::shared_ptr user_defined_seq_comparator, FieldsComparator::Create({DataField(1, arrow::field("value", arrow::int32()))}, - /*is_ascending_order=*/true, /*use_view=*/true)); + /*is_ascending_order=*/true)); auto cmp = LookupChangelogMergeFunctionWrapper::CreateSequenceComparator( user_defined_seq_comparator); KeyValue kv1(RowKind::Insert(), /*sequence_number=*/2, /*level=*/0, /*key=*/ diff --git a/src/paimon/core/mergetree/compact/lookup_merge_tree_compact_rewriter_test.cpp b/src/paimon/core/mergetree/compact/lookup_merge_tree_compact_rewriter_test.cpp index 57ae96da..34c4be1e 100644 --- a/src/paimon/core/mergetree/compact/lookup_merge_tree_compact_rewriter_test.cpp +++ b/src/paimon/core/mergetree/compact/lookup_merge_tree_compact_rewriter_test.cpp @@ -118,8 +118,7 @@ class LookupMergeTreeCompactRewriterTest : public testing::Test { std::vector key_fields = {DataField(0, key_schema_->field(0))}; PAIMON_ASSIGN_OR_RAISE(std::shared_ptr key_comparator, FieldsComparator::Create(key_fields, - /*is_ascending_order=*/true, - /*use_view=*/true)); + /*is_ascending_order=*/true)); return key_comparator; } diff --git a/src/paimon/core/mergetree/compact/merge_tree_compact_rewriter_test.cpp b/src/paimon/core/mergetree/compact/merge_tree_compact_rewriter_test.cpp index 9d955d11..548aaa40 100644 --- a/src/paimon/core/mergetree/compact/merge_tree_compact_rewriter_test.cpp +++ b/src/paimon/core/mergetree/compact/merge_tree_compact_rewriter_test.cpp @@ -69,9 +69,8 @@ class MergeTreeCompactRewriterTest : public testing::Test { PAIMON_ASSIGN_OR_RAISE(auto pk_fields, table_schema->GetFields(table_schema->TrimmedPrimaryKeys().value())); - PAIMON_ASSIGN_OR_RAISE( - std::shared_ptr key_comparator, - FieldsComparator::Create(pk_fields, /*is_ascending_order=*/true, /*use_view=*/true)); + PAIMON_ASSIGN_OR_RAISE(std::shared_ptr key_comparator, + FieldsComparator::Create(pk_fields, /*is_ascending_order=*/true)); IntervalPartition interval_partition(metas, key_comparator); return interval_partition.Partition(); } diff --git a/src/paimon/core/mergetree/compact/partial_update_merge_function.cpp b/src/paimon/core/mergetree/compact/partial_update_merge_function.cpp index 70f9439e..50cf11af 100644 --- a/src/paimon/core/mergetree/compact/partial_update_merge_function.cpp +++ b/src/paimon/core/mergetree/compact/partial_update_merge_function.cpp @@ -209,10 +209,9 @@ PartialUpdateMergeFunction::CreateFieldSeqComparator( } sort_fields.push_back(field_idx); } - PAIMON_ASSIGN_OR_RAISE( - std::shared_ptr cmp, - FieldsComparator::Create(value_data_fields, sort_fields, - /*is_ascending_order=*/true, /*use_view=*/true)); + PAIMON_ASSIGN_OR_RAISE(std::shared_ptr cmp, + FieldsComparator::Create(value_data_fields, sort_fields, + /*is_ascending_order=*/true)); field_seq_group_comparator_map[seq_group] = cmp; field_seq_comparators[i] = cmp; } diff --git a/src/paimon/core/mergetree/compact/sort_merge_reader_test.cpp b/src/paimon/core/mergetree/compact/sort_merge_reader_test.cpp index 98d72c88..d70f864f 100644 --- a/src/paimon/core/mergetree/compact/sort_merge_reader_test.cpp +++ b/src/paimon/core/mergetree/compact/sort_merge_reader_test.cpp @@ -167,7 +167,7 @@ TEST_F(SortMergeReaderTest, TestSimpleWithTwoSameKeys) { ASSERT_OK_AND_ASSIGN(std::shared_ptr user_key_comparator, FieldsComparator::Create({data_fields[2]}, std::vector({0}), - /*is_ascending_order=*/true, /*use_view=*/true)); + /*is_ascending_order=*/true)); std::vector expected = KeyValueChecker::GenerateKeyValues( {1, 1, 1}, {{2}, {3}, {5}}, {{2, 30}, {3, 30}, {5, 50}}, pool_); CheckSortMergeResult( @@ -213,7 +213,7 @@ TEST_F(SortMergeReaderTest, TestSimpleWithThreeSameKeys) { ASSERT_OK_AND_ASSIGN(std::shared_ptr user_key_comparator, FieldsComparator::Create({data_fields[2]}, std::vector({0}), - /*is_ascending_order=*/true, /*use_view=*/true)); + /*is_ascending_order=*/true)); std::vector expected = KeyValueChecker::GenerateKeyValues({2, 1}, {{2}, {5}}, {{2, 30}, {5, 50}}, pool_); CheckSortMergeResult( @@ -258,7 +258,7 @@ TEST_F(SortMergeReaderTest, TestSimpleWithThreeSameKeys2) { ASSERT_OK_AND_ASSIGN(std::shared_ptr user_key_comparator, FieldsComparator::Create({data_fields[2]}, std::vector({0}), - /*is_ascending_order=*/true, /*use_view=*/true)); + /*is_ascending_order=*/true)); std::vector expected = KeyValueChecker::GenerateKeyValues({2, 1}, {{2}, {5}}, {{2, 30}, {5, 50}}, pool_); CheckSortMergeResult( @@ -303,7 +303,7 @@ TEST_F(SortMergeReaderTest, TestSortMergeIn2Ways) { ASSERT_OK_AND_ASSIGN( std::shared_ptr user_key_comparator, FieldsComparator::Create({data_fields[2], data_fields[3]}, std::vector({0, 1}), - /*is_ascending_order=*/true, /*use_view=*/true)); + /*is_ascending_order=*/true)); std::vector expected = KeyValueChecker::GenerateKeyValues( {1, 0, 0, 2, 2, 2}, {{1, 1}, {1, 2}, {1, 3}, {2, 2}, {2, 3}, {2, 5}}, {{1, 1, 14, 24, 34}, @@ -355,7 +355,7 @@ TEST_F(SortMergeReaderTest, TestSortMergeIn3Ways) { ASSERT_OK_AND_ASSIGN(std::shared_ptr user_key_comparator, FieldsComparator::Create({data_fields[2]}, std::vector({0}), - /*is_ascending_order=*/true, /*use_view=*/true)); + /*is_ascending_order=*/true)); std::vector expected = KeyValueChecker::GenerateKeyValues({1, 2, 2, 4, 4}, {{1}, {2}, {3}, {4}, {5}}, {{1, 17}, {2, 18}, {3, 15}, {4, 19}, {5, 20}}, pool_); @@ -391,7 +391,7 @@ TEST_F(SortMergeReaderTest, TestSortMergeIn2WaysWithEmptyArray) { ASSERT_OK_AND_ASSIGN(std::shared_ptr user_key_comparator, FieldsComparator::Create({data_fields[2]}, std::vector({0}), - /*is_ascending_order=*/true, /*use_view=*/true)); + /*is_ascending_order=*/true)); std::vector expected = KeyValueChecker::GenerateKeyValues( {0, 0, 1, 2}, {{1}, {2}, {3}, {4}}, {{1, 10}, {2, 11}, {3, 12}, {4, 13}}, pool_); @@ -430,7 +430,7 @@ TEST_F(SortMergeReaderTest, TestSortMergeIn2WaysWithNoOverlap) { ASSERT_OK_AND_ASSIGN(std::shared_ptr user_key_comparator, FieldsComparator::Create({data_fields[2]}, std::vector({0}), - /*is_ascending_order=*/true, /*use_view=*/true)); + /*is_ascending_order=*/true)); std::vector expected = KeyValueChecker::GenerateKeyValues( {0, 0, 1, 2, 1, 2, 3, 2}, {{1}, {2}, {3}, {4}, {102}, {103}, {104}, {105}}, {{1, 10}, {2, 11}, {3, 12}, {4, 13}, {102, 14}, {103, 15}, {104, 16}, {105, 17}}, pool_); @@ -469,7 +469,7 @@ TEST_F(SortMergeReaderTest, TestSortMergeIn2WaysWithFullOverlap) { ASSERT_OK_AND_ASSIGN(std::shared_ptr user_key_comparator, FieldsComparator::Create({data_fields[2]}, std::vector({0}), - /*is_ascending_order=*/true, /*use_view=*/true)); + /*is_ascending_order=*/true)); std::vector expected = KeyValueChecker::GenerateKeyValues( {1, 2, 3, 3}, {{1}, {2}, {3}, {4}}, {{1, 14}, {2, 15}, {3, 16}, {4, 17}}, pool_); @@ -510,7 +510,7 @@ TEST_F(SortMergeReaderTest, TestSortMergeIn2WaysWithPartialOverlap) { ASSERT_OK_AND_ASSIGN(std::shared_ptr user_key_comparator, FieldsComparator::Create({data_fields[2]}, std::vector({0}), - /*is_ascending_order=*/true, /*use_view=*/true)); + /*is_ascending_order=*/true)); std::vector expected = KeyValueChecker::GenerateKeyValues( {1, 2, 3, 3, 0, 0}, {{1}, {2}, {3}, {4}, {5}, {6}}, {{1, 14}, {2, 15}, {3, 16}, {4, 17}, {5, 18}, {6, 19}}, pool_); @@ -566,12 +566,12 @@ TEST_F(SortMergeReaderTest, TestSortMergeIn3WaysWithUserDefinedSeq) { ASSERT_OK_AND_ASSIGN( std::shared_ptr user_key_comparator, FieldsComparator::Create({data_fields[2], data_fields[3]}, std::vector({0, 1}), - /*is_ascending_order=*/true, /*use_view=*/true)); + /*is_ascending_order=*/true)); ASSERT_OK_AND_ASSIGN(std::shared_ptr user_defined_seq_comparator, FieldsComparator::Create({data_fields[2], data_fields[3], data_fields[4], data_fields[5], data_fields[6]}, std::vector({2, 3}), - /*is_ascending_order=*/true, /*use_view=*/true)); + /*is_ascending_order=*/true)); std::vector expected = KeyValueChecker::GenerateKeyValues( {2, 1, 0, 2, 2, 3}, {{1, 1}, {1, 2}, {1, 3}, {2, 2}, {2, 3}, {2, 5}}, {{1, 1, 14, 24, 34}, diff --git a/src/paimon/core/mergetree/levels_test.cpp b/src/paimon/core/mergetree/levels_test.cpp index b3debe04..0d12ff14 100644 --- a/src/paimon/core/mergetree/levels_test.cpp +++ b/src/paimon/core/mergetree/levels_test.cpp @@ -50,8 +50,7 @@ class LevelsTest : public testing::Test { std::vector data_fields; data_fields.emplace_back(/*id=*/0, arrow::field("f0", arrow::int32())); EXPECT_OK_AND_ASSIGN(auto cmp, - FieldsComparator::Create(data_fields, /*is_ascending_order=*/true, - /*use_view=*/true)); + FieldsComparator::Create(data_fields, /*is_ascending_order=*/true)); return cmp; } diff --git a/src/paimon/core/mergetree/lookup_levels.cpp b/src/paimon/core/mergetree/lookup_levels.cpp index 0e82b21d..81de8c1c 100644 --- a/src/paimon/core/mergetree/lookup_levels.cpp +++ b/src/paimon/core/mergetree/lookup_levels.cpp @@ -43,9 +43,8 @@ Result>> LookupLevels::Create( auto pk_schema = DataField::ConvertDataFieldsToArrowSchema(pk_fields); PAIMON_ASSIGN_OR_RAISE(std::unique_ptr key_serializer, RowCompactedSerializer::Create(pk_schema, pool)); - PAIMON_ASSIGN_OR_RAISE( - std::unique_ptr key_comparator, - FieldsComparator::Create(pk_fields, /*is_ascending_order=*/true, /*use_view=*/true)); + PAIMON_ASSIGN_OR_RAISE(std::unique_ptr key_comparator, + FieldsComparator::Create(pk_fields, /*is_ascending_order=*/true)); PAIMON_ASSIGN_OR_RAISE(std::vector partition_fields, table_schema->GetFields(table_schema->PartitionKeys())); diff --git a/src/paimon/core/mergetree/lookup_levels_test.cpp b/src/paimon/core/mergetree/lookup_levels_test.cpp index 98bdcfae..1a36061b 100644 --- a/src/paimon/core/mergetree/lookup_levels_test.cpp +++ b/src/paimon/core/mergetree/lookup_levels_test.cpp @@ -102,8 +102,7 @@ class LookupLevelsTest : public testing::Test { std::vector key_fields = {DataField(0, key_schema_->field(0))}; PAIMON_ASSIGN_OR_RAISE(std::shared_ptr key_comparator, FieldsComparator::Create(key_fields, - /*is_ascending_order=*/true, - /*use_view=*/true)); + /*is_ascending_order=*/true)); return key_comparator; } diff --git a/src/paimon/core/mergetree/merge_tree_writer_test.cpp b/src/paimon/core/mergetree/merge_tree_writer_test.cpp index 274687cb..20b7853d 100644 --- a/src/paimon/core/mergetree/merge_tree_writer_test.cpp +++ b/src/paimon/core/mergetree/merge_tree_writer_test.cpp @@ -74,9 +74,9 @@ class MergeTreeWriterTest : public ::testing::Test { value_schema_ = DataField::ConvertDataFieldsToArrowSchema(value_fields_); value_type_ = DataField::ConvertDataFieldsToArrowStructType(value_fields_); primary_keys_ = {"f0"}; - ASSERT_OK_AND_ASSIGN(key_comparator_, FieldsComparator::Create({value_fields_[0]}, - /*is_ascending_order=*/true, - /*use_view=*/true)); + ASSERT_OK_AND_ASSIGN(key_comparator_, + FieldsComparator::Create({value_fields_[0]}, + /*is_ascending_order=*/true)); std::vector write_fields = {SpecialFields::SequenceNumber(), SpecialFields::ValueKind()}; write_fields.insert(write_fields.end(), value_fields_.begin(), value_fields_.end()); @@ -293,8 +293,7 @@ TEST_F(MergeTreeWriterTest, TestWriteWithDeleteRow) { ASSERT_OK_AND_ASSIGN(std::shared_ptr user_defined_seq_comparator, FieldsComparator::Create({value_fields_[1]}, - /*is_ascending_order=*/true, - /*use_view=*/true)); + /*is_ascending_order=*/true)); assert(user_defined_seq_comparator); auto merge_writer = std::make_shared( /*last_sequence_number=*/9, primary_keys_, path_factory, key_comparator_, diff --git a/src/paimon/core/mergetree/sorted_run_test.cpp b/src/paimon/core/mergetree/sorted_run_test.cpp index a014eb7e..39913a0a 100644 --- a/src/paimon/core/mergetree/sorted_run_test.cpp +++ b/src/paimon/core/mergetree/sorted_run_test.cpp @@ -59,7 +59,7 @@ TEST_F(SortedRunTest, TestSortedRunIsValid) { ASSERT_OK_AND_ASSIGN( std::shared_ptr comparator, FieldsComparator::Create({DataField(0, arrow::field("test", arrow::int32()))}, - /*is_ascending_order=*/true, /*use_view=*/true)); + /*is_ascending_order=*/true)); // m1 [10, 20] auto m1 = CreateDataFileMeta(10, 20); @@ -91,7 +91,7 @@ TEST_F(SortedRunTest, TestFromUnsorted) { ASSERT_OK_AND_ASSIGN( std::shared_ptr comparator, FieldsComparator::Create({DataField(0, arrow::field("test", arrow::int32()))}, - /*is_ascending_order=*/true, /*use_view=*/true)); + /*is_ascending_order=*/true)); // m1 [10, 20] auto m1 = CreateDataFileMeta(10, 20); diff --git a/src/paimon/core/operation/file_store_write.cpp b/src/paimon/core/operation/file_store_write.cpp index 9d89f947..5e7916fd 100644 --- a/src/paimon/core/operation/file_store_write.cpp +++ b/src/paimon/core/operation/file_store_write.cpp @@ -167,16 +167,9 @@ Result> FileStoreWrite::Create(std::unique_ptrTrimmedPrimaryKeys()); PAIMON_ASSIGN_OR_RAISE(std::vector trimmed_primary_key_fields, schema->GetFields(trimmed_primary_keys)); - // use_view=true: for in-memory Arrow row comparisons in MergeTreeWriter PAIMON_ASSIGN_OR_RAISE(std::shared_ptr key_comparator, FieldsComparator::Create(trimmed_primary_key_fields, - options.SequenceFieldSortOrderIsAscending(), - /*use_view=*/true)); - // use_view=false: for BinaryRow min_key/max_key in DataFileMeta (CompactManager/Levels) - PAIMON_ASSIGN_OR_RAISE(std::shared_ptr key_comparator_for_compact, - FieldsComparator::Create(trimmed_primary_key_fields, - options.SequenceFieldSortOrderIsAscending(), - /*use_view=*/false)); + options.SequenceFieldSortOrderIsAscending())); auto primary_keys = schema->PrimaryKeys(); PAIMON_ASSIGN_OR_RAISE( std::unique_ptr merge_function, @@ -209,7 +202,7 @@ Result> FileStoreWrite::Create(std::unique_ptr( file_store_path_factory, snapshot_manager, schema_manager, ctx->GetCommitUser(), ctx->GetRootPath(), schema, arrow_schema, partition_schema, dv_maintainer_factory, - ctx->GetIOManager(), key_comparator, key_comparator_for_compact, + ctx->GetIOManager(), key_comparator, sequence_fields_comparator, merge_function_wrapper, options, ignore_previous_files, ctx->IsStreamingMode(), ctx->IgnoreNumBucketCheck(), ctx->GetExecutor(), ctx->GetMemoryPool()); diff --git a/src/paimon/core/operation/key_value_file_store_write.cpp b/src/paimon/core/operation/key_value_file_store_write.cpp index 65de7d77..6ff25cba 100644 --- a/src/paimon/core/operation/key_value_file_store_write.cpp +++ b/src/paimon/core/operation/key_value_file_store_write.cpp @@ -72,8 +72,7 @@ KeyValueFileStoreWrite::KeyValueFileStoreWrite( const std::shared_ptr& partition_schema, const std::shared_ptr& dv_maintainer_factory, const std::shared_ptr& io_manager, - const std::shared_ptr& key_comparator, - const std::shared_ptr& key_comparator_for_compact, + const std::shared_ptr& key_comparator, const std::shared_ptr& user_defined_seq_comparator, const std::shared_ptr>& merge_function_wrapper, const CoreOptions& options, bool ignore_previous_files, bool is_streaming_mode, @@ -85,7 +84,6 @@ KeyValueFileStoreWrite::KeyValueFileStoreWrite( ignore_previous_files, is_streaming_mode, ignore_num_bucket_check, executor, pool), key_comparator_(key_comparator), - key_comparator_for_compact_(key_comparator_for_compact), user_defined_seq_comparator_(user_defined_seq_comparator), merge_function_wrapper_(merge_function_wrapper), logger_(Logger::GetLogger("KeyValueFileStoreWrite")) {} @@ -121,7 +119,7 @@ Result> KeyValueFileStoreWrite::CreateWriter( table_schema_->TrimmedPrimaryKeys()); PAIMON_ASSIGN_OR_RAISE( std::shared_ptr levels, - Levels::Create(key_comparator_for_compact_, restore_data_files, options_.GetNumLevels())); + Levels::Create(key_comparator_, restore_data_files, options_.GetNumLevels())); auto compact_strategy = CreateCompactStrategy(options_); PAIMON_ASSIGN_OR_RAISE(std::shared_ptr compact_manager, CreateCompactManager(partition, bucket, compact_strategy, @@ -177,7 +175,7 @@ Result> KeyValueFileStoreWrite::CreateCompactMan compaction_metrics_ ? compaction_metrics_->CreateReporter(partition, bucket) : nullptr; return std::make_shared( - levels, compact_strategy, key_comparator_for_compact_, + levels, compact_strategy, key_comparator_, options_.GetCompactionFileSize(/*has_primary_key=*/true), options_.GetNumSortedRunsStopTrigger(), rewriter, reporter, dv_maintainer, /*lazy_gen_deletion_file=*/false, options_.GetLookupStrategy().need_lookup, diff --git a/src/paimon/core/operation/key_value_file_store_write.h b/src/paimon/core/operation/key_value_file_store_write.h index 4a7cc2dc..dc556985 100644 --- a/src/paimon/core/operation/key_value_file_store_write.h +++ b/src/paimon/core/operation/key_value_file_store_write.h @@ -70,7 +70,6 @@ class KeyValueFileStoreWrite : public AbstractFileStoreWrite { const std::shared_ptr& dv_maintainer_factory, const std::shared_ptr& io_manager, const std::shared_ptr& key_comparator, - const std::shared_ptr& key_comparator_for_compact, const std::shared_ptr& user_defined_seq_comparator, const std::shared_ptr>& merge_function_wrapper, const CoreOptions& options, bool ignore_previous_files, bool is_streaming_mode, @@ -145,9 +144,6 @@ class KeyValueFileStoreWrite : public AbstractFileStoreWrite { private: std::shared_ptr key_comparator_; - // key_comparator_for_compact_ uses use_view=false, safe to compare BinaryRow - // min_key/max_key stored in DataFileMeta (not backed by a live Arrow buffer). - std::shared_ptr key_comparator_for_compact_; std::shared_ptr user_defined_seq_comparator_; std::shared_ptr> merge_function_wrapper_; std::unique_ptr logger_; diff --git a/src/paimon/core/operation/merge_file_split_read.cpp b/src/paimon/core/operation/merge_file_split_read.cpp index c3993eac..20f02da3 100644 --- a/src/paimon/core/operation/merge_file_split_read.cpp +++ b/src/paimon/core/operation/merge_file_split_read.cpp @@ -319,9 +319,8 @@ Status MergeFileSplitRead::GenerateKeyValueReadSchema( // 6. complete key fields to all trimmed primary key key_fields.clear(); PAIMON_ASSIGN_OR_RAISE(key_fields, table_schema.GetFields(trimmed_key_fields)); - PAIMON_ASSIGN_OR_RAISE( - *key_comparator, FieldsComparator::Create(key_fields, - /*is_ascending_order=*/true, /*use_view=*/true)); + PAIMON_ASSIGN_OR_RAISE(*key_comparator, FieldsComparator::Create(key_fields, + /*is_ascending_order=*/true)); // 7. construct actual read fields: special + key + non-key value std::vector read_fields; std::vector special_fields( diff --git a/src/paimon/core/table/source/split_generator_test.cpp b/src/paimon/core/table/source/split_generator_test.cpp index ed9faec7..4b40c363 100644 --- a/src/paimon/core/table/source/split_generator_test.cpp +++ b/src/paimon/core/table/source/split_generator_test.cpp @@ -52,7 +52,7 @@ class SplitGeneratorTest : public testing::Test { pool_ = GetDefaultPool(); key_comparator_ = FieldsComparator::Create({DataField(0, arrow::field("f0", arrow::int32(), false))}, - /*is_ascending_order=*/true, /*use_view=*/true) + /*is_ascending_order=*/true) .value(); } void TearDown() override {} diff --git a/src/paimon/core/table/source/table_scan.cpp b/src/paimon/core/table/source/table_scan.cpp index d6982aec..d80b1488 100644 --- a/src/paimon/core/table/source/table_scan.cpp +++ b/src/paimon/core/table/source/table_scan.cpp @@ -127,8 +127,7 @@ class TableScanImpl { table_schema->GetFields(trimmed_primary_keys)); PAIMON_ASSIGN_OR_RAISE( std::shared_ptr key_comparator, - FieldsComparator::Create(trimmed_pk_fields, /*is_ascending_order=*/true, - /*use_view=*/true)); + FieldsComparator::Create(trimmed_pk_fields, /*is_ascending_order=*/true)); return std::make_unique( source_split_target_size, source_split_open_file_cost, core_options.DeletionVectorsEnabled(), core_options.GetMergeEngine(), diff --git a/src/paimon/core/utils/fields_comparator.cpp b/src/paimon/core/utils/fields_comparator.cpp index 841c5769..164822e0 100644 --- a/src/paimon/core/utils/fields_comparator.cpp +++ b/src/paimon/core/utils/fields_comparator.cpp @@ -32,24 +32,23 @@ namespace paimon { Result> FieldsComparator::Create( - const std::vector& input_data_field, bool is_ascending_order, bool use_view) { + const std::vector& input_data_field, bool is_ascending_order) { std::vector sort_fields; sort_fields.reserve(input_data_field.size()); for (int32_t i = 0; i < static_cast(input_data_field.size()); i++) { sort_fields.push_back(i); } - return Create(input_data_field, sort_fields, is_ascending_order, use_view); + return Create(input_data_field, sort_fields, is_ascending_order); } Result> FieldsComparator::Create( const std::vector& input_data_field, const std::vector& sort_fields, - bool is_ascending_order, bool use_view) { + bool is_ascending_order) { std::vector comparators; comparators.reserve(sort_fields.size()); for (const auto& sort_field_idx : sort_fields) { const auto& type = input_data_field[sort_field_idx].Type(); - PAIMON_ASSIGN_OR_RAISE(FieldComparatorFunc cmp, - CompareField(sort_field_idx, type, use_view)); + PAIMON_ASSIGN_OR_RAISE(FieldComparatorFunc cmp, CompareField(sort_field_idx, type)); comparators.emplace_back(cmp); } return std::unique_ptr( @@ -79,7 +78,7 @@ int32_t FieldsComparator::CompareTo(const InternalRow& lhs, const InternalRow& r } Result FieldsComparator::CompareField( - int32_t field_idx, const std::shared_ptr& input_type, bool use_view) { + int32_t field_idx, const std::shared_ptr& input_type) { arrow::Type::type type = input_type->id(); switch (type) { case arrow::Type::type::BOOL: @@ -142,44 +141,15 @@ Result FieldsComparator::CompareField( double rvalue = rhs.GetDouble(field_idx); return lvalue == rvalue ? 0 : (lvalue < rvalue ? -1 : 1); }); - case arrow::Type::type::STRING: { - if (use_view) { - return FieldsComparator::FieldComparatorFunc( - [field_idx](const InternalRow& lhs, const InternalRow& rhs) -> int32_t { - auto lvalue = lhs.GetStringView(field_idx); - auto rvalue = rhs.GetStringView(field_idx); - int32_t cmp = lvalue.compare(rvalue); - return cmp == 0 ? 0 : (cmp > 0 ? 1 : -1); - }); - } else { - return FieldsComparator::FieldComparatorFunc( - [field_idx](const InternalRow& lhs, const InternalRow& rhs) -> int32_t { - auto lvalue = lhs.GetString(field_idx); - auto rvalue = rhs.GetString(field_idx); - int32_t cmp = lvalue.CompareTo(rvalue); - return cmp == 0 ? 0 : (cmp > 0 ? 1 : -1); - }); - } - } + case arrow::Type::type::STRING: case arrow::Type::type::BINARY: { - // TODO(xinyu.lxy): may use 64byte compare - if (use_view) { - return FieldsComparator::FieldComparatorFunc( - [field_idx](const InternalRow& lhs, const InternalRow& rhs) -> int32_t { - auto lvalue = lhs.GetStringView(field_idx); - auto rvalue = rhs.GetStringView(field_idx); - int32_t cmp = lvalue.compare(rvalue); - return cmp == 0 ? 0 : (cmp > 0 ? 1 : -1); - }); - } else { - return FieldsComparator::FieldComparatorFunc( - [field_idx](const InternalRow& lhs, const InternalRow& rhs) -> int32_t { - auto lvalue = lhs.GetBinary(field_idx); - auto rvalue = rhs.GetBinary(field_idx); - int32_t cmp = lvalue->compare(*rvalue); - return cmp == 0 ? 0 : (cmp > 0 ? 1 : -1); - }); - } + return FieldsComparator::FieldComparatorFunc( + [field_idx](const InternalRow& lhs, const InternalRow& rhs) -> int32_t { + auto lvalue = lhs.GetStringView(field_idx); + auto rvalue = rhs.GetStringView(field_idx); + int32_t cmp = lvalue.compare(rvalue); + return cmp == 0 ? 0 : (cmp > 0 ? 1 : -1); + }); } case arrow::Type::type::TIMESTAMP: { auto timestamp_type = diff --git a/src/paimon/core/utils/fields_comparator.h b/src/paimon/core/utils/fields_comparator.h index 2fd2184a..052a8285 100644 --- a/src/paimon/core/utils/fields_comparator.h +++ b/src/paimon/core/utils/fields_comparator.h @@ -39,11 +39,11 @@ class DataField; class FieldsComparator { public: static Result> Create( - const std::vector& input_data_field, bool is_ascending_order, bool use_view); + const std::vector& input_data_field, bool is_ascending_order); static Result> Create( const std::vector& input_data_field, const std::vector& sort_fields, - bool is_ascending_order, bool use_view); + bool is_ascending_order); int32_t CompareTo(const InternalRow& lhs, const InternalRow& rhs) const; @@ -97,7 +97,7 @@ class FieldsComparator { } static Result CompareField( - int32_t field_idx, const std::shared_ptr& input_type, bool use_view); + int32_t field_idx, const std::shared_ptr& input_type); private: bool is_ascending_order_; diff --git a/src/paimon/core/utils/fields_comparator_test.cpp b/src/paimon/core/utils/fields_comparator_test.cpp index ac264f35..705911eb 100644 --- a/src/paimon/core/utils/fields_comparator_test.cpp +++ b/src/paimon/core/utils/fields_comparator_test.cpp @@ -51,17 +51,16 @@ class FieldsComparatorTest : public ::testing::Test { data_fields.emplace_back(/*id=*/0, arrow::field("fake_name", type)); } ASSERT_OK_AND_ASSIGN(auto comp1, FieldsComparator::Create(data_fields, sort_fields, - /*is_ascending_order=*/true, - /*use_view=*/true)); + /*is_ascending_order=*/true)); ASSERT_EQ(-1, comp1->CompareTo(row1, row2)); ASSERT_EQ(1, comp1->CompareTo(row2, row1)); ASSERT_EQ(0, comp1->CompareTo(row1, row1)); ASSERT_EQ(0, comp1->CompareTo(row2, row2)); if (!has_null) { - ASSERT_OK_AND_ASSIGN(auto comp2, FieldsComparator::Create(data_fields, sort_fields, - /*is_ascending_order=*/false, - /*use_view=*/true)); + ASSERT_OK_AND_ASSIGN(auto comp2, + FieldsComparator::Create(data_fields, sort_fields, + /*is_ascending_order=*/false)); ASSERT_EQ(1, comp2->CompareTo(row1, row2)); ASSERT_EQ(-1, comp2->CompareTo(row2, row1)); ASSERT_EQ(0, comp2->CompareTo(row1, row1)); @@ -322,8 +321,7 @@ TEST_F(FieldsComparatorTest, TestInvalidType) { auto map_type = arrow::map(arrow::int8(), arrow::int16()); ASSERT_NOK_WITH_MSG(FieldsComparator::Create({DataField(0, arrow::field("f0", arrow::int32())), DataField(1, arrow::field("f1", map_type))}, - /*is_ascending_order=*/true, - /*use_view=*/true), + /*is_ascending_order=*/true), "Do not support comparing map type in idx 1"); } diff --git a/src/paimon/core/utils/primary_key_table_utils.cpp b/src/paimon/core/utils/primary_key_table_utils.cpp index 2dc602c4..05abf21d 100644 --- a/src/paimon/core/utils/primary_key_table_utils.cpp +++ b/src/paimon/core/utils/primary_key_table_utils.cpp @@ -90,7 +90,7 @@ Result> PrimaryKeyTableUtils::CreateSequenceFi } } return FieldsComparator::Create(value_fields, sort_field_idxs, - options.SequenceFieldSortOrderIsAscending(), /*use_view=*/true); + options.SequenceFieldSortOrderIsAscending()); } } // namespace paimon diff --git a/src/paimon/testing/utils/key_value_checker.h b/src/paimon/testing/utils/key_value_checker.h index 5789a09d..52cc73db 100644 --- a/src/paimon/testing/utils/key_value_checker.h +++ b/src/paimon/testing/utils/key_value_checker.h @@ -64,12 +64,10 @@ class KeyValueChecker : public testing::Test { const std::vector& key_fields, const std::vector& value_fields) { ASSERT_EQ(expected_vec.size(), result_vec.size()); - ASSERT_OK_AND_ASSIGN( - auto key_comparator, - FieldsComparator::Create(key_fields, /*is_ascending_order=*/true, /*use_view=*/true)); + ASSERT_OK_AND_ASSIGN(auto key_comparator, + FieldsComparator::Create(key_fields, /*is_ascending_order=*/true)); ASSERT_OK_AND_ASSIGN(auto value_comparator, - FieldsComparator::Create(value_fields, /*is_ascending_order=*/true, - /*use_view=*/true)); + FieldsComparator::Create(value_fields, /*is_ascending_order=*/true)); for (size_t i = 0; i < expected_vec.size(); i++) { const auto& expected = expected_vec[i]; const auto& result = result_vec[i]; From bb6b2ae574fcd782f4cad41dfd78265eebb92326 Mon Sep 17 00:00:00 2001 From: lxy264173 Date: Thu, 26 Mar 2026 18:11:03 +0800 Subject: [PATCH 11/11] fix rebase --- src/paimon/common/data/binary_row.cpp | 3 +++ src/paimon/common/data/binary_row_test.cpp | 1 + src/paimon/common/data/binary_string.cpp | 1 + src/paimon/common/sst/block_iterator.cpp | 2 +- src/paimon/common/sst/sst_file_reader.cpp | 5 +++-- .../core/mergetree/compact/merge_tree_compact_manager.cpp | 4 ++-- .../mergetree/compact/merge_tree_compact_manager_test.cpp | 3 +-- 7 files changed, 12 insertions(+), 7 deletions(-) diff --git a/src/paimon/common/data/binary_row.cpp b/src/paimon/common/data/binary_row.cpp index 9337bcc3..09ee5da9 100644 --- a/src/paimon/common/data/binary_row.cpp +++ b/src/paimon/common/data/binary_row.cpp @@ -294,6 +294,9 @@ bool BinaryRow::operator==(const BinaryRow& other) const { } int32_t BinaryRow::HashCode() const { + if (size_in_bytes_ == 0) { + return 0; + } return MemorySegmentUtils::HashByWords({segment_}, offset_, size_in_bytes_, nullptr); } diff --git a/src/paimon/common/data/binary_row_test.cpp b/src/paimon/common/data/binary_row_test.cpp index e5e9de52..6f125a47 100644 --- a/src/paimon/common/data/binary_row_test.cpp +++ b/src/paimon/common/data/binary_row_test.cpp @@ -79,6 +79,7 @@ TEST_F(BinaryRowTest, TestBasic) { ASSERT_EQ(static_cast(5.8), row.GetDouble(1)); row.Clear(); + ASSERT_EQ(0, row.HashCode()); std::shared_ptr bytes1 = Bytes::AllocateBytes(100, pool.get()); MemorySegment segment1 = MemorySegment::Wrap(bytes1); row.PointTo(segment1, 0, 20); diff --git a/src/paimon/common/data/binary_string.cpp b/src/paimon/common/data/binary_string.cpp index 685adb21..cd02d96c 100644 --- a/src/paimon/common/data/binary_string.cpp +++ b/src/paimon/common/data/binary_string.cpp @@ -280,6 +280,7 @@ bool BinaryString::MatchAtOneSeg(const BinaryString& s, int32_t pos) const { std::string_view BinaryString::GetStringView() const { const auto* bytes = segment_.GetArray(); + assert(bytes); return std::string_view(bytes->data() + offset_, size_in_bytes_); } } // namespace paimon diff --git a/src/paimon/common/sst/block_iterator.cpp b/src/paimon/common/sst/block_iterator.cpp index 04df2141..80d127e4 100644 --- a/src/paimon/common/sst/block_iterator.cpp +++ b/src/paimon/common/sst/block_iterator.cpp @@ -53,7 +53,7 @@ Result BlockIterator::SeekTo(const std::shared_ptr& target_ke int32_t mid = left + (right - left) / 2; PAIMON_RETURN_NOT_OK(input_->SetPosition(reader_->SeekTo(mid))); - PAIMON_ASSIGN_OR_RAISE(auto mid_entry, ReadEntry()); + PAIMON_ASSIGN_OR_RAISE(std::unique_ptr mid_entry, ReadEntry()); PAIMON_ASSIGN_OR_RAISE(int32_t compare, reader_->Comparator()(mid_entry->key, target_key)); if (compare == 0) { diff --git a/src/paimon/common/sst/sst_file_reader.cpp b/src/paimon/common/sst/sst_file_reader.cpp index 54a84742..4ba31335 100644 --- a/src/paimon/common/sst/sst_file_reader.cpp +++ b/src/paimon/common/sst/sst_file_reader.cpp @@ -112,7 +112,8 @@ Result> SstFileReader::GetNextBlock( PAIMON_ASSIGN_OR_RAISE(std::unique_ptr ret, index_iterator->Next()); auto& slice = ret->value; auto input = slice->ToInput(); - PAIMON_ASSIGN_OR_RAISE(auto block_handle, BlockHandle::ReadBlockHandle(input)); + PAIMON_ASSIGN_OR_RAISE(std::shared_ptr block_handle, + BlockHandle::ReadBlockHandle(input)); PAIMON_ASSIGN_OR_RAISE(std::shared_ptr reader, ReadBlock(std::move(block_handle), false)); return reader->Iterator(); @@ -168,7 +169,7 @@ Result SstFileReader::DecompressBlock(const MemorySegment& compre auto output = MemorySegment::AllocateHeapMemory(uncompressed_size, pool.get()); auto output_memory = output.GetHeapMemory(); PAIMON_ASSIGN_OR_RAISE( - int uncompressed_length, + int32_t uncompressed_length, decompressor->Decompress(input_memory->data() + input->Position(), input->Available(), output_memory->data(), output_memory->size())); if (static_cast(uncompressed_length) != output_memory->size()) { diff --git a/src/paimon/core/mergetree/compact/merge_tree_compact_manager.cpp b/src/paimon/core/mergetree/compact/merge_tree_compact_manager.cpp index fd58130b..a1060d1b 100644 --- a/src/paimon/core/mergetree/compact/merge_tree_compact_manager.cpp +++ b/src/paimon/core/mergetree/compact/merge_tree_compact_manager.cpp @@ -107,8 +107,8 @@ Status MergeTreeCompactManager::TriggerCompaction(bool full_compaction) { PAIMON_LOG_DEBUG(logger_, "Trigger forced full compaction. Picking from runs:\n%s", StringUtils::VectorToString(runs).c_str()); - optional_unit = CompactStrategy::PickFullCompaction(levels_->NumberOfLevels(), runs, - force_rewrite_all_files_); + optional_unit = CompactStrategy::PickFullCompaction( + levels_->NumberOfLevels(), runs, dv_maintainer_, force_rewrite_all_files_); } else { if (task_future_.valid()) { return Status::OK(); diff --git a/src/paimon/core/mergetree/compact/merge_tree_compact_manager_test.cpp b/src/paimon/core/mergetree/compact/merge_tree_compact_manager_test.cpp index 752d4cb7..a07aec01 100644 --- a/src/paimon/core/mergetree/compact/merge_tree_compact_manager_test.cpp +++ b/src/paimon/core/mergetree/compact/merge_tree_compact_manager_test.cpp @@ -175,8 +175,7 @@ class MergeTreeCompactManagerTest : public testing::Test { std::vector data_fields; data_fields.emplace_back(/*id=*/0, arrow::field("f0", arrow::int32())); ASSERT_OK_AND_ASSIGN(auto comparator, - FieldsComparator::Create(data_fields, /*is_ascending_order=*/true, - /*use_view=*/false)); + FieldsComparator::Create(data_fields, /*is_ascending_order=*/true)); comparator_ = std::shared_ptr(std::move(comparator)); file_id_ = 0; }