From b5de6c0c8d734d4c1cad90e53acf0328bb11e9b1 Mon Sep 17 00:00:00 2001 From: "lisizhuo.lsz" Date: Mon, 12 Jan 2026 16:08:04 +0800 Subject: [PATCH 1/7] feat: support external path for global index --- include/paimon/defs.h | 3 + .../global_index/global_index_io_meta.h | 6 +- .../io/global_index_file_writer.h | 3 + src/paimon/common/defs.cpp | 1 + .../bitmap/bitmap_global_index.cpp | 2 +- .../bitmap/bitmap_global_index_test.cpp | 5 +- .../wrap/file_index_writer_wrapper.h | 3 +- src/paimon/core/core_options.cpp | 29 +++++ src/paimon/core/core_options.h | 7 +- src/paimon/core/core_options_test.cpp | 23 ++++ .../global_index/global_index_file_manager.h | 20 +++- .../global_index/global_index_scan_impl.cpp | 4 +- .../global_index/global_index_write_task.cpp | 18 ++- .../row_range_global_index_scanner_impl.cpp | 6 +- .../row_range_global_index_scanner_impl.h | 6 +- .../core/index/index_file_handler_test.cpp | 4 + src/paimon/core/index/index_file_meta.h | 5 - .../index/index_file_meta_serializer_test.cpp | 30 +++-- ...dex_in_data_file_dir_path_factory_test.cpp | 2 +- .../index_manifest_entry_serializer_test.cpp | 4 + .../core/manifest/manifest_file_test.cpp | 1 + .../core/manifest/manifest_list_test.cpp | 1 + src/paimon/core/migrate/file_meta_utils.cpp | 1 + .../core/operation/expire_snapshots_test.cpp | 15 ++- .../core/operation/file_store_commit.cpp | 5 +- .../operation/file_store_commit_impl_test.cpp | 3 +- .../core/operation/file_store_write.cpp | 5 +- .../key_value_file_store_scan_test.cpp | 15 ++- .../operation/manifest_file_merger_test.cpp | 5 +- .../operation/merge_file_split_read_test.cpp | 5 +- .../core/operation/orphan_files_cleaner.cpp | 5 +- .../operation/orphan_files_cleaner_impl.cpp | 5 +- .../operation/raw_file_split_read_test.cpp | 15 ++- .../postpone_bucket_file_store_write.h | 15 ++- .../core/table/sink/commit_message_test.cpp | 3 +- src/paimon/core/table/source/table_read.cpp | 6 +- src/paimon/core/table/source/table_scan.cpp | 6 +- .../core/utils/file_store_path_factory.cpp | 21 +++- .../core/utils/file_store_path_factory.h | 7 ++ .../utils/file_store_path_factory_test.cpp | 44 +++++++- .../utils/index_file_path_factories_test.cpp | 2 + .../lumina/lumina_global_index.cpp | 5 +- .../lumina/lumina_global_index_test.cpp | 103 ++++++++---------- src/paimon/testing/utils/data_generator.cpp | 16 +-- test/inte/global_index_test.cpp | 84 +++++++++++++- 45 files changed, 430 insertions(+), 144 deletions(-) diff --git a/include/paimon/defs.h b/include/paimon/defs.h index bab1f582..98763238 100644 --- a/include/paimon/defs.h +++ b/include/paimon/defs.h @@ -284,6 +284,9 @@ struct PAIMON_EXPORT Options { static const char BLOB_AS_DESCRIPTOR[]; /// "global-index.enabled" - Whether to enable global index for scan. Default value is "true". static const char GLOBAL_INDEX_ENABLED[]; + /// "global-index.external-path" - The external path where the global index will be + /// written. + static const char GLOBAL_INDEX_EXTERNAL_PATH[]; }; static constexpr int64_t BATCH_WRITE_COMMIT_IDENTIFIER = std::numeric_limits::max(); diff --git a/include/paimon/global_index/global_index_io_meta.h b/include/paimon/global_index/global_index_io_meta.h index 4453c846..a8dacd23 100644 --- a/include/paimon/global_index/global_index_io_meta.h +++ b/include/paimon/global_index/global_index_io_meta.h @@ -25,14 +25,14 @@ namespace paimon { /// Metadata describing a single file entry in a global index. struct PAIMON_EXPORT GlobalIndexIOMeta { - GlobalIndexIOMeta(const std::string& _file_name, int64_t _file_size, int64_t _range_end, + GlobalIndexIOMeta(const std::string& _file_path, int64_t _file_size, int64_t _range_end, const std::shared_ptr& _metadata) - : file_name(_file_name), + : file_path(_file_path), file_size(_file_size), range_end(_range_end), metadata(_metadata) {} - std::string file_name; + std::string file_path; int64_t file_size; /// The inclusive range end covered by this file (i.e., the last local row id). int64_t range_end; diff --git a/include/paimon/global_index/io/global_index_file_writer.h b/include/paimon/global_index/io/global_index_file_writer.h index 70190add..d95e720e 100644 --- a/include/paimon/global_index/io/global_index_file_writer.h +++ b/include/paimon/global_index/io/global_index_file_writer.h @@ -36,6 +36,9 @@ class PAIMON_EXPORT GlobalIndexFileWriter { /// Get the file size of input file name. virtual Result GetFileSize(const std::string& file_name) const = 0; + + /// Get the index file path of input file name. + virtual std::string ToPath(const std::string& file_name) const = 0; }; } // namespace paimon diff --git a/src/paimon/common/defs.cpp b/src/paimon/common/defs.cpp index c198531a..12143c24 100644 --- a/src/paimon/common/defs.cpp +++ b/src/paimon/common/defs.cpp @@ -80,4 +80,5 @@ const char Options::DATA_EVOLUTION_ENABLED[] = "data-evolution.enabled"; const char Options::PARTITION_GENERATE_LEGACY_NAME[] = "partition.legacy-name"; const char Options::BLOB_AS_DESCRIPTOR[] = "blob-as-descriptor"; const char Options::GLOBAL_INDEX_ENABLED[] = "global-index.enabled"; +const char Options::GLOBAL_INDEX_EXTERNAL_PATH[] = "global-index.external-path"; } // namespace paimon diff --git a/src/paimon/common/global_index/bitmap/bitmap_global_index.cpp b/src/paimon/common/global_index/bitmap/bitmap_global_index.cpp index f40dfe63..5301d1cc 100644 --- a/src/paimon/common/global_index/bitmap/bitmap_global_index.cpp +++ b/src/paimon/common/global_index/bitmap/bitmap_global_index.cpp @@ -41,7 +41,7 @@ Result> BitmapGlobalIndex::CreateReader( } const auto& meta = files[0]; PAIMON_ASSIGN_OR_RAISE(std::shared_ptr in, - file_reader->GetInputStream(meta.file_name)); + file_reader->GetInputStream(meta.file_path)); PAIMON_ASSIGN_OR_RAISE( std::shared_ptr reader, index_->CreateReader(arrow_schema, /*start=*/0, meta.file_size, in, pool)); diff --git a/src/paimon/common/global_index/bitmap/bitmap_global_index_test.cpp b/src/paimon/common/global_index/bitmap/bitmap_global_index_test.cpp index 2563ed06..2d430fe6 100644 --- a/src/paimon/common/global_index/bitmap/bitmap_global_index_test.cpp +++ b/src/paimon/common/global_index/bitmap/bitmap_global_index_test.cpp @@ -100,8 +100,9 @@ class BitmapGlobalIndexTest : public ::testing::Test { PAIMON_ASSIGN_OR_RAISE(auto result_metas, global_writer->Finish()); // check meta EXPECT_EQ(result_metas.size(), 1); - EXPECT_TRUE(StringUtils::StartsWith(result_metas[0].file_name, "bitmap-global-index-")); - EXPECT_TRUE(StringUtils::EndsWith(result_metas[0].file_name, ".index")); + auto file_name = PathUtil::GetName(result_metas[0].file_path); + EXPECT_TRUE(StringUtils::StartsWith(file_name, "bitmap-global-index-")); + EXPECT_TRUE(StringUtils::EndsWith(file_name, ".index")); EXPECT_EQ(result_metas[0].range_end, expected_range.to); EXPECT_FALSE(result_metas[0].metadata); return result_metas[0]; diff --git a/src/paimon/common/global_index/wrap/file_index_writer_wrapper.h b/src/paimon/common/global_index/wrap/file_index_writer_wrapper.h index 0f518a71..f62084e7 100644 --- a/src/paimon/common/global_index/wrap/file_index_writer_wrapper.h +++ b/src/paimon/common/global_index/wrap/file_index_writer_wrapper.h @@ -72,7 +72,8 @@ class FileIndexWriterWrapper : public GlobalIndexWriter { } PAIMON_RETURN_NOT_OK(out->Flush()); PAIMON_RETURN_NOT_OK(out->Close()); - GlobalIndexIOMeta meta(file_name, /*file_size=*/bytes->size(), /*range_end=*/count_ - 1, + GlobalIndexIOMeta meta(file_manager_->ToPath(file_name), /*file_size=*/bytes->size(), + /*range_end=*/count_ - 1, /*metadata=*/nullptr); return std::vector({meta}); } diff --git a/src/paimon/core/core_options.cpp b/src/paimon/core/core_options.cpp index c28cc1aa..85fb008a 100644 --- a/src/paimon/core/core_options.cpp +++ b/src/paimon/core/core_options.cpp @@ -303,6 +303,7 @@ struct CoreOptions::Impl { bool data_evolution_enabled = false; bool legacy_partition_name_enabled = true; bool global_index_enabled = true; + std::optional global_index_external_path; }; // Parse configurations from a map and return a populated CoreOptions object @@ -470,6 +471,15 @@ Result CoreOptions::FromMap( // Parse global-index.enabled PAIMON_RETURN_NOT_OK( parser.Parse(Options::GLOBAL_INDEX_ENABLED, &impl->global_index_enabled)); + + // Parse global_index.external-path + std::string global_index_external_path; + PAIMON_RETURN_NOT_OK( + parser.ParseString(Options::GLOBAL_INDEX_EXTERNAL_PATH, &global_index_external_path)); + if (!global_index_external_path.empty()) { + impl->global_index_external_path = global_index_external_path; + } + return options; } @@ -746,4 +756,23 @@ bool CoreOptions::LegacyPartitionNameEnabled() const { bool CoreOptions::GlobalIndexEnabled() const { return impl_->global_index_enabled; } + +std::optional CoreOptions::GetGlobalIndexExternalPath() const { + return impl_->global_index_external_path; +} + +Result> CoreOptions::CreateGlobalIndexExternalPath() const { + std::optional global_index_external_path = GetGlobalIndexExternalPath(); + if (global_index_external_path == std::nullopt) { + return global_index_external_path; + } + std::string tmp_path = global_index_external_path.value(); + StringUtils::Trim(&tmp_path); + PAIMON_ASSIGN_OR_RAISE(Path path, PathUtil::ToPath(tmp_path)); + if (path.scheme.empty()) { + return Status::Invalid(fmt::format("scheme is null, path is {}", tmp_path)); + } + return std::optional(path.ToString()); +} + } // namespace paimon diff --git a/src/paimon/core/core_options.h b/src/paimon/core/core_options.h index 44608b62..2a911fec 100644 --- a/src/paimon/core/core_options.h +++ b/src/paimon/core/core_options.h @@ -102,7 +102,6 @@ class PAIMON_EXPORT CoreOptions { std::optional GetScanFallbackBranch() const; std::string GetBranch() const; - std::optional GetDataFileExternalPaths() const; ExternalPathStrategy GetExternalPathStrategy() const; Result> CreateExternalPaths() const; bool EnableAdaptivePrefetchStrategy() const; @@ -117,8 +116,14 @@ class PAIMON_EXPORT CoreOptions { bool LegacyPartitionNameEnabled() const; bool GlobalIndexEnabled() const; + Result> CreateGlobalIndexExternalPath() const; + const std::map& ToMap() const; + private: + std::optional GetDataFileExternalPaths() const; + std::optional GetGlobalIndexExternalPath() const; + private: struct Impl; diff --git a/src/paimon/core/core_options_test.cpp b/src/paimon/core/core_options_test.cpp index 09ef160f..8f9a5ee4 100644 --- a/src/paimon/core/core_options_test.cpp +++ b/src/paimon/core/core_options_test.cpp @@ -87,6 +87,7 @@ TEST(CoreOptionsTest, TestDefaultValue) { ASSERT_FALSE(core_options.DataEvolutionEnabled()); ASSERT_TRUE(core_options.LegacyPartitionNameEnabled()); ASSERT_TRUE(core_options.GlobalIndexEnabled()); + ASSERT_FALSE(core_options.GetGlobalIndexExternalPath()); } TEST(CoreOptionsTest, TestFromMap) { @@ -144,6 +145,7 @@ TEST(CoreOptionsTest, TestFromMap) { {Options::DATA_EVOLUTION_ENABLED, "true"}, {Options::PARTITION_GENERATE_LEGACY_NAME, "false"}, {Options::GLOBAL_INDEX_ENABLED, "false"}, + {Options::GLOBAL_INDEX_EXTERNAL_PATH, "FILE:///tmp/global_index/"}, }; ASSERT_OK_AND_ASSIGN(CoreOptions core_options, CoreOptions::FromMap(options)); @@ -212,6 +214,8 @@ TEST(CoreOptionsTest, TestFromMap) { ASSERT_TRUE(core_options.DataEvolutionEnabled()); ASSERT_FALSE(core_options.LegacyPartitionNameEnabled()); ASSERT_FALSE(core_options.GlobalIndexEnabled()); + ASSERT_TRUE(core_options.GetGlobalIndexExternalPath()); + ASSERT_EQ(core_options.GetGlobalIndexExternalPath().value(), "FILE:///tmp/global_index/"); } TEST(CoreOptionsTest, TestInvalidCase) { @@ -273,6 +277,25 @@ TEST(CoreOptionsTest, TestInvalidCreateExternalPath) { } } +TEST(CoreOptionsTest, TestCreateGlobalIndexExternalPath) { + std::map options = { + {Options::GLOBAL_INDEX_EXTERNAL_PATH, " FILE:///tmp/index1"}, + }; + ASSERT_OK_AND_ASSIGN(CoreOptions core_options, CoreOptions::FromMap(options)); + ASSERT_OK_AND_ASSIGN(std::optional external_path, + core_options.CreateGlobalIndexExternalPath()); + ASSERT_EQ("FILE:/tmp/index1", external_path.value()); +} + +TEST(CoreOptionsTest, TestInvalidCreateGlobalIndexExternalPath) { + std::map options = { + {Options::GLOBAL_INDEX_EXTERNAL_PATH, "/tmp/index1"}, + }; + ASSERT_OK_AND_ASSIGN(CoreOptions core_options, CoreOptions::FromMap(options)); + ASSERT_NOK_WITH_MSG(core_options.CreateGlobalIndexExternalPath(), + "scheme is null, path is /tmp/index1"); +} + TEST(CoreOptionsTest, TestFileSystem) { { auto mock_fs = std::make_shared(); diff --git a/src/paimon/core/global_index/global_index_file_manager.h b/src/paimon/core/global_index/global_index_file_manager.h index 9406f0f4..750ff3b3 100644 --- a/src/paimon/core/global_index/global_index_file_manager.h +++ b/src/paimon/core/global_index/global_index_file_manager.h @@ -34,8 +34,8 @@ class GlobalIndexFileManager : public GlobalIndexFileReader, public GlobalIndexF : fs_(fs), path_factory_(path_factory) {} Result> GetInputStream( - const std::string& file_name) const override { - return fs_->Open(path_factory_->ToPath(file_name)); + const std::string& file_path) const override { + return fs_->Open(file_path); } Result NewFileName(const std::string& prefix) const override { @@ -46,17 +46,29 @@ class GlobalIndexFileManager : public GlobalIndexFileReader, public GlobalIndexF return prefix + "-" + "global-index-" + uuid + ".index"; } + std::string ToPath(const std::string& file_name) const override { + return path_factory_->ToPath(file_name); + } + + std::string ToPath(const std::shared_ptr& file) const { + return path_factory_->ToPath(file); + } + Result> NewOutputStream( const std::string& file_name) const override { - return fs_->Create(path_factory_->ToPath(file_name), /*overwrite=*/false); + return fs_->Create(ToPath(file_name), /*overwrite=*/false); } Result GetFileSize(const std::string& file_name) const override { PAIMON_ASSIGN_OR_RAISE(std::unique_ptr file_status, - fs_->GetFileStatus(path_factory_->ToPath(file_name))); + fs_->GetFileStatus(ToPath(file_name))); return file_status->GetLen(); } + bool IsExternalPath() const { + return path_factory_->IsExternalPath(); + } + private: std::shared_ptr fs_; std::shared_ptr path_factory_; diff --git a/src/paimon/core/global_index/global_index_scan_impl.cpp b/src/paimon/core/global_index/global_index_scan_impl.cpp index 1b48829c..2420108f 100644 --- a/src/paimon/core/global_index/global_index_scan_impl.cpp +++ b/src/paimon/core/global_index/global_index_scan_impl.cpp @@ -104,13 +104,15 @@ Status GlobalIndexScanImpl::Scan() { } auto arrow_schema = DataField::ConvertDataFieldsToArrowSchema(table_schema_->Fields()); PAIMON_ASSIGN_OR_RAISE(std::vector external_paths, options_.CreateExternalPaths()); + PAIMON_ASSIGN_OR_RAISE(std::optional global_index_external_path, + options_.CreateGlobalIndexExternalPath()); PAIMON_ASSIGN_OR_RAISE( path_factory_, FileStorePathFactory::Create( root_path_, arrow_schema, table_schema_->PartitionKeys(), options_.GetPartitionDefaultName(), options_.GetWriteFileFormat()->Identifier(), options_.DataFilePrefix(), options_.LegacyPartitionNameEnabled(), external_paths, - options_.IndexFileInDataFileDir(), pool_)); + global_index_external_path, options_.IndexFileInDataFileDir(), pool_)); PAIMON_ASSIGN_OR_RAISE(std::unique_ptr index_manifest_file, IndexManifestFile::Create( diff --git a/src/paimon/core/global_index/global_index_write_task.cpp b/src/paimon/core/global_index/global_index_write_task.cpp index 086e6712..c3c499a2 100644 --- a/src/paimon/core/global_index/global_index_write_task.cpp +++ b/src/paimon/core/global_index/global_index_write_task.cpp @@ -39,13 +39,16 @@ Result> CreateGlobalIndexFileManager( auto all_arrow_schema = DataField::ConvertDataFieldsToArrowSchema(table_schema->Fields()); PAIMON_ASSIGN_OR_RAISE(std::vector external_paths, core_options.CreateExternalPaths()); + PAIMON_ASSIGN_OR_RAISE(std::optional global_index_external_path, + core_options.CreateGlobalIndexExternalPath()); PAIMON_ASSIGN_OR_RAISE( std::shared_ptr path_factory, FileStorePathFactory::Create( table_path, all_arrow_schema, table_schema->PartitionKeys(), core_options.GetPartitionDefaultName(), core_options.GetWriteFileFormat()->Identifier(), core_options.DataFilePrefix(), core_options.LegacyPartitionNameEnabled(), - external_paths, core_options.IndexFileInDataFileDir(), pool)); + external_paths, global_index_external_path, core_options.IndexFileInDataFileDir(), + pool)); std::shared_ptr index_path_factory = path_factory->CreateGlobalIndexFileFactory(); return std::make_shared(core_options.GetFileSystem(), @@ -118,17 +121,24 @@ Result> BuildIndex(const std::string& field_name, Result> ToCommitMessage( const std::string& index_type, int32_t field_id, const Range& range, const std::vector& global_index_io_metas, const BinaryRow& partition, - int32_t bucket) { + int32_t bucket, const std::shared_ptr& file_manager) { std::vector> index_file_metas; index_file_metas.reserve(global_index_io_metas.size()); + bool is_external_path = file_manager->IsExternalPath(); for (const auto& io_meta : global_index_io_metas) { if (range.Count() != io_meta.range_end + 1) { return Status::Invalid( fmt::format("specified range length {} mismatch indexed range length {}", range.Count(), io_meta.range_end + 1)); } + std::optional external_path; + if (is_external_path) { + PAIMON_ASSIGN_OR_RAISE(Path path, PathUtil::ToPath(io_meta.file_path)); + external_path = path.ToString(); + } index_file_metas.push_back(std::make_shared( - index_type, io_meta.file_name, io_meta.file_size, io_meta.range_end + 1, + index_type, PathUtil::GetName(io_meta.file_path), io_meta.file_size, + io_meta.range_end + 1, /*dv_ranges=*/std::nullopt, external_path, GlobalIndexMeta(range.from, io_meta.range_end + range.from, field_id, /*extra_field_ids=*/std::nullopt, io_meta.metadata))); } @@ -192,7 +202,7 @@ Result> GlobalIndexWriteTask::WriteIndex( // generate commit message return ToCommitMessage(index_type, field.Id(), range, global_index_io_metas, - data_split->Partition(), data_split->Bucket()); + data_split->Partition(), data_split->Bucket(), index_file_manager); } } // namespace paimon diff --git a/src/paimon/core/global_index/row_range_global_index_scanner_impl.cpp b/src/paimon/core/global_index/row_range_global_index_scanner_impl.cpp index b254ddc3..b158d26f 100644 --- a/src/paimon/core/global_index/row_range_global_index_scanner_impl.cpp +++ b/src/paimon/core/global_index/row_range_global_index_scanner_impl.cpp @@ -117,7 +117,7 @@ Result> RowRangeGlobalIndexScannerImpl::Creat } std::vector RowRangeGlobalIndexScannerImpl::ToGlobalIndexIOMetas( - const std::vector& entries) { + const std::vector& entries) const { std::vector index_io_metas; index_io_metas.reserve(entries.size()); for (const auto& entry : entries) { @@ -127,11 +127,11 @@ std::vector RowRangeGlobalIndexScannerImpl::ToGlobalIndexIOMe } GlobalIndexIOMeta RowRangeGlobalIndexScannerImpl::ToGlobalIndexIOMeta( - const IndexManifestEntry& entry) { + const IndexManifestEntry& entry) const { const auto& index_file = entry.index_file; assert(index_file->GetGlobalIndexMeta()); const auto& global_index_meta = index_file->GetGlobalIndexMeta().value(); - return {index_file->FileName(), index_file->FileSize(), + return {index_file_manager_->ToPath(index_file), index_file->FileSize(), /*range_end=*/global_index_meta.row_range_end - global_index_meta.row_range_start, global_index_meta.index_meta}; } diff --git a/src/paimon/core/global_index/row_range_global_index_scanner_impl.h b/src/paimon/core/global_index/row_range_global_index_scanner_impl.h index c8838c68..41b26ea0 100644 --- a/src/paimon/core/global_index/row_range_global_index_scanner_impl.h +++ b/src/paimon/core/global_index/row_range_global_index_scanner_impl.h @@ -60,10 +60,10 @@ class RowRangeGlobalIndexScannerImpl const DataField& field, const std::string& index_type, const std::vector& entries) const; - static std::vector ToGlobalIndexIOMetas( - const std::vector& entries); + std::vector ToGlobalIndexIOMetas( + const std::vector& entries) const; - static GlobalIndexIOMeta ToGlobalIndexIOMeta(const IndexManifestEntry& entry); + GlobalIndexIOMeta ToGlobalIndexIOMeta(const IndexManifestEntry& entry) const; private: std::shared_ptr pool_; diff --git a/src/paimon/core/index/index_file_handler_test.cpp b/src/paimon/core/index/index_file_handler_test.cpp index 7bf887c9..50d29188 100644 --- a/src/paimon/core/index/index_file_handler_test.cpp +++ b/src/paimon/core/index/index_file_handler_test.cpp @@ -54,6 +54,9 @@ class IndexFileHandlerTest : public testing::Test { auto schema = DataField::ConvertDataFieldsToArrowSchema(table_schema->Fields()); PAIMON_ASSIGN_OR_RAISE(std::vector external_paths, core_options.CreateExternalPaths()); + PAIMON_ASSIGN_OR_RAISE(std::optional global_index_external_path, + core_options.CreateGlobalIndexExternalPath()); + PAIMON_ASSIGN_OR_RAISE( std::shared_ptr path_factory, FileStorePathFactory::Create( @@ -61,6 +64,7 @@ class IndexFileHandlerTest : public testing::Test { core_options.GetPartitionDefaultName(), /*identifier=*/"orc", core_options.DataFilePrefix(), core_options.LegacyPartitionNameEnabled(), external_paths, + global_index_external_path, /*index_file_in_data_file_dir=*/core_options.IndexFileInDataFileDir(), memory_pool_)); PAIMON_ASSIGN_OR_RAISE(std::unique_ptr index_manifest_file, diff --git a/src/paimon/core/index/index_file_meta.h b/src/paimon/core/index/index_file_meta.h index e6276d83..6c668429 100644 --- a/src/paimon/core/index/index_file_meta.h +++ b/src/paimon/core/index/index_file_meta.h @@ -40,11 +40,6 @@ class IndexFileMeta { : IndexFileMeta(index_type, file_name, file_size, row_count, dv_ranges, external_path, /*global_index_meta=*/std::nullopt) {} - IndexFileMeta(const std::string& index_type, const std::string& file_name, int64_t file_size, - int64_t row_count, const std::optional& global_index_meta) - : IndexFileMeta(index_type, file_name, file_size, row_count, /*dv_ranges=*/std::nullopt, - /*external_path=*/std::nullopt, global_index_meta) {} - IndexFileMeta(const std::string& index_type, const std::string& file_name, int64_t file_size, int64_t row_count, const std::optional>& dv_ranges, diff --git a/src/paimon/core/index/index_file_meta_serializer_test.cpp b/src/paimon/core/index/index_file_meta_serializer_test.cpp index 6c939f11..af00f69d 100644 --- a/src/paimon/core/index/index_file_meta_serializer_test.cpp +++ b/src/paimon/core/index/index_file_meta_serializer_test.cpp @@ -93,18 +93,32 @@ TEST_F(IndexFileMetaSerializerTest, TestToFromRowWithNullDeletionVectorMetas) { TEST_F(IndexFileMetaSerializerTest, TestToFromRowWithGlobalIndex) { auto bytes = std::make_shared("apple", memory_pool_.get()); + IndexFileMetaSerializer serializer(memory_pool_); GlobalIndexMeta global_index_meta( /*row_range_start=*/10, /*row_range_end=*/50, /*index_field_id=*/5, /*extra_field_ids=*/std::optional>({0, 1}), bytes); - IndexFileMetaSerializer serializer(memory_pool_); - auto expected = - std::make_shared("bitmap", "bitmap_index_file_0", /*file_size=*/10, - /*row_count=*/41, global_index_meta); - ASSERT_OK_AND_ASSIGN(BinaryRow row, serializer.ToRow(expected)); - ASSERT_OK_AND_ASSIGN(std::shared_ptr actual, serializer.FromRow(row)); - ASSERT_EQ(expected->ToString(), actual->ToString()); - ASSERT_EQ(*expected, *actual); + { + auto expected = + std::make_shared("bitmap", "bitmap_index_file_0", /*file_size=*/10, + /*row_count=*/41, /*dv_ranges=*/std::nullopt, + /*external_path=*/std::nullopt, global_index_meta); + ASSERT_OK_AND_ASSIGN(BinaryRow row, serializer.ToRow(expected)); + ASSERT_OK_AND_ASSIGN(std::shared_ptr actual, serializer.FromRow(row)); + ASSERT_EQ(expected->ToString(), actual->ToString()); + ASSERT_EQ(*expected, *actual); + } + { + // test external path + auto expected = std::make_shared( + "bitmap", "bitmap_index_file_0", /*file_size=*/10, + /*row_count=*/41, /*dv_ranges=*/std::nullopt, + /*external_path=*/"FILE:/tmp/external/bitmap_index_file_0", global_index_meta); + ASSERT_OK_AND_ASSIGN(BinaryRow row, serializer.ToRow(expected)); + ASSERT_OK_AND_ASSIGN(std::shared_ptr actual, serializer.FromRow(row)); + ASSERT_EQ(expected->ToString(), actual->ToString()); + ASSERT_EQ(*expected, *actual); + } } TEST_F(IndexFileMetaSerializerTest, TestSerialize) { diff --git a/src/paimon/core/index/index_in_data_file_dir_path_factory_test.cpp b/src/paimon/core/index/index_in_data_file_dir_path_factory_test.cpp index 3530f356..82d6e1d0 100644 --- a/src/paimon/core/index/index_in_data_file_dir_path_factory_test.cpp +++ b/src/paimon/core/index/index_in_data_file_dir_path_factory_test.cpp @@ -48,7 +48,7 @@ TEST(IndexInDataFileDirPathFactoryTest, TestSimple) { ASSERT_EQ(factory.ToPath(meta), "/tmp/p0=1/p1=0/bucket-0/deletion_file"); // test ToPath with file_name - ASSERT_EQ(factory.ToPath("bitmap_global.index"), "/tmp/p0=1/p1=0/bucket-0/bitmap_global.index"); + ASSERT_EQ(factory.ToPath("bitmap.index"), "/tmp/p0=1/p1=0/bucket-0/bitmap.index"); // test external path ASSERT_FALSE(factory.IsExternalPath()); } diff --git a/src/paimon/core/manifest/index_manifest_entry_serializer_test.cpp b/src/paimon/core/manifest/index_manifest_entry_serializer_test.cpp index 0fbaecf2..ac44a0ff 100644 --- a/src/paimon/core/manifest/index_manifest_entry_serializer_test.cpp +++ b/src/paimon/core/manifest/index_manifest_entry_serializer_test.cpp @@ -42,6 +42,8 @@ TEST(IndexManifestEntrySerializerTest, TestSerialize) { /*bucket=*/0, std::make_shared( "bsi", "bsi.index", /*file_size=*/110, /*row_count=*/210, + /*dv_ranges=*/std::nullopt, + /*external_path=*/std::nullopt, GlobalIndexMeta(/*row_range_start=*/41, /*row_range_end=*/71, /*index_field_id=*/2, /*extra_field_ids=*/std::nullopt, /*index_meta=*/nullptr))), IndexManifestEntry( @@ -49,6 +51,8 @@ TEST(IndexManifestEntrySerializerTest, TestSerialize) { /*bucket=*/0, std::make_shared( "bitmap", "bitmap.index", /*file_size=*/100, /*row_count=*/200, + /*dv_ranges=*/std::nullopt, + /*external_path=*/std::optional("FILE:/tmp/external/bitmap.index"), GlobalIndexMeta(/*row_range_start=*/30, /*row_range_end=*/70, /*index_field_id=*/0, /*extra_field_ids=*/std::optional>({3, 4}), /*index_meta=*/bytes))), diff --git a/src/paimon/core/manifest/manifest_file_test.cpp b/src/paimon/core/manifest/manifest_file_test.cpp index bdc09ec1..3fa8ecff 100644 --- a/src/paimon/core/manifest/manifest_file_test.cpp +++ b/src/paimon/core/manifest/manifest_file_test.cpp @@ -59,6 +59,7 @@ class ManifestFileTest : public testing::Test { /*default_part_value=*/"", file_format->Identifier(), /*data_file_prefix=*/"data-", /*legacy_partition_name_enabled=*/true, /*external_paths=*/{}, + /*global_index_external_path=*/std::nullopt, /*index_file_in_data_file_dir=*/false, pool)); EXPECT_OK_AND_ASSIGN(CoreOptions options, CoreOptions::FromMap({{Options::FILE_FORMAT, "orc"}, diff --git a/src/paimon/core/manifest/manifest_list_test.cpp b/src/paimon/core/manifest/manifest_list_test.cpp index 66d62d19..5fb802e2 100644 --- a/src/paimon/core/manifest/manifest_list_test.cpp +++ b/src/paimon/core/manifest/manifest_list_test.cpp @@ -47,6 +47,7 @@ class ManifestListTest : public testing::Test { /*default_part_value=*/"", file_format->Identifier(), /*data_file_prefix=*/"data-", /*legacy_partition_name_enabled=*/true, /*external_paths=*/{}, + /*global_index_external_path=*/std::nullopt, /*index_file_in_data_file_dir=*/false, pool)); EXPECT_OK_AND_ASSIGN(auto manifest_list, ManifestList::Create(file_system, file_format, "zstd", path_factory, pool)); diff --git a/src/paimon/core/migrate/file_meta_utils.cpp b/src/paimon/core/migrate/file_meta_utils.cpp index 6582fe2a..99c758f9 100644 --- a/src/paimon/core/migrate/file_meta_utils.cpp +++ b/src/paimon/core/migrate/file_meta_utils.cpp @@ -152,6 +152,7 @@ Result> FileMetaUtils::GenerateCommitMessage( core_options.DataFilePrefix(), core_options.LegacyPartitionNameEnabled(), /*external_paths=*/std::vector(), + /*global_index_external_path=*/std::nullopt, /*index_file_in_data_file_dir=*/false, memory_pool)); PAIMON_ASSIGN_OR_RAISE(std::string bucket_path, file_store_path_factory->BucketPath(partition_row, /*bucket=*/0)); diff --git a/src/paimon/core/operation/expire_snapshots_test.cpp b/src/paimon/core/operation/expire_snapshots_test.cpp index adec5cb9..86744143 100644 --- a/src/paimon/core/operation/expire_snapshots_test.cpp +++ b/src/paimon/core/operation/expire_snapshots_test.cpp @@ -115,12 +115,15 @@ class ExpireSnapshotsTest : public testing::Test { EXPECT_OK_AND_ASSIGN(CoreOptions options, CoreOptions::FromMap(raw_options)); EXPECT_OK_AND_ASSIGN(std::vector external_paths, options.CreateExternalPaths()); - EXPECT_OK_AND_ASSIGN(auto path_factory, - FileStorePathFactory::Create( - root, schema_, partition_keys_, options.GetPartitionDefaultName(), - options.GetWriteFileFormat()->Identifier(), - options.DataFilePrefix(), options.LegacyPartitionNameEnabled(), - external_paths, options.IndexFileInDataFileDir(), mem_pool_)); + EXPECT_OK_AND_ASSIGN(std::optional global_index_external_path, + options.CreateGlobalIndexExternalPath()); + EXPECT_OK_AND_ASSIGN( + auto path_factory, + FileStorePathFactory::Create( + root, schema_, partition_keys_, options.GetPartitionDefaultName(), + options.GetWriteFileFormat()->Identifier(), options.DataFilePrefix(), + options.LegacyPartitionNameEnabled(), external_paths, global_index_external_path, + options.IndexFileInDataFileDir(), mem_pool_)); return path_factory; } diff --git a/src/paimon/core/operation/file_store_commit.cpp b/src/paimon/core/operation/file_store_commit.cpp index 34260bcc..f4a78ffe 100644 --- a/src/paimon/core/operation/file_store_commit.cpp +++ b/src/paimon/core/operation/file_store_commit.cpp @@ -89,13 +89,16 @@ Result> FileStoreCommit::Create( table_schema.value()->PartitionKeys(), arrow_schema, options.GetPartitionDefaultName(), options.LegacyPartitionNameEnabled(), ctx->GetMemoryPool())); PAIMON_ASSIGN_OR_RAISE(std::vector external_paths, options.CreateExternalPaths()); + PAIMON_ASSIGN_OR_RAISE(std::optional global_index_external_path, + options.CreateGlobalIndexExternalPath()); + PAIMON_ASSIGN_OR_RAISE( std::shared_ptr path_factory, FileStorePathFactory::Create( root_path, arrow_schema, table_schema.value()->PartitionKeys(), options.GetPartitionDefaultName(), options.GetWriteFileFormat()->Identifier(), options.DataFilePrefix(), options.LegacyPartitionNameEnabled(), external_paths, - options.IndexFileInDataFileDir(), ctx->GetMemoryPool())); + global_index_external_path, options.IndexFileInDataFileDir(), ctx->GetMemoryPool())); auto snapshot_manager = std::make_shared(options.GetFileSystem(), root_path); PAIMON_ASSIGN_OR_RAISE( diff --git a/src/paimon/core/operation/file_store_commit_impl_test.cpp b/src/paimon/core/operation/file_store_commit_impl_test.cpp index 8f9d7feb..a448d60c 100644 --- a/src/paimon/core/operation/file_store_commit_impl_test.cpp +++ b/src/paimon/core/operation/file_store_commit_impl_test.cpp @@ -817,7 +817,8 @@ TEST_F(FileStoreCommitImplTest, TestCleanUpTmpManifests) { std::vector> new_index_files; new_index_files.push_back(std::make_shared( "bitmap", "bitmap-global-index-567ff117-68a0-436d-a270-dc8f6e403d06.index", 100, 5, - std::nullopt)); + /*dv_ranges=*/std::nullopt, + /*external_path=*/std::nullopt, std::nullopt)); DataIncrement data_increment({}, {}, {}, std::move(new_index_files), {}); std::shared_ptr msgs = std::make_shared( BinaryRowGenerator::GenerateRow({10}, GetDefaultPool().get()), /*bucket=*/0, diff --git a/src/paimon/core/operation/file_store_write.cpp b/src/paimon/core/operation/file_store_write.cpp index ac21d0c3..10e7f6e9 100644 --- a/src/paimon/core/operation/file_store_write.cpp +++ b/src/paimon/core/operation/file_store_write.cpp @@ -85,13 +85,16 @@ Result> FileStoreWrite::Create(std::unique_ptrPartitionKeys())); PAIMON_ASSIGN_OR_RAISE(std::vector external_paths, options.CreateExternalPaths()); + PAIMON_ASSIGN_OR_RAISE(std::optional global_index_external_path, + options.CreateGlobalIndexExternalPath()); + PAIMON_ASSIGN_OR_RAISE( std::shared_ptr file_store_path_factory, FileStorePathFactory::Create( ctx->GetRootPath(), arrow_schema, schema->PartitionKeys(), options.GetPartitionDefaultName(), options.GetWriteFileFormat()->Identifier(), options.DataFilePrefix(), options.LegacyPartitionNameEnabled(), external_paths, - options.IndexFileInDataFileDir(), ctx->GetMemoryPool())); + global_index_external_path, options.IndexFileInDataFileDir(), ctx->GetMemoryPool())); auto snapshot_manager = std::make_shared(options.GetFileSystem(), ctx->GetRootPath(), branch); bool ignore_previous_files = ctx->IgnorePreviousFiles(); diff --git a/src/paimon/core/operation/key_value_file_store_scan_test.cpp b/src/paimon/core/operation/key_value_file_store_scan_test.cpp index 96ea0e46..3e78be64 100644 --- a/src/paimon/core/operation/key_value_file_store_scan_test.cpp +++ b/src/paimon/core/operation/key_value_file_store_scan_test.cpp @@ -75,14 +75,17 @@ class KeyValueFileStoreScanTest : public testing::Test { auto arrow_schema = DataField::ConvertDataFieldsToArrowSchema(table_schema->Fields()); PAIMON_ASSIGN_OR_RAISE(std::vector external_paths, core_options.CreateExternalPaths()); + PAIMON_ASSIGN_OR_RAISE(std::optional global_index_external_path, + core_options.CreateGlobalIndexExternalPath()); + PAIMON_ASSIGN_OR_RAISE( std::shared_ptr path_factory, - FileStorePathFactory::Create(table_path, arrow_schema, table_schema->PartitionKeys(), - core_options.GetPartitionDefaultName(), - core_options.GetWriteFileFormat()->Identifier(), - core_options.DataFilePrefix(), - core_options.LegacyPartitionNameEnabled(), external_paths, - core_options.IndexFileInDataFileDir(), pool_)); + FileStorePathFactory::Create( + table_path, arrow_schema, table_schema->PartitionKeys(), + core_options.GetPartitionDefaultName(), + core_options.GetWriteFileFormat()->Identifier(), core_options.DataFilePrefix(), + core_options.LegacyPartitionNameEnabled(), external_paths, + global_index_external_path, core_options.IndexFileInDataFileDir(), pool_)); auto manifest_file_format = core_options.GetManifestFormat(); auto snapshot_manager = std::make_shared(fs, table_path); diff --git a/src/paimon/core/operation/manifest_file_merger_test.cpp b/src/paimon/core/operation/manifest_file_merger_test.cpp index 62eb7389..c76ce70b 100644 --- a/src/paimon/core/operation/manifest_file_merger_test.cpp +++ b/src/paimon/core/operation/manifest_file_merger_test.cpp @@ -118,12 +118,15 @@ class ManifestFileMergerTest : public testing::Test { ASSERT_OK_AND_ASSIGN(CoreOptions options, CoreOptions::FromMap({})); ASSERT_OK_AND_ASSIGN(std::vector external_paths, options.CreateExternalPaths()); + ASSERT_OK_AND_ASSIGN(std::optional global_index_external_path, + options.CreateGlobalIndexExternalPath()); + ASSERT_OK_AND_ASSIGN( static std::shared_ptr path_factory, FileStorePathFactory::Create( path_str, schema, /*partition_keys=*/{"f0"}, options.GetPartitionDefaultName(), options.GetWriteFileFormat()->Identifier(), options.DataFilePrefix(), - options.LegacyPartitionNameEnabled(), external_paths, + options.LegacyPartitionNameEnabled(), external_paths, global_index_external_path, options.IndexFileInDataFileDir(), pool_)); ASSERT_OK_AND_ASSIGN(std::shared_ptr partition_schema, FieldMapping::GetPartitionSchema(schema, {"f0"})); diff --git a/src/paimon/core/operation/merge_file_split_read_test.cpp b/src/paimon/core/operation/merge_file_split_read_test.cpp index 99379b35..39d322fa 100644 --- a/src/paimon/core/operation/merge_file_split_read_test.cpp +++ b/src/paimon/core/operation/merge_file_split_read_test.cpp @@ -333,6 +333,9 @@ class MergeFileSplitReadTest : public ::testing::Test, auto arrow_schema = DataField::ConvertDataFieldsToArrowSchema(table_schema->Fields()); EXPECT_OK_AND_ASSIGN(std::vector external_paths, core_options.CreateExternalPaths()); + EXPECT_OK_AND_ASSIGN(std::optional global_index_external_path, + core_options.CreateGlobalIndexExternalPath()); + PAIMON_ASSIGN_OR_RAISE( std::shared_ptr path_factory, FileStorePathFactory::Create( @@ -340,7 +343,7 @@ class MergeFileSplitReadTest : public ::testing::Test, core_options.GetPartitionDefaultName(), core_options.GetWriteFileFormat()->Identifier(), core_options.DataFilePrefix(), core_options.LegacyPartitionNameEnabled(), external_paths, - core_options.IndexFileInDataFileDir(), pool_)); + global_index_external_path, core_options.IndexFileInDataFileDir(), pool_)); PAIMON_ASSIGN_OR_RAISE(auto split_read, MergeFileSplitRead::Create(path_factory, std::move(internal_context), pool_, executor_)); diff --git a/src/paimon/core/operation/orphan_files_cleaner.cpp b/src/paimon/core/operation/orphan_files_cleaner.cpp index fd9d849f..1cfb6283 100644 --- a/src/paimon/core/operation/orphan_files_cleaner.cpp +++ b/src/paimon/core/operation/orphan_files_cleaner.cpp @@ -167,13 +167,16 @@ Result> OrphanFilesCleaner::Create( PAIMON_ASSIGN_OR_RAISE(CoreOptions options, CoreOptions::FromMap(ctx->GetOptions())); auto arrow_schema = DataField::ConvertDataFieldsToArrowSchema(schema->Fields()); PAIMON_ASSIGN_OR_RAISE(std::vector external_paths, options.CreateExternalPaths()); + PAIMON_ASSIGN_OR_RAISE(std::optional global_index_external_path, + options.CreateGlobalIndexExternalPath()); + PAIMON_ASSIGN_OR_RAISE( std::shared_ptr path_factory, FileStorePathFactory::Create( ctx->GetRootPath(), arrow_schema, schema->PartitionKeys(), options.GetPartitionDefaultName(), options.GetWriteFileFormat()->Identifier(), options.DataFilePrefix(), options.LegacyPartitionNameEnabled(), external_paths, - options.IndexFileInDataFileDir(), ctx->GetMemoryPool())); + global_index_external_path, options.IndexFileInDataFileDir(), ctx->GetMemoryPool())); auto snapshot_manager = std::make_shared(options.GetFileSystem(), ctx->GetRootPath()); PAIMON_ASSIGN_OR_RAISE( diff --git a/src/paimon/core/operation/orphan_files_cleaner_impl.cpp b/src/paimon/core/operation/orphan_files_cleaner_impl.cpp index 74734728..740a629b 100644 --- a/src/paimon/core/operation/orphan_files_cleaner_impl.cpp +++ b/src/paimon/core/operation/orphan_files_cleaner_impl.cpp @@ -142,8 +142,9 @@ Result> OrphanFilesCleanerImpl::ListPaimonFileDirs() const std::set file_dirs = ListFileDirs(root_path_, partition_keys_.size()); paimon_file_dirs.insert(file_dirs.begin(), file_dirs.end()); // add external data paths - std::optional data_file_external_paths = options_.GetDataFileExternalPaths(); - if (data_file_external_paths) { + PAIMON_ASSIGN_OR_RAISE(std::vector data_file_external_paths, + options_.CreateExternalPaths()); + if (!data_file_external_paths.empty()) { return Status::Invalid( "OrphanFilesCleaner do not support cleaning table with external paths"); } diff --git a/src/paimon/core/operation/raw_file_split_read_test.cpp b/src/paimon/core/operation/raw_file_split_read_test.cpp index 63a5ff68..0f0d8bbb 100644 --- a/src/paimon/core/operation/raw_file_split_read_test.cpp +++ b/src/paimon/core/operation/raw_file_split_read_test.cpp @@ -143,6 +143,9 @@ class RawFileSplitReadTest : public ::testing::Test { auto arrow_schema = DataField::ConvertDataFieldsToArrowSchema(table_schema->Fields()); ASSERT_OK_AND_ASSIGN(std::vector external_paths, core_options.CreateExternalPaths()); + ASSERT_OK_AND_ASSIGN(std::optional global_index_external_path, + core_options.CreateGlobalIndexExternalPath()); + ASSERT_OK_AND_ASSIGN( std::shared_ptr path_factory, FileStorePathFactory::Create( @@ -150,7 +153,7 @@ class RawFileSplitReadTest : public ::testing::Test { core_options.GetPartitionDefaultName(), core_options.GetWriteFileFormat()->Identifier(), core_options.DataFilePrefix(), core_options.LegacyPartitionNameEnabled(), external_paths, - core_options.IndexFileInDataFileDir(), pool_)); + global_index_external_path, core_options.IndexFileInDataFileDir(), pool_)); auto split_read = std::make_unique(path_factory, std::move(internal_context), pool_, CreateDefaultExecutor(/*thread_count=*/2)); @@ -384,13 +387,17 @@ TEST_F(RawFileSplitReadTest, TestEmptyPlan) { auto arrow_schema = DataField::ConvertDataFieldsToArrowSchema(table_schema->Fields()); ASSERT_OK_AND_ASSIGN(std::vector external_paths, core_options.CreateExternalPaths()); + ASSERT_OK_AND_ASSIGN(std::optional global_index_external_path, + core_options.CreateGlobalIndexExternalPath()); + ASSERT_OK_AND_ASSIGN( std::shared_ptr path_factory, FileStorePathFactory::Create( internal_context->GetPath(), arrow_schema, table_schema->PartitionKeys(), core_options.GetPartitionDefaultName(), core_options.GetWriteFileFormat()->Identifier(), core_options.DataFilePrefix(), core_options.LegacyPartitionNameEnabled(), - external_paths, core_options.IndexFileInDataFileDir(), pool_)); + external_paths, global_index_external_path, core_options.IndexFileInDataFileDir(), + pool_)); auto split_read = std::make_unique(path_factory, std::move(internal_context), pool_, @@ -488,9 +495,7 @@ TEST_F(RawFileSplitReadTest, TestMatch) { split_read->Match(data_split, /*force_keep_delete=*/false)); ASSERT_FALSE(match_result); } - { - ASSERT_NOK(split_read->Match(nullptr, /*force_keep_delete=*/false)); - } + { ASSERT_NOK(split_read->Match(nullptr, /*force_keep_delete=*/false)); } } } // namespace paimon::test diff --git a/src/paimon/core/postpone/postpone_bucket_file_store_write.h b/src/paimon/core/postpone/postpone_bucket_file_store_write.h index 899b109b..1dfb7a38 100644 --- a/src/paimon/core/postpone/postpone_bucket_file_store_write.h +++ b/src/paimon/core/postpone/postpone_bucket_file_store_write.h @@ -64,14 +64,17 @@ class PostponeBucketFileStoreWrite : public AbstractFileStoreWrite { // prepare FileStorePathFactory PAIMON_ASSIGN_OR_RAISE(std::vector external_paths, new_options.CreateExternalPaths()); + PAIMON_ASSIGN_OR_RAISE(std::optional global_index_external_path, + new_options.CreateGlobalIndexExternalPath()); + PAIMON_ASSIGN_OR_RAISE( std::shared_ptr file_store_path_factory, - FileStorePathFactory::Create(root_path, schema, table_schema->PartitionKeys(), - new_options.GetPartitionDefaultName(), - new_options.GetWriteFileFormat()->Identifier(), - new_options.DataFilePrefix(), - new_options.LegacyPartitionNameEnabled(), external_paths, - new_options.IndexFileInDataFileDir(), pool)); + FileStorePathFactory::Create( + root_path, schema, table_schema->PartitionKeys(), + new_options.GetPartitionDefaultName(), + new_options.GetWriteFileFormat()->Identifier(), new_options.DataFilePrefix(), + new_options.LegacyPartitionNameEnabled(), external_paths, + global_index_external_path, new_options.IndexFileInDataFileDir(), pool)); // Ignoring previous files saves scanning time. // For postpone bucket tables, we only append new files to bucket = -2 directories. diff --git a/src/paimon/core/table/sink/commit_message_test.cpp b/src/paimon/core/table/sink/commit_message_test.cpp index 9f3e3e03..d6948a3a 100644 --- a/src/paimon/core/table/sink/commit_message_test.cpp +++ b/src/paimon/core/table/sink/commit_message_test.cpp @@ -91,7 +91,8 @@ TEST(CommitMessageTest, TestCompatibleWithVersion11) { std::vector expected_msgs; auto index_meta = std::make_shared( "bitmap", "bitmap-global-index-6f974a9b-07bb-4a06-9696-6646020d8139.index", - /*file_size=*/120, /*row_count=*/5, + /*file_size=*/120, /*row_count=*/5, /*dv_ranges=*/std::nullopt, + /*external_path=*/std::nullopt, GlobalIndexMeta(/*row_range_start=*/0, /*row_range_end=*/4, /*index_field_id=*/0, /*extra_field_ids=*/std::nullopt, /*index_meta=*/nullptr)); diff --git a/src/paimon/core/table/source/table_read.cpp b/src/paimon/core/table/source/table_read.cpp index 439b6a06..3253e511 100644 --- a/src/paimon/core/table/source/table_read.cpp +++ b/src/paimon/core/table/source/table_read.cpp @@ -88,13 +88,17 @@ Result> CreateTableRead( auto arrow_schema = DataField::ConvertDataFieldsToArrowSchema(table_schema->Fields()); PAIMON_ASSIGN_OR_RAISE(std::vector external_paths, core_options.CreateExternalPaths()); + PAIMON_ASSIGN_OR_RAISE(std::optional global_index_external_path, + core_options.CreateGlobalIndexExternalPath()); + PAIMON_ASSIGN_OR_RAISE( std::shared_ptr path_factory, FileStorePathFactory::Create( internal_context->GetPath(), arrow_schema, table_schema->PartitionKeys(), core_options.GetPartitionDefaultName(), core_options.GetWriteFileFormat()->Identifier(), core_options.DataFilePrefix(), core_options.LegacyPartitionNameEnabled(), - external_paths, core_options.IndexFileInDataFileDir(), memory_pool)); + external_paths, global_index_external_path, core_options.IndexFileInDataFileDir(), + memory_pool)); if (internal_context->GetPrimaryKeys().empty()) { return std::make_unique(path_factory, internal_context, memory_pool, diff --git a/src/paimon/core/table/source/table_scan.cpp b/src/paimon/core/table/source/table_scan.cpp index 4c5a47d0..a365fa37 100644 --- a/src/paimon/core/table/source/table_scan.cpp +++ b/src/paimon/core/table/source/table_scan.cpp @@ -203,13 +203,17 @@ Result> TableScan::Create(std::unique_ptr external_paths, core_options.CreateExternalPaths()); + PAIMON_ASSIGN_OR_RAISE(std::optional global_index_external_path, + core_options.CreateGlobalIndexExternalPath()); + PAIMON_ASSIGN_OR_RAISE( std::shared_ptr path_factory, FileStorePathFactory::Create( context->GetPath(), arrow_schema, table_schema->PartitionKeys(), core_options.GetPartitionDefaultName(), core_options.GetWriteFileFormat()->Identifier(), core_options.DataFilePrefix(), core_options.LegacyPartitionNameEnabled(), - external_paths, core_options.IndexFileInDataFileDir(), context->GetMemoryPool())); + external_paths, global_index_external_path, core_options.IndexFileInDataFileDir(), + context->GetMemoryPool())); PAIMON_ASSIGN_OR_RAISE(std::shared_ptr file_store_scan, TableScanImpl::CreateFileStoreScan( diff --git a/src/paimon/core/utils/file_store_path_factory.cpp b/src/paimon/core/utils/file_store_path_factory.cpp index 7fbb52ee..fcce4a11 100644 --- a/src/paimon/core/utils/file_store_path_factory.cpp +++ b/src/paimon/core/utils/file_store_path_factory.cpp @@ -41,13 +41,15 @@ FileStorePathFactory::FileStorePathFactory( const std::string& root, const std::string& format_identifier, const std::string& data_file_prefix, const std::string& uuid, std::unique_ptr partition_computer, - const std::vector& external_paths, bool index_file_in_data_file_dir) + const std::vector& external_paths, + const std::optional& global_index_external_path, bool index_file_in_data_file_dir) : root_(root), format_identifier_(format_identifier), data_file_prefix_(data_file_prefix), uuid_(uuid), partition_computer_(std::move(partition_computer)), external_paths_(external_paths), + global_index_external_path_(global_index_external_path), index_file_in_data_file_dir_(index_file_in_data_file_dir) {} Result> FileStorePathFactory::Create( @@ -55,7 +57,8 @@ Result> FileStorePathFactory::Create( const std::vector& partition_keys, const std::string& default_part_value, const std::string& identifier, const std::string& data_file_prefix, bool legacy_partition_name_enabled, const std::vector& external_paths, - bool index_file_in_data_file_dir, const std::shared_ptr& memory_pool) { + const std::optional& global_index_external_path, bool index_file_in_data_file_dir, + const std::shared_ptr& memory_pool) { if (memory_pool == nullptr) { return Status::Invalid("memory pool is null pointer"); } @@ -69,7 +72,7 @@ Result> FileStorePathFactory::Create( legacy_partition_name_enabled, memory_pool)); return std::unique_ptr(new FileStorePathFactory( root, identifier, data_file_prefix, uuid, std::move(partition_computer), external_paths, - index_file_in_data_file_dir)); + global_index_external_path, index_file_in_data_file_dir)); } std::unique_ptr FileStorePathFactory::CreateManifestFileFactory() { @@ -155,13 +158,23 @@ std::unique_ptr FileStorePathFactory::CreateGlobalIndexFileFac return factory_->NewIndexFile(); } std::string ToPath(const std::shared_ptr& file) const override { + const auto& external_path = file->ExternalPath(); + if (external_path) { + return external_path.value(); + } return PathUtil::JoinPath(factory_->IndexPath(factory_->RootPath()), file->FileName()); } + std::string ToPath(const std::string& file_name) const override { + const auto& external_path = factory_->GetGlobalIndexExternalPath(); + if (external_path) { + return PathUtil::JoinPath(factory_->IndexPath(external_path.value()), file_name); + } return PathUtil::JoinPath(factory_->IndexPath(factory_->RootPath()), file_name); } + bool IsExternalPath() const override { - return false; + return factory_->GetGlobalIndexExternalPath() != std::nullopt; } private: diff --git a/src/paimon/core/utils/file_store_path_factory.h b/src/paimon/core/utils/file_store_path_factory.h index d4925fee..beb104b7 100644 --- a/src/paimon/core/utils/file_store_path_factory.h +++ b/src/paimon/core/utils/file_store_path_factory.h @@ -53,6 +53,7 @@ class FileStorePathFactory : public std::enable_shared_from_this& partition_keys, const std::string& default_part_value, const std::string& identifier, const std::string& data_file_prefix, bool legacy_partition_name_enabled, const std::vector& external_paths, + const std::optional& global_index_external_path, bool index_file_in_data_file_dir, const std::shared_ptr& memory_pool); static std::string ManifestPath(const std::string& root) { @@ -92,6 +93,10 @@ class FileStorePathFactory : public std::enable_shared_from_this& GetGlobalIndexExternalPath() const { + return global_index_external_path_; + } + /// @note This method is NOT THREAD SAFE. Result GetPartitionString(const BinaryRow& partition) const; std::string NewManifestFile() const { @@ -137,6 +142,7 @@ class FileStorePathFactory : public std::enable_shared_from_this partition_computer, const std::vector& external_paths, + const std::optional& global_index_external_path, bool index_file_in_data_file_dir); Result> CreateExternalPathProvider( @@ -149,6 +155,7 @@ class FileStorePathFactory : public std::enable_shared_from_this partition_computer_; std::vector external_paths_; + std::optional global_index_external_path_; bool index_file_in_data_file_dir_; mutable std::atomic manifest_file_count_ = 0; diff --git a/src/paimon/core/utils/file_store_path_factory_test.cpp b/src/paimon/core/utils/file_store_path_factory_test.cpp index e0a21ad4..ed2bd716 100644 --- a/src/paimon/core/utils/file_store_path_factory_test.cpp +++ b/src/paimon/core/utils/file_store_path_factory_test.cpp @@ -70,7 +70,8 @@ class FileStorePathFactoryTest : public ::testing::Test { root, schema, {"f0", "f3"}, options.GetPartitionDefaultName(), options.GetWriteFileFormat()->Identifier(), options.DataFilePrefix(), options.LegacyPartitionNameEnabled(), - external_paths, options.IndexFileInDataFileDir(), mem_pool_)); + external_paths, /*global_index_external_path=*/std::nullopt, + options.IndexFileInDataFileDir(), mem_pool_)); return path_factory; } @@ -151,6 +152,7 @@ TEST_F(FileStorePathFactoryTest, TestCreateFactoryWithNoPartition) { dir->Str(), schema, {}, "default", /*identifier=*/"mock_format", /*data_file_prefix=*/"data-", /*legacy_partition_name_enabled=*/true, /*external_paths=*/std::vector(), + /*global_index_external_path=*/std::nullopt, /*index_file_in_data_file_dir=*/false, mem_pool_)); ASSERT_OK_AND_ASSIGN(auto data_file_path_factory, path_factory->CreateDataFilePathFactory(BinaryRow::EmptyRow(), 123)); @@ -208,6 +210,7 @@ TEST_F(FileStorePathFactoryTest, TestCreateDataFilePathFactoryWithPartition) { /*data_file_prefix=*/"data-", /*legacy_partition_name_enabled=*/true, /*external_paths=*/std::vector(), + /*global_index_external_path=*/std::nullopt, /*index_file_in_data_file_dir=*/false, mem_pool_)); CheckPartition(20211224, 18, *path_factory, "/dt=20211224/hr=18"); CheckPartition(20211224, std::nullopt, *path_factory, "/dt=20211224/hr=default"); @@ -235,6 +238,7 @@ TEST_F(FileStorePathFactoryTest, TestGetHierarchicalPartitionPath) { /*identifier=*/"mock_format", /*data_file_prefix=*/"data-", /*legacy_partition_name_enabled=*/true, /*external_paths=*/std::vector(), + /*global_index_external_path=*/std::nullopt, /*index_file_in_data_file_dir=*/false, mem_pool_)); { @@ -272,6 +276,7 @@ TEST_F(FileStorePathFactoryTest, TestToBinaryRowAndToPartitionString) { /*data_file_prefix=*/"data-", /*legacy_partition_name_enabled=*/true, /*external_paths=*/std::vector(), + /*global_index_external_path=*/std::nullopt, /*index_file_in_data_file_dir=*/false, mem_pool_)); { std::map partition_map; @@ -369,6 +374,7 @@ TEST_F(FileStorePathFactoryTest, TestCreateIndexFileFactory) { /*data_file_prefix=*/"data-", /*legacy_partition_name_enabled=*/true, /*external_paths=*/{}, + /*global_index_external_path=*/std::nullopt, /*index_file_in_data_file_dir=*/false, mem_pool_)); auto partition = BinaryRowGenerator::GenerateRow({true, 10}, mem_pool_.get()); ASSERT_OK_AND_ASSIGN( @@ -394,6 +400,7 @@ TEST_F(FileStorePathFactoryTest, TestCreateIndexFileFactory) { /*data_file_prefix=*/"data-", /*legacy_partition_name_enabled=*/true, /*external_paths=*/{"/tmp/external-path"}, + /*global_index_external_path=*/std::nullopt, /*index_file_in_data_file_dir=*/false, mem_pool_)); auto partition = BinaryRowGenerator::GenerateRow({true, 10}, mem_pool_.get()); ASSERT_OK_AND_ASSIGN( @@ -419,6 +426,7 @@ TEST_F(FileStorePathFactoryTest, TestCreateIndexFileFactory) { /*data_file_prefix=*/"data-", /*legacy_partition_name_enabled=*/true, /*external_paths=*/{}, + /*global_index_external_path=*/std::nullopt, /*index_file_in_data_file_dir=*/true, mem_pool_)); auto partition = BinaryRowGenerator::GenerateRow({true, 10}, mem_pool_.get()); ASSERT_OK_AND_ASSIGN( @@ -442,6 +450,7 @@ TEST_F(FileStorePathFactoryTest, TestCreateIndexFileFactory) { /*data_file_prefix=*/"data-", /*legacy_partition_name_enabled=*/true, /*external_paths=*/{"/tmp/external-path"}, + /*global_index_external_path=*/std::nullopt, /*index_file_in_data_file_dir=*/true, mem_pool_)); auto partition = BinaryRowGenerator::GenerateRow({true, 10}, mem_pool_.get()); ASSERT_OK_AND_ASSIGN( @@ -451,5 +460,38 @@ TEST_F(FileStorePathFactoryTest, TestCreateIndexFileFactory) { "/tmp/external-path/f0=true/f1=10/bucket-2/index-" + file_store_path_factory->uuid_ + "-0"); } + { + // test with global index external path + auto dir = UniqueTestDirectory::Create(); + ASSERT_TRUE(dir); + arrow::FieldVector fields = { + arrow::field("f0", arrow::boolean()), arrow::field("f1", arrow::int32()), + arrow::field("f2", arrow::int64()), arrow::field("f3", arrow::int16())}; + auto schema = arrow::schema(fields); + ASSERT_OK_AND_ASSIGN( + std::shared_ptr file_store_path_factory, + FileStorePathFactory::Create(dir->Str(), schema, {"f0", "f1"}, "default", + /*identifier=*/"mock_format", + /*data_file_prefix=*/"data-", + /*legacy_partition_name_enabled=*/true, + /*external_paths=*/std::vector(), + /*global_index_external_path=*/{"/tmp/external-path"}, + /*index_file_in_data_file_dir=*/false, mem_pool_)); + auto partition = BinaryRowGenerator::GenerateRow({true, 10}, mem_pool_.get()); + ASSERT_OK_AND_ASSIGN( + auto index_path_factory, + file_store_path_factory->CreateIndexFileFactory(partition, /*bucket=*/2)); + ASSERT_EQ(index_path_factory->ToPath("bitmap.index"), + "/tmp/external-path/index/bitmap.index"); + ASSERT_TRUE(index_path_factory->IsExternalPath()); + + auto index_file_meta = std::make_shared( + /*index_type=*/"bitmap", /*file_name=*/"bitmap.index", /*file_size=*/10, + /*row_count=*/5, /*dv_ranges=*/std::nullopt, + /*external_path=*/"/tmp/external-path/index/bitmap.index", + /*global_index_meta=*/std::nullopt); + ASSERT_EQ(index_path_factory->ToPath(index_file_meta), + "/tmp/external-path/index/bitmap.index"); + } } } // namespace paimon::test diff --git a/src/paimon/core/utils/index_file_path_factories_test.cpp b/src/paimon/core/utils/index_file_path_factories_test.cpp index 38d0d676..6a9636a3 100644 --- a/src/paimon/core/utils/index_file_path_factories_test.cpp +++ b/src/paimon/core/utils/index_file_path_factories_test.cpp @@ -41,6 +41,7 @@ TEST(IndexFilePathFactoriesTest, TestSimple) { /*data_file_prefix=*/"data-", /*legacy_partition_name_enabled=*/true, /*external_paths=*/{}, + /*global_index_external_path=*/std::nullopt, /*index_file_in_data_file_dir=*/false, pool)); auto uuid = file_store_path_factory->uuid_; auto factories = std::make_shared(file_store_path_factory); @@ -88,6 +89,7 @@ TEST(IndexFilePathFactoriesTest, TestWithExternalPath) { /*data_file_prefix=*/"data-", /*legacy_partition_name_enabled=*/true, /*external_paths=*/{"/tmp/external-path"}, + /*global_index_external_path=*/std::nullopt, /*index_file_in_data_file_dir=*/true, pool)); auto uuid = file_store_path_factory->uuid_; auto factories = std::make_shared(file_store_path_factory); diff --git a/src/paimon/global_index/lumina/lumina_global_index.cpp b/src/paimon/global_index/lumina/lumina_global_index.cpp index 528f9e53..1c5b1580 100644 --- a/src/paimon/global_index/lumina/lumina_global_index.cpp +++ b/src/paimon/global_index/lumina/lumina_global_index.cpp @@ -124,7 +124,7 @@ Result> LuminaGlobalIndex::CreateReader( auto searcher = std::make_unique<::lumina::api::LuminaSearcher>(std::move(lumina_searcher)); // get input stream and open index PAIMON_ASSIGN_OR_RAISE(std::shared_ptr in, - file_manager->GetInputStream(io_meta.file_name)); + file_manager->GetInputStream(io_meta.file_path)); auto lumina_file_reader = std::make_unique(in); PAIMON_RETURN_NOT_OK_FROM_LUMINA( searcher->Open(std::move(lumina_file_reader), ::lumina::api::IOOptions())); @@ -264,7 +264,8 @@ Result> LuminaIndexWriter::Finish() { PAIMON_RETURN_NOT_OK_FROM_LUMINA(builder.Dump(std::move(file_writer), io_options_)); // prepare GlobalIndexIOMeta PAIMON_ASSIGN_OR_RAISE(int64_t file_size, file_manager_->GetFileSize(index_file_name)); - GlobalIndexIOMeta meta(index_file_name, file_size, /*range_end=*/count_ - 1, + GlobalIndexIOMeta meta(file_manager_->ToPath(index_file_name), file_size, + /*range_end=*/count_ - 1, /*metadata=*/nullptr); return std::vector({meta}); } diff --git a/src/paimon/global_index/lumina/lumina_global_index_test.cpp b/src/paimon/global_index/lumina/lumina_global_index_test.cpp index 8bfadd54..c9bb691b 100644 --- a/src/paimon/global_index/lumina/lumina_global_index_test.cpp +++ b/src/paimon/global_index/lumina/lumina_global_index_test.cpp @@ -86,8 +86,9 @@ class LuminaGlobalIndexTest : public ::testing::Test { PAIMON_ASSIGN_OR_RAISE(auto result_metas, global_writer->Finish()); // check meta EXPECT_EQ(result_metas.size(), 1); - EXPECT_TRUE(StringUtils::StartsWith(result_metas[0].file_name, "lumina-global-index-")); - EXPECT_TRUE(StringUtils::EndsWith(result_metas[0].file_name, ".index")); + auto file_name = PathUtil::GetName(result_metas[0].file_path); + EXPECT_TRUE(StringUtils::StartsWith(file_name, "lumina-global-index-")); + EXPECT_TRUE(StringUtils::EndsWith(file_name, ".index")); EXPECT_EQ(result_metas[0].range_end, expected_range.to); EXPECT_FALSE(result_metas[0].metadata); return result_metas[0]; @@ -260,70 +261,60 @@ TEST_F(LuminaGlobalIndexTest, TestInvalidInputs) { ASSERT_NOK_WITH_MSG(CreateGlobalIndexReader(index_root, data_type_, options, fake_meta), "convert key lumina.index.dimension, value xxx to unsigned int failed"); } + // invalid inputs in write { - // invalid inputs in write - { - auto data_type = arrow::int32(); - ASSERT_NOK_WITH_MSG( - WriteGlobalIndex(index_root, data_type, options_, array_, Range(0, 3)), - "arrow schema must be struct type when create LuminaIndexWriter"); - } - { - auto data_type = arrow::struct_({arrow::field("f1", arrow::list(arrow::float32()))}); - ASSERT_NOK_WITH_MSG( - WriteGlobalIndex(index_root, data_type, options_, array_, Range(0, 3)), - "field f0 not exist in arrow schema when create LuminaIndexWriter"); - } - { - auto data_type = arrow::struct_({arrow::field("f0", arrow::float32())}); - ASSERT_NOK_WITH_MSG( - WriteGlobalIndex(index_root, data_type, options_, array_, Range(0, 3)), - "field type must be list[float] when create LuminaIndexWriter"); - } - { - auto data_type = arrow::struct_({arrow::field("f0", arrow::list(arrow::float64()))}); - ASSERT_NOK_WITH_MSG( - WriteGlobalIndex(index_root, data_type, options_, array_, Range(0, 3)), - "field type must be list[float] when create LuminaIndexWriter"); - } - { - std::shared_ptr array = - arrow::ipc::internal::json::ArrayFromJSON(data_type_, - R"([ + auto data_type = arrow::int32(); + ASSERT_NOK_WITH_MSG(WriteGlobalIndex(index_root, data_type, options_, array_, Range(0, 3)), + "arrow schema must be struct type when create LuminaIndexWriter"); + } + { + auto data_type = arrow::struct_({arrow::field("f1", arrow::list(arrow::float32()))}); + ASSERT_NOK_WITH_MSG(WriteGlobalIndex(index_root, data_type, options_, array_, Range(0, 3)), + "field f0 not exist in arrow schema when create LuminaIndexWriter"); + } + { + auto data_type = arrow::struct_({arrow::field("f0", arrow::float32())}); + ASSERT_NOK_WITH_MSG(WriteGlobalIndex(index_root, data_type, options_, array_, Range(0, 3)), + "field type must be list[float] when create LuminaIndexWriter"); + } + { + auto data_type = arrow::struct_({arrow::field("f0", arrow::list(arrow::float64()))}); + ASSERT_NOK_WITH_MSG(WriteGlobalIndex(index_root, data_type, options_, array_, Range(0, 3)), + "field type must be list[float] when create LuminaIndexWriter"); + } + { + std::shared_ptr array = arrow::ipc::internal::json::ArrayFromJSON(data_type_, + R"([ [[0.0, 0.0, 0.0, 0.0]], null ])") - .ValueOrDie(); - ASSERT_NOK_WITH_MSG( - WriteGlobalIndex(index_root, data_type_, options_, array, Range(0, 2)), - "arrow_array in LuminaIndexWriter is invalid, must not null"); - } - { - std::shared_ptr array = - arrow::ipc::internal::json::ArrayFromJSON(data_type_, - R"([ + .ValueOrDie(); + ASSERT_NOK_WITH_MSG(WriteGlobalIndex(index_root, data_type_, options_, array, Range(0, 2)), + "arrow_array in LuminaIndexWriter is invalid, must not null"); + } + { + std::shared_ptr array = arrow::ipc::internal::json::ArrayFromJSON(data_type_, + R"([ [[0.0, 0.0, 0.0, 0.0]], [[0.0, 1.0, 0.0, null]] ])") - .ValueOrDie(); - ASSERT_NOK_WITH_MSG( - WriteGlobalIndex(index_root, data_type_, options_, array, Range(0, 2)), - "field value array in LuminaIndexWriter is invalid, must not null"); - } - { - std::shared_ptr array = - arrow::ipc::internal::json::ArrayFromJSON(data_type_, - R"([ + .ValueOrDie(); + ASSERT_NOK_WITH_MSG(WriteGlobalIndex(index_root, data_type_, options_, array, Range(0, 2)), + "field value array in LuminaIndexWriter is invalid, must not null"); + } + { + std::shared_ptr array = arrow::ipc::internal::json::ArrayFromJSON(data_type_, + R"([ [[0.0, 0.0, 0.0, 0.0]], [[0.0, 1.0, 0.0]] ])") - .ValueOrDie(); - ASSERT_NOK_WITH_MSG( - WriteGlobalIndex(index_root, data_type_, options_, array, Range(0, 2)), - "invalid input array in LuminaIndexWriter, length of field array [2] multiplied " - "dimension [4] must match length of field value array [7]"); - } + .ValueOrDie(); + ASSERT_NOK_WITH_MSG( + WriteGlobalIndex(index_root, data_type_, options_, array, Range(0, 2)), + "invalid input array in LuminaIndexWriter, length of field array [2] multiplied " + "dimension [4] must match length of field value array [7]"); } + { // invalid inputs in read auto test_root_dir = paimon::test::UniqueTestDirectory::Create(); @@ -360,7 +351,7 @@ TEST_F(LuminaGlobalIndexTest, TestInvalidInputs) { } { auto fake_meta = meta; - fake_meta.file_name = "non-exist-file"; + fake_meta.file_path = "non-exist-file"; ASSERT_NOK_WITH_MSG( CreateGlobalIndexReader(index_root, data_type_, options_, fake_meta), "non-exist-file\' not exists"); diff --git a/src/paimon/testing/utils/data_generator.cpp b/src/paimon/testing/utils/data_generator.cpp index 1cbd6665..5aeaaa58 100644 --- a/src/paimon/testing/utils/data_generator.cpp +++ b/src/paimon/testing/utils/data_generator.cpp @@ -234,13 +234,15 @@ Result>> DataGenerator::SplitArrayByPar std::vector> row_kinds_holder; auto schema = DataField::ConvertDataFieldsToArrowSchema(fields); - PAIMON_ASSIGN_OR_RAISE(auto path_factory, - FileStorePathFactory::Create( - /*root=*/"/tmp", schema, partition_keys, - /*default_part_value=*/"__DEFAULT_PARTITION__", - /*identifier=*/"orc", /*data_file_prefix=*/"data-", - /*legacy_partition_name_enabled=*/true, std::vector(), - /*index_file_in_data_file_dir=*/false, memory_pool_)); + PAIMON_ASSIGN_OR_RAISE( + auto path_factory, + FileStorePathFactory::Create( + /*root=*/"/tmp", schema, partition_keys, + /*default_part_value=*/"__DEFAULT_PARTITION__", + /*identifier=*/"orc", /*data_file_prefix=*/"data-", + /*legacy_partition_name_enabled=*/true, /*external_paths=*/std::vector(), + /*global_index_external_path=*/std::nullopt, + /*index_file_in_data_file_dir=*/false, memory_pool_)); for (const auto& binary_row : binary_rows) { PAIMON_ASSIGN_OR_RAISE(BinaryRow partition_row, diff --git a/test/inte/global_index_test.cpp b/test/inte/global_index_test.cpp index cda5e3b7..712cfe17 100644 --- a/test/inte/global_index_test.cpp +++ b/test/inte/global_index_test.cpp @@ -265,7 +265,8 @@ TEST_P(GlobalIndexTest, TestWriteLuminaIndex) { /*extra_field_ids=*/std::nullopt, /*index_meta=*/nullptr); auto expected_index_file_meta = std::make_shared("lumina", /*file_name=*/"fake_index_file", /*file_size=*/10, - /*row_count=*/4, expected_global_index_meta); + /*row_count=*/4, /*dv_ranges=*/std::nullopt, + /*external_path=*/std::nullopt, expected_global_index_meta); DataIncrement expected_data_increment({expected_index_file_meta}); auto expected_commit_message = std::make_shared( /*partition=*/BinaryRow::EmptyRow(), /*bucket=*/0, /*total_buckets=*/std::nullopt, @@ -309,7 +310,8 @@ TEST_P(GlobalIndexTest, TestWriteIndex) { /*extra_field_ids=*/std::nullopt, /*index_meta=*/nullptr); auto expected_index_file_meta = std::make_shared("bitmap", /*file_name=*/"fake_index_file", /*file_size=*/10, - /*row_count=*/8, expected_global_index_meta); + /*row_count=*/8, /*dv_ranges=*/std::nullopt, + /*external_path=*/std::nullopt, expected_global_index_meta); DataIncrement expected_data_increment({expected_index_file_meta}); auto expected_commit_message = std::make_shared( /*partition=*/BinaryRow::EmptyRow(), /*bucket=*/0, /*total_buckets=*/std::nullopt, @@ -394,7 +396,8 @@ TEST_P(GlobalIndexTest, TestWriteIndexWithPartition) { /*extra_field_ids=*/std::nullopt, /*index_meta=*/nullptr); auto expected_index_file_meta = std::make_shared( "bitmap", /*file_name=*/"fake_index_file", /*file_size=*/10, - /*row_count=*/expected_range.Count(), expected_global_index_meta); + /*row_count=*/expected_range.Count(), /*dv_ranges=*/std::nullopt, + /*external_path=*/std::nullopt, expected_global_index_meta); DataIncrement expected_data_increment({expected_index_file_meta}); auto expected_commit_message = std::make_shared( /*partition=*/expected_partition_row, @@ -1986,6 +1989,81 @@ TEST_P(GlobalIndexTest, TestScanIndexWithTwoIndexes) { ASSERT_EQ(index_readers.size(), 0); } +TEST_P(GlobalIndexTest, TestDataEvolutionBatchScanWithExternalPath) { + arrow::FieldVector fields = { + arrow::field("f0", arrow::utf8()), arrow::field("f1", arrow::list(arrow::float32())), + arrow::field("f2", arrow::int32()), arrow::field("f3", arrow::float64())}; + std::map lumina_options = { + {"lumina.dimension", "4"}, + {"lumina.indextype", "bruteforce"}, + {"lumina.distance.metric", "l2"}, + {"lumina.encoding.type", "encoding.rawf32"}, + {"lumina.search.threadcount", "10"}}; + auto schema = arrow::schema(fields); + std::map options = {{Options::MANIFEST_FORMAT, "orc"}, + {Options::FILE_FORMAT, GetParam()}, + {Options::FILE_SYSTEM, "local"}, + {Options::ROW_TRACKING_ENABLED, "true"}, + {Options::DATA_EVOLUTION_ENABLED, "true"}}; + CreateTable(/*partition_keys=*/{}, schema, options); + + std::string table_path = PathUtil::JoinPath(dir_->Str(), "foo.db/bar"); + std::vector write_cols = schema->field_names(); + + auto src_array = std::dynamic_pointer_cast( + arrow::ipc::internal::json::ArrayFromJSON(arrow::struct_(fields), R"([ +["Alice", [0.0, 0.0, 0.0, 0.0], 10, 11.1], +["Bob", [0.0, 1.0, 0.0, 1.0], 10, 12.1], +["Emily", [1.0, 0.0, 1.0, 0.0], 10, 13.1], +["Tony", [1.0, 1.0, 1.0, 1.0], 10, 14.1], +["Lucy", [10.0, 10.0, 10.0, 10.0], 20, 15.1], +["Bob", [10.0, 11.0, 10.0, 11.0], 20, 16.1], +["Tony", [11.0, 10.0, 11.0, 10.0], 20, 17.1], +["Alice", [11.0, 11.0, 11.0, 11.0], 20, 18.1], +["Paul", [10.0, 10.0, 10.0, 10.0], 20, 19.1] + ])") + .ValueOrDie()); + ASSERT_OK_AND_ASSIGN(auto commit_msgs, WriteArray(table_path, write_cols, src_array)); + ASSERT_OK(Commit(table_path, commit_msgs)); + + // write and commit bitmap global index + auto external_dir1 = UniqueTestDirectory::Create("local"); + ASSERT_OK( + WriteIndex(table_path, /*partition_filters=*/{}, "f0", "bitmap", + /*options=*/{{"global-index.external-path", "FILE://" + external_dir1->Str()}}, + Range(0, 8))); + + auto external_dir2 = UniqueTestDirectory::Create("local"); + auto lumina_options_with_external_path = lumina_options; + lumina_options_with_external_path["global-index.external-path"] = + "FILE://" + external_dir2->Str(); + ASSERT_OK(WriteIndex(table_path, /*partition_filters=*/{}, "f1", "lumina", + /*options=*/lumina_options_with_external_path, Range(0, 8))); + + auto read_cols = write_cols; + read_cols.push_back("_INDEX_SCORE"); + auto result_fields = fields; + result_fields.insert(result_fields.begin(), SpecialFields::ValueKind().ArrowField()); + result_fields.insert(result_fields.end(), SpecialFields::IndexScore().ArrowField()); + + // test scan and read + auto predicate = + PredicateBuilder::Equal(/*field_index=*/0, /*field_name=*/"f0", FieldType::STRING, + Literal(FieldType::STRING, "Alice", 5)); + auto vector_search = std::make_shared( + "f1", /*limit=*/1, std::vector({1.0f, 1.0f, 1.0f, 1.1f}), /*filter=*/nullptr, + /*predicate=*/nullptr); + ASSERT_OK_AND_ASSIGN( + auto plan, ScanGlobalIndexAndData(table_path, predicate, vector_search, lumina_options)); + + auto expected_array = + arrow::ipc::internal::json::ArrayFromJSON(arrow::struct_(result_fields), R"([ +[0, "Alice", [0.0, 0.0, 0.0, 0.0], 10, 11.1, 4.21] + ])") + .ValueOrDie(); + ASSERT_OK(ReadData(table_path, read_cols, expected_array, predicate, plan)); +} + TEST_P(GlobalIndexTest, TestIOException) { if (GetParam() == "lance") { return; From 7ad9abf2609b8dd3dac458777feda23350bea830 Mon Sep 17 00:00:00 2001 From: "lisizhuo.lsz" Date: Mon, 12 Jan 2026 16:37:09 +0800 Subject: [PATCH 2/7] fix --- test/inte/global_index_test.cpp | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/test/inte/global_index_test.cpp b/test/inte/global_index_test.cpp index 712cfe17..f0c4456e 100644 --- a/test/inte/global_index_test.cpp +++ b/test/inte/global_index_test.cpp @@ -1993,12 +1993,11 @@ TEST_P(GlobalIndexTest, TestDataEvolutionBatchScanWithExternalPath) { arrow::FieldVector fields = { arrow::field("f0", arrow::utf8()), arrow::field("f1", arrow::list(arrow::float32())), arrow::field("f2", arrow::int32()), arrow::field("f3", arrow::float64())}; - std::map lumina_options = { - {"lumina.dimension", "4"}, - {"lumina.indextype", "bruteforce"}, - {"lumina.distance.metric", "l2"}, - {"lumina.encoding.type", "encoding.rawf32"}, - {"lumina.search.threadcount", "10"}}; + std::map lumina_options = {{"lumina.index.dimension", "4"}, + {"lumina.index.type", "bruteforce"}, + {"lumina.distance.metric", "l2"}, + {"lumina.encoding.type", "rawf32"}, + {"lumina.search.thread_count", "10"}}; auto schema = arrow::schema(fields); std::map options = {{Options::MANIFEST_FORMAT, "orc"}, {Options::FILE_FORMAT, GetParam()}, From 1268448efc9a86f7d784907d09df6537b5e6dadc Mon Sep 17 00:00:00 2001 From: "lisizhuo.lsz" Date: Mon, 12 Jan 2026 16:49:53 +0800 Subject: [PATCH 3/7] fix2 From 2a0c49fa5741e86f0f68993ba6e3b9af61867833 Mon Sep 17 00:00:00 2001 From: "lisizhuo.lsz" Date: Mon, 12 Jan 2026 16:52:44 +0800 Subject: [PATCH 4/7] fix3 --- src/paimon/core/operation/raw_file_split_read_test.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/paimon/core/operation/raw_file_split_read_test.cpp b/src/paimon/core/operation/raw_file_split_read_test.cpp index 0f0d8bbb..6131347f 100644 --- a/src/paimon/core/operation/raw_file_split_read_test.cpp +++ b/src/paimon/core/operation/raw_file_split_read_test.cpp @@ -495,7 +495,9 @@ TEST_F(RawFileSplitReadTest, TestMatch) { split_read->Match(data_split, /*force_keep_delete=*/false)); ASSERT_FALSE(match_result); } - { ASSERT_NOK(split_read->Match(nullptr, /*force_keep_delete=*/false)); } + { + ASSERT_NOK(split_read->Match(nullptr, /*force_keep_delete=*/false)); + } } } // namespace paimon::test From 4d24776e57d1845ff4ecccd9291135460f33b308 Mon Sep 17 00:00:00 2001 From: "lisizhuo.lsz" Date: Tue, 13 Jan 2026 14:06:53 +0800 Subject: [PATCH 5/7] fix2 --- include/paimon/defs.h | 4 ++-- src/paimon/core/utils/file_store_path_factory.cpp | 3 ++- src/paimon/core/utils/file_store_path_factory_test.cpp | 8 +++----- 3 files changed, 7 insertions(+), 8 deletions(-) diff --git a/include/paimon/defs.h b/include/paimon/defs.h index 98763238..59a0ab5a 100644 --- a/include/paimon/defs.h +++ b/include/paimon/defs.h @@ -284,8 +284,8 @@ struct PAIMON_EXPORT Options { static const char BLOB_AS_DESCRIPTOR[]; /// "global-index.enabled" - Whether to enable global index for scan. Default value is "true". static const char GLOBAL_INDEX_ENABLED[]; - /// "global-index.external-path" - The external path where the global index will be - /// written. + /// "global-index.external-path" - Global index root directory, if not set, the global index + /// files will be stored under the /index. static const char GLOBAL_INDEX_EXTERNAL_PATH[]; }; diff --git a/src/paimon/core/utils/file_store_path_factory.cpp b/src/paimon/core/utils/file_store_path_factory.cpp index fcce4a11..a3d1a73a 100644 --- a/src/paimon/core/utils/file_store_path_factory.cpp +++ b/src/paimon/core/utils/file_store_path_factory.cpp @@ -157,6 +157,7 @@ std::unique_ptr FileStorePathFactory::CreateGlobalIndexFileFac std::string NewPath() const override { return factory_->NewIndexFile(); } + std::string ToPath(const std::shared_ptr& file) const override { const auto& external_path = file->ExternalPath(); if (external_path) { @@ -168,7 +169,7 @@ std::unique_ptr FileStorePathFactory::CreateGlobalIndexFileFac std::string ToPath(const std::string& file_name) const override { const auto& external_path = factory_->GetGlobalIndexExternalPath(); if (external_path) { - return PathUtil::JoinPath(factory_->IndexPath(external_path.value()), file_name); + return PathUtil::JoinPath(external_path.value(), file_name); } return PathUtil::JoinPath(factory_->IndexPath(factory_->RootPath()), file_name); } diff --git a/src/paimon/core/utils/file_store_path_factory_test.cpp b/src/paimon/core/utils/file_store_path_factory_test.cpp index ed2bd716..bc8825bd 100644 --- a/src/paimon/core/utils/file_store_path_factory_test.cpp +++ b/src/paimon/core/utils/file_store_path_factory_test.cpp @@ -481,17 +481,15 @@ TEST_F(FileStorePathFactoryTest, TestCreateIndexFileFactory) { ASSERT_OK_AND_ASSIGN( auto index_path_factory, file_store_path_factory->CreateIndexFileFactory(partition, /*bucket=*/2)); - ASSERT_EQ(index_path_factory->ToPath("bitmap.index"), - "/tmp/external-path/index/bitmap.index"); + ASSERT_EQ(index_path_factory->ToPath("bitmap.index"), "/tmp/external-path/bitmap.index"); ASSERT_TRUE(index_path_factory->IsExternalPath()); auto index_file_meta = std::make_shared( /*index_type=*/"bitmap", /*file_name=*/"bitmap.index", /*file_size=*/10, /*row_count=*/5, /*dv_ranges=*/std::nullopt, - /*external_path=*/"/tmp/external-path/index/bitmap.index", + /*external_path=*/"/tmp/external-path/bitmap.index", /*global_index_meta=*/std::nullopt); - ASSERT_EQ(index_path_factory->ToPath(index_file_meta), - "/tmp/external-path/index/bitmap.index"); + ASSERT_EQ(index_path_factory->ToPath(index_file_meta), "/tmp/external-path/bitmap.index"); } } } // namespace paimon::test From 822a436c5e1a276034e3e73e51e0ec8c862b400d Mon Sep 17 00:00:00 2001 From: "lisizhuo.lsz" Date: Tue, 13 Jan 2026 14:13:43 +0800 Subject: [PATCH 6/7] fix3 --- src/paimon/core/utils/file_store_path_factory.cpp | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/paimon/core/utils/file_store_path_factory.cpp b/src/paimon/core/utils/file_store_path_factory.cpp index a3d1a73a..b807c2b8 100644 --- a/src/paimon/core/utils/file_store_path_factory.cpp +++ b/src/paimon/core/utils/file_store_path_factory.cpp @@ -157,7 +157,6 @@ std::unique_ptr FileStorePathFactory::CreateGlobalIndexFileFac std::string NewPath() const override { return factory_->NewIndexFile(); } - std::string ToPath(const std::shared_ptr& file) const override { const auto& external_path = file->ExternalPath(); if (external_path) { @@ -165,7 +164,6 @@ std::unique_ptr FileStorePathFactory::CreateGlobalIndexFileFac } return PathUtil::JoinPath(factory_->IndexPath(factory_->RootPath()), file->FileName()); } - std::string ToPath(const std::string& file_name) const override { const auto& external_path = factory_->GetGlobalIndexExternalPath(); if (external_path) { @@ -173,7 +171,6 @@ std::unique_ptr FileStorePathFactory::CreateGlobalIndexFileFac } return PathUtil::JoinPath(factory_->IndexPath(factory_->RootPath()), file_name); } - bool IsExternalPath() const override { return factory_->GetGlobalIndexExternalPath() != std::nullopt; } From cc13a353f2633a7d86120e83ab8497651c0e86c2 Mon Sep 17 00:00:00 2001 From: "lisizhuo.lsz" Date: Tue, 13 Jan 2026 14:32:59 +0800 Subject: [PATCH 7/7] fix4 --- include/paimon/defs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/paimon/defs.h b/include/paimon/defs.h index 59a0ab5a..c862bdd7 100644 --- a/include/paimon/defs.h +++ b/include/paimon/defs.h @@ -285,7 +285,7 @@ struct PAIMON_EXPORT Options { /// "global-index.enabled" - Whether to enable global index for scan. Default value is "true". static const char GLOBAL_INDEX_ENABLED[]; /// "global-index.external-path" - Global index root directory, if not set, the global index - /// files will be stored under the /index. + /// files will be stored under the index directory. static const char GLOBAL_INDEX_EXTERNAL_PATH[]; };