diff --git a/include/paimon/catalog/catalog.h b/include/paimon/catalog/catalog.h index 693c3d18..19e74004 100644 --- a/include/paimon/catalog/catalog.h +++ b/include/paimon/catalog/catalog.h @@ -23,6 +23,7 @@ #include "paimon/catalog/identifier.h" #include "paimon/result.h" +#include "paimon/schema/schema.h" #include "paimon/status.h" #include "paimon/type_fwd.h" #include "paimon/visibility.h" @@ -93,6 +94,16 @@ class PAIMON_EXPORT Catalog { /// @return A result containing a vector of table names in the specified database, or an error /// status. virtual Result> ListTables(const std::string& db_name) const = 0; + + /// Loads the latest schema of a specified table. + /// + /// @note System tables will not be supported. + /// + /// @param identifier The identifier (database and table name) of the table to load. + /// @return A result containing table schema if the table exists, or std::nullopt if it + /// doesn't, or an error status on failure. + virtual Result>> LoadTableSchema( + const Identifier& identifier) const = 0; }; } // namespace paimon diff --git a/include/paimon/schema/schema.h b/include/paimon/schema/schema.h new file mode 100644 index 00000000..55a38b19 --- /dev/null +++ b/include/paimon/schema/schema.h @@ -0,0 +1,81 @@ +/* + * Copyright 2025-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include + +#include "arrow/api.h" +#include "arrow/c/bridge.h" +#include "paimon/result.h" +#include "paimon/visibility.h" + +struct ArrowSchema; + +namespace paimon { + +/// This interface provides access to TableSchema-related information. +class PAIMON_EXPORT Schema { + public: + virtual ~Schema() = default; + + /// Get the Arrow C schema representation of this table schema. + /// @return A result containing an ArrowSchema, or an error status if conversion fails. + virtual Result> GetArrowSchema() const = 0; + + /// Get the names of all fields in the table schema. + /// @return A vector of field names. + virtual std::vector FieldNames() const = 0; + + /// Get the unique identifier of this table schema. + /// @return The schema ID + virtual int64_t Id() const = 0; + + /// Get the list of primary key field names. + /// @return A reference to the vector of primary key names; empty if no primary keys are + /// defined. + virtual const std::vector& PrimaryKeys() const = 0; + + /// Get the list of partition key field names. + /// @return A reference to the vector of partition key names; empty if the table is not + /// partitioned. + virtual const std::vector& PartitionKeys() const = 0; + + /// Get the list of bucket key field names used for bucketing. + /// @return A reference to the vector of bucket key names. + virtual const std::vector& BucketKeys() const = 0; + + /// Get the number of buckets configured for this table. + /// @return The number of buckets. + virtual int32_t NumBuckets() const = 0; + + /// Get the highest field ID assigned in this schema. + /// @return The maximum field ID. + virtual int32_t HighestFieldId() const = 0; + + /// Get the table-level options associated with this schema. + /// @return A reference to the map of option key-value pairs (e.g., file format, filesystem). + virtual const std::map& Options() const = 0; + + /// Get an optional comment describing the table. + /// @return The table comment if set, or std::nullopt otherwise. + virtual std::optional Comment() const = 0; +}; + +} // namespace paimon diff --git a/src/paimon/core/catalog/file_system_catalog.cpp b/src/paimon/core/catalog/file_system_catalog.cpp index 01b203d7..174b5b32 100644 --- a/src/paimon/core/catalog/file_system_catalog.cpp +++ b/src/paimon/core/catalog/file_system_catalog.cpp @@ -27,6 +27,7 @@ #include "paimon/common/utils/arrow/status_utils.h" #include "paimon/common/utils/path_util.h" #include "paimon/common/utils/string_utils.h" +#include "paimon/core/schema/schema_impl.h" #include "paimon/core/schema/schema_manager.h" #include "paimon/fs/file_system.h" #include "paimon/logging.h" @@ -211,4 +212,19 @@ Result FileSystemCatalog::TableExistsInFileSystem(const std::string& table } } +Result>> FileSystemCatalog::LoadTableSchema( + const Identifier& identifier) const { + if (IsSystemTable(identifier)) { + return Status::NotImplemented("do not support loading schema for system table."); + } + SchemaManager schema_manager(fs_, NewDataTablePath(warehouse_, identifier)); + PAIMON_ASSIGN_OR_RAISE(std::optional> latest_schema, + schema_manager.Latest()); + if (latest_schema.has_value()) { + std::shared_ptr schema = std::make_shared(*latest_schema); + return std::optional>(schema); + } + return std::optional>(); +} + } // namespace paimon diff --git a/src/paimon/core/catalog/file_system_catalog.h b/src/paimon/core/catalog/file_system_catalog.h index 1170a66a..976d13c5 100644 --- a/src/paimon/core/catalog/file_system_catalog.h +++ b/src/paimon/core/catalog/file_system_catalog.h @@ -49,6 +49,8 @@ class FileSystemCatalog : public Catalog { Result> ListDatabases() const override; Result> ListTables(const std::string& database_names) const override; + Result>> LoadTableSchema( + const Identifier& identifier) const override; private: static std::string NewDatabasePath(const std::string& warehouse, const std::string& db_name); diff --git a/src/paimon/core/catalog/file_system_catalog_test.cpp b/src/paimon/core/catalog/file_system_catalog_test.cpp index a0846115..c83b70c5 100644 --- a/src/paimon/core/catalog/file_system_catalog_test.cpp +++ b/src/paimon/core/catalog/file_system_catalog_test.cpp @@ -180,6 +180,12 @@ TEST(FileSystemCatalogTest, TestCreateTableWithBlob) { ASSERT_OK_AND_ASSIGN(std::vector table_names, catalog.ListTables("db1")); ASSERT_EQ(1, table_names.size()); ASSERT_EQ(table_names[0], "tbl1"); + ASSERT_OK_AND_ASSIGN(std::optional> table_schema, + catalog.LoadTableSchema(Identifier("db1", "tbl1"))); + ASSERT_TRUE(table_schema.has_value()); + ASSERT_OK_AND_ASSIGN(auto arrow_schema, (*table_schema)->GetArrowSchema()); + auto loaded_schema = arrow::ImportSchema(arrow_schema.get()).ValueOrDie(); + ASSERT_TRUE(typed_schema.Equals(loaded_schema)); ArrowSchemaRelease(&schema); } @@ -309,4 +315,54 @@ TEST(FileSystemCatalogTest, TestInvalidList) { "do not support listing tables for system database."); } +TEST(FileSystemCatalogTest, TestValidateTableSchema) { + std::map options; + options[Options::FILE_SYSTEM] = "local"; + options[Options::FILE_FORMAT] = "orc"; + ASSERT_OK_AND_ASSIGN(auto core_options, CoreOptions::FromMap(options)); + auto dir = UniqueTestDirectory::Create(); + ASSERT_TRUE(dir); + FileSystemCatalog catalog(core_options.GetFileSystem(), dir->Str()); + ASSERT_OK(catalog.CreateDatabase("db1", options, /*ignore_if_exists=*/true)); + arrow::FieldVector fields = { + arrow::field("f0", arrow::utf8()), + arrow::field("f1", arrow::int32()), + arrow::field("f2", arrow::int32()), + arrow::field("f3", arrow::float64()), + }; + arrow::Schema typed_schema(fields); + ::ArrowSchema schema; + ASSERT_TRUE(arrow::ExportSchema(typed_schema, &schema).ok()); + ASSERT_OK(catalog.CreateTable(Identifier("db1", "tbl1"), &schema, {"f1"}, {}, options, + /*ignore_if_exists=*/false)); + + ASSERT_OK_AND_ASSIGN(std::optional> table_schema, + catalog.LoadTableSchema(Identifier("db0", "tbl0"))); + ASSERT_FALSE(table_schema.has_value()); + ASSERT_OK_AND_ASSIGN(table_schema, catalog.LoadTableSchema(Identifier("db1", "tbl1"))); + ASSERT_TRUE(table_schema.has_value()); + ASSERT_EQ(0, (*table_schema)->Id()); + ASSERT_EQ(3, (*table_schema)->HighestFieldId()); + ASSERT_EQ(1, (*table_schema)->PartitionKeys().size()); + ASSERT_EQ(0, (*table_schema)->PrimaryKeys().size()); + ASSERT_EQ(-1, (*table_schema)->NumBuckets()); + ASSERT_FALSE((*table_schema)->Comment().has_value()); + std::vector field_names = (*table_schema)->FieldNames(); + std::vector expected_field_names = {"f0", "f1", "f2", "f3"}; + ASSERT_EQ(field_names, expected_field_names); + + ASSERT_OK_AND_ASSIGN(auto arrow_schema, (*table_schema)->GetArrowSchema()); + auto loaded_schema = arrow::ImportSchema(arrow_schema.get()).ValueOrDie(); + ASSERT_TRUE(typed_schema.Equals(loaded_schema)); + + ASSERT_OK_AND_ASSIGN(auto fs, FileSystemFactory::Get("local", dir->Str(), {})); + ASSERT_OK(fs->Delete(PathUtil::JoinPath(dir->Str(), "db1.db/tbl1/schema/schema-0"))); + ASSERT_OK_AND_ASSIGN(table_schema, catalog.LoadTableSchema(Identifier("db1", "tbl1"))); + ASSERT_FALSE(table_schema.has_value()); + + ASSERT_NOK_WITH_MSG(catalog.LoadTableSchema(Identifier("db1", "tbl$11")), + "do not support loading schema for system table."); + ArrowSchemaRelease(&schema); +} + } // namespace paimon::test diff --git a/src/paimon/core/schema/schema_impl.h b/src/paimon/core/schema/schema_impl.h new file mode 100644 index 00000000..16d3227d --- /dev/null +++ b/src/paimon/core/schema/schema_impl.h @@ -0,0 +1,67 @@ +/* + * Copyright 2025-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include + +#include "paimon/core/schema/table_schema.h" + +namespace paimon { + +class SchemaImpl : public Schema { + public: + explicit SchemaImpl(const std::shared_ptr& table_schema) + : table_schema_(table_schema) {} + Result> GetArrowSchema() const override { + return table_schema_->GetArrowSchema(); + } + std::vector FieldNames() const override { + return table_schema_->FieldNames(); + } + int64_t Id() const override { + return table_schema_->Id(); + } + const std::vector& PrimaryKeys() const override { + return table_schema_->PrimaryKeys(); + } + const std::vector& PartitionKeys() const override { + return table_schema_->PartitionKeys(); + } + const std::vector& BucketKeys() const override { + return table_schema_->BucketKeys(); + } + int32_t NumBuckets() const override { + return table_schema_->NumBuckets(); + } + int32_t HighestFieldId() const override { + return table_schema_->HighestFieldId(); + } + const std::map& Options() const override { + return table_schema_->Options(); + } + std::optional Comment() const override { + return table_schema_->Comment(); + } + + private: + std::shared_ptr table_schema_; +}; + +} // namespace paimon diff --git a/src/paimon/core/schema/table_schema.cpp b/src/paimon/core/schema/table_schema.cpp index 6195067c..bb92fd29 100644 --- a/src/paimon/core/schema/table_schema.cpp +++ b/src/paimon/core/schema/table_schema.cpp @@ -22,6 +22,7 @@ #include #include "arrow/api.h" +#include "arrow/c/bridge.h" #include "arrow/util/checked_cast.h" #include "fmt/format.h" #include "paimon/common/utils/arrow/status_utils.h" @@ -333,4 +334,11 @@ bool TableSchema::CrossPartitionUpdate() const { return !ObjectUtils::ContainsAll(primary_keys_, partition_keys_); } +Result> TableSchema::GetArrowSchema() const { + std::shared_ptr schema = DataField::ConvertDataFieldsToArrowSchema(fields_); + auto arrow_schema = std::make_unique<::ArrowSchema>(); + PAIMON_RETURN_NOT_OK_FROM_ARROW(arrow::ExportSchema(*schema, arrow_schema.get())); + return arrow_schema; +} + } // namespace paimon diff --git a/src/paimon/core/schema/table_schema.h b/src/paimon/core/schema/table_schema.h index cbbfe5fe..d75c1e86 100644 --- a/src/paimon/core/schema/table_schema.h +++ b/src/paimon/core/schema/table_schema.h @@ -92,12 +92,14 @@ class TableSchema : public Jsonizable { Result> TrimmedPrimaryKeys() const; - std::optional Commit() const { + std::optional Comment() const { return comment_; } bool CrossPartitionUpdate() const; + Result> GetArrowSchema() const; + private: JSONIZABLE_FRIEND_AND_DEFAULT_CTOR(TableSchema);