Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 4 additions & 3 deletions include/paimon/global_index/global_index_scan.h
Original file line number Diff line number Diff line change
Expand Up @@ -63,14 +63,15 @@ class PAIMON_EXPORT GlobalIndexScan {
virtual Result<std::shared_ptr<RowRangeGlobalIndexScanner>> CreateRangeScan(
const Range& range) = 0;

/// Returns the set of row ID ranges covered by this global index.
/// Returns row ID ranges covered by this global index (sorted and non-overlapping
/// ranges).
///
/// Each `Range` represents a contiguous segment of row IDs for which global index
/// data exists. This allows the query engine to parallelize scanning and be aware
/// of ranges that are not covered by any global index.
///
/// @return A `Result` containing a set of non-overlapping `Range` objects.
virtual Result<std::set<Range>> GetRowRangeList() = 0;
/// @return A `Result` containing sorted and non-overlapping `Range` objects.
virtual Result<std::vector<Range>> GetRowRangeList() = 0;
};

} // namespace paimon
45 changes: 45 additions & 0 deletions include/paimon/global_index/indexed_split.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
/*
* Copyright 2025-present Alibaba Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#pragma once

#include <cstddef>
#include <cstdint>
#include <memory>
#include <optional>
#include <string>
#include <vector>

#include "paimon/table/source/data_split.h"
#include "paimon/utils/range.h"
#include "paimon/visibility.h"

namespace paimon {
/// Indexed split for global index reading operation.
class PAIMON_EXPORT IndexedSplit : public Split {
public:
/// @returns The underlying physical data split containing actual data file details.
virtual std::shared_ptr<DataSplit> GetDataSplit() const = 0;

/// @returns A list of row intervals [start, end] indicating which rows
/// are relevant (e.g., passed predicate pushdown).
virtual const std::vector<Range>& RowRanges() const = 0;

/// @returns A score for **each individual row** included in `RowRanges()`,
/// in the order they appear when traversing the ranges.
virtual const std::vector<float>& Scores() const = 0;
};
} // namespace paimon
8 changes: 4 additions & 4 deletions include/paimon/global_index/row_range_global_index_writer.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,9 @@
#include <map>
#include <string>

#include "paimon/global_index/indexed_split.h"
#include "paimon/memory/memory_pool.h"
#include "paimon/result.h"
#include "paimon/table/source/data_split.h"
#include "paimon/utils/range.h"
#include "paimon/visibility.h"

Expand All @@ -35,8 +35,8 @@ class PAIMON_EXPORT RowRangeGlobalIndexWriter {
/// @param table_path Path to the table root directory where index files are stored.
/// @param field_name Name of the indexed column (must be present in the table schema).
/// @param index_type Type of global index to build (e.g., "bitmap", "lumina").
/// @param split The data split (e.g., Parquet file) containing the actual data.
/// @param range Row ID range [from, to] for data to build index.
/// @param index_split The indexed split containing the actual data (e.g., Parquet file) and
// row id range [from, to] for data to build index.
/// The range must be fully contained within the data covered
/// by the given `split`.
/// @param options Index-specific configuration (e.g., false positive rate for bloom
Expand All @@ -47,7 +47,7 @@ class PAIMON_EXPORT RowRangeGlobalIndexWriter {
/// or an error if indexing fails (e.g., unsupported type, I/O error).
static Result<std::shared_ptr<CommitMessage>> WriteIndex(
const std::string& table_path, const std::string& field_name, const std::string& index_type,
const std::shared_ptr<DataSplit>& split, const Range& range,
const std::shared_ptr<IndexedSplit>& indexed_split,
const std::map<std::string, std::string>& options, const std::shared_ptr<MemoryPool>& pool);
};

Expand Down
22 changes: 2 additions & 20 deletions include/paimon/read_context.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@
#include "paimon/predicate/predicate.h"
#include "paimon/result.h"
#include "paimon/type_fwd.h"
#include "paimon/utils/range.h"
#include "paimon/visibility.h"

namespace paimon {
Expand All @@ -43,8 +42,8 @@ class PAIMON_EXPORT ReadContext {
public:
ReadContext(const std::string& path, const std::string& branch,
const std::vector<std::string>& read_schema,
const std::shared_ptr<Predicate>& predicate, const std::vector<Range>& row_ranges,
bool enable_predicate_filter, bool enable_prefetch, uint32_t prefetch_batch_count,
const std::shared_ptr<Predicate>& predicate, bool enable_predicate_filter,
bool enable_prefetch, uint32_t prefetch_batch_count,
uint32_t prefetch_max_parallel_num, bool enable_multi_thread_row_to_batch,
uint32_t row_to_batch_thread_number, const std::optional<std::string>& table_schema,
const std::shared_ptr<MemoryPool>& memory_pool,
Expand Down Expand Up @@ -77,10 +76,6 @@ class PAIMON_EXPORT ReadContext {
return predicate_;
}

const std::vector<Range>& GetRowRanges() const {
return row_ranges_;
}

bool EnablePredicateFilter() const {
return enable_predicate_filter_;
}
Expand Down Expand Up @@ -114,7 +109,6 @@ class PAIMON_EXPORT ReadContext {
std::string branch_;
std::vector<std::string> read_schema_;
std::shared_ptr<Predicate> predicate_;
std::vector<Range> row_ranges_;
bool enable_predicate_filter_;
bool enable_prefetch_;
uint32_t prefetch_batch_count_;
Expand Down Expand Up @@ -273,18 +267,6 @@ class PAIMON_EXPORT ReadContextBuilder {
ReadContextBuilder& WithFileSystemSchemeToIdentifierMap(
const std::map<std::string, std::string>& fs_scheme_to_identifier_map);

/// Set specific row ranges to read for targeted data access.
///
/// This is primarily used in data evolution scenarios where only specific rows
/// need to be read. File ranges that do not intersect with the specified row ranges
/// will be filtered out, improving performance by avoiding unnecessary I/O.
///
/// @param row_ranges Vector of specific row ranges to read.
/// @return Reference to this builder for method chaining.
/// @note If not set, all rows in the selected files will be returned.
/// @note This is commonly used in data evolution mode for selective reading.
ReadContextBuilder& SetRowRanges(const std::vector<Range>& row_ranges);

/// Build and return a `ReadContext` instance with input validation.
/// @return Result containing the constructed `ReadContext` or an error status.
Result<std::unique_ptr<ReadContext>> Finish();
Expand Down
34 changes: 4 additions & 30 deletions include/paimon/table/source/data_split.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,42 +26,15 @@
#include "paimon/data/timestamp.h"
#include "paimon/memory/memory_pool.h"
#include "paimon/result.h"
#include "paimon/table/source/split.h"
#include "paimon/visibility.h"

namespace paimon {
class MemoryPool;

/// Input splits for read operation. Needed by most batch computation engines. Support Serialize and
/// Deserialize, compatible with java version.
class PAIMON_EXPORT DataSplit {
/// Input data split for reading operation. Needed by most batch computation engines.
class PAIMON_EXPORT DataSplit : public Split {
public:
virtual ~DataSplit() = default;

/// Deserialize a `DataSplit` from a binary buffer.
///
/// Creates a `DataSplit` instance from its serialized binary representation.
/// This is typically used in distributed computing scenarios where splits
/// are transmitted between different nodes or processes.
///
/// @param buffer Const pointer to the binary data containing the serialized `DataSplit`.
/// @param length Size of the buffer in bytes.
/// @param pool Memory pool for allocating objects during deserialization.
/// @return Result containing the deserialized `DataSplit` or an error status.
static Result<std::shared_ptr<DataSplit>> Deserialize(const char* buffer, size_t length,
const std::shared_ptr<MemoryPool>& pool);

/// Serialize a `DataSplit` to a binary string.
///
/// Converts a `DataSplit` instance to its binary representation for storage
/// or transmission. The serialized data can later be deserialized using
/// the Deserialize method.
///
/// @param data_split The `DataSplit` instance to serialize.
/// @param pool Memory pool for allocating temporary objects during serialization.
/// @return Result containing the serialized binary data as a string or an error status.
static Result<std::string> Serialize(const std::shared_ptr<DataSplit>& data_split,
const std::shared_ptr<MemoryPool>& pool);

/// Metadata structure for simple data files.
///
/// Contains essential information about a data file including its location,
Expand Down Expand Up @@ -97,6 +70,7 @@ class PAIMON_EXPORT DataSplit {
std::optional<int64_t> delete_row_count;

bool operator==(const SimpleDataFileMeta& other) const;

std::string ToString() const;
};

Expand Down
4 changes: 2 additions & 2 deletions include/paimon/table/source/plan.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,15 +20,15 @@
#include <optional>
#include <vector>

#include "paimon/table/source/data_split.h"
#include "paimon/table/source/split.h"

namespace paimon {
/// %Result plan of this `TableScan`.
class PAIMON_EXPORT Plan {
public:
virtual ~Plan() = default;
/// %Result splits.
virtual const std::vector<std::shared_ptr<DataSplit>>& Splits() const = 0;
virtual const std::vector<std::shared_ptr<Split>>& Splits() const = 0;
/// Snapshot id of this plan, return `std::nullopt` if the table is empty.
virtual std::optional<int64_t> SnapshotId() const = 0;
};
Expand Down
66 changes: 66 additions & 0 deletions include/paimon/table/source/split.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
/*
* Copyright 2025-present Alibaba Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#pragma once

#include <cstddef>
#include <cstdint>
#include <memory>
#include <optional>
#include <string>
#include <vector>

#include "paimon/memory/memory_pool.h"
#include "paimon/result.h"
#include "paimon/visibility.h"

namespace paimon {
class MemoryPool;

/// An input split for reading operation. Needed by most batch computation engines. Support
/// Serialize and Deserialize, compatible with java version.
/// This split can be either a `DataSplit` (for direct data file reads) or an `IndexedSplit`
/// (for reads leveraging global indexes).
class PAIMON_EXPORT Split {
public:
virtual ~Split() = default;

/// Deserialize a `Split` from a binary buffer.
///
/// Creates a `Split` instance from its serialized binary representation.
/// This is typically used in distributed computing scenarios where splits
/// are transmitted between different nodes or processes.
///
/// @param buffer Const pointer to the binary data containing the serialized `Split`.
/// @param length Size of the buffer in bytes.
/// @param pool Memory pool for allocating objects during deserialization.
/// @return Result containing the deserialized `Split` or an error status.
static Result<std::shared_ptr<Split>> Deserialize(const char* buffer, size_t length,
const std::shared_ptr<MemoryPool>& pool);

/// Serialize a `Split` to a binary string.
///
/// Converts a `Split` instance to its binary representation for storage
/// or transmission. The serialized data can later be deserialized using
/// the Deserialize method.
///
/// @param split The `Split` instance to serialize.
/// @param pool Memory pool for allocating temporary objects during serialization.
/// @return Result containing the serialized binary data as a string or an error status.
static Result<std::string> Serialize(const std::shared_ptr<Split>& split,
const std::shared_ptr<MemoryPool>& pool);
};
} // namespace paimon
17 changes: 8 additions & 9 deletions include/paimon/table/source/table_read.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,15 +24,14 @@
#include "paimon/read_context.h"
#include "paimon/reader/batch_reader.h"
#include "paimon/result.h"
#include "paimon/table/source/data_split.h"
#include "paimon/table/source/split.h"
#include "paimon/visibility.h"

namespace paimon {
class DataSplit;
class MemoryPool;
class ReadContext;

/// Given a `DataSplit` or a list of `DataSplit`, generate a reader for batch reading.
/// Given a `Split` or a list of `Split`, generate a reader for batch reading.
class PAIMON_EXPORT TableRead {
public:
virtual ~TableRead() = default;
Expand All @@ -46,21 +45,21 @@ class PAIMON_EXPORT TableRead {
/// Creates a `BatchReader` instance for reading data.
///
/// This method creates a BatchReader that will be responsible for reading data from the
/// provided data splits.
/// provided splits.
///
/// @param data_splits A vector of shared pointers to `DataSplit` instances representing the
/// @param splits A vector of shared pointers to `Split` instances representing the
/// data to be read.
/// @return A Result containing a unique pointer to the `BatchReader` instance.
virtual Result<std::unique_ptr<BatchReader>> CreateReader(
const std::vector<std::shared_ptr<DataSplit>>& data_splits);
const std::vector<std::shared_ptr<Split>>& splits);

/// Creates a `BatchReader` instance for a single data split.
/// Creates a `BatchReader` instance for a single split.
///
/// @param data_split A shared pointer to the `DataSplit` instance that defines the data to be
/// @param split A shared pointer to the `Split` instance that defines the data to be
/// read.
/// @return A Result containing a unique pointer to the `BatchReader` instance.
virtual Result<std::unique_ptr<BatchReader>> CreateReader(
const std::shared_ptr<DataSplit>& data_split) = 0;
const std::shared_ptr<Split>& split) = 0;

protected:
explicit TableRead(const std::shared_ptr<MemoryPool>& memory_pool);
Expand Down
15 changes: 15 additions & 0 deletions include/paimon/utils/range.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
#pragma once
#include <optional>
#include <string>
#include <vector>

#include "paimon/visibility.h"

Expand All @@ -28,9 +29,23 @@ struct PAIMON_EXPORT Range {
/// Returns the number of integers in the range [from, to].
int64_t Count() const;

/// Computes the intersection of two ranges.
static std::optional<Range> Intersection(const Range& left, const Range& right);

/// Checks whether two ranges have any overlap.
static bool HasIntersection(const Range& left, const Range& right);

/// Sorts a list of ranges by `from`, then merges overlapping or adjacent ranges.
/// @param ranges Input vector of ranges to merge.
/// @param adjacent If true, also merges ranges that are adjacent (e.g., [1,3] and [4,5] →
/// [1,5]).
/// If false, only merges strictly overlapping ranges.
/// @return A new vector of non-overlapping, sorted ranges.
static std::vector<Range> SortAndMergeOverlap(const std::vector<Range>& ranges, bool adjacent);

/// Computes the set intersection of two collections of disjoint, sorted ranges.
static std::vector<Range> And(const std::vector<Range>& left, const std::vector<Range>& right);

bool operator==(const Range& other) const;
bool operator<(const Range& other) const;

Expand Down
Loading
Loading