alibaba · lszskye · Dec 17, 2025 · Dec 16, 2025 · Dec 16, 2025 · Dec 16, 2025
diff --git a/include/paimon/global_index/global_index_scan.h b/include/paimon/global_index/global_index_scan.h
@@ -63,14 +63,15 @@ class PAIMON_EXPORT GlobalIndexScan {
     virtual Result<std::shared_ptr<RowRangeGlobalIndexScanner>> CreateRangeScan(
         const Range& range) = 0;
 
-    /// Returns the set of row ID ranges covered by this global index.
+    /// Returns row ID ranges covered by this global index (sorted and non-overlapping
+    /// ranges).
     ///
     /// Each `Range` represents a contiguous segment of row IDs for which global index
     /// data exists. This allows the query engine to parallelize scanning and be aware
     /// of ranges that are not covered by any global index.
     ///
-    /// @return A `Result` containing a set of non-overlapping `Range` objects.
-    virtual Result<std::set<Range>> GetRowRangeList() = 0;
+    /// @return A `Result` containing sorted and non-overlapping `Range` objects.
+    virtual Result<std::vector<Range>> GetRowRangeList() = 0;
 };
 
 }  // namespace paimon
diff --git a/include/paimon/global_index/indexed_split.h b/include/paimon/global_index/indexed_split.h
@@ -0,0 +1,45 @@
+/*
+ * Copyright 2025-present Alibaba Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <string>
+#include <vector>
+
+#include "paimon/table/source/data_split.h"
+#include "paimon/utils/range.h"
+#include "paimon/visibility.h"
+
+namespace paimon {
+/// Indexed split for global index reading operation.
+class PAIMON_EXPORT IndexedSplit : public Split {
+ public:
+    /// @returns The underlying physical data split containing actual data file details.
+    virtual std::shared_ptr<DataSplit> GetDataSplit() const = 0;
+
+    /// @returns A list of row intervals [start, end] indicating which rows
+    ///          are relevant (e.g., passed predicate pushdown).
+    virtual const std::vector<Range>& RowRanges() const = 0;
+
+    /// @returns A score for **each individual row** included in `RowRanges()`,
+    ///          in the order they appear when traversing the ranges.
+    virtual const std::vector<float>& Scores() const = 0;
+};
+}  // namespace paimon
diff --git a/include/paimon/global_index/row_range_global_index_writer.h b/include/paimon/global_index/row_range_global_index_writer.h
@@ -18,9 +18,9 @@
 #include <map>
 #include <string>
 
+#include "paimon/global_index/indexed_split.h"
 #include "paimon/memory/memory_pool.h"
 #include "paimon/result.h"
-#include "paimon/table/source/data_split.h"
 #include "paimon/utils/range.h"
 #include "paimon/visibility.h"
 
@@ -35,8 +35,8 @@ class PAIMON_EXPORT RowRangeGlobalIndexWriter {
     /// @param table_path   Path to the table root directory where index files are stored.
     /// @param field_name   Name of the indexed column (must be present in the table schema).
     /// @param index_type   Type of global index to build (e.g., "bitmap", "lumina").
-    /// @param split        The data split (e.g., Parquet file) containing the actual data.
-    /// @param range        Row ID range [from, to] for data to build index.
+    /// @param index_split  The indexed split containing the actual data (e.g., Parquet file) and
+    //                      row id range [from, to] for data to build index.
     ///                     The range must be fully contained within the data covered
     ///                     by the given `split`.
     /// @param options      Index-specific configuration (e.g., false positive rate for bloom
@@ -47,7 +47,7 @@ class PAIMON_EXPORT RowRangeGlobalIndexWriter {
     ///         or an error if indexing fails (e.g., unsupported type, I/O error).
     static Result<std::shared_ptr<CommitMessage>> WriteIndex(
         const std::string& table_path, const std::string& field_name, const std::string& index_type,
-        const std::shared_ptr<DataSplit>& split, const Range& range,
+        const std::shared_ptr<IndexedSplit>& indexed_split,
         const std::map<std::string, std::string>& options, const std::shared_ptr<MemoryPool>& pool);
 };
 

diff --git a/include/paimon/read_context.h b/include/paimon/read_context.h
@@ -26,7 +26,6 @@
 #include "paimon/predicate/predicate.h"
 #include "paimon/result.h"
 #include "paimon/type_fwd.h"
-#include "paimon/utils/range.h"
 #include "paimon/visibility.h"
 
 namespace paimon {
@@ -43,8 +42,8 @@ class PAIMON_EXPORT ReadContext {
  public:
     ReadContext(const std::string& path, const std::string& branch,
                 const std::vector<std::string>& read_schema,
-                const std::shared_ptr<Predicate>& predicate, const std::vector<Range>& row_ranges,
-                bool enable_predicate_filter, bool enable_prefetch, uint32_t prefetch_batch_count,
+                const std::shared_ptr<Predicate>& predicate, bool enable_predicate_filter,
+                bool enable_prefetch, uint32_t prefetch_batch_count,
                 uint32_t prefetch_max_parallel_num, bool enable_multi_thread_row_to_batch,
                 uint32_t row_to_batch_thread_number, const std::optional<std::string>& table_schema,
                 const std::shared_ptr<MemoryPool>& memory_pool,
@@ -77,10 +76,6 @@ class PAIMON_EXPORT ReadContext {
         return predicate_;
     }
 
-    const std::vector<Range>& GetRowRanges() const {
-        return row_ranges_;
-    }
-
     bool EnablePredicateFilter() const {
         return enable_predicate_filter_;
     }
@@ -114,7 +109,6 @@ class PAIMON_EXPORT ReadContext {
     std::string branch_;
     std::vector<std::string> read_schema_;
     std::shared_ptr<Predicate> predicate_;
-    std::vector<Range> row_ranges_;
     bool enable_predicate_filter_;
     bool enable_prefetch_;
     uint32_t prefetch_batch_count_;
@@ -273,18 +267,6 @@ class PAIMON_EXPORT ReadContextBuilder {
     ReadContextBuilder& WithFileSystemSchemeToIdentifierMap(
         const std::map<std::string, std::string>& fs_scheme_to_identifier_map);
 
-    /// Set specific row ranges to read for targeted data access.
-    ///
-    /// This is primarily used in data evolution scenarios where only specific rows
-    /// need to be read. File ranges that do not intersect with the specified row ranges
-    /// will be filtered out, improving performance by avoiding unnecessary I/O.
-    ///
-    /// @param row_ranges Vector of specific row ranges to read.
-    /// @return Reference to this builder for method chaining.
-    /// @note If not set, all rows in the selected files will be returned.
-    /// @note This is commonly used in data evolution mode for selective reading.
-    ReadContextBuilder& SetRowRanges(const std::vector<Range>& row_ranges);
-
     /// Build and return a `ReadContext` instance with input validation.
     /// @return Result containing the constructed `ReadContext` or an error status.
     Result<std::unique_ptr<ReadContext>> Finish();

diff --git a/include/paimon/table/source/data_split.h b/include/paimon/table/source/data_split.h
@@ -26,42 +26,15 @@
 #include "paimon/data/timestamp.h"
 #include "paimon/memory/memory_pool.h"
 #include "paimon/result.h"
+#include "paimon/table/source/split.h"
 #include "paimon/visibility.h"
 
 namespace paimon {
 class MemoryPool;
 
-/// Input splits for read operation. Needed by most batch computation engines. Support Serialize and
-/// Deserialize, compatible with java version.
-class PAIMON_EXPORT DataSplit {
+/// Input data split for reading operation. Needed by most batch computation engines.
+class PAIMON_EXPORT DataSplit : public Split {
  public:
-    virtual ~DataSplit() = default;
-
-    /// Deserialize a `DataSplit` from a binary buffer.
-    ///
-    /// Creates a `DataSplit` instance from its serialized binary representation.
-    /// This is typically used in distributed computing scenarios where splits
-    /// are transmitted between different nodes or processes.
-    ///
-    /// @param buffer Const pointer to the binary data containing the serialized `DataSplit`.
-    /// @param length Size of the buffer in bytes.
-    /// @param pool Memory pool for allocating objects during deserialization.
-    /// @return Result containing the deserialized `DataSplit` or an error status.
-    static Result<std::shared_ptr<DataSplit>> Deserialize(const char* buffer, size_t length,
-                                                          const std::shared_ptr<MemoryPool>& pool);
-
-    /// Serialize a `DataSplit` to a binary string.
-    ///
-    /// Converts a `DataSplit` instance to its binary representation for storage
-    /// or transmission. The serialized data can later be deserialized using
-    /// the Deserialize method.
-    ///
-    /// @param data_split The `DataSplit` instance to serialize.
-    /// @param pool Memory pool for allocating temporary objects during serialization.
-    /// @return Result containing the serialized binary data as a string or an error status.
-    static Result<std::string> Serialize(const std::shared_ptr<DataSplit>& data_split,
-                                         const std::shared_ptr<MemoryPool>& pool);
-
     /// Metadata structure for simple data files.
     ///
     /// Contains essential information about a data file including its location,
@@ -97,6 +70,7 @@ class PAIMON_EXPORT DataSplit {
         std::optional<int64_t> delete_row_count;
 
         bool operator==(const SimpleDataFileMeta& other) const;
+
         std::string ToString() const;
     };
 

diff --git a/include/paimon/table/source/plan.h b/include/paimon/table/source/plan.h
@@ -20,15 +20,15 @@
 #include <optional>
 #include <vector>
 
-#include "paimon/table/source/data_split.h"
+#include "paimon/table/source/split.h"
 
 namespace paimon {
 /// %Result plan of this `TableScan`.
 class PAIMON_EXPORT Plan {
  public:
     virtual ~Plan() = default;
     /// %Result splits.
-    virtual const std::vector<std::shared_ptr<DataSplit>>& Splits() const = 0;
+    virtual const std::vector<std::shared_ptr<Split>>& Splits() const = 0;
     /// Snapshot id of this plan, return `std::nullopt` if the table is empty.
     virtual std::optional<int64_t> SnapshotId() const = 0;
 };

diff --git a/include/paimon/table/source/split.h b/include/paimon/table/source/split.h
@@ -0,0 +1,66 @@
+/*
+ * Copyright 2025-present Alibaba Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <string>
+#include <vector>
+
+#include "paimon/memory/memory_pool.h"
+#include "paimon/result.h"
+#include "paimon/visibility.h"
+
+namespace paimon {
+class MemoryPool;
+
+/// An input split for reading operation. Needed by most batch computation engines. Support
+/// Serialize and Deserialize, compatible with java version.
+/// This split can be either a `DataSplit` (for direct data file reads) or an `IndexedSplit`
+/// (for reads leveraging global indexes).
+class PAIMON_EXPORT Split {
+ public:
+    virtual ~Split() = default;
+
+    /// Deserialize a `Split` from a binary buffer.
+    ///
+    /// Creates a `Split` instance from its serialized binary representation.
+    /// This is typically used in distributed computing scenarios where splits
+    /// are transmitted between different nodes or processes.
+    ///
+    /// @param buffer Const pointer to the binary data containing the serialized `Split`.
+    /// @param length Size of the buffer in bytes.
+    /// @param pool Memory pool for allocating objects during deserialization.
+    /// @return Result containing the deserialized `Split` or an error status.
+    static Result<std::shared_ptr<Split>> Deserialize(const char* buffer, size_t length,
+                                                      const std::shared_ptr<MemoryPool>& pool);
+
+    /// Serialize a `Split` to a binary string.
+    ///
+    /// Converts a `Split` instance to its binary representation for storage
+    /// or transmission. The serialized data can later be deserialized using
+    /// the Deserialize method.
+    ///
+    /// @param split The `Split` instance to serialize.
+    /// @param pool Memory pool for allocating temporary objects during serialization.
+    /// @return Result containing the serialized binary data as a string or an error status.
+    static Result<std::string> Serialize(const std::shared_ptr<Split>& split,
+                                         const std::shared_ptr<MemoryPool>& pool);
+};
+}  // namespace paimon
diff --git a/include/paimon/table/source/table_read.h b/include/paimon/table/source/table_read.h
@@ -24,15 +24,14 @@
 #include "paimon/read_context.h"
 #include "paimon/reader/batch_reader.h"
 #include "paimon/result.h"
-#include "paimon/table/source/data_split.h"
+#include "paimon/table/source/split.h"
 #include "paimon/visibility.h"
 
 namespace paimon {
-class DataSplit;
 class MemoryPool;
 class ReadContext;
 
-/// Given a `DataSplit` or a list of `DataSplit`, generate a reader for batch reading.
+/// Given a `Split` or a list of `Split`, generate a reader for batch reading.
 class PAIMON_EXPORT TableRead {
  public:
     virtual ~TableRead() = default;
@@ -46,21 +45,21 @@ class PAIMON_EXPORT TableRead {
     /// Creates a `BatchReader` instance for reading data.
     ///
     /// This method creates a BatchReader that will be responsible for reading data from the
-    /// provided data splits.
+    /// provided splits.
     ///
-    /// @param data_splits A vector of shared pointers to `DataSplit` instances representing the
+    /// @param splits A vector of shared pointers to `Split` instances representing the
     ///                    data to be read.
     /// @return A Result containing a unique pointer to the `BatchReader` instance.
     virtual Result<std::unique_ptr<BatchReader>> CreateReader(
-        const std::vector<std::shared_ptr<DataSplit>>& data_splits);
+        const std::vector<std::shared_ptr<Split>>& splits);
 
-    /// Creates a `BatchReader` instance for a single data split.
+    /// Creates a `BatchReader` instance for a single split.
     ///
-    /// @param data_split A shared pointer to the `DataSplit` instance that defines the data to be
+    /// @param split A shared pointer to the `Split` instance that defines the data to be
     ///                   read.
     /// @return A Result containing a unique pointer to the `BatchReader` instance.
     virtual Result<std::unique_ptr<BatchReader>> CreateReader(
-        const std::shared_ptr<DataSplit>& data_split) = 0;
+        const std::shared_ptr<Split>& split) = 0;
 
  protected:
     explicit TableRead(const std::shared_ptr<MemoryPool>& memory_pool);

diff --git a/include/paimon/utils/range.h b/include/paimon/utils/range.h
@@ -17,6 +17,7 @@
 #pragma once
 #include <optional>
 #include <string>
+#include <vector>
 
 #include "paimon/visibility.h"
 
@@ -28,9 +29,23 @@ struct PAIMON_EXPORT Range {
     /// Returns the number of integers in the range [from, to].
     int64_t Count() const;
 
+    /// Computes the intersection of two ranges.
     static std::optional<Range> Intersection(const Range& left, const Range& right);
+
+    /// Checks whether two ranges have any overlap.
     static bool HasIntersection(const Range& left, const Range& right);
 
+    /// Sorts a list of ranges by `from`, then merges overlapping or adjacent ranges.
+    /// @param ranges Input vector of ranges to merge.
+    /// @param adjacent If true, also merges ranges that are adjacent (e.g., [1,3] and [4,5] →
+    /// [1,5]).
+    ///                 If false, only merges strictly overlapping ranges.
+    /// @return A new vector of non-overlapping, sorted ranges.
+    static std::vector<Range> SortAndMergeOverlap(const std::vector<Range>& ranges, bool adjacent);
+
+    /// Computes the set intersection of two collections of disjoint, sorted ranges.
+    static std::vector<Range> And(const std::vector<Range>& left, const std::vector<Range>& right);
+
     bool operator==(const Range& other) const;
     bool operator<(const Range& other) const;