Skip to content

Commit b7914e4

Browse files
authored
Merge branch 'main' into add-compact-strategy
2 parents e296bf7 + 94b2003 commit b7914e4

File tree

10 files changed

+352
-23
lines changed

10 files changed

+352
-23
lines changed

cmake_modules/ThirdpartyToolchain.cmake

Lines changed: 23 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -119,7 +119,7 @@ else()
119119
set_urls(LUCENE_SOURCE_URL "${THIRDPARTY_DIR}/${PAIMON_LUCENE_PKG_NAME}")
120120
else()
121121
set_urls(LUCENE_SOURCE_URL
122-
"${THIRDPARTY_MIRROR_URL}https://github.com/luceneplusplus/LucenePlusPlus/archive/refs/tags/${PAIMON_LUCENE_PKG_NAME}"
122+
"${THIRDPARTY_MIRROR_URL}https://github.com/luceneplusplus/LucenePlusPlus/archive/refs/tags/rel_${PAIMON_LUCENE_BUILD_VERSION}.tar.gz"
123123
)
124124
endif()
125125
endif()
@@ -131,7 +131,7 @@ else()
131131
set_urls(LIMONP_SOURCE_URL "${THIRDPARTY_DIR}/${PAIMON_LIMONP_PKG_NAME}")
132132
else()
133133
set_urls(LIMONP_SOURCE_URL
134-
"${THIRDPARTY_MIRROR_URL}https://github.com/yanyiwu/limonp/archive/refs/tags/${PAIMON_LIMONP_PKG_NAME}"
134+
"${THIRDPARTY_MIRROR_URL}https://github.com/yanyiwu/limonp/archive/refs/tags/v${PAIMON_LIMONP_BUILD_VERSION}.tar.gz"
135135
)
136136
endif()
137137
endif()
@@ -143,7 +143,7 @@ else()
143143
set_urls(JIEBA_SOURCE_URL "${THIRDPARTY_DIR}/${PAIMON_JIEBA_PKG_NAME}")
144144
else()
145145
set_urls(JIEBA_SOURCE_URL
146-
"${THIRDPARTY_MIRROR_URL}https://github.com/yanyiwu/cppjieba/archive/refs/tags/${PAIMON_JIEBA_PKG_NAME}"
146+
"${THIRDPARTY_MIRROR_URL}https://github.com/yanyiwu/cppjieba/archive/refs/tags/${PAIMON_JIEBA_BUILD_VERSION}.tar.gz"
147147
)
148148
endif()
149149
endif()
@@ -318,13 +318,24 @@ macro(build_lucene)
318318
get_filename_component(LUCENE_ZLIB_ROOT "${LUCENE_ZLIB_INCLUDE_DIR}" DIRECTORY)
319319

320320
set(LUCENE_PREFIX "${CMAKE_CURRENT_BINARY_DIR}/lucene_ep-install")
321+
322+
set(LUCENE_CMAKE_CXX_FLAGS "-pthread")
323+
if(PAIMON_USE_CXX11_ABI)
324+
string(APPEND LUCENE_CMAKE_CXX_FLAGS " -D_GLIBCXX_USE_CXX11_ABI=1")
325+
else()
326+
string(APPEND LUCENE_CMAKE_CXX_FLAGS " -D_GLIBCXX_USE_CXX11_ABI=0")
327+
endif()
328+
321329
set(LUCENE_CMAKE_ARGS
322330
${EP_COMMON_CMAKE_ARGS}
323331
"-DLUCENE_BUILD_SHARED=OFF"
324332
"-DENABLE_TEST=OFF"
325333
"-DCMAKE_C_FLAGS=-pthread"
326-
"-DCMAKE_CXX_FLAGS=-pthread"
334+
"-DCMAKE_CXX_FLAGS=${LUCENE_CMAKE_CXX_FLAGS}"
327335
"-DCMAKE_EXE_LINKER_FLAGS=-pthread"
336+
"-DBoost_NO_BOOST_CMAKE=ON"
337+
"-DBoost_NO_SYSTEM_PATHS=ON"
338+
"-DBoost_USE_STATIC_LIBS=ON"
328339
"-DBoost_INCLUDE_DIR=${BOOST_INCLUDE_DIR}"
329340
"-DBoost_LIBRARY_DIR=${BOOST_LIBRARY_DIR}"
330341
"-DBOOST_ROOT=${BOOST_INSTALL}"
@@ -505,6 +516,13 @@ macro(build_boost)
505516
${BOOST_LIBRARY_DIR}/libboost_chrono.a
506517
${BOOST_LIBRARY_DIR}/libboost_iostreams.a)
507518

519+
set(BOOST_CXX_FLAGS "-fPIC")
520+
if(PAIMON_USE_CXX11_ABI)
521+
string(APPEND BOOST_CXX_FLAGS " -D_GLIBCXX_USE_CXX11_ABI=1")
522+
else()
523+
string(APPEND BOOST_CXX_FLAGS " -D_GLIBCXX_USE_CXX11_ABI=0")
524+
endif()
525+
508526
externalproject_add(boost_ep
509527
URL "${THIRDPARTY_DIR}/boost/${PAIMON_BOOST_PKG_NAME}"
510528
URL_HASH "SHA256=${PAIMON_BOOST_BUILD_SHA256_CHECKSUM}"
@@ -515,7 +533,7 @@ macro(build_boost)
515533
--prefix=${BOOST_INSTALL}
516534
--libdir=${BOOST_LIBRARY_DIR} link=static
517535
runtime-link=shared threading=multi variant=release
518-
cxxflags=-fPIC install
536+
cxxflags=${BOOST_CXX_FLAGS} install
519537
INSTALL_COMMAND bash -c
520538
"mkdir -p ${BOOST_INSTALL}/include/boost && cp -r ${BOOST_PREFIX}/src/boost_ep/libs/*/include/boost/* ${BOOST_INSTALL}/include/boost && cp -r ${BOOST_PREFIX}/src/boost_ep/libs/*/*/include/boost/* ${BOOST_INSTALL}/include/boost"
521539
BUILD_BYPRODUCTS ${BOOST_BYPRODUCTS}

src/paimon/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ set(PAIMON_COMMON_SRCS
2929
common/data/columnar/columnar_array.cpp
3030
common/data/columnar/columnar_map.cpp
3131
common/data/columnar/columnar_row.cpp
32+
common/data/columnar/columnar_row_ref.cpp
3233
common/data/decimal.cpp
3334
common/data/internal_row.cpp
3435
common/data/record_batch.cpp

src/paimon/common/data/columnar/columnar_array.cpp

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,8 +26,9 @@
2626
#include "arrow/util/checked_cast.h"
2727
#include "arrow/util/decimal.h"
2828
#include "fmt/format.h"
29+
#include "paimon/common/data/columnar/columnar_batch_context.h"
2930
#include "paimon/common/data/columnar/columnar_map.h"
30-
#include "paimon/common/data/columnar/columnar_row.h"
31+
#include "paimon/common/data/columnar/columnar_row_ref.h"
3132
#include "paimon/common/utils/date_time_utils.h"
3233

3334
namespace paimon {
@@ -84,7 +85,8 @@ std::shared_ptr<InternalMap> ColumnarArray::GetMap(int32_t pos) const {
8485
std::shared_ptr<InternalRow> ColumnarArray::GetRow(int32_t pos, int32_t num_fields) const {
8586
auto struct_array = arrow::internal::checked_cast<const arrow::StructArray*>(array_);
8687
assert(struct_array);
87-
return std::make_shared<ColumnarRow>(struct_array->fields(), pool_, offset_ + pos);
88+
auto row_ctx = std::make_shared<ColumnarBatchContext>(nullptr, struct_array->fields(), pool_);
89+
return std::make_shared<ColumnarRowRef>(std::move(row_ctx), offset_ + pos);
8890
}
8991

9092
Result<std::vector<char>> ColumnarArray::ToBooleanArray() const {
Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
/*
2+
* Copyright 2026-present Alibaba Inc.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
#pragma once
18+
19+
#include <memory>
20+
#include <vector>
21+
22+
#include "arrow/array/array_base.h"
23+
24+
namespace arrow {
25+
class StructArray;
26+
} // namespace arrow
27+
28+
namespace paimon {
29+
class MemoryPool;
30+
31+
struct ColumnarBatchContext {
32+
ColumnarBatchContext(const std::shared_ptr<arrow::StructArray>& struct_array_in,
33+
const arrow::ArrayVector& array_vec_holder_in,
34+
const std::shared_ptr<MemoryPool>& pool_in)
35+
: struct_array(struct_array_in), pool(pool_in), array_vec_holder(array_vec_holder_in) {
36+
array_ptrs.reserve(array_vec_holder.size());
37+
for (const auto& array : array_vec_holder) {
38+
array_ptrs.push_back(array.get());
39+
}
40+
}
41+
42+
std::shared_ptr<arrow::StructArray> struct_array;
43+
std::shared_ptr<MemoryPool> pool;
44+
arrow::ArrayVector array_vec_holder;
45+
std::vector<const arrow::Array*> array_ptrs;
46+
};
47+
} // namespace paimon
Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
/*
2+
* Copyright 2026-present Alibaba Inc.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
#include "paimon/common/data/columnar/columnar_row_ref.h"
18+
19+
#include <cassert>
20+
21+
#include "arrow/array/array_decimal.h"
22+
#include "arrow/array/array_nested.h"
23+
#include "arrow/array/array_primitive.h"
24+
#include "arrow/type_traits.h"
25+
#include "arrow/util/checked_cast.h"
26+
#include "arrow/util/decimal.h"
27+
#include "paimon/common/data/columnar/columnar_array.h"
28+
#include "paimon/common/data/columnar/columnar_map.h"
29+
#include "paimon/common/utils/date_time_utils.h"
30+
31+
namespace paimon {
32+
Decimal ColumnarRowRef::GetDecimal(int32_t pos, int32_t precision, int32_t scale) const {
33+
using ArrayType = typename arrow::TypeTraits<arrow::Decimal128Type>::ArrayType;
34+
auto array = arrow::internal::checked_cast<const ArrayType*>(ctx_->array_ptrs[pos]);
35+
assert(array);
36+
arrow::Decimal128 decimal(array->GetValue(row_id_));
37+
return Decimal(precision, scale,
38+
static_cast<Decimal::int128_t>(decimal.high_bits()) << 64 | decimal.low_bits());
39+
}
40+
41+
Timestamp ColumnarRowRef::GetTimestamp(int32_t pos, int32_t precision) const {
42+
using ArrayType = typename arrow::TypeTraits<arrow::TimestampType>::ArrayType;
43+
auto array = arrow::internal::checked_cast<const ArrayType*>(ctx_->array_ptrs[pos]);
44+
assert(array);
45+
int64_t data = array->Value(row_id_);
46+
auto timestamp_type =
47+
arrow::internal::checked_pointer_cast<arrow::TimestampType>(array->type());
48+
// for orc format, data is saved as nano, therefore, Timestamp convert should consider precision
49+
// in arrow array rather than input precision
50+
DateTimeUtils::TimeType time_type = DateTimeUtils::GetTimeTypeFromArrowType(timestamp_type);
51+
auto [milli, nano] = DateTimeUtils::TimestampConverter(
52+
data, time_type, DateTimeUtils::TimeType::MILLISECOND, DateTimeUtils::TimeType::NANOSECOND);
53+
return Timestamp(milli, nano);
54+
}
55+
56+
std::shared_ptr<InternalRow> ColumnarRowRef::GetRow(int32_t pos, int32_t num_fields) const {
57+
auto struct_array =
58+
arrow::internal::checked_cast<const arrow::StructArray*>(ctx_->array_ptrs[pos]);
59+
assert(struct_array);
60+
auto nested_ctx =
61+
std::make_shared<ColumnarBatchContext>(nullptr, struct_array->fields(), ctx_->pool);
62+
return std::make_shared<ColumnarRowRef>(std::move(nested_ctx), row_id_);
63+
}
64+
65+
std::shared_ptr<InternalArray> ColumnarRowRef::GetArray(int32_t pos) const {
66+
auto list_array = arrow::internal::checked_cast<const arrow::ListArray*>(ctx_->array_ptrs[pos]);
67+
assert(list_array);
68+
int32_t offset = list_array->value_offset(row_id_);
69+
int32_t length = list_array->value_length(row_id_);
70+
return std::make_shared<ColumnarArray>(list_array->values(), ctx_->pool, offset, length);
71+
}
72+
73+
std::shared_ptr<InternalMap> ColumnarRowRef::GetMap(int32_t pos) const {
74+
auto map_array = arrow::internal::checked_cast<const arrow::MapArray*>(ctx_->array_ptrs[pos]);
75+
assert(map_array);
76+
int32_t offset = map_array->value_offset(row_id_);
77+
int32_t length = map_array->value_length(row_id_);
78+
return std::make_shared<ColumnarMap>(map_array->keys(), map_array->items(), ctx_->pool, offset,
79+
length);
80+
}
81+
82+
} // namespace paimon
Lines changed: 135 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,135 @@
1+
/*
2+
* Copyright 2026-present Alibaba Inc.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
#pragma once
18+
19+
#include <cstdint>
20+
#include <memory>
21+
#include <string>
22+
#include <string_view>
23+
24+
#include "fmt/format.h"
25+
#include "paimon/common/data/binary_string.h"
26+
#include "paimon/common/data/columnar/columnar_batch_context.h"
27+
#include "paimon/common/data/columnar/columnar_utils.h"
28+
#include "paimon/common/data/internal_array.h"
29+
#include "paimon/common/data/internal_map.h"
30+
#include "paimon/common/data/internal_row.h"
31+
#include "paimon/common/types/row_kind.h"
32+
#include "paimon/data/decimal.h"
33+
#include "paimon/data/timestamp.h"
34+
#include "paimon/result.h"
35+
36+
namespace paimon {
37+
class Bytes;
38+
39+
/// Columnar row view which shares batch-level context to reduce per-row overhead.
40+
class ColumnarRowRef : public InternalRow {
41+
public:
42+
ColumnarRowRef(std::shared_ptr<ColumnarBatchContext> ctx, int64_t row_id)
43+
: ctx_(std::move(ctx)), row_id_(row_id) {}
44+
45+
Result<const RowKind*> GetRowKind() const override {
46+
return row_kind_;
47+
}
48+
49+
void SetRowKind(const RowKind* kind) override {
50+
row_kind_ = kind;
51+
}
52+
53+
int32_t GetFieldCount() const override {
54+
return static_cast<int32_t>(ctx_->array_ptrs.size());
55+
}
56+
57+
bool IsNullAt(int32_t pos) const override {
58+
return ctx_->array_ptrs[pos]->IsNull(row_id_);
59+
}
60+
61+
bool GetBoolean(int32_t pos) const override {
62+
return ColumnarUtils::GetGenericValue<arrow::BooleanType, bool>(ctx_->array_ptrs[pos],
63+
row_id_);
64+
}
65+
66+
char GetByte(int32_t pos) const override {
67+
return ColumnarUtils::GetGenericValue<arrow::Int8Type, char>(ctx_->array_ptrs[pos],
68+
row_id_);
69+
}
70+
71+
int16_t GetShort(int32_t pos) const override {
72+
return ColumnarUtils::GetGenericValue<arrow::Int16Type, int16_t>(ctx_->array_ptrs[pos],
73+
row_id_);
74+
}
75+
76+
int32_t GetInt(int32_t pos) const override {
77+
return ColumnarUtils::GetGenericValue<arrow::Int32Type, int32_t>(ctx_->array_ptrs[pos],
78+
row_id_);
79+
}
80+
81+
int32_t GetDate(int32_t pos) const override {
82+
return ColumnarUtils::GetGenericValue<arrow::Date32Type, int32_t>(ctx_->array_ptrs[pos],
83+
row_id_);
84+
}
85+
86+
int64_t GetLong(int32_t pos) const override {
87+
return ColumnarUtils::GetGenericValue<arrow::Int64Type, int64_t>(ctx_->array_ptrs[pos],
88+
row_id_);
89+
}
90+
91+
float GetFloat(int32_t pos) const override {
92+
return ColumnarUtils::GetGenericValue<arrow::FloatType, float>(ctx_->array_ptrs[pos],
93+
row_id_);
94+
}
95+
96+
double GetDouble(int32_t pos) const override {
97+
return ColumnarUtils::GetGenericValue<arrow::DoubleType, double>(ctx_->array_ptrs[pos],
98+
row_id_);
99+
}
100+
101+
BinaryString GetString(int32_t pos) const override {
102+
auto bytes = ColumnarUtils::GetBytes<arrow::StringType>(ctx_->array_ptrs[pos], row_id_,
103+
ctx_->pool.get());
104+
return BinaryString::FromBytes(bytes);
105+
}
106+
107+
std::string_view GetStringView(int32_t pos) const override {
108+
return ColumnarUtils::GetView(ctx_->array_ptrs[pos], row_id_);
109+
}
110+
111+
Decimal GetDecimal(int32_t pos, int32_t precision, int32_t scale) const override;
112+
113+
Timestamp GetTimestamp(int32_t pos, int32_t precision) const override;
114+
115+
std::shared_ptr<Bytes> GetBinary(int32_t pos) const override {
116+
return ColumnarUtils::GetBytes<arrow::BinaryType>(ctx_->array_ptrs[pos], row_id_,
117+
ctx_->pool.get());
118+
}
119+
120+
std::shared_ptr<InternalRow> GetRow(int32_t pos, int32_t num_fields) const override;
121+
122+
std::shared_ptr<InternalArray> GetArray(int32_t pos) const override;
123+
124+
std::shared_ptr<InternalMap> GetMap(int32_t pos) const override;
125+
126+
std::string ToString() const override {
127+
return fmt::format("ColumnarRowRef, row_id {}", row_id_);
128+
}
129+
130+
private:
131+
std::shared_ptr<ColumnarBatchContext> ctx_;
132+
const RowKind* row_kind_ = RowKind::Insert();
133+
int64_t row_id_;
134+
};
135+
} // namespace paimon

0 commit comments

Comments
 (0)