Skip to content

Commit e635cc2

Browse files
authored
apacheGH-41813: [C++] Fix avx2 gather offset larger than 2GB in CompareColumnsToRows (apache#42188)
### Rationale for this change AVX2 intrinsics `_mm256_i32gather_epi32`/`_mm256_i32gather_epi64` are used in `CompareColumnsToRows` API, and treat the `vindex` as signed integer. In our row table implementation, we use `uint32_t` to represent the offset within the row table. When a offset is larger than (`0x80000000`, or `2GB`), the aforementioned intrinsics will treat it as negative offset and gather the data from undesired address. More details please see apache#41813 (comment). Considering there is no unsigned-32bit-offset or 64bit-offset counterparts of those intrinsics in AVX2, this issue can be simply mitigated by translating the base address and the offset: ``` new_base = base + 0x80000000; new_offset = offset - 0x80000000; ``` ### What changes are included in this PR? Fix and UT that reproduces the issue. ### Are these changes tested? UT included. ### Are there any user-facing changes? None. * GitHub Issue: apache#41813 Authored-by: Ruoxi Sun <[email protected]> Signed-off-by: Antoine Pitrou <[email protected]>
1 parent 3e7ae53 commit e635cc2

File tree

2 files changed

+179
-5
lines changed

2 files changed

+179
-5
lines changed

cpp/src/arrow/compute/row/compare_internal_avx2.cc

+41-5
Original file line numberDiff line numberDiff line change
@@ -236,6 +236,8 @@ uint32_t KeyCompare::CompareBinaryColumnToRowHelper_avx2(
236236
irow_right =
237237
_mm256_loadu_si256(reinterpret_cast<const __m256i*>(left_to_right_map) + i);
238238
}
239+
// TODO: Need to test if this gather is OK when irow_right is larger than
240+
// 0x80000000u.
239241
__m256i offset_right =
240242
_mm256_i32gather_epi32((const int*)offsets_right, irow_right, 4);
241243
offset_right = _mm256_add_epi32(offset_right, _mm256_set1_epi32(offset_within_row));
@@ -251,6 +253,40 @@ uint32_t KeyCompare::CompareBinaryColumnToRowHelper_avx2(
251253
}
252254
}
253255

256+
namespace {
257+
258+
// Intrinsics `_mm256_i32gather_epi32/64` treat the `vindex` as signed integer, and we
259+
// are using `uint32_t` to represent the offset, in range of [0, 4G), within the row
260+
// table. When the offset is larger than `0x80000000` (2GB), those intrinsics will treat
261+
// it as negative offset and gather the data from undesired address. To avoid this issue,
262+
// we normalize the addresses by translating `base` `0x80000000` higher, and `offset`
263+
// `0x80000000` lower. This way, the offset is always in range of [-2G, 2G) and those
264+
// intrinsics are safe.
265+
266+
constexpr uint64_t kTwoGB = 0x80000000ull;
267+
268+
template <uint32_t kScale>
269+
inline __m256i UnsignedOffsetSafeGather32(int const* base, __m256i offset) {
270+
int const* normalized_base = base + kTwoGB / sizeof(int);
271+
__m256i normalized_offset =
272+
_mm256_sub_epi32(offset, _mm256_set1_epi32(static_cast<int>(kTwoGB / kScale)));
273+
return _mm256_i32gather_epi32(normalized_base, normalized_offset,
274+
static_cast<int>(kScale));
275+
}
276+
277+
template <uint32_t kScale>
278+
inline __m256i UnsignedOffsetSafeGather64(arrow::util::int64_for_gather_t const* base,
279+
__m128i offset) {
280+
arrow::util::int64_for_gather_t const* normalized_base =
281+
base + kTwoGB / sizeof(arrow::util::int64_for_gather_t);
282+
__m128i normalized_offset =
283+
_mm_sub_epi32(offset, _mm_set1_epi32(static_cast<int>(kTwoGB / kScale)));
284+
return _mm256_i32gather_epi64(normalized_base, normalized_offset,
285+
static_cast<int>(kScale));
286+
}
287+
288+
} // namespace
289+
254290
template <int column_width>
255291
inline uint64_t CompareSelected8_avx2(const uint8_t* left_base, const uint8_t* right_base,
256292
__m256i irow_left, __m256i offset_right,
@@ -281,7 +317,7 @@ inline uint64_t CompareSelected8_avx2(const uint8_t* left_base, const uint8_t* r
281317
ARROW_DCHECK(false);
282318
}
283319

284-
__m256i right = _mm256_i32gather_epi32((const int*)right_base, offset_right, 1);
320+
__m256i right = UnsignedOffsetSafeGather32<1>((int const*)right_base, offset_right);
285321
if (column_width != sizeof(uint32_t)) {
286322
constexpr uint32_t mask = column_width == 0 || column_width == 1 ? 0xff : 0xffff;
287323
right = _mm256_and_si256(right, _mm256_set1_epi32(mask));
@@ -330,7 +366,7 @@ inline uint64_t Compare8_avx2(const uint8_t* left_base, const uint8_t* right_bas
330366
ARROW_DCHECK(false);
331367
}
332368

333-
__m256i right = _mm256_i32gather_epi32((const int*)right_base, offset_right, 1);
369+
__m256i right = UnsignedOffsetSafeGather32<1>((int const*)right_base, offset_right);
334370
if (column_width != sizeof(uint32_t)) {
335371
constexpr uint32_t mask = column_width == 0 || column_width == 1 ? 0xff : 0xffff;
336372
right = _mm256_and_si256(right, _mm256_set1_epi32(mask));
@@ -367,9 +403,9 @@ inline uint64_t Compare8_64bit_avx2(const uint8_t* left_base, const uint8_t* rig
367403
auto right_base_i64 =
368404
reinterpret_cast<const arrow::util::int64_for_gather_t*>(right_base);
369405
__m256i right_lo =
370-
_mm256_i32gather_epi64(right_base_i64, _mm256_castsi256_si128(offset_right), 1);
371-
__m256i right_hi = _mm256_i32gather_epi64(right_base_i64,
372-
_mm256_extracti128_si256(offset_right, 1), 1);
406+
UnsignedOffsetSafeGather64<1>(right_base_i64, _mm256_castsi256_si128(offset_right));
407+
__m256i right_hi = UnsignedOffsetSafeGather64<1>(
408+
right_base_i64, _mm256_extracti128_si256(offset_right, 1));
373409
uint32_t result_lo = _mm256_movemask_epi8(_mm256_cmpeq_epi64(left_lo, right_lo));
374410
uint32_t result_hi = _mm256_movemask_epi8(_mm256_cmpeq_epi64(left_hi, right_hi));
375411
return result_lo | (static_cast<uint64_t>(result_hi) << 32);

cpp/src/arrow/compute/row/compare_test.cc

+138
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,10 @@
1818
#include <numeric>
1919

2020
#include "arrow/compute/row/compare_internal.h"
21+
#include "arrow/testing/generator.h"
2122
#include "arrow/testing/gtest_util.h"
2223
#include "arrow/testing/random.h"
24+
#include "arrow/util/bitmap_ops.h"
2325

2426
namespace arrow {
2527
namespace compute {
@@ -164,5 +166,141 @@ TEST(KeyCompare, CompareColumnsToRowsTempStackUsage) {
164166
}
165167
}
166168

169+
#ifndef ARROW_VALGRIND
170+
// Compare columns to rows at offsets over 2GB within a row table.
171+
// Certain AVX2 instructions may behave unexpectedly causing troubles like GH-41813.
172+
TEST(KeyCompare, CompareColumnsToRowsLarge) {
173+
if constexpr (sizeof(void*) == 4) {
174+
GTEST_SKIP() << "Test only works on 64-bit platforms";
175+
}
176+
177+
// The idea of this case is to create a row table using several fixed length columns and
178+
// one var length column (so the row is hence var length and has offset buffer), with
179+
// the overall data size exceeding 2GB. Then compare each row with itself.
180+
constexpr int64_t two_gb = 2ll * 1024ll * 1024ll * 1024ll;
181+
// The compare function requires the row id of the left column to be uint16_t, hence the
182+
// number of rows.
183+
constexpr int64_t num_rows = std::numeric_limits<uint16_t>::max() + 1;
184+
const std::vector<std::shared_ptr<DataType>> fixed_length_types{uint64(), uint32()};
185+
// The var length column should be a little smaller than 2GB to workaround the capacity
186+
// limitation in the var length builder.
187+
constexpr int32_t var_length = two_gb / num_rows - 1;
188+
auto row_size = std::accumulate(fixed_length_types.begin(), fixed_length_types.end(),
189+
static_cast<int64_t>(var_length),
190+
[](int64_t acc, const std::shared_ptr<DataType>& type) {
191+
return acc + type->byte_width();
192+
});
193+
// The overall size should be larger than 2GB.
194+
ASSERT_GT(row_size * num_rows, two_gb);
195+
196+
MemoryPool* pool = default_memory_pool();
197+
198+
// The left side columns.
199+
std::vector<KeyColumnArray> columns_left;
200+
ExecBatch batch_left;
201+
{
202+
std::vector<Datum> values;
203+
204+
// Several fixed length arrays containing random content.
205+
for (const auto& type : fixed_length_types) {
206+
ASSERT_OK_AND_ASSIGN(auto value, ::arrow::gen::Random(type)->Generate(num_rows));
207+
values.push_back(std::move(value));
208+
}
209+
// A var length array containing 'X' repeated var_length times.
210+
ASSERT_OK_AND_ASSIGN(auto value_var_length,
211+
::arrow::gen::Constant(
212+
std::make_shared<BinaryScalar>(std::string(var_length, 'X')))
213+
->Generate(num_rows));
214+
values.push_back(std::move(value_var_length));
215+
216+
batch_left = ExecBatch(std::move(values), num_rows);
217+
ASSERT_OK(ColumnArraysFromExecBatch(batch_left, &columns_left));
218+
}
219+
220+
// The right side row table.
221+
RowTableImpl row_table_right;
222+
{
223+
// Encode the row table with the left columns.
224+
std::vector<KeyColumnMetadata> column_metadatas;
225+
ASSERT_OK(ColumnMetadatasFromExecBatch(batch_left, &column_metadatas));
226+
RowTableMetadata table_metadata;
227+
table_metadata.FromColumnMetadataVector(column_metadatas, sizeof(uint64_t),
228+
sizeof(uint64_t));
229+
ASSERT_OK(row_table_right.Init(pool, table_metadata));
230+
std::vector<uint16_t> row_ids(num_rows);
231+
std::iota(row_ids.begin(), row_ids.end(), 0);
232+
RowTableEncoder row_encoder;
233+
row_encoder.Init(column_metadatas, sizeof(uint64_t), sizeof(uint64_t));
234+
row_encoder.PrepareEncodeSelected(0, num_rows, columns_left);
235+
ASSERT_OK(row_encoder.EncodeSelected(
236+
&row_table_right, static_cast<uint32_t>(num_rows), row_ids.data()));
237+
238+
// The row table must contain an offset buffer.
239+
ASSERT_NE(row_table_right.offsets(), NULLPTR);
240+
// The whole point of this test.
241+
ASSERT_GT(row_table_right.offsets()[num_rows - 1], two_gb);
242+
}
243+
244+
// The rows to compare.
245+
std::vector<uint32_t> row_ids_to_compare(num_rows);
246+
std::iota(row_ids_to_compare.begin(), row_ids_to_compare.end(), 0);
247+
248+
TempVectorStack stack;
249+
ASSERT_OK(stack.Init(pool, KeyCompare::CompareColumnsToRowsTempStackUsage(num_rows)));
250+
LightContext ctx{CpuInfo::GetInstance()->hardware_flags(), &stack};
251+
252+
{
253+
// No selection, output no match row ids.
254+
uint32_t num_rows_no_match;
255+
std::vector<uint16_t> row_ids_out(num_rows);
256+
KeyCompare::CompareColumnsToRows(num_rows, /*sel_left_maybe_null=*/NULLPTR,
257+
row_ids_to_compare.data(), &ctx, &num_rows_no_match,
258+
row_ids_out.data(), columns_left, row_table_right,
259+
/*are_cols_in_encoding_order=*/true,
260+
/*out_match_bitvector_maybe_null=*/NULLPTR);
261+
ASSERT_EQ(num_rows_no_match, 0);
262+
}
263+
264+
{
265+
// No selection, output match bit vector.
266+
std::vector<uint8_t> match_bitvector(BytesForBits(num_rows));
267+
KeyCompare::CompareColumnsToRows(
268+
num_rows, /*sel_left_maybe_null=*/NULLPTR, row_ids_to_compare.data(), &ctx,
269+
/*out_num_rows=*/NULLPTR, /*out_sel_left_maybe_same=*/NULLPTR, columns_left,
270+
row_table_right,
271+
/*are_cols_in_encoding_order=*/true, match_bitvector.data());
272+
ASSERT_EQ(arrow::internal::CountSetBits(match_bitvector.data(), 0, num_rows),
273+
num_rows);
274+
}
275+
276+
std::vector<uint16_t> selection_left(num_rows);
277+
std::iota(selection_left.begin(), selection_left.end(), 0);
278+
279+
{
280+
// With selection, output no match row ids.
281+
uint32_t num_rows_no_match;
282+
std::vector<uint16_t> row_ids_out(num_rows);
283+
KeyCompare::CompareColumnsToRows(num_rows, selection_left.data(),
284+
row_ids_to_compare.data(), &ctx, &num_rows_no_match,
285+
row_ids_out.data(), columns_left, row_table_right,
286+
/*are_cols_in_encoding_order=*/true,
287+
/*out_match_bitvector_maybe_null=*/NULLPTR);
288+
ASSERT_EQ(num_rows_no_match, 0);
289+
}
290+
291+
{
292+
// With selection, output match bit vector.
293+
std::vector<uint8_t> match_bitvector(BytesForBits(num_rows));
294+
KeyCompare::CompareColumnsToRows(
295+
num_rows, selection_left.data(), row_ids_to_compare.data(), &ctx,
296+
/*out_num_rows=*/NULLPTR, /*out_sel_left_maybe_same=*/NULLPTR, columns_left,
297+
row_table_right,
298+
/*are_cols_in_encoding_order=*/true, match_bitvector.data());
299+
ASSERT_EQ(arrow::internal::CountSetBits(match_bitvector.data(), 0, num_rows),
300+
num_rows);
301+
}
302+
}
303+
#endif // ARROW_VALGRIND
304+
167305
} // namespace compute
168306
} // namespace arrow

0 commit comments

Comments
 (0)