Skip to content

Commit

Permalink
fix: add build-side join keys to memory accounting
Browse files Browse the repository at this point in the history
  • Loading branch information
korowa committed Jan 19, 2025
1 parent 0283077 commit 154926b
Showing 1 changed file with 48 additions and 10 deletions.
58 changes: 48 additions & 10 deletions datafusion/physical-plan/src/joins/hash_join.rs
Original file line number Diff line number Diff line change
Expand Up @@ -56,11 +56,12 @@ use arrow::array::{
Array, ArrayRef, BooleanArray, BooleanBufferBuilder, UInt32Array, UInt64Array,
};
use arrow::compute::kernels::cmp::{eq, not_distinct};
use arrow::compute::{and, concat_batches, take, FilterBuilder};
use arrow::compute::{and, concat, concat_batches, take, FilterBuilder};
use arrow::datatypes::{Schema, SchemaRef};
use arrow::record_batch::RecordBatch;
use arrow::util::bit_util;
use arrow_array::cast::downcast_array;
use arrow_array::new_empty_array;
use arrow_schema::ArrowError;
use datafusion_common::utils::memory::estimate_memory_size;
use datafusion_common::{
Expand Down Expand Up @@ -914,6 +915,52 @@ async fn collect_left_input(
})
.await?;

let batches_iter = batches.iter().rev();

let left_values = if batches.is_empty() {
on_left
.iter()
.map(|expr| Ok(new_empty_array(&expr.data_type(&schema)?)))
.collect::<Result<Vec<_>>>()?
} else {
on_left
.iter()
.map(|expr| {
let mut key_reservation = 0;
let join_key_arrays = batches_iter
.clone()
.map(|batch| {
let array: Arc<dyn Array> =
expr.evaluate(batch).unwrap().into_array(batch.num_rows())?;
let array_size = array.get_array_memory_size();
reservation.try_grow(array_size)?;
key_reservation += array_size;
Ok(array)
})
.collect::<Result<Vec<_>>>()?;

// `concat` function is non-consuming, so reserving x2 memory
// required for collected join key arrays (assuming worst case scenario)
reservation.try_grow(key_reservation)?;
let concatenated = concat(
join_key_arrays
.iter()
.map(AsRef::as_ref)
.collect::<Vec<_>>()
.as_ref(),
)?;

// Resizing reservation to its original size + concatenated array size
let build_size_mem = reservation.size() - key_reservation * 2
+ concatenated.get_array_memory_size();
reservation.resize(build_size_mem);
metrics.build_mem_used.set(build_size_mem);

Ok(concatenated)
})
.collect::<Result<Vec<_>>>()?
};

// Estimation of memory size, required for hashtable, prior to allocation.
// Final result can be verified using `RawTable.allocation_info()`
let fixed_size = size_of::<JoinHashMap>();
Expand All @@ -928,7 +975,6 @@ async fn collect_left_input(
let mut offset = 0;

// Updating hashmap starting from the last batch
let batches_iter = batches.iter().rev();
for batch in batches_iter.clone() {
hashes_buffer.clear();
hashes_buffer.resize(batch.num_rows(), 0);
Expand Down Expand Up @@ -960,14 +1006,6 @@ async fn collect_left_input(
BooleanBufferBuilder::new(0)
};

let left_values = on_left
.iter()
.map(|c| {
c.evaluate(&single_batch)?
.into_array(single_batch.num_rows())
})
.collect::<Result<Vec<_>>>()?;

let data = JoinLeftData::new(
hashmap,
single_batch,
Expand Down

0 comments on commit 154926b

Please sign in to comment.