Skip to content

Commit 71bbfb8

Browse files
authored
Merge pull request #1023 from ttencate/perf/packed_array_extend
Speed up creating and extending packed arrays from iterators up to 63×
2 parents 7dad97c + b7178ee commit 71bbfb8

File tree

5 files changed

+313
-18
lines changed

5 files changed

+313
-18
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,122 @@
1+
/*
2+
* Copyright (c) godot-rust; Bromeon and contributors.
3+
* This Source Code Form is subject to the terms of the Mozilla Public
4+
* License, v. 2.0. If a copy of the MPL was not distributed with this
5+
* file, You can obtain one at https://mozilla.org/MPL/2.0/.
6+
*/
7+
8+
use std::mem::MaybeUninit;
9+
use std::ptr;
10+
11+
/// A fixed-size buffer that does not do any allocations, and can hold up to `N` elements of type `T`.
12+
///
13+
/// This is used to implement `Packed*Array::extend()` in an efficient way, because it forms a middle ground between
14+
/// repeated `push()` calls (slow) and first collecting the entire `Iterator` into a `Vec` (faster, but takes more memory).
15+
///
16+
/// Note that `N` must not be 0 for the buffer to be useful. This is checked at compile time.
17+
pub struct ExtendBuffer<T, const N: usize> {
18+
buf: [MaybeUninit<T>; N],
19+
len: usize,
20+
}
21+
22+
impl<T, const N: usize> Default for ExtendBuffer<T, N> {
23+
fn default() -> Self {
24+
Self {
25+
buf: [const { MaybeUninit::uninit() }; N],
26+
len: 0,
27+
}
28+
}
29+
}
30+
31+
impl<T, const N: usize> ExtendBuffer<T, N> {
32+
/// Appends the given value to the buffer.
33+
///
34+
/// # Panics
35+
/// If the buffer is full.
36+
pub fn push(&mut self, value: T) {
37+
self.buf[self.len].write(value);
38+
self.len += 1;
39+
}
40+
41+
/// Returns `true` iff the buffer is full.
42+
pub fn is_full(&self) -> bool {
43+
self.len == N
44+
}
45+
46+
/// Returns a slice of all initialized elements in the buffer, and sets the length of the buffer back to 0.
47+
///
48+
/// It is the caller's responsibility to ensure that all elements in the returned slice get dropped!
49+
pub fn drain_as_mut_slice(&mut self) -> &mut [T] {
50+
// Prevent panic in self.buf[0] below.
51+
if N == 0 {
52+
return &mut [];
53+
}
54+
debug_assert!(self.len <= N);
55+
56+
let len = self.len;
57+
self.len = 0;
58+
59+
// MaybeUninit::slice_assume_init_ref could be used here instead, but it's experimental.
60+
//
61+
// SAFETY:
62+
// - The pointer is non-null, valid and aligned.
63+
// - `len` elements are always initialized.
64+
// - The memory is not accessed through any other pointer, because we hold a `&mut` reference to `self`.
65+
// - `len * mem::size_of::<T>()` is no larger than `isize::MAX`, otherwise the `buf` slice could not have existed either.
66+
unsafe { std::slice::from_raw_parts_mut(self.buf[0].as_mut_ptr(), len) }
67+
}
68+
}
69+
70+
impl<T, const N: usize> Drop for ExtendBuffer<T, N> {
71+
fn drop(&mut self) {
72+
// Prevent panic in self.buf[0] below.
73+
if N == 0 {
74+
return;
75+
}
76+
debug_assert!(self.len <= N);
77+
78+
// SAFETY: `slice_from_raw_parts_mut` by itself is not unsafe, but to make the resulting slice safe to use:
79+
// - `self.buf[0]` is a valid pointer, exactly `self.len` elements are initialized.
80+
// - The pointer is not aliased since we have an exclusive `&mut self`.
81+
let slice = ptr::slice_from_raw_parts_mut(self.buf[0].as_mut_ptr(), self.len);
82+
83+
// SAFETY: the value is valid because the `slice_from_raw_parts_mut` requirements are met,
84+
// and there is no other way to access the value.
85+
unsafe {
86+
ptr::drop_in_place(slice);
87+
}
88+
}
89+
}
90+
91+
#[test]
92+
fn test_extend_buffer_drop() {
93+
// We use an `Rc` to test the buffer's `drop` behavior.
94+
use std::rc::Rc;
95+
96+
let mut buf = ExtendBuffer::<Rc<i32>, 1>::default();
97+
let value = Rc::new(42);
98+
buf.push(Rc::clone(&value));
99+
100+
// The buffer contains one strong reference, this function contains another.
101+
assert_eq!(Rc::strong_count(&value), 2);
102+
103+
let slice = buf.drain_as_mut_slice();
104+
105+
// The strong reference has been returned in the slice, but not dropped.
106+
assert_eq!(Rc::strong_count(&value), 2);
107+
108+
// SAFETY:
109+
// - The slice returned by `drain_as_mut_slice` is valid, and therefore so is its first element.
110+
// - There is no way to access parts of `slice[0]` while `drop_in_place` is executing.
111+
unsafe {
112+
ptr::drop_in_place(&mut slice[0]);
113+
}
114+
115+
// The reference held by the slice has now been dropped.
116+
assert_eq!(Rc::strong_count(&value), 1);
117+
118+
drop(buf);
119+
120+
// The buffer has not dropped another reference.
121+
assert_eq!(Rc::strong_count(&value), 1);
122+
}

godot-core/src/builtin/collections/mod.rs

+1
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77

88
mod array;
99
mod dictionary;
10+
mod extend_buffer;
1011
mod packed_array;
1112

1213
// Re-export in godot::builtin.

godot-core/src/builtin/collections/packed_array.rs

+123-16
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,10 @@
1111

1212
use godot_ffi as sys;
1313

14+
use crate::builtin::collections::extend_buffer::ExtendBuffer;
1415
use crate::builtin::*;
1516
use crate::meta::{AsArg, ToGodot};
17+
use std::mem::size_of;
1618
use std::{fmt, ops, ptr};
1719
use sys::types::*;
1820
use sys::{ffi_methods, interface_fn, GodotFfi};
@@ -380,17 +382,18 @@ macro_rules! impl_packed_array {
380382
array
381383
}
382384

383-
/// Drops all elements in `self` and replaces them with data from an array of values.
385+
/// Drops all elements in `self` starting from `dst` and replaces them with data from an array of values.
386+
/// `dst` must be a valid index, even if `len` is zero.
384387
///
385388
/// # Safety
386389
///
387-
/// * Pointer must be valid slice of data with `len` size.
388-
/// * Pointer must not point to `self` data.
389-
/// * Length must be equal to `self.len()`.
390+
/// * `src` must be valid slice of data with `len` size.
391+
/// * `src` must not point to `self` data.
392+
/// * `len` must be equal to `self.len() - dst`.
390393
/// * Source data must not be dropped later.
391-
unsafe fn move_from_slice(&mut self, src: *const $Element, len: usize) {
392-
let ptr = self.ptr_mut(0);
393-
debug_assert_eq!(len, self.len(), "length precondition violated");
394+
unsafe fn move_from_slice(&mut self, src: *const $Element, dst: usize, len: usize) {
395+
let ptr = self.ptr_mut(dst);
396+
debug_assert_eq!(len, self.len() - dst, "length precondition violated");
394397
// Drops all elements in place. Drop impl must not panic.
395398
ptr::drop_in_place(ptr::slice_from_raw_parts_mut(ptr, len));
396399
// Copy is okay since all elements are dropped.
@@ -457,7 +460,7 @@ macro_rules! impl_packed_array {
457460

458461
// SAFETY: The packed array contains exactly N elements and the source array will be forgotten.
459462
unsafe {
460-
packed_array.move_from_slice(arr.as_ptr(), N);
463+
packed_array.move_from_slice(arr.as_ptr(), 0, N);
461464
}
462465
packed_array
463466
}
@@ -476,13 +479,21 @@ macro_rules! impl_packed_array {
476479
// The vector is forcibly set to empty, so its contents are forgotten.
477480
unsafe {
478481
vec.set_len(0);
479-
array.move_from_slice(vec.as_ptr(), len);
482+
array.move_from_slice(vec.as_ptr(), 0, len);
480483
}
481484
array
482485
}
483486
}
484487

485488
#[doc = concat!("Creates a `", stringify!($PackedArray), "` from an iterator.")]
489+
///
490+
/// # Performance note
491+
/// This uses the lower bound from `Iterator::size_hint()` to allocate memory up front. If the iterator returns
492+
/// more than that number of elements, it falls back to reading elements into a fixed-size buffer before adding
493+
/// them all efficiently as a batch.
494+
///
495+
/// # Panics
496+
/// - If the iterator's `size_hint()` returns an incorrect lower bound (which is a breach of the `Iterator` protocol).
486497
impl FromIterator<$Element> for $PackedArray {
487498
fn from_iter<I: IntoIterator<Item = $Element>>(iter: I) -> Self {
488499
let mut array = $PackedArray::default();
@@ -491,16 +502,103 @@ macro_rules! impl_packed_array {
491502
}
492503
}
493504

494-
#[doc = concat!("Extends a`", stringify!($PackedArray), "` with the contents of an iterator")]
505+
#[doc = concat!("Extends a`", stringify!($PackedArray), "` with the contents of an iterator.")]
506+
///
507+
/// # Performance note
508+
/// This uses the lower bound from `Iterator::size_hint()` to allocate memory up front. If the iterator returns
509+
/// more than that number of elements, it falls back to reading elements into a fixed-size buffer before adding
510+
/// them all efficiently as a batch.
511+
///
512+
/// # Panics
513+
/// - If the iterator's `size_hint()` returns an incorrect lower bound (which is a breach of the `Iterator` protocol).
495514
impl Extend<$Element> for $PackedArray {
496515
fn extend<I: IntoIterator<Item = $Element>>(&mut self, iter: I) {
497-
// Unfortunately the GDExtension API does not offer the equivalent of `Vec::reserve`.
498-
// Otherwise we could use it to pre-allocate based on `iter.size_hint()`.
516+
// This function is complicated, but with good reason. The problem is that we don't know the length of
517+
// the `Iterator` ahead of time; all we get is its `size_hint()`.
518+
//
519+
// There are at least two categories of iterators that are common in the wild, for which we'd want good performance:
520+
//
521+
// 1. The length is known: `size_hint()` returns the exact size, e.g. just iterating over a `Vec` or `BTreeSet`.
522+
// 2. The length is unknown: `size_hint()` returns 0, e.g. `Filter`, `FlatMap`, `FromFn`.
523+
//
524+
// A number of implementations are possible, which were benchmarked for 1000 elements of type `i32`:
525+
//
526+
// - Simply call `push()` in a loop:
527+
// 6.1 µs whether or not the length is known.
528+
// - First `collect()` the `Iterator` into a `Vec`, call `self.resize()` to make room, then move out of the `Vec`:
529+
// 0.78 µs if the length is known, 1.62 µs if the length is unknown.
530+
// It also requires additional temporary memory to hold all elements.
531+
// - The strategy implemented below:
532+
// 0.097 µs if the length is known, 0.49 µs if the length is unknown.
499533
//
500-
// A faster implementation using `resize()` and direct pointer writes might still be
501-
// possible.
502-
for item in iter.into_iter() {
503-
self.push(meta::ParamType::owned_to_arg(item));
534+
// The implementation of `Vec` in the standard library deals with this by repeatedly `reserve()`ing
535+
// whatever `size_hint()` returned, but we don't want to do that because the Godot API call to
536+
// `self.resize()` is relatively slow.
537+
538+
let mut iter = iter.into_iter();
539+
// Cache the length to avoid repeated Godot API calls.
540+
let mut len = self.len();
541+
542+
// Fast part.
543+
//
544+
// Use `Iterator::size_hint()` to pre-allocate the minimum number of elements in the iterator, then
545+
// write directly to the resulting slice. We can do this because `size_hint()` is required by the
546+
// `Iterator` contract to return correct bounds. Note that any bugs in it must not result in UB.
547+
let (size_hint_min, _size_hint_max) = iter.size_hint();
548+
if size_hint_min > 0 {
549+
let capacity = len + size_hint_min;
550+
self.resize(capacity);
551+
for out_ref in &mut self.as_mut_slice()[len..] {
552+
*out_ref = iter.next().expect("iterator returned fewer than size_hint().0 elements");
553+
}
554+
len = capacity;
555+
}
556+
557+
// Slower part.
558+
//
559+
// While the iterator is still not finished, gather elements into a fixed-size buffer, then add them all
560+
// at once.
561+
//
562+
// Why not call `self.resize()` with fixed-size increments, like 32 elements at a time? Well, we might
563+
// end up over-allocating, and then need to trim the array length back at the end. Because Godot
564+
// allocates memory in steps of powers of two, this might end up with an array backing storage that is
565+
// twice as large as it needs to be. By first gathering elements into a buffer, we can tell Godot to
566+
// allocate exactly as much as we need, and no more.
567+
//
568+
// Note that we can't get by with simple memcpys, because `PackedStringArray` contains `GString`, which
569+
// does not implement `Copy`.
570+
//
571+
// Buffer size: 2 kB is enough for the performance win, without needlessly blowing up the stack size.
572+
// (A cursory check shows that most/all platforms use a stack size of at least 1 MB.)
573+
const BUFFER_SIZE_BYTES: usize = 2048;
574+
const BUFFER_CAPACITY: usize = const_max(
575+
1,
576+
BUFFER_SIZE_BYTES / size_of::<$Element>(),
577+
);
578+
let mut buf = ExtendBuffer::<_, BUFFER_CAPACITY>::default();
579+
while let Some(item) = iter.next() {
580+
buf.push(item);
581+
while !buf.is_full() {
582+
if let Some(item) = iter.next() {
583+
buf.push(item);
584+
} else {
585+
break;
586+
}
587+
}
588+
589+
let buf_slice = buf.drain_as_mut_slice();
590+
let capacity = len + buf_slice.len();
591+
592+
// Assumption: resize does not panic. Otherwise we would leak memory here.
593+
self.resize(capacity);
594+
595+
// SAFETY: Dropping the first `buf_slice.len()` items is safe, because those are exactly the ones we initialized.
596+
// Writing output is safe because we just allocated `buf_slice.len()` new elements after index `len`.
597+
unsafe {
598+
self.move_from_slice(buf_slice.as_ptr(), len, buf_slice.len());
599+
}
600+
601+
len = capacity;
504602
}
505603
}
506604
}
@@ -1071,3 +1169,12 @@ fn populated_or_err(array: PackedByteArray) -> Result<PackedByteArray, ()> {
10711169
Ok(array)
10721170
}
10731171
}
1172+
1173+
/// Helper because `usize::max()` is not const.
1174+
const fn const_max(a: usize, b: usize) -> usize {
1175+
if a > b {
1176+
a
1177+
} else {
1178+
b
1179+
}
1180+
}

itest/rust/src/benchmarks/mod.rs

+21-1
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
use std::hint::black_box;
1111

1212
use godot::builtin::inner::InnerRect2i;
13-
use godot::builtin::{GString, Rect2i, StringName, Vector2i};
13+
use godot::builtin::{GString, PackedInt32Array, Rect2i, StringName, Vector2i};
1414
use godot::classes::{Node3D, Os, RefCounted};
1515
use godot::obj::{Gd, InstanceId, NewAlloc, NewGd};
1616
use godot::register::GodotClass;
@@ -93,6 +93,26 @@ fn utilities_ffi_call() -> f64 {
9393
godot::global::pow(base, exponent)
9494
}
9595

96+
#[bench(repeat = 25)]
97+
fn packed_array_from_iter_known_size() -> PackedInt32Array {
98+
// Create an iterator whose `size_hint()` returns `(len, Some(len))`.
99+
PackedInt32Array::from_iter(0..100)
100+
}
101+
102+
#[bench(repeat = 25)]
103+
fn packed_array_from_iter_unknown_size() -> PackedInt32Array {
104+
// Create an iterator whose `size_hint()` returns `(0, None)`.
105+
let mut item = 0;
106+
PackedInt32Array::from_iter(std::iter::from_fn(|| {
107+
item += 1;
108+
if item <= 100 {
109+
Some(item)
110+
} else {
111+
None
112+
}
113+
}))
114+
}
115+
96116
// ----------------------------------------------------------------------------------------------------------------------------------------------
97117
// Helpers for benchmarks above
98118

0 commit comments

Comments
 (0)