Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use vectorcall (where possible) when calling Python functions #4456

Merged
merged 4 commits into from
Aug 25, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions newsfragments/4456.changed.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Improve performance of calls to Python by using the vectorcall calling convention where possible.
146 changes: 145 additions & 1 deletion pyo3-benches/benches/bench_call.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,9 @@ use std::hint::black_box;

use codspeed_criterion_compat::{criterion_group, criterion_main, Bencher, Criterion};

use pyo3::prelude::*;
use pyo3::ffi::c_str;
use pyo3::prelude::*;
use pyo3::types::IntoPyDict;

macro_rules! test_module {
($py:ident, $code:literal) => {
Expand All @@ -26,6 +27,62 @@ fn bench_call_0(b: &mut Bencher<'_>) {
})
}

fn bench_call_1(b: &mut Bencher<'_>) {
Python::with_gil(|py| {
let module = test_module!(py, "def foo(a, b, c): pass");

let foo_module = &module.getattr("foo").unwrap();
let args = (
<_ as IntoPy<PyObject>>::into_py(1, py).into_bound(py),
<_ as IntoPy<PyObject>>::into_py("s", py).into_bound(py),
<_ as IntoPy<PyObject>>::into_py(1.23, py).into_bound(py),
);

b.iter(|| {
for _ in 0..1000 {
black_box(foo_module).call1(args.clone()).unwrap();
}
});
})
}

fn bench_call(b: &mut Bencher<'_>) {
Python::with_gil(|py| {
let module = test_module!(py, "def foo(a, b, c, d, e): pass");

let foo_module = &module.getattr("foo").unwrap();
let args = (
<_ as IntoPy<PyObject>>::into_py(1, py).into_bound(py),
<_ as IntoPy<PyObject>>::into_py("s", py).into_bound(py),
<_ as IntoPy<PyObject>>::into_py(1.23, py).into_bound(py),
);
let kwargs = [("d", 1), ("e", 42)].into_py_dict(py);

b.iter(|| {
for _ in 0..1000 {
black_box(foo_module)
.call(args.clone(), Some(&kwargs))
.unwrap();
}
});
})
}

fn bench_call_one_arg(b: &mut Bencher<'_>) {
Python::with_gil(|py| {
let module = test_module!(py, "def foo(a): pass");

let foo_module = &module.getattr("foo").unwrap();
let arg = <_ as IntoPy<PyObject>>::into_py(1, py).into_bound(py);

b.iter(|| {
for _ in 0..1000 {
black_box(foo_module).call1((arg.clone(),)).unwrap();
}
});
})
}

fn bench_call_method_0(b: &mut Bencher<'_>) {
Python::with_gil(|py| {
let module = test_module!(
Expand All @@ -47,9 +104,96 @@ class Foo:
})
}

fn bench_call_method_1(b: &mut Bencher<'_>) {
Python::with_gil(|py| {
let module = test_module!(
py,
"
class Foo:
def foo(self, a, b, c):
pass
"
);

let foo_module = &module.getattr("Foo").unwrap().call0().unwrap();
let args = (
<_ as IntoPy<PyObject>>::into_py(1, py).into_bound(py),
<_ as IntoPy<PyObject>>::into_py("s", py).into_bound(py),
<_ as IntoPy<PyObject>>::into_py(1.23, py).into_bound(py),
);

b.iter(|| {
for _ in 0..1000 {
black_box(foo_module)
.call_method1("foo", args.clone())
.unwrap();
}
});
})
}

fn bench_call_method(b: &mut Bencher<'_>) {
Python::with_gil(|py| {
let module = test_module!(
py,
"
class Foo:
def foo(self, a, b, c, d, e):
pass
"
);

let foo_module = &module.getattr("Foo").unwrap().call0().unwrap();
let args = (
<_ as IntoPy<PyObject>>::into_py(1, py).into_bound(py),
<_ as IntoPy<PyObject>>::into_py("s", py).into_bound(py),
<_ as IntoPy<PyObject>>::into_py(1.23, py).into_bound(py),
);
let kwargs = [("d", 1), ("e", 42)].into_py_dict(py);

b.iter(|| {
for _ in 0..1000 {
black_box(foo_module)
.call_method("foo", args.clone(), Some(&kwargs))
.unwrap();
}
});
})
}

fn bench_call_method_one_arg(b: &mut Bencher<'_>) {
Python::with_gil(|py| {
let module = test_module!(
py,
"
class Foo:
def foo(self, a):
pass
"
);

let foo_module = &module.getattr("Foo").unwrap().call0().unwrap();
let arg = <_ as IntoPy<PyObject>>::into_py(1, py).into_bound(py);

b.iter(|| {
for _ in 0..1000 {
black_box(foo_module)
.call_method1("foo", (arg.clone(),))
.unwrap();
}
});
})
}

fn criterion_benchmark(c: &mut Criterion) {
c.bench_function("call_0", bench_call_0);
c.bench_function("call_1", bench_call_1);
c.bench_function("call", bench_call);
c.bench_function("call_one_arg", bench_call_one_arg);
c.bench_function("call_method_0", bench_call_method_0);
c.bench_function("call_method_1", bench_call_method_1);
c.bench_function("call_method", bench_call_method);
c.bench_function("call_method_one_arg", bench_call_method_one_arg);
}

criterion_group!(benches, criterion_benchmark);
Expand Down
2 changes: 1 addition & 1 deletion pyo3-ffi/src/cpython/abstract_.rs
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ extern "C" {
}

#[cfg(Py_3_8)]
const PY_VECTORCALL_ARGUMENTS_OFFSET: size_t =
pub const PY_VECTORCALL_ARGUMENTS_OFFSET: size_t =
1 << (8 * std::mem::size_of::<size_t>() as size_t - 1);

#[cfg(Py_3_8)]
Expand Down
136 changes: 135 additions & 1 deletion src/conversion.rs
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
//! Defines conversions between Rust and Python types.
use crate::err::PyResult;
use crate::ffi_ptr_ext::FfiPtrExt;
#[cfg(feature = "experimental-inspect")]
use crate::inspect::types::TypeInfo;
use crate::pyclass::boolean_struct::False;
use crate::types::any::PyAnyMethods;
use crate::types::PyTuple;
use crate::types::{PyDict, PyString, PyTuple};
use crate::{
ffi, Borrowed, Bound, BoundObject, Py, PyAny, PyClass, PyErr, PyObject, PyRef, PyRefMut, Python,
};
Expand Down Expand Up @@ -172,6 +173,93 @@ pub trait IntoPy<T>: Sized {
fn type_output() -> TypeInfo {
TypeInfo::Any
}

// The following methods are helpers to use the vectorcall API where possible.
// They are overridden on tuples to perform a vectorcall.
// Be careful when you're implementing these: they can never refer to `Bound` call methods,
// as those refer to these methods, so this will create an infinite recursion.
#[doc(hidden)]
#[inline]
fn __py_call_vectorcall1<'py>(
Comment on lines +177 to +183
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Rather than adding to this trait, we should look at its upcoming replacement IntoPyObject and consider how to slot these methods on there or a companion trait. We need to migrate the IntoPy<Py<PyTuple>> bound on the .call functions anyway, so this is a good time to bring this up cc @Icxolu.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't know how you expect this new trait(s) to look like, but it shouldn't be hard to migrate. I believe it is out of scope for this PR though.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also, I think IntoPyObject has a worse API (in some part): it cannot convert one Rust type to multiple Python type, which can especially hurt calls (for example, because it prevents supporting calling with arrays or Vec without an inefficient conversion).

This has advantages - less type annotation, but I think these trait can coexist (with calls using IntoPy).

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hey, thanks for the ping David! I have to say upfront I'm not really familiar with these different calling conversions.

Also, I think IntoPyObject has a worse API (in some part): it cannot convert one Rust type to multiple Python type,

Together with fallibility I would considers the the two major advantages of the new API. During the experimentation phase we concluded that there is generally a clear Python target type for any Rust type. The additional complexity would make this overall less ergonomic to while bringing not much benefit in general.

This has advantages - less type annotation, but I think these trait can coexist (with calls using IntoPy).

IMO we should not keep IntoPy around. It has clear problems regarding fallible conversions. Also there should really be one trait responsible for converting Rust value into Python objects. Everything else is way harder to explain and to maintain. For example the implementations could get out of sync and the same value in Rust will be converted differently depending on which API it is given to. (This can already happen with ToPyObject and IntoPy currently, and I think we should get rid of it and not introduce a new form here)

If I understood correctly the problem is that we also want to convert arrays, Vecs, ... to a PyTuple while there normally convert into a PyList. I think we can support that special casing with IntoPyObject as well, using another method that converts Self into a PyTuple "args" object. A quick sketch below with my limited understanding.

pub trait IntoPyObject<'py>: Sized {
    ....
    
    #[doc(hidden)]
    /// Turn `Self` into callable args, can be specialized for tuples, array, ...
    fn into_args(self, py: Python<'py>, _: private::Token) -> PyResult<Bound<'py, PyTuple>>
    where
        PyErr: From<Self::Error>,
    {
        (self,).into_pyobject(py) // for tuples this can then be `self.into_pyobject(py)`
    }

    #[doc(hidden)]
    /// Call `function` with `obj` as `arg`; can use specialized calling conventions
    fn vectorcall(
        obj: Self,
        py: Python<'py>,
        function: Borrowed<'_, 'py, PyAny>,
        token: private::Token,
    ) -> PyResult<Bound<'py, PyAny>>
    where
        PyErr: From<Self::Error>,
    {
        #[inline]
        fn inner<'py>(
            py: Python<'py>,
            function: Borrowed<'_, 'py, PyAny>,
            args: Bound<'py, PyTuple>,
        ) -> PyResult<Bound<'py, PyAny>> {
            use crate::ffi_ptr_ext::FfiPtrExt;
            unsafe {
                ffi::PyObject_Call(function.as_ptr(), args.as_ptr(), std::ptr::null_mut())
                    .assume_owned_or_err(py)
            }
        }
        // make this use `into_args`
        inner(py, function, obj.into_args(py, token)?.into_bound())
    }

}

If I got something wrong, or overlooked something, let me know, but in general I think it should be possible to support this with IntoPyObject as well.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I believe it is indeed possible to support this with IntoPyObject.

If we are already making a breaking change, I think a better path than adding methods on IntoPyObject is to use another trait for calls, say PyCallArgs. This has the following advantage:

  • Assuming we seal PyCallArgs, this will allow us to easily enable future possibilities, even ones that we cannot predict, around perf and not only.
  • If you take IntoPyObject, you have to check you actually got a tuple. The overhead can be mitigated for known-tuples by specializing methods on them, but it is still not the best API since it does not prevent non-tuples at compile time and doesn't even signal the user their code is going to fail.

Anyway, this is unrelated to this PR. We can land it now, and I expect any changes around calling can be adjusted fairly trivially.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

An additional reason I find the different trait approach tempting is that it can be used for both more convenient and more performant approach for kwargs, even without waiting for a pycall! macro - if we choose this path, we can instead of taking kwargs: Option<PyDict> take generic type that can convert to a dict.

For example,

fn call<Args, Kwargs>(&self, args: Args, kwargs: Kwargs)
where
    (Args, Kwargs): PyCallArgs
{ ... }

That already means people can more nicely use kwargs with syntax like call((arg1, arg2, ...), [("a", 1), ("b", 2), ...]). But in addition, we may specialize the impls to instead of converting to PyDict, using the vectorcall API directly.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@Icxolu, what do you think of doing that (i.e. release 0.23 now as an interim towards a complete switchover for 0.24)?

Generally I'm open to that. I guess that depends a little on how we want to structure/explain the migration. I guess the current state is fairly minimal with the amount of actual breakage. My proposal for the trait bounds migration would have been to provide impl<'a, 'py, T> IntoPyObject<'py> for &'a T where T: ToPyObject {} this blanket, since the vast majority of the APIs are generic of ToPyObject. I would hope that that would keep breakage still low, but it's probably gonna be higher that now. So if you prefer we can definitely delay that to 0.24

On a different note, there is still a bit if bound api cleanup left that I think we should finish before 0.23 and I think #4449 we can also put in 0.23

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

My proposal for the trait bounds migration would have been to provide impl<'a, 'py, T> IntoPyObject<'py> for &'a T where T: ToPyObject {}

Hmm, interesting. So I played around with this (and ideas like a blanket-impl of ToPyObject from IntoPyObject, i.e. the reverse direction). TBH, neither felt great. For example implementing IntoPyObject for &'a T where T: ToPyObject will only help when users pass references for their custom types. Having the blanket might just be more confusion.

Having looked at that more, I think that in 0.23 we should just go for it and migrate all trait bounds without a blanket and commit to the bigger breakage. While it's a big (ish) breakage, I think it's actually the easiest state for users to understand, and I think we can make the migration easier for users by adding the derive proposed in #4458. (They might then just be able to switch to the derive and delete code in a lot of cases).

That said, I think we need to cut a 0.22.3 release to resolve #4452 and ship #4396, so I am open to the idea of merging this PR as-is and cherry-picking it as a perf enhancement in 0.22.3. @ChayimFriedman2, if we did that, would you be willing to help work on the follow-up to move this off IntoPy and onto new traits?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@ChayimFriedman2, if we did that, would you be willing to help work on the follow-up to move this off IntoPy and onto new traits?

Yes. Ping me when you need my help.

I'm actually trying to work now on a pycall!() draft, which will be both the most performant, most capable and most convenient way to call a Python method. Let's see where this'll bring us (it is still worth landing this PR because it benefits user we haven't migrated).

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For example implementing IntoPyObject for &'a T where T: ToPyObject will only help when users pass references for their custom types. Having the blanket might just be more confusion.

That's true, haven't thought of that. In that case I think I tend to agree, providing any blanket will probably make it worse.

Having looked at that more, I think that in 0.23 we should just go for it and migrate all trait bounds without a blanket and commit to the bigger breakage. While it's a big (ish) breakage, I think it's actually the easiest state for users to understand, and I think we can make the migration easier for users by adding the derive proposed in #4458.

Sure thing, I'll prepare the PR with the trait bounds change and afterwards look into the derive macro.

self,
py: Python<'py>,
function: Borrowed<'_, 'py, PyAny>,
_: private::Token,
) -> PyResult<Bound<'py, PyAny>>
where
Self: IntoPy<Py<PyTuple>>,
{
#[inline]
fn inner<'py>(
py: Python<'py>,
function: Borrowed<'_, 'py, PyAny>,
args: Bound<'py, PyTuple>,
) -> PyResult<Bound<'py, PyAny>> {
unsafe {
ffi::PyObject_Call(function.as_ptr(), args.as_ptr(), std::ptr::null_mut())
.assume_owned_or_err(py)
}
}
inner(
py,
function,
<Self as IntoPy<Py<PyTuple>>>::into_py(self, py).into_bound(py),
)
}

#[doc(hidden)]
#[inline]
fn __py_call_vectorcall<'py>(
self,
py: Python<'py>,
function: Borrowed<'_, 'py, PyAny>,
kwargs: Option<Borrowed<'_, '_, PyDict>>,
_: private::Token,
) -> PyResult<Bound<'py, PyAny>>
where
Self: IntoPy<Py<PyTuple>>,
{
#[inline]
fn inner<'py>(
py: Python<'py>,
function: Borrowed<'_, 'py, PyAny>,
args: Bound<'py, PyTuple>,
kwargs: Option<Borrowed<'_, '_, PyDict>>,
) -> PyResult<Bound<'py, PyAny>> {
unsafe {
ffi::PyObject_Call(
function.as_ptr(),
args.as_ptr(),
kwargs.map_or_else(std::ptr::null_mut, |kwargs| kwargs.as_ptr()),
)
.assume_owned_or_err(py)
}
}
inner(
py,
function,
<Self as IntoPy<Py<PyTuple>>>::into_py(self, py).into_bound(py),
kwargs,
)
}

#[doc(hidden)]
#[inline]
fn __py_call_method_vectorcall1<'py>(
self,
_py: Python<'py>,
object: Borrowed<'_, 'py, PyAny>,
method_name: Borrowed<'_, 'py, PyString>,
_: private::Token,
) -> PyResult<Bound<'py, PyAny>>
where
Self: IntoPy<Py<PyTuple>>,
{
// Don't `self.into_py()`! This will lose the optimization of vectorcall.
object
.getattr(method_name)
.and_then(|method| method.call1(self))
}
}

/// Defines a conversion from a Rust type to a Python object, which may fail.
Expand Down Expand Up @@ -502,6 +590,52 @@ impl IntoPy<Py<PyTuple>> for () {
fn into_py(self, py: Python<'_>) -> Py<PyTuple> {
PyTuple::empty(py).unbind()
}

#[inline]
fn __py_call_vectorcall1<'py>(
self,
py: Python<'py>,
function: Borrowed<'_, 'py, PyAny>,
_: private::Token,
) -> PyResult<Bound<'py, PyAny>> {
unsafe { ffi::compat::PyObject_CallNoArgs(function.as_ptr()).assume_owned_or_err(py) }
}

#[inline]
fn __py_call_vectorcall<'py>(
self,
py: Python<'py>,
function: Borrowed<'_, 'py, PyAny>,
kwargs: Option<Borrowed<'_, '_, PyDict>>,
_: private::Token,
) -> PyResult<Bound<'py, PyAny>> {
unsafe {
match kwargs {
Some(kwargs) => ffi::PyObject_Call(
function.as_ptr(),
PyTuple::empty(py).as_ptr(),
kwargs.as_ptr(),
)
.assume_owned_or_err(py),
None => ffi::compat::PyObject_CallNoArgs(function.as_ptr()).assume_owned_or_err(py),
}
}
}

#[inline]
#[allow(clippy::used_underscore_binding)]
fn __py_call_method_vectorcall1<'py>(
self,
py: Python<'py>,
object: Borrowed<'_, 'py, PyAny>,
method_name: Borrowed<'_, 'py, PyString>,
_: private::Token,
) -> PyResult<Bound<'py, PyAny>> {
unsafe {
ffi::compat::PyObject_CallMethodNoArgs(object.as_ptr(), method_name.as_ptr())
.assume_owned_or_err(py)
}
}
}

impl<'py> IntoPyObject<'py> for () {
Expand Down
9 changes: 7 additions & 2 deletions src/conversions/chrono.rs
Original file line number Diff line number Diff line change
Expand Up @@ -813,7 +813,7 @@ fn timezone_utc(py: Python<'_>) -> Bound<'_, PyAny> {
#[cfg(test)]
mod tests {
use super::*;
use crate::types::PyTuple;
use crate::{types::PyTuple, BoundObject};
use std::{cmp::Ordering, panic};

#[test]
Expand Down Expand Up @@ -1333,7 +1333,12 @@ mod tests {
.unwrap()
.getattr(name)
.unwrap()
.call1(args)
.call1(
args.into_pyobject(py)
.map_err(Into::into)
.unwrap()
.into_bound(),
)
.unwrap()
}

Expand Down
Loading
Loading