Avoid heap allocation for function calls with a small number of args (#5824)

* Avoid heap allocation for function calls with a small number of arguments

We don't have access to llvm::SmallVector or similar, but given the
limited subset of the `std::vector` API that
`function_call::args{,_convert}` need and the "reserve-then-fill"
usage pattern, it is relatively straightforward to implement custom
containers that get the job done.

Seems to improves time to call the collatz function in
pybind/pybind11_benchmark significantly; numbers are a little noisy
but there's a clear improvement from "about 60 ns per call" to "about
45 ns per call" on my machine (M4 Max Mac), as measured with
`timeit.repeat('collatz(4)', 'from pybind11_benchmark import
collatz')`.

* clang-tidy

* more clang-tidy

* clang-tidy NOLINTBEGIN/END instead of NOLINTNEXTLINE

* forgot to increase inline size after removing std::variant

* constexpr arg_vector_small_size, use move instead of swap to hopefully clarify second_pass_convert

* rename test_embed to test_low_level

* rename test_low_level to test_with_catch

* Be careful to NOINLINE slow paths

* rename array/vector members to iarray/hvector. Move comment per request. Add static_asserts for our untagged union implementation per request.

* drop is_standard_layout assertions; see https://github.com/pybind/pybind11/pull/5824#issuecomment-3308616072
This commit is contained in:
Scott Wolchok
2025-09-19 13:44:40 -07:00
committed by GitHub
parent 326b10637a
commit 30748f863f
16 changed files with 532 additions and 18 deletions

View File

@@ -180,6 +180,7 @@ if(PYBIND11_MASTER_PROJECT)
endif()
set(PYBIND11_HEADERS
include/pybind11/detail/argument_vector.h
include/pybind11/detail/class.h
include/pybind11/detail/common.h
include/pybind11/detail/cpp_conduit.h

View File

@@ -10,6 +10,7 @@
#pragma once
#include "detail/argument_vector.h"
#include "detail/common.h"
#include "detail/descr.h"
#include "detail/native_enum_data.h"
@@ -2037,6 +2038,10 @@ using is_pos_only = std::is_same<intrinsic_t<T>, pos_only>;
// forward declaration (definition in attr.h)
struct function_record;
/// (Inline size chosen mostly arbitrarily; 6 should pad function_call out to two cache lines
/// (16 pointers) in size.)
constexpr std::size_t arg_vector_small_size = 6;
/// Internal data associated with a single function call
struct function_call {
function_call(const function_record &f, handle p); // Implementation in attr.h
@@ -2045,10 +2050,10 @@ struct function_call {
const function_record &func;
/// Arguments passed to the function:
std::vector<handle> args;
argument_vector<arg_vector_small_size> args;
/// The `convert` value the arguments should be loaded with
std::vector<bool> args_convert;
args_convert_vector<arg_vector_small_size> args_convert;
/// Extra references for the optional `py::args` and/or `py::kwargs` arguments (which, if
/// present, are also in `args` but without a reference).

View File

@@ -0,0 +1,330 @@
/*
pybind11/detail/argument_vector.h: small_vector-like containers to
avoid heap allocation of arguments during function call dispatch.
Copyright (c) Meta Platforms, Inc. and affiliates.
All rights reserved. Use of this source code is governed by a
BSD-style license that can be found in the LICENSE file.
*/
#pragma once
#include <pybind11/pytypes.h>
#include "common.h"
#include <algorithm>
#include <array>
#include <cstddef>
#include <cstdint>
#include <cstring>
#include <iterator>
#include <type_traits>
#include <utility>
#include <vector>
PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
PYBIND11_WARNING_DISABLE_MSVC(4127)
PYBIND11_NAMESPACE_BEGIN(detail)
// Shared implementation utility for our small_vector-like containers.
// We support C++11 and C++14, so we cannot use
// std::variant. Union with the tag packed next to the inline
// array's size is smaller anyway, allowing 1 extra handle of
// inline storage for free. Compare the layouts (1 line per
// size_t/void*, assuming a 64-bit machine):
// With variant, total is N + 2 for N >= 2:
// - variant tag (cannot be packed with the array size)
// - array size (or first pointer of 3 in std::vector)
// - N pointers of inline storage (or 2 remaining pointers of std::vector)
// Custom union, total is N + 1 for N >= 3:
// - variant tag & array size if applicable
// - N pointers of inline storage (or 3 pointers of std::vector)
//
// NOTE: this is a low-level representational convenience; the two
// use cases of this union are materially different and in particular
// have different semantics for inline_array::size. All that is being
// shared is the memory management behavior.
template <typename ArrayT, std::size_t InlineSize, typename VectorT = ArrayT>
union inline_array_or_vector {
struct inline_array {
bool is_inline = true;
std::uint32_t size = 0;
std::array<ArrayT, InlineSize> arr;
};
struct heap_vector {
bool is_inline = false;
std::vector<VectorT> vec;
heap_vector() = default;
heap_vector(std::size_t count, VectorT value) : vec(count, value) {}
};
inline_array iarray;
heap_vector hvector;
static_assert(std::is_trivially_move_constructible<ArrayT>::value,
"ArrayT must be trivially move constructible");
static_assert(std::is_trivially_destructible<ArrayT>::value,
"ArrayT must be trivially destructible");
inline_array_or_vector() : iarray() {}
~inline_array_or_vector() {
if (!is_inline()) {
hvector.~heap_vector();
}
}
// Disable copy ctor and assignment.
inline_array_or_vector(const inline_array_or_vector &) = delete;
inline_array_or_vector &operator=(const inline_array_or_vector &) = delete;
inline_array_or_vector(inline_array_or_vector &&rhs) noexcept {
if (rhs.is_inline()) {
std::memcpy(&iarray, &rhs.iarray, sizeof(iarray));
} else {
new (&hvector) heap_vector(std::move(rhs.hvector));
}
assert(is_inline() == rhs.is_inline());
}
inline_array_or_vector &operator=(inline_array_or_vector &&rhs) noexcept {
if (this == &rhs) {
return *this;
}
if (rhs.is_inline()) {
if (!is_inline()) {
hvector.~heap_vector();
}
std::memcpy(&iarray, &rhs.iarray, sizeof(iarray));
} else {
if (is_inline()) {
new (&hvector) heap_vector(std::move(rhs.hvector));
} else {
hvector = std::move(rhs.hvector);
}
}
return *this;
}
bool is_inline() const {
// It is undefined behavior to access the inactive member of a
// union directly. However, it is well-defined to reinterpret_cast any
// pointer into a pointer to char and examine it as an array
// of bytes. See
// https://dev-discuss.pytorch.org/t/unionizing-for-profit-how-to-exploit-the-power-of-unions-in-c/444#the-memcpy-loophole-4
bool result = false;
static_assert(offsetof(inline_array, is_inline) == 0,
"untagged union implementation relies on this");
static_assert(offsetof(heap_vector, is_inline) == 0,
"untagged union implementation relies on this");
std::memcpy(&result, reinterpret_cast<const char *>(this), sizeof(bool));
return result;
}
};
// small_vector-like container to avoid heap allocation for N or fewer
// arguments.
template <std::size_t N>
struct argument_vector {
public:
argument_vector() = default;
// Disable copy ctor and assignment.
argument_vector(const argument_vector &) = delete;
argument_vector &operator=(const argument_vector &) = delete;
argument_vector(argument_vector &&) noexcept = default;
argument_vector &operator=(argument_vector &&) noexcept = default;
std::size_t size() const {
if (is_inline()) {
return m_repr.iarray.size;
}
return m_repr.hvector.vec.size();
}
handle &operator[](std::size_t idx) {
assert(idx < size());
if (is_inline()) {
return m_repr.iarray.arr[idx];
}
return m_repr.hvector.vec[idx];
}
handle operator[](std::size_t idx) const {
assert(idx < size());
if (is_inline()) {
return m_repr.iarray.arr[idx];
}
return m_repr.hvector.vec[idx];
}
void push_back(handle x) {
if (is_inline()) {
auto &ha = m_repr.iarray;
if (ha.size == N) {
move_to_heap_vector_with_reserved_size(N + 1);
push_back_slow_path(x);
} else {
ha.arr[ha.size++] = x;
}
} else {
push_back_slow_path(x);
}
}
template <typename Arg>
void emplace_back(Arg &&x) {
push_back(handle(x));
}
void reserve(std::size_t sz) {
if (is_inline()) {
if (sz > N) {
move_to_heap_vector_with_reserved_size(sz);
}
} else {
reserve_slow_path(sz);
}
}
private:
using repr_type = inline_array_or_vector<handle, N>;
repr_type m_repr;
PYBIND11_NOINLINE void move_to_heap_vector_with_reserved_size(std::size_t reserved_size) {
assert(is_inline());
auto &ha = m_repr.iarray;
using heap_vector = typename repr_type::heap_vector;
heap_vector hv;
hv.vec.reserve(reserved_size);
std::copy(ha.arr.begin(), ha.arr.begin() + ha.size, std::back_inserter(hv.vec));
new (&m_repr.hvector) heap_vector(std::move(hv));
}
PYBIND11_NOINLINE void push_back_slow_path(handle x) { m_repr.hvector.vec.push_back(x); }
PYBIND11_NOINLINE void reserve_slow_path(std::size_t sz) { m_repr.hvector.vec.reserve(sz); }
bool is_inline() const { return m_repr.is_inline(); }
};
// small_vector-like container to avoid heap allocation for N or fewer
// arguments.
template <std::size_t kRequestedInlineSize>
struct args_convert_vector {
private:
public:
args_convert_vector() = default;
// Disable copy ctor and assignment.
args_convert_vector(const args_convert_vector &) = delete;
args_convert_vector &operator=(const args_convert_vector &) = delete;
args_convert_vector(args_convert_vector &&) noexcept = default;
args_convert_vector &operator=(args_convert_vector &&) noexcept = default;
args_convert_vector(std::size_t count, bool value) {
if (count > kInlineSize) {
new (&m_repr.hvector) typename repr_type::heap_vector(count, value);
} else {
auto &inline_arr = m_repr.iarray;
inline_arr.arr.fill(value ? std::size_t(-1) : 0);
inline_arr.size = static_cast<decltype(inline_arr.size)>(count);
}
}
std::size_t size() const {
if (is_inline()) {
return m_repr.iarray.size;
}
return m_repr.hvector.vec.size();
}
void reserve(std::size_t sz) {
if (is_inline()) {
if (sz > kInlineSize) {
move_to_heap_vector_with_reserved_size(sz);
}
} else {
m_repr.hvector.vec.reserve(sz);
}
}
bool operator[](std::size_t idx) const {
if (is_inline()) {
return inline_index(idx);
}
assert(idx < m_repr.hvector.vec.size());
return m_repr.hvector.vec[idx];
}
void push_back(bool b) {
if (is_inline()) {
auto &ha = m_repr.iarray;
if (ha.size == kInlineSize) {
move_to_heap_vector_with_reserved_size(kInlineSize + 1);
push_back_slow_path(b);
} else {
assert(ha.size < kInlineSize);
const auto wbi = word_and_bit_index(ha.size++);
assert(wbi.word < kWords);
assert(wbi.bit < kBitsPerWord);
if (b) {
ha.arr[wbi.word] |= (std::size_t(1) << wbi.bit);
} else {
ha.arr[wbi.word] &= ~(std::size_t(1) << wbi.bit);
}
assert(operator[](ha.size - 1) == b);
}
} else {
push_back_slow_path(b);
}
}
void swap(args_convert_vector &rhs) noexcept { std::swap(m_repr, rhs.m_repr); }
private:
struct WordAndBitIndex {
std::size_t word;
std::size_t bit;
};
static WordAndBitIndex word_and_bit_index(std::size_t idx) {
return WordAndBitIndex{idx / kBitsPerWord, idx % kBitsPerWord};
}
bool inline_index(std::size_t idx) const {
const auto wbi = word_and_bit_index(idx);
assert(wbi.word < kWords);
assert(wbi.bit < kBitsPerWord);
return m_repr.iarray.arr[wbi.word] & (std::size_t(1) << wbi.bit);
}
PYBIND11_NOINLINE void move_to_heap_vector_with_reserved_size(std::size_t reserved_size) {
auto &inline_arr = m_repr.iarray;
using heap_vector = typename repr_type::heap_vector;
heap_vector hv;
hv.vec.reserve(reserved_size);
for (std::size_t ii = 0; ii < inline_arr.size; ++ii) {
hv.vec.push_back(inline_index(ii));
}
new (&m_repr.hvector) heap_vector(std::move(hv));
}
PYBIND11_NOINLINE void push_back_slow_path(bool b) { m_repr.hvector.vec.push_back(b); }
static constexpr auto kBitsPerWord = 8 * sizeof(std::size_t);
static constexpr auto kWords = (kRequestedInlineSize + kBitsPerWord - 1) / kBitsPerWord;
static constexpr auto kInlineSize = kWords * kBitsPerWord;
using repr_type = inline_array_or_vector<std::size_t, kWords, bool>;
repr_type m_repr;
bool is_inline() const { return m_repr.is_inline(); }
};
PYBIND11_NAMESPACE_END(detail)
PYBIND11_NAMESPACE_END(PYBIND11_NAMESPACE)

View File

@@ -1048,13 +1048,14 @@ protected:
}
#endif
std::vector<bool> second_pass_convert;
args_convert_vector<arg_vector_small_size> second_pass_convert;
if (overloaded) {
// We're in the first no-convert pass, so swap out the conversion flags for a
// set of all-false flags. If the call fails, we'll swap the flags back in for
// the conversion-allowed call below.
second_pass_convert.resize(func.nargs, false);
call.args_convert.swap(second_pass_convert);
second_pass_convert = std::move(call.args_convert);
call.args_convert
= args_convert_vector<arg_vector_small_size>(func.nargs, false);
}
// 6. Call the function.

View File

@@ -647,8 +647,8 @@ if(NOT PYBIND11_CUDA_TESTS)
# Test pure C++ code (not depending on Python). Provides the `test_pure_cpp` target.
add_subdirectory(pure_cpp)
# Test embedding the interpreter. Provides the `cpptest` target.
add_subdirectory(test_embed)
# Test C++ code that depends on Python, such as embedding the interpreter. Provides the `cpptest` target.
add_subdirectory(test_with_catch)
# Test CMake build using functions and targets from subdirectory or installed location
add_subdirectory(test_cmake_build)

View File

@@ -76,6 +76,7 @@ conduit_headers = {
}
detail_headers = {
"include/pybind11/detail/argument_vector.h",
"include/pybind11/detail/class.h",
"include/pybind11/detail/common.h",
"include/pybind11/detail/cpp_conduit.h",

View File

@@ -26,11 +26,11 @@ add_custom_target(
DEPENDS test_subdirectory_embed)
# Test custom export group -- PYBIND11_EXPORT_NAME
add_library(test_embed_lib ../embed.cpp)
target_link_libraries(test_embed_lib PRIVATE pybind11::embed)
add_library(test_with_catch_lib ../embed.cpp)
target_link_libraries(test_with_catch_lib PRIVATE pybind11::embed)
install(
TARGETS test_embed_lib
TARGETS test_with_catch_lib
EXPORT test_export
ARCHIVE DESTINATION bin
LIBRARY DESTINATION lib

View File

@@ -33,10 +33,11 @@ if(PYBIND11_TEST_SMART_HOLDER)
-DPYBIND11_RUN_TESTING_WITH_SMART_HOLDER_AS_DEFAULT_BUT_NEVER_USE_IN_PRODUCTION_PLEASE)
endif()
add_executable(test_embed catch.cpp test_interpreter.cpp test_subinterpreter.cpp)
pybind11_enable_warnings(test_embed)
add_executable(test_with_catch catch.cpp test_args_convert_vector.cpp test_argument_vector.cpp
test_interpreter.cpp test_subinterpreter.cpp)
pybind11_enable_warnings(test_with_catch)
target_link_libraries(test_embed PRIVATE pybind11::embed Catch2::Catch2 Threads::Threads)
target_link_libraries(test_with_catch PRIVATE pybind11::embed Catch2::Catch2 Threads::Threads)
if(NOT CMAKE_CURRENT_SOURCE_DIR STREQUAL CMAKE_CURRENT_BINARY_DIR)
file(COPY test_interpreter.py test_trampoline.py DESTINATION "${CMAKE_CURRENT_BINARY_DIR}")
@@ -44,8 +45,8 @@ endif()
add_custom_target(
cpptest
COMMAND "$<TARGET_FILE:test_embed>"
DEPENDS test_embed
COMMAND "$<TARGET_FILE:test_with_catch>"
DEPENDS test_with_catch
WORKING_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}")
pybind11_add_module(external_module THIN_LTO external_module.cpp)

View File

@@ -19,7 +19,7 @@ namespace py = pybind11;
int main(int argc, char *argv[]) {
// Setup for TEST_CASE in test_interpreter.cpp, tagging on a large random number:
std::string updated_pythonpath("pybind11_test_embed_PYTHONPATH_2099743835476552");
std::string updated_pythonpath("pybind11_test_with_catch_PYTHONPATH_2099743835476552");
const char *preexisting_pythonpath = getenv("PYTHONPATH");
if (preexisting_pythonpath != nullptr) {
#if defined(_WIN32)

View File

@@ -0,0 +1,80 @@
#include "pybind11/pybind11.h"
#include "catch.hpp"
namespace py = pybind11;
using args_convert_vector = py::detail::args_convert_vector<py::detail::arg_vector_small_size>;
namespace {
template <typename Container>
std::vector<Container> get_sample_vectors() {
std::vector<Container> result;
result.emplace_back();
for (const auto sz : {0, 4, 5, 6, 31, 32, 33, 63, 64, 65}) {
for (const bool b : {false, true}) {
result.emplace_back(static_cast<std::size_t>(sz), b);
}
}
return result;
}
void require_vector_matches_sample(const args_convert_vector &actual,
const std::vector<bool> &expected) {
REQUIRE(actual.size() == expected.size());
for (size_t ii = 0; ii < actual.size(); ++ii) {
REQUIRE(actual[ii] == expected[ii]);
}
}
template <typename ActualMutationFunc, typename ExpectedMutationFunc>
void mutation_test_with_samples(ActualMutationFunc actual_mutation_func,
ExpectedMutationFunc expected_mutation_func) {
auto sample_contents = get_sample_vectors<std::vector<bool>>();
auto samples = get_sample_vectors<args_convert_vector>();
for (size_t ii = 0; ii < samples.size(); ++ii) {
auto &actual = samples[ii];
auto &expected = sample_contents[ii];
actual_mutation_func(actual);
expected_mutation_func(expected);
require_vector_matches_sample(actual, expected);
}
}
} // namespace
// I would like to write [capture](auto& vec) block inline, but we
// have to work with C++11, which doesn't have generic lambdas.
// NOLINTBEGIN(bugprone-macro-parentheses)
#define MUTATION_LAMBDA(capture, block) \
[capture](args_convert_vector & vec) block, [capture](std::vector<bool> & vec) block
// NOLINTEND(bugprone-macro-parentheses)
// For readability, rather than having ugly empty arguments.
#define NO_CAPTURE
TEST_CASE("check sample args_convert_vector contents") {
mutation_test_with_samples(MUTATION_LAMBDA(NO_CAPTURE, { (void) vec; }));
}
TEST_CASE("args_convert_vector push_back") {
for (const bool b : {false, true}) {
mutation_test_with_samples(MUTATION_LAMBDA(b, { vec.push_back(b); }));
}
}
TEST_CASE("args_convert_vector reserve") {
for (std::size_t ii = 0; ii < 4; ++ii) {
mutation_test_with_samples(MUTATION_LAMBDA(ii, { vec.reserve(ii); }));
}
}
TEST_CASE("args_convert_vector reserve then push_back") {
for (std::size_t ii = 0; ii < 4; ++ii) {
for (const bool b : {false, true}) {
mutation_test_with_samples(MUTATION_LAMBDA(=, {
vec.reserve(ii);
vec.push_back(b);
}));
}
}
}

View File

@@ -0,0 +1,94 @@
#include "pybind11/pybind11.h"
#include "catch.hpp"
namespace py = pybind11;
// 2 is chosen because it is the smallest number (keeping tests short)
// where we can create non-empty vectors whose size is the inline size
// plus or minus 1.
using argument_vector = py::detail::argument_vector<2>;
namespace {
argument_vector to_argument_vector(const std::vector<py::handle> &v) {
argument_vector result;
result.reserve(v.size());
for (const auto x : v) {
result.push_back(x);
}
return result;
}
std::vector<std::vector<py::handle>> get_sample_argument_vector_contents() {
return std::vector<std::vector<py::handle>>{
{},
{py::handle(Py_None)},
{py::handle(Py_None), py::handle(Py_False)},
{py::handle(Py_None), py::handle(Py_False), py::handle(Py_True)},
};
}
std::vector<argument_vector> get_sample_argument_vectors() {
std::vector<argument_vector> result;
for (const auto &vec : get_sample_argument_vector_contents()) {
result.push_back(to_argument_vector(vec));
}
return result;
}
void require_vector_matches_sample(const argument_vector &actual,
const std::vector<py::handle> &expected) {
REQUIRE(actual.size() == expected.size());
for (size_t ii = 0; ii < actual.size(); ++ii) {
REQUIRE(actual[ii].ptr() == expected[ii].ptr());
}
}
template <typename ActualMutationFunc, typename ExpectedMutationFunc>
void mutation_test_with_samples(ActualMutationFunc actual_mutation_func,
ExpectedMutationFunc expected_mutation_func) {
auto sample_contents = get_sample_argument_vector_contents();
auto samples = get_sample_argument_vectors();
for (size_t ii = 0; ii < samples.size(); ++ii) {
auto &actual = samples[ii];
auto &expected = sample_contents[ii];
actual_mutation_func(actual);
expected_mutation_func(expected);
require_vector_matches_sample(actual, expected);
}
}
} // namespace
// I would like to write [capture](auto& vec) block inline, but we
// have to work with C++11, which doesn't have generic lambdas.
// NOLINTBEGIN(bugprone-macro-parentheses)
#define MUTATION_LAMBDA(capture, block) \
[capture](argument_vector & vec) block, [capture](std::vector<py::handle> & vec) block
// NOLINTEND(bugprone-macro-parentheses)
// For readability, rather than having ugly empty arguments.
#define NO_CAPTURE
TEST_CASE("check sample argument_vector contents") {
mutation_test_with_samples(MUTATION_LAMBDA(NO_CAPTURE, { (void) vec; }));
}
TEST_CASE("argument_vector push_back") {
mutation_test_with_samples(MUTATION_LAMBDA(NO_CAPTURE, { vec.emplace_back(Py_None); }));
}
TEST_CASE("argument_vector reserve") {
for (std::size_t ii = 0; ii < 4; ++ii) {
mutation_test_with_samples(MUTATION_LAMBDA(ii, { vec.reserve(ii); }));
}
}
TEST_CASE("argument_vector reserve then push_back") {
for (std::size_t ii = 0; ii < 4; ++ii) {
mutation_test_with_samples(MUTATION_LAMBDA(ii, {
vec.reserve(ii);
vec.emplace_back(Py_True);
}));
}
}

View File

@@ -94,8 +94,9 @@ PYBIND11_EMBEDDED_MODULE(throw_error_already_set, ) {
TEST_CASE("PYTHONPATH is used to update sys.path") {
// The setup for this TEST_CASE is in catch.cpp!
auto sys_path = py::str(py::module_::import("sys").attr("path")).cast<std::string>();
REQUIRE_THAT(sys_path,
Catch::Matchers::Contains("pybind11_test_embed_PYTHONPATH_2099743835476552"));
REQUIRE_THAT(
sys_path,
Catch::Matchers::Contains("pybind11_test_with_catch_PYTHONPATH_2099743835476552"));
}
TEST_CASE("Pass classes and data between modules defined in C++ and Python") {