Avoid heap allocation for function calls with a small number of args (#5824)

* Avoid heap allocation for function calls with a small number of arguments

We don't have access to llvm::SmallVector or similar, but given the
limited subset of the `std::vector` API that
`function_call::args{,_convert}` need and the "reserve-then-fill"
usage pattern, it is relatively straightforward to implement custom
containers that get the job done.

Seems to improves time to call the collatz function in
pybind/pybind11_benchmark significantly; numbers are a little noisy
but there's a clear improvement from "about 60 ns per call" to "about
45 ns per call" on my machine (M4 Max Mac), as measured with
`timeit.repeat('collatz(4)', 'from pybind11_benchmark import
collatz')`.

* clang-tidy

* more clang-tidy

* clang-tidy NOLINTBEGIN/END instead of NOLINTNEXTLINE

* forgot to increase inline size after removing std::variant

* constexpr arg_vector_small_size, use move instead of swap to hopefully clarify second_pass_convert

* rename test_embed to test_low_level

* rename test_low_level to test_with_catch

* Be careful to NOINLINE slow paths

* rename array/vector members to iarray/hvector. Move comment per request. Add static_asserts for our untagged union implementation per request.

* drop is_standard_layout assertions; see https://github.com/pybind/pybind11/pull/5824#issuecomment-3308616072
This commit is contained in:
Scott Wolchok
2025-09-19 13:44:40 -07:00
committed by GitHub
parent 326b10637a
commit 30748f863f
16 changed files with 532 additions and 18 deletions

View File

@@ -647,8 +647,8 @@ if(NOT PYBIND11_CUDA_TESTS)
# Test pure C++ code (not depending on Python). Provides the `test_pure_cpp` target.
add_subdirectory(pure_cpp)
# Test embedding the interpreter. Provides the `cpptest` target.
add_subdirectory(test_embed)
# Test C++ code that depends on Python, such as embedding the interpreter. Provides the `cpptest` target.
add_subdirectory(test_with_catch)
# Test CMake build using functions and targets from subdirectory or installed location
add_subdirectory(test_cmake_build)

View File

@@ -76,6 +76,7 @@ conduit_headers = {
}
detail_headers = {
"include/pybind11/detail/argument_vector.h",
"include/pybind11/detail/class.h",
"include/pybind11/detail/common.h",
"include/pybind11/detail/cpp_conduit.h",

View File

@@ -26,11 +26,11 @@ add_custom_target(
DEPENDS test_subdirectory_embed)
# Test custom export group -- PYBIND11_EXPORT_NAME
add_library(test_embed_lib ../embed.cpp)
target_link_libraries(test_embed_lib PRIVATE pybind11::embed)
add_library(test_with_catch_lib ../embed.cpp)
target_link_libraries(test_with_catch_lib PRIVATE pybind11::embed)
install(
TARGETS test_embed_lib
TARGETS test_with_catch_lib
EXPORT test_export
ARCHIVE DESTINATION bin
LIBRARY DESTINATION lib

View File

@@ -33,10 +33,11 @@ if(PYBIND11_TEST_SMART_HOLDER)
-DPYBIND11_RUN_TESTING_WITH_SMART_HOLDER_AS_DEFAULT_BUT_NEVER_USE_IN_PRODUCTION_PLEASE)
endif()
add_executable(test_embed catch.cpp test_interpreter.cpp test_subinterpreter.cpp)
pybind11_enable_warnings(test_embed)
add_executable(test_with_catch catch.cpp test_args_convert_vector.cpp test_argument_vector.cpp
test_interpreter.cpp test_subinterpreter.cpp)
pybind11_enable_warnings(test_with_catch)
target_link_libraries(test_embed PRIVATE pybind11::embed Catch2::Catch2 Threads::Threads)
target_link_libraries(test_with_catch PRIVATE pybind11::embed Catch2::Catch2 Threads::Threads)
if(NOT CMAKE_CURRENT_SOURCE_DIR STREQUAL CMAKE_CURRENT_BINARY_DIR)
file(COPY test_interpreter.py test_trampoline.py DESTINATION "${CMAKE_CURRENT_BINARY_DIR}")
@@ -44,8 +45,8 @@ endif()
add_custom_target(
cpptest
COMMAND "$<TARGET_FILE:test_embed>"
DEPENDS test_embed
COMMAND "$<TARGET_FILE:test_with_catch>"
DEPENDS test_with_catch
WORKING_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}")
pybind11_add_module(external_module THIN_LTO external_module.cpp)

View File

@@ -19,7 +19,7 @@ namespace py = pybind11;
int main(int argc, char *argv[]) {
// Setup for TEST_CASE in test_interpreter.cpp, tagging on a large random number:
std::string updated_pythonpath("pybind11_test_embed_PYTHONPATH_2099743835476552");
std::string updated_pythonpath("pybind11_test_with_catch_PYTHONPATH_2099743835476552");
const char *preexisting_pythonpath = getenv("PYTHONPATH");
if (preexisting_pythonpath != nullptr) {
#if defined(_WIN32)

View File

@@ -0,0 +1,80 @@
#include "pybind11/pybind11.h"
#include "catch.hpp"
namespace py = pybind11;
using args_convert_vector = py::detail::args_convert_vector<py::detail::arg_vector_small_size>;
namespace {
template <typename Container>
std::vector<Container> get_sample_vectors() {
std::vector<Container> result;
result.emplace_back();
for (const auto sz : {0, 4, 5, 6, 31, 32, 33, 63, 64, 65}) {
for (const bool b : {false, true}) {
result.emplace_back(static_cast<std::size_t>(sz), b);
}
}
return result;
}
void require_vector_matches_sample(const args_convert_vector &actual,
const std::vector<bool> &expected) {
REQUIRE(actual.size() == expected.size());
for (size_t ii = 0; ii < actual.size(); ++ii) {
REQUIRE(actual[ii] == expected[ii]);
}
}
template <typename ActualMutationFunc, typename ExpectedMutationFunc>
void mutation_test_with_samples(ActualMutationFunc actual_mutation_func,
ExpectedMutationFunc expected_mutation_func) {
auto sample_contents = get_sample_vectors<std::vector<bool>>();
auto samples = get_sample_vectors<args_convert_vector>();
for (size_t ii = 0; ii < samples.size(); ++ii) {
auto &actual = samples[ii];
auto &expected = sample_contents[ii];
actual_mutation_func(actual);
expected_mutation_func(expected);
require_vector_matches_sample(actual, expected);
}
}
} // namespace
// I would like to write [capture](auto& vec) block inline, but we
// have to work with C++11, which doesn't have generic lambdas.
// NOLINTBEGIN(bugprone-macro-parentheses)
#define MUTATION_LAMBDA(capture, block) \
[capture](args_convert_vector & vec) block, [capture](std::vector<bool> & vec) block
// NOLINTEND(bugprone-macro-parentheses)
// For readability, rather than having ugly empty arguments.
#define NO_CAPTURE
TEST_CASE("check sample args_convert_vector contents") {
mutation_test_with_samples(MUTATION_LAMBDA(NO_CAPTURE, { (void) vec; }));
}
TEST_CASE("args_convert_vector push_back") {
for (const bool b : {false, true}) {
mutation_test_with_samples(MUTATION_LAMBDA(b, { vec.push_back(b); }));
}
}
TEST_CASE("args_convert_vector reserve") {
for (std::size_t ii = 0; ii < 4; ++ii) {
mutation_test_with_samples(MUTATION_LAMBDA(ii, { vec.reserve(ii); }));
}
}
TEST_CASE("args_convert_vector reserve then push_back") {
for (std::size_t ii = 0; ii < 4; ++ii) {
for (const bool b : {false, true}) {
mutation_test_with_samples(MUTATION_LAMBDA(=, {
vec.reserve(ii);
vec.push_back(b);
}));
}
}
}

View File

@@ -0,0 +1,94 @@
#include "pybind11/pybind11.h"
#include "catch.hpp"
namespace py = pybind11;
// 2 is chosen because it is the smallest number (keeping tests short)
// where we can create non-empty vectors whose size is the inline size
// plus or minus 1.
using argument_vector = py::detail::argument_vector<2>;
namespace {
argument_vector to_argument_vector(const std::vector<py::handle> &v) {
argument_vector result;
result.reserve(v.size());
for (const auto x : v) {
result.push_back(x);
}
return result;
}
std::vector<std::vector<py::handle>> get_sample_argument_vector_contents() {
return std::vector<std::vector<py::handle>>{
{},
{py::handle(Py_None)},
{py::handle(Py_None), py::handle(Py_False)},
{py::handle(Py_None), py::handle(Py_False), py::handle(Py_True)},
};
}
std::vector<argument_vector> get_sample_argument_vectors() {
std::vector<argument_vector> result;
for (const auto &vec : get_sample_argument_vector_contents()) {
result.push_back(to_argument_vector(vec));
}
return result;
}
void require_vector_matches_sample(const argument_vector &actual,
const std::vector<py::handle> &expected) {
REQUIRE(actual.size() == expected.size());
for (size_t ii = 0; ii < actual.size(); ++ii) {
REQUIRE(actual[ii].ptr() == expected[ii].ptr());
}
}
template <typename ActualMutationFunc, typename ExpectedMutationFunc>
void mutation_test_with_samples(ActualMutationFunc actual_mutation_func,
ExpectedMutationFunc expected_mutation_func) {
auto sample_contents = get_sample_argument_vector_contents();
auto samples = get_sample_argument_vectors();
for (size_t ii = 0; ii < samples.size(); ++ii) {
auto &actual = samples[ii];
auto &expected = sample_contents[ii];
actual_mutation_func(actual);
expected_mutation_func(expected);
require_vector_matches_sample(actual, expected);
}
}
} // namespace
// I would like to write [capture](auto& vec) block inline, but we
// have to work with C++11, which doesn't have generic lambdas.
// NOLINTBEGIN(bugprone-macro-parentheses)
#define MUTATION_LAMBDA(capture, block) \
[capture](argument_vector & vec) block, [capture](std::vector<py::handle> & vec) block
// NOLINTEND(bugprone-macro-parentheses)
// For readability, rather than having ugly empty arguments.
#define NO_CAPTURE
TEST_CASE("check sample argument_vector contents") {
mutation_test_with_samples(MUTATION_LAMBDA(NO_CAPTURE, { (void) vec; }));
}
TEST_CASE("argument_vector push_back") {
mutation_test_with_samples(MUTATION_LAMBDA(NO_CAPTURE, { vec.emplace_back(Py_None); }));
}
TEST_CASE("argument_vector reserve") {
for (std::size_t ii = 0; ii < 4; ++ii) {
mutation_test_with_samples(MUTATION_LAMBDA(ii, { vec.reserve(ii); }));
}
}
TEST_CASE("argument_vector reserve then push_back") {
for (std::size_t ii = 0; ii < 4; ++ii) {
mutation_test_with_samples(MUTATION_LAMBDA(ii, {
vec.reserve(ii);
vec.emplace_back(Py_True);
}));
}
}

View File

@@ -94,8 +94,9 @@ PYBIND11_EMBEDDED_MODULE(throw_error_already_set, ) {
TEST_CASE("PYTHONPATH is used to update sys.path") {
// The setup for this TEST_CASE is in catch.cpp!
auto sys_path = py::str(py::module_::import("sys").attr("path")).cast<std::string>();
REQUIRE_THAT(sys_path,
Catch::Matchers::Contains("pybind11_test_embed_PYTHONPATH_2099743835476552"));
REQUIRE_THAT(
sys_path,
Catch::Matchers::Contains("pybind11_test_with_catch_PYTHONPATH_2099743835476552"));
}
TEST_CASE("Pass classes and data between modules defined in C++ and Python") {