Avoid heap allocation for function calls with a small number of args (#5824)

* Avoid heap allocation for function calls with a small number of arguments We don't have access to llvm::SmallVector or similar, but given the limited subset of the `std::vector` API that `function_call::args{,_convert}` need and the "reserve-then-fill" usage pattern, it is relatively straightforward to implement custom containers that get the job done. Seems to improves time to call the collatz function in pybind/pybind11_benchmark significantly; numbers are a little noisy but there's a clear improvement from "about 60 ns per call" to "about 45 ns per call" on my machine (M4 Max Mac), as measured with `timeit.repeat('collatz(4)', 'from pybind11_benchmark import collatz')`. * clang-tidy * more clang-tidy * clang-tidy NOLINTBEGIN/END instead of NOLINTNEXTLINE * forgot to increase inline size after removing std::variant * constexpr arg_vector_small_size, use move instead of swap to hopefully clarify second_pass_convert * rename test_embed to test_low_level * rename test_low_level to test_with_catch * Be careful to NOINLINE slow paths * rename array/vector members to iarray/hvector. Move comment per request. Add static_asserts for our untagged union implementation per request. * drop is_standard_layout assertions; see https://github.com/pybind/pybind11/pull/5824#issuecomment-3308616072
2026-05-14 02:03:34 +00:00 · 2025-09-19 13:44:40 -07:00
parent 326b10637a
commit 30748f863f
16 changed files with 532 additions and 18 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -180,6 +180,7 @@ if(PYBIND11_MASTER_PROJECT)
 endif()

 set(PYBIND11_HEADERS
+    include/pybind11/detail/argument_vector.h
    include/pybind11/detail/class.h
    include/pybind11/detail/common.h
    include/pybind11/detail/cpp_conduit.h
--- a/include/pybind11/cast.h
+++ b/include/pybind11/cast.h
@@ -10,6 +10,7 @@

 #pragma once

+#include "detail/argument_vector.h"
 #include "detail/common.h"
 #include "detail/descr.h"
 #include "detail/native_enum_data.h"
@@ -2037,6 +2038,10 @@ using is_pos_only = std::is_same<intrinsic_t<T>, pos_only>;
 // forward declaration (definition in attr.h)
 struct function_record;

+/// (Inline size chosen mostly arbitrarily; 6 should pad function_call out to two cache lines
+/// (16 pointers) in size.)
+constexpr std::size_t arg_vector_small_size = 6;
+
 /// Internal data associated with a single function call
 struct function_call {
    function_call(const function_record &f, handle p); // Implementation in attr.h
@@ -2045,10 +2050,10 @@ struct function_call {
    const function_record &func;

    /// Arguments passed to the function:
-    std::vector<handle> args;
+    argument_vector<arg_vector_small_size> args;

    /// The `convert` value the arguments should be loaded with
-    std::vector<bool> args_convert;
+    args_convert_vector<arg_vector_small_size> args_convert;

    /// Extra references for the optional `py::args` and/or `py::kwargs` arguments (which, if
    /// present, are also in `args` but without a reference).
--- a/include/pybind11/detail/argument_vector.h
+++ b/include/pybind11/detail/argument_vector.h
@@ -0,0 +1,330 @@
+/*
+    pybind11/detail/argument_vector.h: small_vector-like containers to
+    avoid heap allocation of arguments during function call dispatch.
+
+    Copyright (c) Meta Platforms, Inc. and affiliates.
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include <pybind11/pytypes.h>
+
+#include "common.h"
+
+#include <algorithm>
+#include <array>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <iterator>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+
+PYBIND11_WARNING_DISABLE_MSVC(4127)
+
+PYBIND11_NAMESPACE_BEGIN(detail)
+
+// Shared implementation utility for our small_vector-like containers.
+// We support C++11 and C++14, so we cannot use
+// std::variant. Union with the tag packed next to the inline
+// array's size is smaller anyway, allowing 1 extra handle of
+// inline storage for free. Compare the layouts (1 line per
+// size_t/void*, assuming a 64-bit machine):
+// With variant, total is N + 2 for N >= 2:
+// - variant tag (cannot be packed with the array size)
+// - array size (or first pointer of 3 in std::vector)
+// - N pointers of inline storage (or 2 remaining pointers of std::vector)
+// Custom union, total is N + 1 for N >= 3:
+// - variant tag & array size if applicable
+// - N pointers of inline storage (or 3 pointers of std::vector)
+//
+// NOTE: this is a low-level representational convenience; the two
+// use cases of this union are materially different and in particular
+// have different semantics for inline_array::size. All that is being
+// shared is the memory management behavior.
+template <typename ArrayT, std::size_t InlineSize, typename VectorT = ArrayT>
+union inline_array_or_vector {
+    struct inline_array {
+        bool is_inline = true;
+        std::uint32_t size = 0;
+        std::array<ArrayT, InlineSize> arr;
+    };
+    struct heap_vector {
+        bool is_inline = false;
+        std::vector<VectorT> vec;
+
+        heap_vector() = default;
+        heap_vector(std::size_t count, VectorT value) : vec(count, value) {}
+    };
+
+    inline_array iarray;
+    heap_vector hvector;
+
+    static_assert(std::is_trivially_move_constructible<ArrayT>::value,
+                  "ArrayT must be trivially move constructible");
+    static_assert(std::is_trivially_destructible<ArrayT>::value,
+                  "ArrayT must be trivially destructible");
+
+    inline_array_or_vector() : iarray() {}
+    ~inline_array_or_vector() {
+        if (!is_inline()) {
+            hvector.~heap_vector();
+        }
+    }
+    // Disable copy ctor and assignment.
+    inline_array_or_vector(const inline_array_or_vector &) = delete;
+    inline_array_or_vector &operator=(const inline_array_or_vector &) = delete;
+
+    inline_array_or_vector(inline_array_or_vector &&rhs) noexcept {
+        if (rhs.is_inline()) {
+            std::memcpy(&iarray, &rhs.iarray, sizeof(iarray));
+        } else {
+            new (&hvector) heap_vector(std::move(rhs.hvector));
+        }
+        assert(is_inline() == rhs.is_inline());
+    }
+
+    inline_array_or_vector &operator=(inline_array_or_vector &&rhs) noexcept {
+        if (this == &rhs) {
+            return *this;
+        }
+
+        if (rhs.is_inline()) {
+            if (!is_inline()) {
+                hvector.~heap_vector();
+            }
+            std::memcpy(&iarray, &rhs.iarray, sizeof(iarray));
+        } else {
+            if (is_inline()) {
+                new (&hvector) heap_vector(std::move(rhs.hvector));
+            } else {
+                hvector = std::move(rhs.hvector);
+            }
+        }
+        return *this;
+    }
+
+    bool is_inline() const {
+        // It is undefined behavior to access the inactive member of a
+        // union directly. However, it is well-defined to reinterpret_cast any
+        // pointer into a pointer to char and examine it as an array
+        // of bytes. See
+        // https://dev-discuss.pytorch.org/t/unionizing-for-profit-how-to-exploit-the-power-of-unions-in-c/444#the-memcpy-loophole-4
+        bool result = false;
+        static_assert(offsetof(inline_array, is_inline) == 0,
+                      "untagged union implementation relies on this");
+        static_assert(offsetof(heap_vector, is_inline) == 0,
+                      "untagged union implementation relies on this");
+        std::memcpy(&result, reinterpret_cast<const char *>(this), sizeof(bool));
+        return result;
+    }
+};
+
+// small_vector-like container to avoid heap allocation for N or fewer
+// arguments.
+template <std::size_t N>
+struct argument_vector {
+public:
+    argument_vector() = default;
+
+    // Disable copy ctor and assignment.
+    argument_vector(const argument_vector &) = delete;
+    argument_vector &operator=(const argument_vector &) = delete;
+    argument_vector(argument_vector &&) noexcept = default;
+    argument_vector &operator=(argument_vector &&) noexcept = default;
+
+    std::size_t size() const {
+        if (is_inline()) {
+            return m_repr.iarray.size;
+        }
+        return m_repr.hvector.vec.size();
+    }
+
+    handle &operator[](std::size_t idx) {
+        assert(idx < size());
+        if (is_inline()) {
+            return m_repr.iarray.arr[idx];
+        }
+        return m_repr.hvector.vec[idx];
+    }
+
+    handle operator[](std::size_t idx) const {
+        assert(idx < size());
+        if (is_inline()) {
+            return m_repr.iarray.arr[idx];
+        }
+        return m_repr.hvector.vec[idx];
+    }
+
+    void push_back(handle x) {
+        if (is_inline()) {
+            auto &ha = m_repr.iarray;
+            if (ha.size == N) {
+                move_to_heap_vector_with_reserved_size(N + 1);
+                push_back_slow_path(x);
+            } else {
+                ha.arr[ha.size++] = x;
+            }
+        } else {
+            push_back_slow_path(x);
+        }
+    }
+
+    template <typename Arg>
+    void emplace_back(Arg &&x) {
+        push_back(handle(x));
+    }
+
+    void reserve(std::size_t sz) {
+        if (is_inline()) {
+            if (sz > N) {
+                move_to_heap_vector_with_reserved_size(sz);
+            }
+        } else {
+            reserve_slow_path(sz);
+        }
+    }
+
+private:
+    using repr_type = inline_array_or_vector<handle, N>;
+    repr_type m_repr;
+
+    PYBIND11_NOINLINE void move_to_heap_vector_with_reserved_size(std::size_t reserved_size) {
+        assert(is_inline());
+        auto &ha = m_repr.iarray;
+        using heap_vector = typename repr_type::heap_vector;
+        heap_vector hv;
+        hv.vec.reserve(reserved_size);
+        std::copy(ha.arr.begin(), ha.arr.begin() + ha.size, std::back_inserter(hv.vec));
+        new (&m_repr.hvector) heap_vector(std::move(hv));
+    }
+
+    PYBIND11_NOINLINE void push_back_slow_path(handle x) { m_repr.hvector.vec.push_back(x); }
+
+    PYBIND11_NOINLINE void reserve_slow_path(std::size_t sz) { m_repr.hvector.vec.reserve(sz); }
+
+    bool is_inline() const { return m_repr.is_inline(); }
+};
+
+// small_vector-like container to avoid heap allocation for N or fewer
+// arguments.
+template <std::size_t kRequestedInlineSize>
+struct args_convert_vector {
+private:
+public:
+    args_convert_vector() = default;
+
+    // Disable copy ctor and assignment.
+    args_convert_vector(const args_convert_vector &) = delete;
+    args_convert_vector &operator=(const args_convert_vector &) = delete;
+    args_convert_vector(args_convert_vector &&) noexcept = default;
+    args_convert_vector &operator=(args_convert_vector &&) noexcept = default;
+
+    args_convert_vector(std::size_t count, bool value) {
+        if (count > kInlineSize) {
+            new (&m_repr.hvector) typename repr_type::heap_vector(count, value);
+        } else {
+            auto &inline_arr = m_repr.iarray;
+            inline_arr.arr.fill(value ? std::size_t(-1) : 0);
+            inline_arr.size = static_cast<decltype(inline_arr.size)>(count);
+        }
+    }
+
+    std::size_t size() const {
+        if (is_inline()) {
+            return m_repr.iarray.size;
+        }
+        return m_repr.hvector.vec.size();
+    }
+
+    void reserve(std::size_t sz) {
+        if (is_inline()) {
+            if (sz > kInlineSize) {
+                move_to_heap_vector_with_reserved_size(sz);
+            }
+        } else {
+            m_repr.hvector.vec.reserve(sz);
+        }
+    }
+
+    bool operator[](std::size_t idx) const {
+        if (is_inline()) {
+            return inline_index(idx);
+        }
+        assert(idx < m_repr.hvector.vec.size());
+        return m_repr.hvector.vec[idx];
+    }
+
+    void push_back(bool b) {
+        if (is_inline()) {
+            auto &ha = m_repr.iarray;
+            if (ha.size == kInlineSize) {
+                move_to_heap_vector_with_reserved_size(kInlineSize + 1);
+                push_back_slow_path(b);
+            } else {
+                assert(ha.size < kInlineSize);
+                const auto wbi = word_and_bit_index(ha.size++);
+                assert(wbi.word < kWords);
+                assert(wbi.bit < kBitsPerWord);
+                if (b) {
+                    ha.arr[wbi.word] |= (std::size_t(1) << wbi.bit);
+                } else {
+                    ha.arr[wbi.word] &= ~(std::size_t(1) << wbi.bit);
+                }
+                assert(operator[](ha.size - 1) == b);
+            }
+        } else {
+            push_back_slow_path(b);
+        }
+    }
+
+    void swap(args_convert_vector &rhs) noexcept { std::swap(m_repr, rhs.m_repr); }
+
+private:
+    struct WordAndBitIndex {
+        std::size_t word;
+        std::size_t bit;
+    };
+
+    static WordAndBitIndex word_and_bit_index(std::size_t idx) {
+        return WordAndBitIndex{idx / kBitsPerWord, idx % kBitsPerWord};
+    }
+
+    bool inline_index(std::size_t idx) const {
+        const auto wbi = word_and_bit_index(idx);
+        assert(wbi.word < kWords);
+        assert(wbi.bit < kBitsPerWord);
+        return m_repr.iarray.arr[wbi.word] & (std::size_t(1) << wbi.bit);
+    }
+
+    PYBIND11_NOINLINE void move_to_heap_vector_with_reserved_size(std::size_t reserved_size) {
+        auto &inline_arr = m_repr.iarray;
+        using heap_vector = typename repr_type::heap_vector;
+        heap_vector hv;
+        hv.vec.reserve(reserved_size);
+        for (std::size_t ii = 0; ii < inline_arr.size; ++ii) {
+            hv.vec.push_back(inline_index(ii));
+        }
+        new (&m_repr.hvector) heap_vector(std::move(hv));
+    }
+
+    PYBIND11_NOINLINE void push_back_slow_path(bool b) { m_repr.hvector.vec.push_back(b); }
+
+    static constexpr auto kBitsPerWord = 8 * sizeof(std::size_t);
+    static constexpr auto kWords = (kRequestedInlineSize + kBitsPerWord - 1) / kBitsPerWord;
+    static constexpr auto kInlineSize = kWords * kBitsPerWord;
+
+    using repr_type = inline_array_or_vector<std::size_t, kWords, bool>;
+    repr_type m_repr;
+
+    bool is_inline() const { return m_repr.is_inline(); }
+};
+
+PYBIND11_NAMESPACE_END(detail)
+PYBIND11_NAMESPACE_END(PYBIND11_NAMESPACE)
--- a/include/pybind11/pybind11.h
+++ b/include/pybind11/pybind11.h
@@ -1048,13 +1048,14 @@ protected:
                }
 #endif

-                std::vector<bool> second_pass_convert;
+                args_convert_vector<arg_vector_small_size> second_pass_convert;
                if (overloaded) {
                    // We're in the first no-convert pass, so swap out the conversion flags for a
                    // set of all-false flags.  If the call fails, we'll swap the flags back in for
                    // the conversion-allowed call below.
-                    second_pass_convert.resize(func.nargs, false);
-                    call.args_convert.swap(second_pass_convert);
+                    second_pass_convert = std::move(call.args_convert);
+                    call.args_convert
+                        = args_convert_vector<arg_vector_small_size>(func.nargs, false);
                }

                // 6. Call the function.
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -647,8 +647,8 @@ if(NOT PYBIND11_CUDA_TESTS)
  # Test pure C++ code (not depending on Python). Provides the `test_pure_cpp` target.
  add_subdirectory(pure_cpp)

-  # Test embedding the interpreter. Provides the `cpptest` target.
-  add_subdirectory(test_embed)
+  # Test C++ code that depends on Python, such as embedding the interpreter. Provides the `cpptest` target.
+  add_subdirectory(test_with_catch)

  # Test CMake build using functions and targets from subdirectory or installed location
  add_subdirectory(test_cmake_build)
--- a/tests/extra_python_package/test_files.py
+++ b/tests/extra_python_package/test_files.py
@@ -76,6 +76,7 @@ conduit_headers = {
 }

 detail_headers = {
+    "include/pybind11/detail/argument_vector.h",
    "include/pybind11/detail/class.h",
    "include/pybind11/detail/common.h",
    "include/pybind11/detail/cpp_conduit.h",
--- a/tests/test_cmake_build/subdirectory_embed/CMakeLists.txt
+++ b/tests/test_cmake_build/subdirectory_embed/CMakeLists.txt
@@ -26,11 +26,11 @@ add_custom_target(
  DEPENDS test_subdirectory_embed)

 # Test custom export group -- PYBIND11_EXPORT_NAME
-add_library(test_embed_lib ../embed.cpp)
-target_link_libraries(test_embed_lib PRIVATE pybind11::embed)
+add_library(test_with_catch_lib ../embed.cpp)
+target_link_libraries(test_with_catch_lib PRIVATE pybind11::embed)

 install(
-  TARGETS test_embed_lib
+  TARGETS test_with_catch_lib
  EXPORT test_export
  ARCHIVE DESTINATION bin
  LIBRARY DESTINATION lib
--- a/tests/test_with_catch/CMakeLists.txt
+++ b/tests/test_with_catch/CMakeLists.txt
@@ -33,10 +33,11 @@ if(PYBIND11_TEST_SMART_HOLDER)
    -DPYBIND11_RUN_TESTING_WITH_SMART_HOLDER_AS_DEFAULT_BUT_NEVER_USE_IN_PRODUCTION_PLEASE)
 endif()

-add_executable(test_embed catch.cpp test_interpreter.cpp test_subinterpreter.cpp)
-pybind11_enable_warnings(test_embed)
+add_executable(test_with_catch catch.cpp test_args_convert_vector.cpp test_argument_vector.cpp
+                               test_interpreter.cpp test_subinterpreter.cpp)
+pybind11_enable_warnings(test_with_catch)

-target_link_libraries(test_embed PRIVATE pybind11::embed Catch2::Catch2 Threads::Threads)
+target_link_libraries(test_with_catch PRIVATE pybind11::embed Catch2::Catch2 Threads::Threads)

 if(NOT CMAKE_CURRENT_SOURCE_DIR STREQUAL CMAKE_CURRENT_BINARY_DIR)
  file(COPY test_interpreter.py test_trampoline.py DESTINATION "${CMAKE_CURRENT_BINARY_DIR}")
@@ -44,8 +45,8 @@ endif()

 add_custom_target(
  cpptest
-  COMMAND "$<TARGET_FILE:test_embed>"
-  DEPENDS test_embed
+  COMMAND "$<TARGET_FILE:test_with_catch>"
+  DEPENDS test_with_catch
  WORKING_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}")

 pybind11_add_module(external_module THIN_LTO external_module.cpp)
--- a/tests/test_with_catch/catch.cpp
+++ b/tests/test_with_catch/catch.cpp
@@ -19,7 +19,7 @@ namespace py = pybind11;

 int main(int argc, char *argv[]) {
    // Setup for TEST_CASE in test_interpreter.cpp, tagging on a large random number:
-    std::string updated_pythonpath("pybind11_test_embed_PYTHONPATH_2099743835476552");
+    std::string updated_pythonpath("pybind11_test_with_catch_PYTHONPATH_2099743835476552");
    const char *preexisting_pythonpath = getenv("PYTHONPATH");
    if (preexisting_pythonpath != nullptr) {
 #if defined(_WIN32)
--- a/tests/test_with_catch/external_module.cpp
+++ b/tests/test_with_catch/external_module.cpp
--- a/tests/test_with_catch/test_args_convert_vector.cpp
+++ b/tests/test_with_catch/test_args_convert_vector.cpp
@@ -0,0 +1,80 @@
+#include "pybind11/pybind11.h"
+#include "catch.hpp"
+
+namespace py = pybind11;
+
+using args_convert_vector = py::detail::args_convert_vector<py::detail::arg_vector_small_size>;
+
+namespace {
+template <typename Container>
+std::vector<Container> get_sample_vectors() {
+    std::vector<Container> result;
+    result.emplace_back();
+    for (const auto sz : {0, 4, 5, 6, 31, 32, 33, 63, 64, 65}) {
+        for (const bool b : {false, true}) {
+            result.emplace_back(static_cast<std::size_t>(sz), b);
+        }
+    }
+    return result;
+}
+
+void require_vector_matches_sample(const args_convert_vector &actual,
+                                   const std::vector<bool> &expected) {
+    REQUIRE(actual.size() == expected.size());
+    for (size_t ii = 0; ii < actual.size(); ++ii) {
+        REQUIRE(actual[ii] == expected[ii]);
+    }
+}
+
+template <typename ActualMutationFunc, typename ExpectedMutationFunc>
+void mutation_test_with_samples(ActualMutationFunc actual_mutation_func,
+                                ExpectedMutationFunc expected_mutation_func) {
+    auto sample_contents = get_sample_vectors<std::vector<bool>>();
+    auto samples = get_sample_vectors<args_convert_vector>();
+    for (size_t ii = 0; ii < samples.size(); ++ii) {
+        auto &actual = samples[ii];
+        auto &expected = sample_contents[ii];
+
+        actual_mutation_func(actual);
+        expected_mutation_func(expected);
+        require_vector_matches_sample(actual, expected);
+    }
+}
+} // namespace
+
+// I would like to write [capture](auto& vec) block inline, but we
+// have to work with C++11, which doesn't have generic lambdas.
+// NOLINTBEGIN(bugprone-macro-parentheses)
+#define MUTATION_LAMBDA(capture, block)                                                           \
+    [capture](args_convert_vector & vec) block, [capture](std::vector<bool> & vec) block
+// NOLINTEND(bugprone-macro-parentheses)
+
+// For readability, rather than having ugly empty arguments.
+#define NO_CAPTURE
+
+TEST_CASE("check sample args_convert_vector contents") {
+    mutation_test_with_samples(MUTATION_LAMBDA(NO_CAPTURE, { (void) vec; }));
+}
+
+TEST_CASE("args_convert_vector push_back") {
+    for (const bool b : {false, true}) {
+        mutation_test_with_samples(MUTATION_LAMBDA(b, { vec.push_back(b); }));
+    }
+}
+
+TEST_CASE("args_convert_vector reserve") {
+    for (std::size_t ii = 0; ii < 4; ++ii) {
+        mutation_test_with_samples(MUTATION_LAMBDA(ii, { vec.reserve(ii); }));
+    }
+}
+
+TEST_CASE("args_convert_vector reserve then push_back") {
+    for (std::size_t ii = 0; ii < 4; ++ii) {
+        for (const bool b : {false, true}) {
+            mutation_test_with_samples(MUTATION_LAMBDA(=, {
+                vec.reserve(ii);
+                vec.push_back(b);
+            }));
+        }
+    }
+}
--- a/tests/test_with_catch/test_argument_vector.cpp
+++ b/tests/test_with_catch/test_argument_vector.cpp
@@ -0,0 +1,94 @@
+#include "pybind11/pybind11.h"
+#include "catch.hpp"
+
+namespace py = pybind11;
+
+// 2 is chosen because it is the smallest number (keeping tests short)
+// where we can create non-empty vectors whose size is the inline size
+// plus or minus 1.
+using argument_vector = py::detail::argument_vector<2>;
+
+namespace {
+argument_vector to_argument_vector(const std::vector<py::handle> &v) {
+    argument_vector result;
+    result.reserve(v.size());
+    for (const auto x : v) {
+        result.push_back(x);
+    }
+    return result;
+}
+
+std::vector<std::vector<py::handle>> get_sample_argument_vector_contents() {
+    return std::vector<std::vector<py::handle>>{
+        {},
+        {py::handle(Py_None)},
+        {py::handle(Py_None), py::handle(Py_False)},
+        {py::handle(Py_None), py::handle(Py_False), py::handle(Py_True)},
+    };
+}
+
+std::vector<argument_vector> get_sample_argument_vectors() {
+    std::vector<argument_vector> result;
+    for (const auto &vec : get_sample_argument_vector_contents()) {
+        result.push_back(to_argument_vector(vec));
+    }
+    return result;
+}
+
+void require_vector_matches_sample(const argument_vector &actual,
+                                   const std::vector<py::handle> &expected) {
+    REQUIRE(actual.size() == expected.size());
+    for (size_t ii = 0; ii < actual.size(); ++ii) {
+        REQUIRE(actual[ii].ptr() == expected[ii].ptr());
+    }
+}
+
+template <typename ActualMutationFunc, typename ExpectedMutationFunc>
+void mutation_test_with_samples(ActualMutationFunc actual_mutation_func,
+                                ExpectedMutationFunc expected_mutation_func) {
+    auto sample_contents = get_sample_argument_vector_contents();
+    auto samples = get_sample_argument_vectors();
+    for (size_t ii = 0; ii < samples.size(); ++ii) {
+        auto &actual = samples[ii];
+        auto &expected = sample_contents[ii];
+
+        actual_mutation_func(actual);
+        expected_mutation_func(expected);
+        require_vector_matches_sample(actual, expected);
+    }
+}
+
+} // namespace
+
+// I would like to write [capture](auto& vec) block inline, but we
+// have to work with C++11, which doesn't have generic lambdas.
+// NOLINTBEGIN(bugprone-macro-parentheses)
+#define MUTATION_LAMBDA(capture, block)                                                           \
+    [capture](argument_vector & vec) block, [capture](std::vector<py::handle> & vec) block
+// NOLINTEND(bugprone-macro-parentheses)
+
+// For readability, rather than having ugly empty arguments.
+#define NO_CAPTURE
+
+TEST_CASE("check sample argument_vector contents") {
+    mutation_test_with_samples(MUTATION_LAMBDA(NO_CAPTURE, { (void) vec; }));
+}
+
+TEST_CASE("argument_vector push_back") {
+    mutation_test_with_samples(MUTATION_LAMBDA(NO_CAPTURE, { vec.emplace_back(Py_None); }));
+}
+
+TEST_CASE("argument_vector reserve") {
+    for (std::size_t ii = 0; ii < 4; ++ii) {
+        mutation_test_with_samples(MUTATION_LAMBDA(ii, { vec.reserve(ii); }));
+    }
+}
+
+TEST_CASE("argument_vector reserve then push_back") {
+    for (std::size_t ii = 0; ii < 4; ++ii) {
+        mutation_test_with_samples(MUTATION_LAMBDA(ii, {
+            vec.reserve(ii);
+            vec.emplace_back(Py_True);
+        }));
+    }
+}
--- a/tests/test_with_catch/test_interpreter.cpp
+++ b/tests/test_with_catch/test_interpreter.cpp
@@ -94,8 +94,9 @@ PYBIND11_EMBEDDED_MODULE(throw_error_already_set, ) {
 TEST_CASE("PYTHONPATH is used to update sys.path") {
    // The setup for this TEST_CASE is in catch.cpp!
    auto sys_path = py::str(py::module_::import("sys").attr("path")).cast<std::string>();
-    REQUIRE_THAT(sys_path,
-                 Catch::Matchers::Contains("pybind11_test_embed_PYTHONPATH_2099743835476552"));
+    REQUIRE_THAT(
+        sys_path,
+        Catch::Matchers::Contains("pybind11_test_with_catch_PYTHONPATH_2099743835476552"));
 }

 TEST_CASE("Pass classes and data between modules defined in C++ and Python") {
--- a/tests/test_with_catch/test_interpreter.py
+++ b/tests/test_with_catch/test_interpreter.py
--- a/tests/test_with_catch/test_subinterpreter.cpp
+++ b/tests/test_with_catch/test_subinterpreter.cpp
--- a/tests/test_with_catch/test_trampoline.py
+++ b/tests/test_with_catch/test_trampoline.py