mirror of
https://github.com/pybind/pybind11.git
synced 2026-04-19 22:39:09 +00:00
Use thread_local for loader_life_support to improve performance (#5830)
* Use thread_local for loader_life_support to improve performance As explained in a new code comment, `loader_life_support` needs to be `thread_local` but does not need to be isolated to a particular interpreter because any given function call is already going to only happen on a single interpreter by definiton. Performance before: - on M4 Max using pybind/pybind11_benchmark unmodified repo: ``` > python -m timeit --setup 'from pybind11_benchmark import collatz' 'collatz(4)' 5000000 loops, best of 5: 63.8 nsec per loop ``` - Linux server: ``` python -m timeit --setup 'from pybind11_benchmark import collatz' 'collatz(4)' (pytorch) 2000000 loops, best of 5: 120 nsec per loop ``` After: - M4 Max: ``` python -m timeit --setup 'from pybind11_benchmark import collatz' 'collatz(4)' 5000000 loops, best of 5: 53.1 nsec per loop ``` - Linux server: ``` > python -m timeit --setup 'from pybind11_benchmark import collatz' 'collatz(4)' (pytorch) 2000000 loops, best of 5: 101 nsec per loop ``` A quick profile with perf shows that pthread_setspecific and pthread_getspecific are gone. Open questions: - How do we determine whether we can safely use `thread_local`? I see concerns about old iOS versions on https://github.com/pybind/pybind11/pull/5705#issuecomment-2922858880 and https://github.com/pybind/pybind11/pull/5709; is there anything else? - Do we have a test that covers "function called in one interpreter calls a C++ function that causes a function call in another interpreter"? I think it's fine, but can it happen? - Are we happy with what we think will happen in the case where multiple extensions compiled with and without this PR interoperate? I think it's fine -- each dispatch pushes and cleans up its own state -- but a second opinion is certainly welcome. * Remove PYBIND11_CAN_USE_THREAD_LOCAL * clarify comment * Simplify loader_life_support TLS storage Replace the `fake_thread_specific_storage` struct with a direct thread-local pointer managed via a function-local static: static loader_life_support *& tls_current_frame() This retains the "stack of frames" behavior via the `parent` link. It also reduces indirection and clarifies intent. Note: this form is C++11-compatible; once pybind11 requires C++17, the helper can be simplified to: inline static thread_local loader_life_support *tls_current_frame = nullptr; * loader_life_support: avoid duplicate tls_current_frame() calls Replace repeated calls with a single local reference: auto &frame = tls_current_frame(); This ensures the thread_local initialization guard is checked only once per constructor/destructor call site, avoids potential clang-tidy complaints, and makes the code more readable. Functional behavior is unchanged. * Add REMINDER for next version bump in internals.h --------- Co-authored-by: Ralf W. Grosse-Kunstleve <rgrossekunst@nvidia.com>
This commit is contained in:
@@ -39,6 +39,7 @@
|
||||
/// further ABI-incompatible changes may be made before the ABI is officially
|
||||
/// changed to the new version.
|
||||
#ifndef PYBIND11_INTERNALS_VERSION
|
||||
// REMINDER for next version bump: remove loader_life_support_tls
|
||||
# define PYBIND11_INTERNALS_VERSION 11
|
||||
#endif
|
||||
|
||||
@@ -260,7 +261,7 @@ struct internals {
|
||||
PyObject *instance_base = nullptr;
|
||||
// Unused if PYBIND11_SIMPLE_GIL_MANAGEMENT is defined:
|
||||
thread_specific_storage<PyThreadState> tstate;
|
||||
thread_specific_storage<loader_life_support> loader_life_support_tls;
|
||||
thread_specific_storage<loader_life_support> loader_life_support_tls; // OBSOLETE (PR #5830)
|
||||
// Unused if PYBIND11_SIMPLE_GIL_MANAGEMENT is defined:
|
||||
PyInterpreterState *istate = nullptr;
|
||||
|
||||
|
||||
@@ -42,24 +42,40 @@ PYBIND11_NAMESPACE_BEGIN(detail)
|
||||
/// Adding a patient will keep it alive up until the enclosing function returns.
|
||||
class loader_life_support {
|
||||
private:
|
||||
// Thread-local top-of-stack for loader_life_support frames (linked via parent).
|
||||
// Observation: loader_life_support needs to be thread-local,
|
||||
// but we don't need to go to extra effort to keep it
|
||||
// per-interpreter (i.e., by putting it in internals) since
|
||||
// individual function calls are already isolated to a single
|
||||
// interpreter, even though they could potentially call into a
|
||||
// different interpreter later in the same call chain. This
|
||||
// saves a significant cost per function call spent in
|
||||
// loader_life_support destruction.
|
||||
// Note for future C++17 simplification:
|
||||
// inline static thread_local loader_life_support *tls_current_frame = nullptr;
|
||||
static loader_life_support *&tls_current_frame() {
|
||||
static thread_local loader_life_support *frame_ptr = nullptr;
|
||||
return frame_ptr;
|
||||
}
|
||||
|
||||
loader_life_support *parent = nullptr;
|
||||
std::unordered_set<PyObject *> keep_alive;
|
||||
|
||||
public:
|
||||
/// A new patient frame is created when a function is entered
|
||||
loader_life_support() {
|
||||
auto &stack_top = get_internals().loader_life_support_tls;
|
||||
parent = stack_top.get();
|
||||
stack_top = this;
|
||||
auto &frame = tls_current_frame();
|
||||
parent = frame;
|
||||
frame = this;
|
||||
}
|
||||
|
||||
/// ... and destroyed after it returns
|
||||
~loader_life_support() {
|
||||
auto &stack_top = get_internals().loader_life_support_tls;
|
||||
if (stack_top.get() != this) {
|
||||
auto &frame = tls_current_frame();
|
||||
if (frame != this) {
|
||||
pybind11_fail("loader_life_support: internal error");
|
||||
}
|
||||
stack_top = parent;
|
||||
frame = parent;
|
||||
for (auto *item : keep_alive) {
|
||||
Py_DECREF(item);
|
||||
}
|
||||
@@ -68,7 +84,7 @@ public:
|
||||
/// This can only be used inside a pybind11-bound function, either by `argument_loader`
|
||||
/// at argument preparation time or by `py::cast()` at execution time.
|
||||
PYBIND11_NOINLINE static void add_patient(handle h) {
|
||||
loader_life_support *frame = get_internals().loader_life_support_tls.get();
|
||||
loader_life_support *frame = tls_current_frame();
|
||||
if (!frame) {
|
||||
// NOTE: It would be nice to include the stack frames here, as this indicates
|
||||
// use of pybind11::cast<> outside the normal call framework, finding such
|
||||
|
||||
Reference in New Issue
Block a user