From 729654cf6f1b472f47acc98c3ab4890afbf3f08e Mon Sep 17 00:00:00 2001 From: Xuehai Pan Date: Sat, 13 Dec 2025 20:06:35 +0800 Subject: [PATCH 01/60] Add new argument to `gil_safe_call_once_and_store::call_once_and_store_result` --- include/pybind11/gil_safe_call_once.h | 29 +++++++++++++++++---------- 1 file changed, 18 insertions(+), 11 deletions(-) diff --git a/include/pybind11/gil_safe_call_once.h b/include/pybind11/gil_safe_call_once.h index 2abd8fc326..06fb9ef75e 100644 --- a/include/pybind11/gil_safe_call_once.h +++ b/include/pybind11/gil_safe_call_once.h @@ -8,8 +8,12 @@ #include #include -#ifdef Py_GIL_DISABLED +#if defined(Py_GIL_DISABLED) || defined(PYBIND11_HAS_SUBINTERPRETER_SUPPORT) # include + +using atomic_bool = std::atomic_bool; +#else +using atomic_bool = bool; #endif PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE) @@ -53,7 +57,8 @@ class gil_safe_call_once_and_store { public: // PRECONDITION: The GIL must be held when `call_once_and_store_result()` is called. template - gil_safe_call_once_and_store &call_once_and_store_result(Callable &&fn) { + gil_safe_call_once_and_store &call_once_and_store_result(Callable &&fn, + void (*finalize_fn)(T &) = nullptr) { if (!is_initialized_) { // This read is guarded by the GIL. // Multiple threads may enter here, because the GIL is released in the next line and // CPython API calls in the `fn()` call below may release and reacquire the GIL. @@ -61,8 +66,9 @@ class gil_safe_call_once_and_store { std::call_once(once_flag_, [&] { // Only one thread will ever enter here. gil_scoped_acquire gil_acq; - ::new (storage_) T(fn()); // fn may release, but will reacquire, the GIL. - is_initialized_ = true; // This write is guarded by the GIL. + ::new (storage_) T(fn()); // fn may release, but will reacquire, the GIL. + finalize_fn_ = finalize_fn; // Store the finalizer. + is_initialized_ = true; // This write is guarded by the GIL. }); // All threads will observe `is_initialized_` as true here. } @@ -83,20 +89,21 @@ class gil_safe_call_once_and_store { } constexpr gil_safe_call_once_and_store() = default; - PYBIND11_DTOR_CONSTEXPR ~gil_safe_call_once_and_store() = default; + PYBIND11_DTOR_CONSTEXPR ~gil_safe_call_once_and_store() { + if (is_initialized_ && finalize_fn_ != nullptr) { + finalize_fn_(*reinterpret_cast(storage_)); + } + } private: alignas(T) char storage_[sizeof(T)] = {}; std::once_flag once_flag_; -#ifdef Py_GIL_DISABLED - std::atomic_bool -#else - bool -#endif - is_initialized_{false}; + void (*finalize_fn_)(T &) = nullptr; + // The `is_initialized_`-`storage_` pair is very similar to `std::optional`, // but the latter does not have the triviality properties of former, // therefore `std::optional` is not a viable alternative here. + atomic_bool is_initialized_{false}; }; PYBIND11_NAMESPACE_END(PYBIND11_NAMESPACE) From d2b76050a11ed5284903c69f9d9e01054d5754f6 Mon Sep 17 00:00:00 2001 From: Xuehai Pan Date: Sun, 14 Dec 2025 01:03:04 +0800 Subject: [PATCH 02/60] Add per-interpreter storage for `gil_safe_call_once_and_store` --- include/pybind11/detail/internals.h | 39 +++++++++++- include/pybind11/gil_safe_call_once.h | 91 ++++++++++++++++++++++++++- 2 files changed, 126 insertions(+), 4 deletions(-) diff --git a/include/pybind11/detail/internals.h b/include/pybind11/detail/internals.h index 858de67525..d5c4da1acf 100644 --- a/include/pybind11/detail/internals.h +++ b/include/pybind11/detail/internals.h @@ -39,7 +39,7 @@ /// further ABI-incompatible changes may be made before the ABI is officially /// changed to the new version. #ifndef PYBIND11_INTERNALS_VERSION -# define PYBIND11_INTERNALS_VERSION 11 +# define PYBIND11_INTERNALS_VERSION 12 #endif #if PYBIND11_INTERNALS_VERSION < 11 @@ -234,6 +234,34 @@ inline uint64_t round_up_to_next_pow2(uint64_t x) { class loader_life_support; +struct call_once_storage_base { + call_once_storage_base() = default; + virtual ~call_once_storage_base() = default; + call_once_storage_base(const call_once_storage_base &) = delete; + call_once_storage_base(call_once_storage_base &&) = delete; + call_once_storage_base &operator=(const call_once_storage_base &) = delete; + call_once_storage_base &operator=(call_once_storage_base &&) = delete; +}; + +template +struct call_once_storage : call_once_storage_base { + void (*finalize)(T &) = nullptr; + alignas(T) char storage[sizeof(T)] = {0}; + + call_once_storage() = default; + ~call_once_storage() override { + if (finalize != nullptr) { + finalize(*reinterpret_cast(storage)); + } + memset(storage, 0, sizeof(T)); + finalize = nullptr; + }; + call_once_storage(const call_once_storage &) = delete; + call_once_storage(call_once_storage &&) = delete; + call_once_storage &operator=(const call_once_storage &) = delete; + call_once_storage &operator=(call_once_storage &&) = delete; +}; + /// Internal data structure used to track registered instances and types. /// Whenever binary incompatible changes are made to this structure, /// `PYBIND11_INTERNALS_VERSION` must be incremented. @@ -283,6 +311,8 @@ struct internals { type_map native_enum_type_map; + std::unordered_map call_once_storage_map; + internals() : static_property_type(make_static_property_type()), default_metaclass(make_default_metaclass()) { @@ -308,7 +338,12 @@ struct internals { internals(internals &&other) = delete; internals &operator=(const internals &other) = delete; internals &operator=(internals &&other) = delete; - ~internals() = default; + ~internals() { + for (auto &[_, storage_ptr] : call_once_storage_map) { + delete storage_ptr; + } + call_once_storage_map.clear(); + } }; // the internals struct (above) is shared between all the modules. local_internals are only diff --git a/include/pybind11/gil_safe_call_once.h b/include/pybind11/gil_safe_call_once.h index 06fb9ef75e..a848404eaf 100644 --- a/include/pybind11/gil_safe_call_once.h +++ b/include/pybind11/gil_safe_call_once.h @@ -3,6 +3,7 @@ #pragma once #include "detail/common.h" +#include "detail/internals.h" #include "gil.h" #include @@ -52,6 +53,7 @@ PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE) // functions, which is usually the case. // // For in-depth background, see docs/advanced/deadlock.md +#ifndef PYBIND11_HAS_SUBINTERPRETER_SUPPORT template class gil_safe_call_once_and_store { public: @@ -59,6 +61,7 @@ class gil_safe_call_once_and_store { template gil_safe_call_once_and_store &call_once_and_store_result(Callable &&fn, void (*finalize_fn)(T &) = nullptr) { + if (!is_initialized_) { // This read is guarded by the GIL. // Multiple threads may enter here, because the GIL is released in the next line and // CPython API calls in the `fn()` call below may release and reacquire the GIL. @@ -80,10 +83,10 @@ class gil_safe_call_once_and_store { T &get_stored() { assert(is_initialized_); PYBIND11_WARNING_PUSH -#if !defined(__clang__) && defined(__GNUC__) && __GNUC__ < 5 +# if !defined(__clang__) && defined(__GNUC__) && __GNUC__ < 5 // Needed for gcc 4.8.5 PYBIND11_WARNING_DISABLE_GCC("-Wstrict-aliasing") -#endif +# endif return *reinterpret_cast(storage_); PYBIND11_WARNING_POP } @@ -96,6 +99,7 @@ class gil_safe_call_once_and_store { } private: + // Global static storage (per process) when subinterpreter support is disabled. alignas(T) char storage_[sizeof(T)] = {}; std::once_flag once_flag_; void (*finalize_fn_)(T &) = nullptr; @@ -105,5 +109,88 @@ class gil_safe_call_once_and_store { // therefore `std::optional` is not a viable alternative here. atomic_bool is_initialized_{false}; }; +#else +// Subinterpreter support is enabled. +// In this case, we should store the result per-interpreter instead of globally, because +// each subinterpreter has its own separate state. The cached object may not shareable +// across interpreters (e.g., imported modules and their members). +template +class gil_safe_call_once_and_store { +public: + // PRECONDITION: The GIL must be held when `call_once_and_store_result()` is called. + template + gil_safe_call_once_and_store &call_once_and_store_result(Callable &&fn, + void (*finalize_fn)(T &) = nullptr) { + if (!is_initialized_by_atleast_one_interpreter_ + || detail::get_num_interpreters_seen() > 1) { + detail::with_internals([&](detail::internals &internals) { + const void *key = reinterpret_cast(this); + auto &storage_map = internals.call_once_storage_map; + auto it = storage_map.find(key); + if (it == storage_map.end()) { + gil_scoped_release gil_rel; // Needed to establish lock ordering. + { + // Only one thread will ever enter here. + gil_scoped_acquire gil_acq; + auto s = new detail::call_once_storage{}; + ::new (s->storage) T(fn()); // fn may release, but will reacquire, the GIL. + s->finalize = finalize_fn; + last_storage_ = reinterpret_cast(s->storage); + storage_map.emplace(key, s); + }; + } + is_initialized_by_atleast_one_interpreter_ = true; + }); + // All threads will observe `is_initialized_by_atleast_one_interp_` as true here. + } + // Intentionally not returning `T &` to ensure the calling code is self-documenting. + return *this; + } + // This must only be called after `call_once_and_store_result()` was called. + T &get_stored() { + T *result = last_storage_; + if (!is_initialized_by_atleast_one_interpreter_ + || detail::get_num_interpreters_seen() > 1) { + detail::with_internals([&](detail::internals &internals) { + const void *key = reinterpret_cast(this); + auto &storage_map = internals.call_once_storage_map; + auto it = storage_map.find(key); + assert(it != storage_map.end()); + auto *s = static_cast *>(it->second); + result = last_storage_ = reinterpret_cast(s->storage); + }); + } + assert(result != nullptr); + return *result; + } + + constexpr gil_safe_call_once_and_store() = default; + PYBIND11_DTOR_CONSTEXPR ~gil_safe_call_once_and_store() { + if (is_initialized_by_atleast_one_interpreter_) { + detail::with_internals([&](detail::internals &internals) { + const void *key = reinterpret_cast(this); + auto &storage_map = internals.call_once_storage_map; + auto it = storage_map.find(key); + if (it != storage_map.end()) { + delete it->second; + storage_map.erase(it); + } + }); + } + } + +private: + // No storage needed when subinterpreter support is enabled. + // The actual storage is stored in the per-interpreter state dict in + // `internals.call_once_storage_map`. + + // Fast local cache to avoid repeated lookups when there are no multiple interpreters. + // This is only valid if there is a single interpreter. Otherwise, it is not used. + T *last_storage_ = nullptr; + // This flag is true if the value has been initialized by any interpreter (may not be the + // current one). + atomic_bool is_initialized_by_atleast_one_interpreter_{false}; +}; +#endif PYBIND11_NAMESPACE_END(PYBIND11_NAMESPACE) From e7417606e8979f948be47d0512bfaf9a21d2953f Mon Sep 17 00:00:00 2001 From: Xuehai Pan Date: Sun, 14 Dec 2025 10:56:19 +0800 Subject: [PATCH 03/60] Make `~gil_safe_call_once_and_store` a no-op --- include/pybind11/gil_safe_call_once.h | 57 ++++++++++----------------- 1 file changed, 21 insertions(+), 36 deletions(-) diff --git a/include/pybind11/gil_safe_call_once.h b/include/pybind11/gil_safe_call_once.h index a848404eaf..10ba995dcc 100644 --- a/include/pybind11/gil_safe_call_once.h +++ b/include/pybind11/gil_safe_call_once.h @@ -60,8 +60,7 @@ class gil_safe_call_once_and_store { // PRECONDITION: The GIL must be held when `call_once_and_store_result()` is called. template gil_safe_call_once_and_store &call_once_and_store_result(Callable &&fn, - void (*finalize_fn)(T &) = nullptr) { - + void (*)(T &) /*unused*/ = nullptr) { if (!is_initialized_) { // This read is guarded by the GIL. // Multiple threads may enter here, because the GIL is released in the next line and // CPython API calls in the `fn()` call below may release and reacquire the GIL. @@ -69,9 +68,8 @@ class gil_safe_call_once_and_store { std::call_once(once_flag_, [&] { // Only one thread will ever enter here. gil_scoped_acquire gil_acq; - ::new (storage_) T(fn()); // fn may release, but will reacquire, the GIL. - finalize_fn_ = finalize_fn; // Store the finalizer. - is_initialized_ = true; // This write is guarded by the GIL. + ::new (storage_) T(fn()); // fn may release, but will reacquire, the GIL. + is_initialized_ = true; // This write is guarded by the GIL. }); // All threads will observe `is_initialized_` as true here. } @@ -92,17 +90,15 @@ class gil_safe_call_once_and_store { } constexpr gil_safe_call_once_and_store() = default; - PYBIND11_DTOR_CONSTEXPR ~gil_safe_call_once_and_store() { - if (is_initialized_ && finalize_fn_ != nullptr) { - finalize_fn_(*reinterpret_cast(storage_)); - } - } + // The instance is a global static, so its destructor runs when the process + // is terminating. Therefore, do nothing here because the Python interpreter + // may have been finalized already. + PYBIND11_DTOR_CONSTEXPR ~gil_safe_call_once_and_store() = default; private: // Global static storage (per process) when subinterpreter support is disabled. alignas(T) char storage_[sizeof(T)] = {}; std::once_flag once_flag_; - void (*finalize_fn_)(T &) = nullptr; // The `is_initialized_`-`storage_` pair is very similar to `std::optional`, // but the latter does not have the triviality properties of former, @@ -124,19 +120,19 @@ class gil_safe_call_once_and_store { if (!is_initialized_by_atleast_one_interpreter_ || detail::get_num_interpreters_seen() > 1) { detail::with_internals([&](detail::internals &internals) { - const void *key = reinterpret_cast(this); + const void *k = reinterpret_cast(this); auto &storage_map = internals.call_once_storage_map; - auto it = storage_map.find(key); + auto it = storage_map.find(k); if (it == storage_map.end()) { gil_scoped_release gil_rel; // Needed to establish lock ordering. { // Only one thread will ever enter here. gil_scoped_acquire gil_acq; - auto s = new detail::call_once_storage{}; - ::new (s->storage) T(fn()); // fn may release, but will reacquire, the GIL. - s->finalize = finalize_fn; - last_storage_ = reinterpret_cast(s->storage); - storage_map.emplace(key, s); + auto v = new detail::call_once_storage{}; + ::new (v->storage) T(fn()); // fn may release, but will reacquire, the GIL. + v->finalize = finalize_fn; + last_storage_ = reinterpret_cast(v->storage); + storage_map.emplace(k, v); }; } is_initialized_by_atleast_one_interpreter_ = true; @@ -153,12 +149,10 @@ class gil_safe_call_once_and_store { if (!is_initialized_by_atleast_one_interpreter_ || detail::get_num_interpreters_seen() > 1) { detail::with_internals([&](detail::internals &internals) { - const void *key = reinterpret_cast(this); + const void *k = reinterpret_cast(this); auto &storage_map = internals.call_once_storage_map; - auto it = storage_map.find(key); - assert(it != storage_map.end()); - auto *s = static_cast *>(it->second); - result = last_storage_ = reinterpret_cast(s->storage); + auto *v = static_cast *>(storage_map.at(k)); + result = last_storage_ = reinterpret_cast(v->storage); }); } assert(result != nullptr); @@ -166,19 +160,10 @@ class gil_safe_call_once_and_store { } constexpr gil_safe_call_once_and_store() = default; - PYBIND11_DTOR_CONSTEXPR ~gil_safe_call_once_and_store() { - if (is_initialized_by_atleast_one_interpreter_) { - detail::with_internals([&](detail::internals &internals) { - const void *key = reinterpret_cast(this); - auto &storage_map = internals.call_once_storage_map; - auto it = storage_map.find(key); - if (it != storage_map.end()) { - delete it->second; - storage_map.erase(it); - } - }); - } - } + // The instance is a global static, so its destructor runs when the process + // is terminating. Therefore, do nothing here because the Python interpreter + // may have been finalized already. + PYBIND11_DTOR_CONSTEXPR ~gil_safe_call_once_and_store() = default; private: // No storage needed when subinterpreter support is enabled. From 5d1d6782b9fa7cc6f705b6adad1828ac2c66ec5a Mon Sep 17 00:00:00 2001 From: Xuehai Pan Date: Sun, 14 Dec 2025 12:15:30 +0800 Subject: [PATCH 04/60] Fix C++11 compatibility --- include/pybind11/detail/internals.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/pybind11/detail/internals.h b/include/pybind11/detail/internals.h index d5c4da1acf..046e47314f 100644 --- a/include/pybind11/detail/internals.h +++ b/include/pybind11/detail/internals.h @@ -339,8 +339,8 @@ struct internals { internals &operator=(const internals &other) = delete; internals &operator=(internals &&other) = delete; ~internals() { - for (auto &[_, storage_ptr] : call_once_storage_map) { - delete storage_ptr; + for (auto &entry : call_once_storage_map) { + delete entry.second; } call_once_storage_map.clear(); } From 0bac82df687e2bdd919c653b9ee0d1fecd155fa5 Mon Sep 17 00:00:00 2001 From: Xuehai Pan Date: Sun, 14 Dec 2025 13:06:15 +0800 Subject: [PATCH 05/60] Improve thread-safety and add default finalizer --- include/pybind11/detail/internals.h | 11 +++-- include/pybind11/gil_safe_call_once.h | 60 +++++++++++++++++---------- 2 files changed, 46 insertions(+), 25 deletions(-) diff --git a/include/pybind11/detail/internals.h b/include/pybind11/detail/internals.h index 046e47314f..dd0c2af957 100644 --- a/include/pybind11/detail/internals.h +++ b/include/pybind11/detail/internals.h @@ -247,14 +247,17 @@ template struct call_once_storage : call_once_storage_base { void (*finalize)(T &) = nullptr; alignas(T) char storage[sizeof(T)] = {0}; + std::atomic_bool is_initialized{false}; call_once_storage() = default; ~call_once_storage() override { - if (finalize != nullptr) { - finalize(*reinterpret_cast(storage)); + if (is_initialized) { + if (finalize != nullptr) { + finalize(*reinterpret_cast(storage)); + } else { + reinterpret_cast(storage)->~T(); + } } - memset(storage, 0, sizeof(T)); - finalize = nullptr; }; call_once_storage(const call_once_storage &) = delete; call_once_storage(call_once_storage &&) = delete; diff --git a/include/pybind11/gil_safe_call_once.h b/include/pybind11/gil_safe_call_once.h index 10ba995dcc..5904f97ba4 100644 --- a/include/pybind11/gil_safe_call_once.h +++ b/include/pybind11/gil_safe_call_once.h @@ -54,6 +54,11 @@ PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE) // // For in-depth background, see docs/advanced/deadlock.md #ifndef PYBIND11_HAS_SUBINTERPRETER_SUPPORT +// Subinterpreter support is disabled. +// In this case, we can store the result globally, because there is only a single interpreter. +// +// The life span of the stored result is the entire process lifetime. It is leaked on process +// termination to avoid destructor calls after the Python interpreter was finalized. template class gil_safe_call_once_and_store { public: @@ -107,9 +112,12 @@ class gil_safe_call_once_and_store { }; #else // Subinterpreter support is enabled. -// In this case, we should store the result per-interpreter instead of globally, because -// each subinterpreter has its own separate state. The cached object may not shareable -// across interpreters (e.g., imported modules and their members). +// In this case, we should store the result per-interpreter instead of globally, because each +// subinterpreter has its own separate state. The cached result may not shareable across +// interpreters (e.g., imported modules and their members). +// +// The life span of the stored result is the entire interpreter lifetime. An additional +// `finalize_fn` can be provided to clean up the stored result when the interpreter is destroyed. template class gil_safe_call_once_and_store { public: @@ -117,26 +125,32 @@ class gil_safe_call_once_and_store { template gil_safe_call_once_and_store &call_once_and_store_result(Callable &&fn, void (*finalize_fn)(T &) = nullptr) { - if (!is_initialized_by_atleast_one_interpreter_ - || detail::get_num_interpreters_seen() > 1) { - detail::with_internals([&](detail::internals &internals) { - const void *k = reinterpret_cast(this); - auto &storage_map = internals.call_once_storage_map; - auto it = storage_map.find(k); - if (it == storage_map.end()) { - gil_scoped_release gil_rel; // Needed to establish lock ordering. - { - // Only one thread will ever enter here. - gil_scoped_acquire gil_acq; + if (!is_last_storage_valid()) { + // Multiple threads may enter here, because the GIL is released in the next line and + // CPython API calls in the `fn()` call below may release and reacquire the GIL. + gil_scoped_release gil_rel; // Needed to establish lock ordering. + { + gil_scoped_acquire gil_acq; + detail::with_internals([&](detail::internals &internals) { + // The concurrency control is done inside `detail::with_internals`. + // At most one thread will enter here at a time. + const void *k = reinterpret_cast(this); + auto &storage_map = internals.call_once_storage_map; + // There can be multiple threads going through here, but only one each at a + // time. So only one thread will create the storage. Other threads will find it + // already created. + auto it = storage_map.find(k); + if (it == storage_map.end()) { auto v = new detail::call_once_storage{}; ::new (v->storage) T(fn()); // fn may release, but will reacquire, the GIL. v->finalize = finalize_fn; last_storage_ = reinterpret_cast(v->storage); + v->is_initialized = true; storage_map.emplace(k, v); - }; - } - is_initialized_by_atleast_one_interpreter_ = true; - }); + } + is_initialized_by_atleast_one_interpreter_ = true; + }); + } // All threads will observe `is_initialized_by_atleast_one_interp_` as true here. } // Intentionally not returning `T &` to ensure the calling code is self-documenting. @@ -146,8 +160,7 @@ class gil_safe_call_once_and_store { // This must only be called after `call_once_and_store_result()` was called. T &get_stored() { T *result = last_storage_; - if (!is_initialized_by_atleast_one_interpreter_ - || detail::get_num_interpreters_seen() > 1) { + if (!is_last_storage_valid()) { detail::with_internals([&](detail::internals &internals) { const void *k = reinterpret_cast(this); auto &storage_map = internals.call_once_storage_map; @@ -159,13 +172,18 @@ class gil_safe_call_once_and_store { return *result; } - constexpr gil_safe_call_once_and_store() = default; + gil_safe_call_once_and_store() = default; // The instance is a global static, so its destructor runs when the process // is terminating. Therefore, do nothing here because the Python interpreter // may have been finalized already. PYBIND11_DTOR_CONSTEXPR ~gil_safe_call_once_and_store() = default; private: + bool is_last_storage_valid() const { + return is_initialized_by_atleast_one_interpreter_ + && detail::get_num_interpreters_seen() <= 1 && last_storage_ != nullptr; + } + // No storage needed when subinterpreter support is enabled. // The actual storage is stored in the per-interpreter state dict in // `internals.call_once_storage_map`. From be971103aad809575d22db6bcc5aa56c8215b2c4 Mon Sep 17 00:00:00 2001 From: Xuehai Pan Date: Sun, 14 Dec 2025 16:24:06 +0800 Subject: [PATCH 06/60] Try fix thread-safety --- include/pybind11/gil_safe_call_once.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/include/pybind11/gil_safe_call_once.h b/include/pybind11/gil_safe_call_once.h index 5904f97ba4..2bedb6d665 100644 --- a/include/pybind11/gil_safe_call_once.h +++ b/include/pybind11/gil_safe_call_once.h @@ -130,7 +130,6 @@ class gil_safe_call_once_and_store { // CPython API calls in the `fn()` call below may release and reacquire the GIL. gil_scoped_release gil_rel; // Needed to establish lock ordering. { - gil_scoped_acquire gil_acq; detail::with_internals([&](detail::internals &internals) { // The concurrency control is done inside `detail::with_internals`. // At most one thread will enter here at a time. @@ -141,10 +140,11 @@ class gil_safe_call_once_and_store { // already created. auto it = storage_map.find(k); if (it == storage_map.end()) { + gil_scoped_acquire gil_acq; auto v = new detail::call_once_storage{}; ::new (v->storage) T(fn()); // fn may release, but will reacquire, the GIL. v->finalize = finalize_fn; - last_storage_ = reinterpret_cast(v->storage); + last_storage_ptr_ = reinterpret_cast(v->storage); v->is_initialized = true; storage_map.emplace(k, v); } @@ -159,13 +159,13 @@ class gil_safe_call_once_and_store { // This must only be called after `call_once_and_store_result()` was called. T &get_stored() { - T *result = last_storage_; + T *result = last_storage_ptr_; if (!is_last_storage_valid()) { detail::with_internals([&](detail::internals &internals) { const void *k = reinterpret_cast(this); auto &storage_map = internals.call_once_storage_map; auto *v = static_cast *>(storage_map.at(k)); - result = last_storage_ = reinterpret_cast(v->storage); + result = last_storage_ptr_ = reinterpret_cast(v->storage); }); } assert(result != nullptr); @@ -181,7 +181,7 @@ class gil_safe_call_once_and_store { private: bool is_last_storage_valid() const { return is_initialized_by_atleast_one_interpreter_ - && detail::get_num_interpreters_seen() <= 1 && last_storage_ != nullptr; + && detail::get_num_interpreters_seen() <= 1; } // No storage needed when subinterpreter support is enabled. @@ -190,7 +190,7 @@ class gil_safe_call_once_and_store { // Fast local cache to avoid repeated lookups when there are no multiple interpreters. // This is only valid if there is a single interpreter. Otherwise, it is not used. - T *last_storage_ = nullptr; + T *last_storage_ptr_ = nullptr; // This flag is true if the value has been initialized by any interpreter (may not be the // current one). atomic_bool is_initialized_by_atleast_one_interpreter_{false}; From 3e77ce953a740fe2182af686723901bde05cc2a5 Mon Sep 17 00:00:00 2001 From: Xuehai Pan Date: Mon, 15 Dec 2025 01:15:46 +0800 Subject: [PATCH 07/60] Try fix thread-safety --- include/pybind11/detail/internals.h | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/include/pybind11/detail/internals.h b/include/pybind11/detail/internals.h index dd0c2af957..b5e9d6eb7c 100644 --- a/include/pybind11/detail/internals.h +++ b/include/pybind11/detail/internals.h @@ -602,27 +602,26 @@ class internals_pp_manager { /// acquire the GIL. Will never return nullptr. std::unique_ptr *get_pp() { #ifdef PYBIND11_HAS_SUBINTERPRETER_SUPPORT - if (get_num_interpreters_seen() > 1) { - // Whenever the interpreter changes on the current thread we need to invalidate the - // internals_pp so that it can be pulled from the interpreter's state dict. That is - // slow, so we use the current PyThreadState to check if it is necessary. - auto *tstate = get_thread_state_unchecked(); - if (!tstate || tstate->interp != last_istate_tls()) { - gil_scoped_acquire_simple gil; - if (!tstate) { - tstate = get_thread_state_unchecked(); - } - last_istate_tls() = tstate->interp; - internals_p_tls() = get_or_create_pp_in_state_dict(); + // Whenever the interpreter changes on the current thread we need to invalidate the + // internals_pp so that it can be pulled from the interpreter's state dict. That is + // slow, so we use the current PyThreadState to check if it is necessary. + auto *tstate = get_thread_state_unchecked(); + if (!tstate || tstate->interp != last_istate_tls()) { + gil_scoped_acquire_simple gil; + if (!tstate) { + tstate = get_thread_state_unchecked(); } - return internals_p_tls(); + last_istate_tls() = tstate->interp; + internals_p_tls() = get_or_create_pp_in_state_dict(); } -#endif + return internals_p_tls(); +#else if (!internals_singleton_pp_) { gil_scoped_acquire_simple gil; internals_singleton_pp_ = get_or_create_pp_in_state_dict(); } return internals_singleton_pp_; +#endif } /// Drop all the references we're currently holding. From d5b8813a66f2b66dcc7419e87d16401356033159 Mon Sep 17 00:00:00 2001 From: Xuehai Pan Date: Mon, 15 Dec 2025 14:36:35 +0800 Subject: [PATCH 08/60] Add a warning comment --- include/pybind11/detail/internals.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/include/pybind11/detail/internals.h b/include/pybind11/detail/internals.h index b5e9d6eb7c..e22e94ffe5 100644 --- a/include/pybind11/detail/internals.h +++ b/include/pybind11/detail/internals.h @@ -602,6 +602,19 @@ class internals_pp_manager { /// acquire the GIL. Will never return nullptr. std::unique_ptr *get_pp() { #ifdef PYBIND11_HAS_SUBINTERPRETER_SUPPORT + // WARNING: We cannot use `get_num_interpreters_seen() > 1` here to create a fast path for + // the single-interpreter case. + // + // For multi-interpreter support, the subinterpreters can be initialized concurrently, and + // the first time this function may not be called in the main interpreter. + // For example, a clean main interpreter that does not import any pybind11 module and then + // spawns multiple subinterpreters using `InterpreterPoolExecutor` that each imports a + // pybind11 module concurrently. + // + // Multiple subinterpreters may observe `get_num_interpreters_seen() <= 1` at the same + // time, while `get_num_interpreters_seen() += 1` in `PYBIND11_MODULE(...)` is called + // later. + // Whenever the interpreter changes on the current thread we need to invalidate the // internals_pp so that it can be pulled from the interpreter's state dict. That is // slow, so we use the current PyThreadState to check if it is necessary. From f6d0f88bd6a29858e92da1503362bfdab2a86c39 Mon Sep 17 00:00:00 2001 From: Xuehai Pan Date: Mon, 15 Dec 2025 18:16:16 +0800 Subject: [PATCH 09/60] Simplify `PYBIND11_INTERNALS_VERSION >= 12` --- include/pybind11/detail/class.h | 2 -- include/pybind11/detail/internals.h | 11 ++--------- include/pybind11/detail/type_caster_base.h | 10 ++-------- include/pybind11/gil_safe_call_once.h | 1 + include/pybind11/pybind11.h | 4 ---- 5 files changed, 5 insertions(+), 23 deletions(-) diff --git a/include/pybind11/detail/class.h b/include/pybind11/detail/class.h index 21e966cfea..1cd9af0bd1 100644 --- a/include/pybind11/detail/class.h +++ b/include/pybind11/detail/class.h @@ -226,14 +226,12 @@ extern "C" inline void pybind11_meta_dealloc(PyObject *obj) { local_internals.registered_types_cpp.erase(tinfo->cpptype); } else { internals.registered_types_cpp.erase(tindex); -#if PYBIND11_INTERNALS_VERSION >= 12 internals.registered_types_cpp_fast.erase(tinfo->cpptype); for (const std::type_info *alias : tinfo->alias_chain) { auto num_erased = internals.registered_types_cpp_fast.erase(alias); (void) num_erased; assert(num_erased > 0); } -#endif } internals.registered_types_py.erase(tinfo->type); diff --git a/include/pybind11/detail/internals.h b/include/pybind11/detail/internals.h index e22e94ffe5..b67b9ce6d4 100644 --- a/include/pybind11/detail/internals.h +++ b/include/pybind11/detail/internals.h @@ -42,8 +42,8 @@ # define PYBIND11_INTERNALS_VERSION 12 #endif -#if PYBIND11_INTERNALS_VERSION < 11 -# error "PYBIND11_INTERNALS_VERSION 11 is the minimum for all platforms for pybind11v3." +#if PYBIND11_INTERNALS_VERSION < 12 +# error "PYBIND11_INTERNALS_VERSION 12 is the minimum for all platforms for pybind11v3." #endif PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE) @@ -273,14 +273,12 @@ struct internals { pymutex mutex; pymutex exception_translator_mutex; #endif -#if PYBIND11_INTERNALS_VERSION >= 12 // non-normative but fast "hint" for registered_types_cpp. Meant // to be used as the first level of a two-level lookup: successful // lookups are correct, but unsuccessful lookups need to try // registered_types_cpp and then backfill this map if they find // anything. fast_type_map registered_types_cpp_fast; -#endif // std::type_index -> pybind11's type information type_map registered_types_cpp; @@ -306,9 +304,6 @@ struct internals { PyObject *instance_base = nullptr; // Unused if PYBIND11_SIMPLE_GIL_MANAGEMENT is defined: thread_specific_storage tstate; -#if PYBIND11_INTERNALS_VERSION <= 11 - thread_specific_storage loader_life_support_tls; // OBSOLETE (PR #5830) -#endif // Unused if PYBIND11_SIMPLE_GIL_MANAGEMENT is defined: PyInterpreterState *istate = nullptr; @@ -396,7 +391,6 @@ struct type_info { void *(*module_local_load)(PyObject *, const type_info *) = nullptr; holder_enum_t holder_enum_v = holder_enum_t::undefined; -#if PYBIND11_INTERNALS_VERSION >= 12 // When a type appears in multiple DSOs, // internals::registered_types_cpp_fast will have multiple distinct // keys (the std::type_info from each DSO) mapped to the same @@ -407,7 +401,6 @@ struct type_info { // nb_alias_chain` added in // https://github.com/wjakob/nanobind/commit/b515b1f7f2f4ecc0357818e6201c94a9f4cbfdc2 std::forward_list alias_chain; -#endif /* A simple type never occurs as a (direct or indirect) parent * of a class that makes use of multiple inheritance. diff --git a/include/pybind11/detail/type_caster_base.h b/include/pybind11/detail/type_caster_base.h index b0c59e1138..21b7f0950e 100644 --- a/include/pybind11/detail/type_caster_base.h +++ b/include/pybind11/detail/type_caster_base.h @@ -227,32 +227,26 @@ inline detail::type_info *get_global_type_info_lock_held(const std::type_info &t // next time. detail::type_info *type_info = nullptr; auto &internals = get_internals(); -#if PYBIND11_INTERNALS_VERSION >= 12 auto &fast_types = internals.registered_types_cpp_fast; -#endif auto &types = internals.registered_types_cpp; -#if PYBIND11_INTERNALS_VERSION >= 12 auto fast_it = fast_types.find(&tp); if (fast_it != fast_types.end()) { -# ifndef NDEBUG +#ifndef NDEBUG auto types_it = types.find(std::type_index(tp)); assert(types_it != types.end()); assert(types_it->second == fast_it->second); -# endif +#endif return fast_it->second; } -#endif // PYBIND11_INTERNALS_VERSION >= 12 auto it = types.find(std::type_index(tp)); if (it != types.end()) { -#if PYBIND11_INTERNALS_VERSION >= 12 // We found the type in the slow map but not the fast one, so // some other DSO added it (otherwise it would be in the fast // map under &tp) and therefore we must be an alias. Record // that. it->second->alias_chain.push_front(&tp); fast_types.emplace(&tp, it->second); -#endif type_info = it->second; } return type_info; diff --git a/include/pybind11/gil_safe_call_once.h b/include/pybind11/gil_safe_call_once.h index 2bedb6d665..a0d74bc6f3 100644 --- a/include/pybind11/gil_safe_call_once.h +++ b/include/pybind11/gil_safe_call_once.h @@ -196,4 +196,5 @@ class gil_safe_call_once_and_store { atomic_bool is_initialized_by_atleast_one_interpreter_{false}; }; #endif + PYBIND11_NAMESPACE_END(PYBIND11_NAMESPACE) diff --git a/include/pybind11/pybind11.h b/include/pybind11/pybind11.h index 91b38d91ed..8bd62c85c9 100644 --- a/include/pybind11/pybind11.h +++ b/include/pybind11/pybind11.h @@ -1692,9 +1692,7 @@ class generic_type : public object { local_internals.registered_types_cpp[rec.type] = tinfo; } else { internals.registered_types_cpp[tindex] = tinfo; -#if PYBIND11_INTERNALS_VERSION >= 12 internals.registered_types_cpp_fast[rec.type] = tinfo; -#endif } PYBIND11_WARNING_PUSH @@ -2201,9 +2199,7 @@ class class_ : public detail::generic_type { type_info *const val = internals.registered_types_cpp[std::type_index(typeid(type))]; internals.registered_types_cpp[std::type_index(typeid(type_alias))] = val; -#if PYBIND11_INTERNALS_VERSION >= 12 internals.registered_types_cpp_fast[&typeid(type_alias)] = val; -#endif } }); } From 7d8339eff5998b33c5455c3f6937756e3168d6fa Mon Sep 17 00:00:00 2001 From: Xuehai Pan Date: Mon, 15 Dec 2025 18:45:23 +0800 Subject: [PATCH 10/60] Try fix thread-safety --- include/pybind11/detail/internals.h | 4 ++-- include/pybind11/gil_safe_call_once.h | 28 +++++++++++++-------------- 2 files changed, 16 insertions(+), 16 deletions(-) diff --git a/include/pybind11/detail/internals.h b/include/pybind11/detail/internals.h index b67b9ce6d4..4abf7d41df 100644 --- a/include/pybind11/detail/internals.h +++ b/include/pybind11/detail/internals.h @@ -245,8 +245,8 @@ struct call_once_storage_base { template struct call_once_storage : call_once_storage_base { - void (*finalize)(T &) = nullptr; alignas(T) char storage[sizeof(T)] = {0}; + void (*finalize)(T &) = nullptr; std::atomic_bool is_initialized{false}; call_once_storage() = default; @@ -337,7 +337,7 @@ struct internals { internals &operator=(const internals &other) = delete; internals &operator=(internals &&other) = delete; ~internals() { - for (auto &entry : call_once_storage_map) { + for (const auto &entry : call_once_storage_map) { delete entry.second; } call_once_storage_map.clear(); diff --git a/include/pybind11/gil_safe_call_once.h b/include/pybind11/gil_safe_call_once.h index a0d74bc6f3..98e7149947 100644 --- a/include/pybind11/gil_safe_call_once.h +++ b/include/pybind11/gil_safe_call_once.h @@ -131,22 +131,22 @@ class gil_safe_call_once_and_store { gil_scoped_release gil_rel; // Needed to establish lock ordering. { detail::with_internals([&](detail::internals &internals) { - // The concurrency control is done inside `detail::with_internals`. - // At most one thread will enter here at a time. - const void *k = reinterpret_cast(this); + const void *key = reinterpret_cast(this); auto &storage_map = internals.call_once_storage_map; - // There can be multiple threads going through here, but only one each at a - // time. So only one thread will create the storage. Other threads will find it - // already created. - auto it = storage_map.find(k); - if (it == storage_map.end()) { + // There can be multiple threads going through here. + if (storage_map.find(key) == storage_map.end()) { gil_scoped_acquire gil_acq; - auto v = new detail::call_once_storage{}; - ::new (v->storage) T(fn()); // fn may release, but will reacquire, the GIL. - v->finalize = finalize_fn; - last_storage_ptr_ = reinterpret_cast(v->storage); - v->is_initialized = true; - storage_map.emplace(k, v); + // Only one thread will enter here at a time. + // Fast recheck to avoid double work. + if (storage_map.find(key) == storage_map.end()) { + auto value = new detail::call_once_storage{}; + // fn may release, but will reacquire, the GIL. + ::new (value->storage) T(fn()); + value->finalize = finalize_fn; + value->is_initialized = true; + storage_map.emplace(key, value); + last_storage_ptr_ = reinterpret_cast(value->storage); + } } is_initialized_by_atleast_one_interpreter_ = true; }); From 1920f4345a61acbd444f0d3309a124ac7dee895d Mon Sep 17 00:00:00 2001 From: Xuehai Pan Date: Mon, 15 Dec 2025 19:48:42 +0800 Subject: [PATCH 11/60] Try fix thread-safety --- include/pybind11/detail/internals.h | 3 +- include/pybind11/gil_safe_call_once.h | 50 +++++++++++++++------------ 2 files changed, 30 insertions(+), 23 deletions(-) diff --git a/include/pybind11/detail/internals.h b/include/pybind11/detail/internals.h index 4abf7d41df..802a57e3e5 100644 --- a/include/pybind11/detail/internals.h +++ b/include/pybind11/detail/internals.h @@ -245,7 +245,8 @@ struct call_once_storage_base { template struct call_once_storage : call_once_storage_base { - alignas(T) char storage[sizeof(T)] = {0}; + alignas(T) char storage[sizeof(T)] = {}; + std::once_flag once_flag; void (*finalize)(T &) = nullptr; std::atomic_bool is_initialized{false}; diff --git a/include/pybind11/gil_safe_call_once.h b/include/pybind11/gil_safe_call_once.h index 98e7149947..e00bbb9f06 100644 --- a/include/pybind11/gil_safe_call_once.h +++ b/include/pybind11/gil_safe_call_once.h @@ -129,28 +129,34 @@ class gil_safe_call_once_and_store { // Multiple threads may enter here, because the GIL is released in the next line and // CPython API calls in the `fn()` call below may release and reacquire the GIL. gil_scoped_release gil_rel; // Needed to establish lock ordering. - { - detail::with_internals([&](detail::internals &internals) { - const void *key = reinterpret_cast(this); - auto &storage_map = internals.call_once_storage_map; - // There can be multiple threads going through here. - if (storage_map.find(key) == storage_map.end()) { - gil_scoped_acquire gil_acq; - // Only one thread will enter here at a time. - // Fast recheck to avoid double work. - if (storage_map.find(key) == storage_map.end()) { - auto value = new detail::call_once_storage{}; - // fn may release, but will reacquire, the GIL. - ::new (value->storage) T(fn()); - value->finalize = finalize_fn; - value->is_initialized = true; - storage_map.emplace(key, value); - last_storage_ptr_ = reinterpret_cast(value->storage); - } + detail::with_internals([&](detail::internals &internals) { + const void *key = reinterpret_cast(this); + auto &storage_map = internals.call_once_storage_map; + // There can be multiple threads going through here. + detail::call_once_storage *value = nullptr; + { + gil_scoped_acquire gil_acq; + // Only one thread will enter here at a time. + const auto it = storage_map.find(key); + if (it != storage_map.end()) { + value = static_cast *>(it->second); + } else { + value = new detail::call_once_storage{}; + storage_map.emplace(key, value); } + } + assert(value != nullptr); + std::call_once(value->once_flag, [&] { + // Only one thread will ever enter here. + gil_scoped_acquire gil_acq; + // fn may release, but will reacquire, the GIL. + ::new (value->storage) T(fn()); + value->finalize = finalize_fn; + value->is_initialized = true; + last_storage_ptr_ = reinterpret_cast(value->storage); is_initialized_by_atleast_one_interpreter_ = true; }); - } + }); // All threads will observe `is_initialized_by_atleast_one_interp_` as true here. } // Intentionally not returning `T &` to ensure the calling code is self-documenting. @@ -162,10 +168,10 @@ class gil_safe_call_once_and_store { T *result = last_storage_ptr_; if (!is_last_storage_valid()) { detail::with_internals([&](detail::internals &internals) { - const void *k = reinterpret_cast(this); + const void *key = reinterpret_cast(this); auto &storage_map = internals.call_once_storage_map; - auto *v = static_cast *>(storage_map.at(k)); - result = last_storage_ptr_ = reinterpret_cast(v->storage); + auto *value = static_cast *>(storage_map.at(key)); + result = last_storage_ptr_ = reinterpret_cast(value->storage); }); } assert(result != nullptr); From a6754ba40d2326c3680984b52a6f893cd89d57bd Mon Sep 17 00:00:00 2001 From: Xuehai Pan Date: Tue, 16 Dec 2025 15:57:09 +0800 Subject: [PATCH 12/60] Revert get_pp() --- include/pybind11/detail/internals.h | 56 +++++++++++++++++------------ 1 file changed, 33 insertions(+), 23 deletions(-) diff --git a/include/pybind11/detail/internals.h b/include/pybind11/detail/internals.h index 802a57e3e5..c157bf53cb 100644 --- a/include/pybind11/detail/internals.h +++ b/include/pybind11/detail/internals.h @@ -596,39 +596,46 @@ class internals_pp_manager { /// acquire the GIL. Will never return nullptr. std::unique_ptr *get_pp() { #ifdef PYBIND11_HAS_SUBINTERPRETER_SUPPORT - // WARNING: We cannot use `get_num_interpreters_seen() > 1` here to create a fast path for - // the single-interpreter case. + // FIXME: We cannot use `get_num_interpreters_seen() > 1` here to create a fast path for + // the multi-interpreter case. The singleton may be initialized by a subinterpreter not the + // main interpreter. // // For multi-interpreter support, the subinterpreters can be initialized concurrently, and // the first time this function may not be called in the main interpreter. // For example, a clean main interpreter that does not import any pybind11 module and then // spawns multiple subinterpreters using `InterpreterPoolExecutor` that each imports a // pybind11 module concurrently. - // - // Multiple subinterpreters may observe `get_num_interpreters_seen() <= 1` at the same - // time, while `get_num_interpreters_seen() += 1` in `PYBIND11_MODULE(...)` is called - // later. - - // Whenever the interpreter changes on the current thread we need to invalidate the - // internals_pp so that it can be pulled from the interpreter's state dict. That is - // slow, so we use the current PyThreadState to check if it is necessary. - auto *tstate = get_thread_state_unchecked(); - if (!tstate || tstate->interp != last_istate_tls()) { - gil_scoped_acquire_simple gil; - if (!tstate) { - tstate = get_thread_state_unchecked(); + if (get_num_interpreters_seen() > 1) { + // Whenever the interpreter changes on the current thread we need to invalidate the + // internals_pp so that it can be pulled from the interpreter's state dict. That is + // slow, so we use the current PyThreadState to check if it is necessary. + auto *tstate = get_thread_state_unchecked(); + if (!tstate || tstate->interp != last_istate_tls()) { + gil_scoped_acquire_simple gil; + if (!tstate) { + tstate = get_thread_state_unchecked(); + } + last_istate_tls() = tstate->interp; + internals_p_tls() = get_or_create_pp_in_state_dict(); } - last_istate_tls() = tstate->interp; - internals_p_tls() = get_or_create_pp_in_state_dict(); + return internals_p_tls(); } - return internals_p_tls(); -#else - if (!internals_singleton_pp_) { - gil_scoped_acquire_simple gil; - internals_singleton_pp_ = get_or_create_pp_in_state_dict(); +#endif + return get_pp_for_main_interpreter(); + } + + /// Get the pointer-to-pointer for the main interpreter, allocating it if it does not already + /// exist. May acquire the GIL. Will never return nullptr. + std::unique_ptr *get_pp_for_main_interpreter() { + // This function **assumes** that the current thread is running in the main interpreter. + if (!seen_main_interpreter_) { + std::call_once(seen_main_interpreter_flag_, [&] { + gil_scoped_acquire_simple gil; + internals_singleton_pp_ = get_or_create_pp_in_state_dict(); + seen_main_interpreter_ = true; + }); } return internals_singleton_pp_; -#endif } /// Drop all the references we're currently holding. @@ -705,6 +712,9 @@ class internals_pp_manager { char const *holder_id_ = nullptr; on_fetch_function *on_fetch_ = nullptr; std::unique_ptr *internals_singleton_pp_; + + std::once_flag seen_main_interpreter_flag_; + std::atomic_bool seen_main_interpreter_{false}; }; // If We loaded the internals through `state_dict`, our `error_already_set` From 1aed3ab1b4682ab61cd41a00284d5a6f1b63e1d1 Mon Sep 17 00:00:00 2001 From: Xuehai Pan Date: Tue, 16 Dec 2025 16:33:14 +0800 Subject: [PATCH 13/60] Update comments --- include/pybind11/detail/internals.h | 9 +++++++-- include/pybind11/gil_safe_call_once.h | 4 ++-- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/include/pybind11/detail/internals.h b/include/pybind11/detail/internals.h index c157bf53cb..4ff904607a 100644 --- a/include/pybind11/detail/internals.h +++ b/include/pybind11/detail/internals.h @@ -627,19 +627,23 @@ class internals_pp_manager { /// Get the pointer-to-pointer for the main interpreter, allocating it if it does not already /// exist. May acquire the GIL. Will never return nullptr. std::unique_ptr *get_pp_for_main_interpreter() { - // This function **assumes** that the current thread is running in the main interpreter. if (!seen_main_interpreter_) { + // The first call to this function **MUST** be from the main interpreter. + // Here we **ASSUME** that the current thread is running in the main interpreter. + // The caller is responsible for ensuring this. std::call_once(seen_main_interpreter_flag_, [&] { gil_scoped_acquire_simple gil; internals_singleton_pp_ = get_or_create_pp_in_state_dict(); seen_main_interpreter_ = true; }); } + // This is shared between all threads and all interpreters. return internals_singleton_pp_; } /// Drop all the references we're currently holding. void unref() { + // See comment in get_pp() above. #ifdef PYBIND11_HAS_SUBINTERPRETER_SUPPORT if (get_num_interpreters_seen() > 1) { last_istate_tls() = nullptr; @@ -651,6 +655,7 @@ class internals_pp_manager { } void destroy() { + // See comment in get_pp() above. #ifdef PYBIND11_HAS_SUBINTERPRETER_SUPPORT if (get_num_interpreters_seen() > 1) { auto *tstate = get_thread_state_unchecked(); @@ -711,8 +716,8 @@ class internals_pp_manager { char const *holder_id_ = nullptr; on_fetch_function *on_fetch_ = nullptr; + // Pointer to the singleton internals for the main interpreter std::unique_ptr *internals_singleton_pp_; - std::once_flag seen_main_interpreter_flag_; std::atomic_bool seen_main_interpreter_{false}; }; diff --git a/include/pybind11/gil_safe_call_once.h b/include/pybind11/gil_safe_call_once.h index e00bbb9f06..68314c6f8d 100644 --- a/include/pybind11/gil_safe_call_once.h +++ b/include/pybind11/gil_safe_call_once.h @@ -130,7 +130,7 @@ class gil_safe_call_once_and_store { // CPython API calls in the `fn()` call below may release and reacquire the GIL. gil_scoped_release gil_rel; // Needed to establish lock ordering. detail::with_internals([&](detail::internals &internals) { - const void *key = reinterpret_cast(this); + const void *const key = reinterpret_cast(this); auto &storage_map = internals.call_once_storage_map; // There can be multiple threads going through here. detail::call_once_storage *value = nullptr; @@ -168,7 +168,7 @@ class gil_safe_call_once_and_store { T *result = last_storage_ptr_; if (!is_last_storage_valid()) { detail::with_internals([&](detail::internals &internals) { - const void *key = reinterpret_cast(this); + const void *const key = reinterpret_cast(this); auto &storage_map = internals.call_once_storage_map; auto *value = static_cast *>(storage_map.at(key)); result = last_storage_ptr_ = reinterpret_cast(value->storage); From b61e902dce793e2b82b1a0f6e9ba8ffb5c875894 Mon Sep 17 00:00:00 2001 From: Xuehai Pan Date: Wed, 17 Dec 2025 12:02:29 +0800 Subject: [PATCH 14/60] Move call-once storage out of internals --- include/pybind11/detail/internals.h | 41 +------- include/pybind11/gil_safe_call_once.h | 140 +++++++++++++++++++------- 2 files changed, 104 insertions(+), 77 deletions(-) diff --git a/include/pybind11/detail/internals.h b/include/pybind11/detail/internals.h index 4ff904607a..11a2ee4c92 100644 --- a/include/pybind11/detail/internals.h +++ b/include/pybind11/detail/internals.h @@ -234,38 +234,6 @@ inline uint64_t round_up_to_next_pow2(uint64_t x) { class loader_life_support; -struct call_once_storage_base { - call_once_storage_base() = default; - virtual ~call_once_storage_base() = default; - call_once_storage_base(const call_once_storage_base &) = delete; - call_once_storage_base(call_once_storage_base &&) = delete; - call_once_storage_base &operator=(const call_once_storage_base &) = delete; - call_once_storage_base &operator=(call_once_storage_base &&) = delete; -}; - -template -struct call_once_storage : call_once_storage_base { - alignas(T) char storage[sizeof(T)] = {}; - std::once_flag once_flag; - void (*finalize)(T &) = nullptr; - std::atomic_bool is_initialized{false}; - - call_once_storage() = default; - ~call_once_storage() override { - if (is_initialized) { - if (finalize != nullptr) { - finalize(*reinterpret_cast(storage)); - } else { - reinterpret_cast(storage)->~T(); - } - } - }; - call_once_storage(const call_once_storage &) = delete; - call_once_storage(call_once_storage &&) = delete; - call_once_storage &operator=(const call_once_storage &) = delete; - call_once_storage &operator=(call_once_storage &&) = delete; -}; - /// Internal data structure used to track registered instances and types. /// Whenever binary incompatible changes are made to this structure, /// `PYBIND11_INTERNALS_VERSION` must be incremented. @@ -310,8 +278,6 @@ struct internals { type_map native_enum_type_map; - std::unordered_map call_once_storage_map; - internals() : static_property_type(make_static_property_type()), default_metaclass(make_default_metaclass()) { @@ -337,12 +303,7 @@ struct internals { internals(internals &&other) = delete; internals &operator=(const internals &other) = delete; internals &operator=(internals &&other) = delete; - ~internals() { - for (const auto &entry : call_once_storage_map) { - delete entry.second; - } - call_once_storage_map.clear(); - } + ~internals() = default; }; // the internals struct (above) is shared between all the modules. local_internals are only diff --git a/include/pybind11/gil_safe_call_once.h b/include/pybind11/gil_safe_call_once.h index 68314c6f8d..2268ca3ac7 100644 --- a/include/pybind11/gil_safe_call_once.h +++ b/include/pybind11/gil_safe_call_once.h @@ -115,7 +115,45 @@ class gil_safe_call_once_and_store { // In this case, we should store the result per-interpreter instead of globally, because each // subinterpreter has its own separate state. The cached result may not shareable across // interpreters (e.g., imported modules and their members). -// + +struct call_once_storage_base { + call_once_storage_base() = default; + virtual ~call_once_storage_base() = default; + call_once_storage_base(const call_once_storage_base &) = delete; + call_once_storage_base(call_once_storage_base &&) = delete; + call_once_storage_base &operator=(const call_once_storage_base &) = delete; + call_once_storage_base &operator=(call_once_storage_base &&) = delete; +}; + +template +struct call_once_storage : call_once_storage_base { + alignas(T) char storage[sizeof(T)] = {}; + std::once_flag once_flag; + void (*finalize)(T &) = nullptr; + std::atomic_bool is_initialized{false}; + + call_once_storage() = default; + ~call_once_storage() override { + if (is_initialized) { + if (finalize != nullptr) { + finalize(*reinterpret_cast(storage)); + } else { + reinterpret_cast(storage)->~T(); + } + } + }; + call_once_storage(const call_once_storage &) = delete; + call_once_storage(call_once_storage &&) = delete; + call_once_storage &operator=(const call_once_storage &) = delete; + call_once_storage &operator=(call_once_storage &&) = delete; +}; + +/// Storage map for `gil_safe_call_once_and_store`. Stored in a capsule in the interpreter's state +/// dict with proper destructor to ensure cleanup when the interpreter is destroyed. +using call_once_storage_map_type = std::unordered_map; + +# define PYBIND11_CALL_ONCE_STORAGE_MAP_ID PYBIND11_INTERNALS_ID "_call_once_storage_map__" + // The life span of the stored result is the entire interpreter lifetime. An additional // `finalize_fn` can be provided to clean up the stored result when the interpreter is destroyed. template @@ -129,35 +167,33 @@ class gil_safe_call_once_and_store { // Multiple threads may enter here, because the GIL is released in the next line and // CPython API calls in the `fn()` call below may release and reacquire the GIL. gil_scoped_release gil_rel; // Needed to establish lock ordering. - detail::with_internals([&](detail::internals &internals) { - const void *const key = reinterpret_cast(this); - auto &storage_map = internals.call_once_storage_map; - // There can be multiple threads going through here. - detail::call_once_storage *value = nullptr; - { - gil_scoped_acquire gil_acq; - // Only one thread will enter here at a time. - const auto it = storage_map.find(key); - if (it != storage_map.end()) { - value = static_cast *>(it->second); - } else { - value = new detail::call_once_storage{}; - storage_map.emplace(key, value); - } + const void *const key = reinterpret_cast(this); + // There can be multiple threads going through here. + call_once_storage *value = nullptr; + { + gil_scoped_acquire gil_acq; + // Only one thread will enter here at a time. + auto &storage_map = *get_or_create_call_once_storage_map(); + const auto it = storage_map.find(key); + if (it != storage_map.end()) { + value = static_cast *>(it->second); + } else { + value = new call_once_storage{}; + storage_map.emplace(key, value); } - assert(value != nullptr); - std::call_once(value->once_flag, [&] { - // Only one thread will ever enter here. - gil_scoped_acquire gil_acq; - // fn may release, but will reacquire, the GIL. - ::new (value->storage) T(fn()); - value->finalize = finalize_fn; - value->is_initialized = true; - last_storage_ptr_ = reinterpret_cast(value->storage); - is_initialized_by_atleast_one_interpreter_ = true; - }); + } + assert(value != nullptr); + std::call_once(value->once_flag, [&] { + // Only one thread will ever enter here. + gil_scoped_acquire gil_acq; + // fn may release, but will reacquire, the GIL. + ::new (value->storage) T(fn()); + value->finalize = finalize_fn; + value->is_initialized = true; + last_storage_ptr_ = reinterpret_cast(value->storage); + is_initialized_by_atleast_one_interpreter_ = true; }); - // All threads will observe `is_initialized_by_atleast_one_interp_` as true here. + // All threads will observe `is_initialized_by_atleast_one_interpreter_` as true here. } // Intentionally not returning `T &` to ensure the calling code is self-documenting. return *this; @@ -167,12 +203,11 @@ class gil_safe_call_once_and_store { T &get_stored() { T *result = last_storage_ptr_; if (!is_last_storage_valid()) { - detail::with_internals([&](detail::internals &internals) { - const void *const key = reinterpret_cast(this); - auto &storage_map = internals.call_once_storage_map; - auto *value = static_cast *>(storage_map.at(key)); - result = last_storage_ptr_ = reinterpret_cast(value->storage); - }); + gil_scoped_acquire gil_acq; + const void *const key = reinterpret_cast(this); + auto &storage_map = *get_or_create_call_once_storage_map(); + auto *value = static_cast *>(storage_map.at(key)); + result = last_storage_ptr_ = reinterpret_cast(value->storage); } assert(result != nullptr); return *result; @@ -187,12 +222,43 @@ class gil_safe_call_once_and_store { private: bool is_last_storage_valid() const { return is_initialized_by_atleast_one_interpreter_ - && detail::get_num_interpreters_seen() <= 1; + && detail::get_num_interpreters_seen() == 1; + } + + static call_once_storage_map_type *get_or_create_call_once_storage_map() { + error_scope err_scope; + dict state_dict = detail::get_python_state_dict(); + auto storage_map_obj = reinterpret_steal( + detail::dict_getitemstringref(state_dict.ptr(), PYBIND11_CALL_ONCE_STORAGE_MAP_ID)); + call_once_storage_map_type *storage_map = nullptr; + if (storage_map_obj) { + void *raw_ptr = PyCapsule_GetPointer(storage_map_obj.ptr(), /*name=*/nullptr); + if (!raw_ptr) { + raise_from(PyExc_SystemError, + "pybind11::gil_safe_call_once_and_store::" + "get_or_create_call_once_storage_map() FAILED"); + throw error_already_set(); + } + storage_map = reinterpret_cast(raw_ptr); + } else { + storage_map = new call_once_storage_map_type(); + // Create capsule with destructor to clean up the storage map when the interpreter + // shuts down + state_dict[PYBIND11_CALL_ONCE_STORAGE_MAP_ID] + = capsule(storage_map, [](void *ptr) noexcept { + auto *map = reinterpret_cast(ptr); + for (const auto &entry : *map) { + delete entry.second; + } + delete map; + }); + } + return storage_map; } // No storage needed when subinterpreter support is enabled. - // The actual storage is stored in the per-interpreter state dict in - // `internals.call_once_storage_map`. + // The actual storage is stored in the per-interpreter state dict via + // `get_or_create_call_once_storage_map()`. // Fast local cache to avoid repeated lookups when there are no multiple interpreters. // This is only valid if there is a single interpreter. Otherwise, it is not used. From b72cd4162baf14472f37c66144aca55df7c9fa74 Mon Sep 17 00:00:00 2001 From: Xuehai Pan Date: Wed, 17 Dec 2025 11:22:54 +0800 Subject: [PATCH 15/60] Revert internal version bump --- include/pybind11/detail/class.h | 2 ++ include/pybind11/detail/internals.h | 13 ++++++++++--- include/pybind11/detail/type_caster_base.h | 10 ++++++++-- include/pybind11/pybind11.h | 4 ++++ 4 files changed, 24 insertions(+), 5 deletions(-) diff --git a/include/pybind11/detail/class.h b/include/pybind11/detail/class.h index 1cd9af0bd1..21e966cfea 100644 --- a/include/pybind11/detail/class.h +++ b/include/pybind11/detail/class.h @@ -226,12 +226,14 @@ extern "C" inline void pybind11_meta_dealloc(PyObject *obj) { local_internals.registered_types_cpp.erase(tinfo->cpptype); } else { internals.registered_types_cpp.erase(tindex); +#if PYBIND11_INTERNALS_VERSION >= 12 internals.registered_types_cpp_fast.erase(tinfo->cpptype); for (const std::type_info *alias : tinfo->alias_chain) { auto num_erased = internals.registered_types_cpp_fast.erase(alias); (void) num_erased; assert(num_erased > 0); } +#endif } internals.registered_types_py.erase(tinfo->type); diff --git a/include/pybind11/detail/internals.h b/include/pybind11/detail/internals.h index 11a2ee4c92..5347511538 100644 --- a/include/pybind11/detail/internals.h +++ b/include/pybind11/detail/internals.h @@ -39,11 +39,11 @@ /// further ABI-incompatible changes may be made before the ABI is officially /// changed to the new version. #ifndef PYBIND11_INTERNALS_VERSION -# define PYBIND11_INTERNALS_VERSION 12 +# define PYBIND11_INTERNALS_VERSION 11 #endif -#if PYBIND11_INTERNALS_VERSION < 12 -# error "PYBIND11_INTERNALS_VERSION 12 is the minimum for all platforms for pybind11v3." +#if PYBIND11_INTERNALS_VERSION < 11 +# error "PYBIND11_INTERNALS_VERSION 11 is the minimum for all platforms for pybind11v3." #endif PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE) @@ -242,12 +242,14 @@ struct internals { pymutex mutex; pymutex exception_translator_mutex; #endif +#if PYBIND11_INTERNALS_VERSION >= 12 // non-normative but fast "hint" for registered_types_cpp. Meant // to be used as the first level of a two-level lookup: successful // lookups are correct, but unsuccessful lookups need to try // registered_types_cpp and then backfill this map if they find // anything. fast_type_map registered_types_cpp_fast; +#endif // std::type_index -> pybind11's type information type_map registered_types_cpp; @@ -273,6 +275,9 @@ struct internals { PyObject *instance_base = nullptr; // Unused if PYBIND11_SIMPLE_GIL_MANAGEMENT is defined: thread_specific_storage tstate; +#if PYBIND11_INTERNALS_VERSION <= 11 + thread_specific_storage loader_life_support_tls; // OBSOLETE (PR #5830) +#endif // Unused if PYBIND11_SIMPLE_GIL_MANAGEMENT is defined: PyInterpreterState *istate = nullptr; @@ -353,6 +358,7 @@ struct type_info { void *(*module_local_load)(PyObject *, const type_info *) = nullptr; holder_enum_t holder_enum_v = holder_enum_t::undefined; +#if PYBIND11_INTERNALS_VERSION >= 12 // When a type appears in multiple DSOs, // internals::registered_types_cpp_fast will have multiple distinct // keys (the std::type_info from each DSO) mapped to the same @@ -363,6 +369,7 @@ struct type_info { // nb_alias_chain` added in // https://github.com/wjakob/nanobind/commit/b515b1f7f2f4ecc0357818e6201c94a9f4cbfdc2 std::forward_list alias_chain; +#endif /* A simple type never occurs as a (direct or indirect) parent * of a class that makes use of multiple inheritance. diff --git a/include/pybind11/detail/type_caster_base.h b/include/pybind11/detail/type_caster_base.h index 21b7f0950e..b0c59e1138 100644 --- a/include/pybind11/detail/type_caster_base.h +++ b/include/pybind11/detail/type_caster_base.h @@ -227,26 +227,32 @@ inline detail::type_info *get_global_type_info_lock_held(const std::type_info &t // next time. detail::type_info *type_info = nullptr; auto &internals = get_internals(); +#if PYBIND11_INTERNALS_VERSION >= 12 auto &fast_types = internals.registered_types_cpp_fast; +#endif auto &types = internals.registered_types_cpp; +#if PYBIND11_INTERNALS_VERSION >= 12 auto fast_it = fast_types.find(&tp); if (fast_it != fast_types.end()) { -#ifndef NDEBUG +# ifndef NDEBUG auto types_it = types.find(std::type_index(tp)); assert(types_it != types.end()); assert(types_it->second == fast_it->second); -#endif +# endif return fast_it->second; } +#endif // PYBIND11_INTERNALS_VERSION >= 12 auto it = types.find(std::type_index(tp)); if (it != types.end()) { +#if PYBIND11_INTERNALS_VERSION >= 12 // We found the type in the slow map but not the fast one, so // some other DSO added it (otherwise it would be in the fast // map under &tp) and therefore we must be an alias. Record // that. it->second->alias_chain.push_front(&tp); fast_types.emplace(&tp, it->second); +#endif type_info = it->second; } return type_info; diff --git a/include/pybind11/pybind11.h b/include/pybind11/pybind11.h index 8bd62c85c9..91b38d91ed 100644 --- a/include/pybind11/pybind11.h +++ b/include/pybind11/pybind11.h @@ -1692,7 +1692,9 @@ class generic_type : public object { local_internals.registered_types_cpp[rec.type] = tinfo; } else { internals.registered_types_cpp[tindex] = tinfo; +#if PYBIND11_INTERNALS_VERSION >= 12 internals.registered_types_cpp_fast[rec.type] = tinfo; +#endif } PYBIND11_WARNING_PUSH @@ -2199,7 +2201,9 @@ class class_ : public detail::generic_type { type_info *const val = internals.registered_types_cpp[std::type_index(typeid(type))]; internals.registered_types_cpp[std::type_index(typeid(type_alias))] = val; +#if PYBIND11_INTERNALS_VERSION >= 12 internals.registered_types_cpp_fast[&typeid(type_alias)] = val; +#endif } }); } From ac02a3208d4bd377059bb97bba4df5bb8f1b3923 Mon Sep 17 00:00:00 2001 From: Xuehai Pan Date: Wed, 17 Dec 2025 12:07:48 +0800 Subject: [PATCH 16/60] Cleanup outdated comments --- include/pybind11/detail/internals.h | 35 +++++------------------------ 1 file changed, 5 insertions(+), 30 deletions(-) diff --git a/include/pybind11/detail/internals.h b/include/pybind11/detail/internals.h index 5347511538..5ccd4d18e5 100644 --- a/include/pybind11/detail/internals.h +++ b/include/pybind11/detail/internals.h @@ -564,15 +564,6 @@ class internals_pp_manager { /// acquire the GIL. Will never return nullptr. std::unique_ptr *get_pp() { #ifdef PYBIND11_HAS_SUBINTERPRETER_SUPPORT - // FIXME: We cannot use `get_num_interpreters_seen() > 1` here to create a fast path for - // the multi-interpreter case. The singleton may be initialized by a subinterpreter not the - // main interpreter. - // - // For multi-interpreter support, the subinterpreters can be initialized concurrently, and - // the first time this function may not be called in the main interpreter. - // For example, a clean main interpreter that does not import any pybind11 module and then - // spawns multiple subinterpreters using `InterpreterPoolExecutor` that each imports a - // pybind11 module concurrently. if (get_num_interpreters_seen() > 1) { // Whenever the interpreter changes on the current thread we need to invalidate the // internals_pp so that it can be pulled from the interpreter's state dict. That is @@ -589,29 +580,15 @@ class internals_pp_manager { return internals_p_tls(); } #endif - return get_pp_for_main_interpreter(); - } - - /// Get the pointer-to-pointer for the main interpreter, allocating it if it does not already - /// exist. May acquire the GIL. Will never return nullptr. - std::unique_ptr *get_pp_for_main_interpreter() { - if (!seen_main_interpreter_) { - // The first call to this function **MUST** be from the main interpreter. - // Here we **ASSUME** that the current thread is running in the main interpreter. - // The caller is responsible for ensuring this. - std::call_once(seen_main_interpreter_flag_, [&] { - gil_scoped_acquire_simple gil; - internals_singleton_pp_ = get_or_create_pp_in_state_dict(); - seen_main_interpreter_ = true; - }); + if (!internals_singleton_pp_) { + gil_scoped_acquire_simple gil; + internals_singleton_pp_ = get_or_create_pp_in_state_dict(); } - // This is shared between all threads and all interpreters. return internals_singleton_pp_; } /// Drop all the references we're currently holding. void unref() { - // See comment in get_pp() above. #ifdef PYBIND11_HAS_SUBINTERPRETER_SUPPORT if (get_num_interpreters_seen() > 1) { last_istate_tls() = nullptr; @@ -623,7 +600,6 @@ class internals_pp_manager { } void destroy() { - // See comment in get_pp() above. #ifdef PYBIND11_HAS_SUBINTERPRETER_SUPPORT if (get_num_interpreters_seen() > 1) { auto *tstate = get_thread_state_unchecked(); @@ -684,10 +660,9 @@ class internals_pp_manager { char const *holder_id_ = nullptr; on_fetch_function *on_fetch_ = nullptr; - // Pointer to the singleton internals for the main interpreter + // Pointer-to-pointer to the singleton internals for the first seen interpreter (may not be the + // main interpreter) std::unique_ptr *internals_singleton_pp_; - std::once_flag seen_main_interpreter_flag_; - std::atomic_bool seen_main_interpreter_{false}; }; // If We loaded the internals through `state_dict`, our `error_already_set` From ddb6dd4c73cc84b82373ba6c2e851dbecd19ad62 Mon Sep 17 00:00:00 2001 From: "Ralf W. Grosse-Kunstleve" Date: Fri, 19 Dec 2025 21:56:14 -0800 Subject: [PATCH 17/60] Move atomic_bool alias into pybind11::detail namespace The `using atomic_bool = ...` declaration was at global scope, polluting the global namespace. Move it into pybind11::detail to avoid potential conflicts with user code. --- include/pybind11/gil_safe_call_once.h | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/include/pybind11/gil_safe_call_once.h b/include/pybind11/gil_safe_call_once.h index 2268ca3ac7..ffd147ad0b 100644 --- a/include/pybind11/gil_safe_call_once.h +++ b/include/pybind11/gil_safe_call_once.h @@ -11,13 +11,17 @@ #if defined(Py_GIL_DISABLED) || defined(PYBIND11_HAS_SUBINTERPRETER_SUPPORT) # include +#endif + +PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE) +namespace detail { +#if defined(Py_GIL_DISABLED) || defined(PYBIND11_HAS_SUBINTERPRETER_SUPPORT) using atomic_bool = std::atomic_bool; #else using atomic_bool = bool; #endif - -PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE) +} // namespace detail // Use the `gil_safe_call_once_and_store` class below instead of the naive // @@ -108,7 +112,7 @@ class gil_safe_call_once_and_store { // The `is_initialized_`-`storage_` pair is very similar to `std::optional`, // but the latter does not have the triviality properties of former, // therefore `std::optional` is not a viable alternative here. - atomic_bool is_initialized_{false}; + detail::atomic_bool is_initialized_{false}; }; #else // Subinterpreter support is enabled. @@ -265,7 +269,7 @@ class gil_safe_call_once_and_store { T *last_storage_ptr_ = nullptr; // This flag is true if the value has been initialized by any interpreter (may not be the // current one). - atomic_bool is_initialized_by_atleast_one_interpreter_{false}; + detail::atomic_bool is_initialized_by_atleast_one_interpreter_{false}; }; #endif From 3fb52dff6a90385f278473487f8b56fd48d46598 Mon Sep 17 00:00:00 2001 From: "Ralf W. Grosse-Kunstleve" Date: Fri, 19 Dec 2025 21:56:30 -0800 Subject: [PATCH 18/60] Add explicit #include for subinterpreter support The subinterpreter branch uses std::unordered_map but relied on transitive includes. Add an explicit include for robustness. --- include/pybind11/gil_safe_call_once.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/include/pybind11/gil_safe_call_once.h b/include/pybind11/gil_safe_call_once.h index ffd147ad0b..20166ecf08 100644 --- a/include/pybind11/gil_safe_call_once.h +++ b/include/pybind11/gil_safe_call_once.h @@ -12,6 +12,9 @@ #if defined(Py_GIL_DISABLED) || defined(PYBIND11_HAS_SUBINTERPRETER_SUPPORT) # include #endif +#ifdef PYBIND11_HAS_SUBINTERPRETER_SUPPORT +# include +#endif PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE) From 32deca43804bde28bf863cf9e0f9d0f3fbab69f9 Mon Sep 17 00:00:00 2001 From: "Ralf W. Grosse-Kunstleve" Date: Fri, 19 Dec 2025 21:56:49 -0800 Subject: [PATCH 19/60] Remove extraneous semicolon after destructor definition Style fix: remove trailing semicolon after ~call_once_storage() destructor body. --- include/pybind11/gil_safe_call_once.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/pybind11/gil_safe_call_once.h b/include/pybind11/gil_safe_call_once.h index 20166ecf08..51e7f0c931 100644 --- a/include/pybind11/gil_safe_call_once.h +++ b/include/pybind11/gil_safe_call_once.h @@ -148,7 +148,7 @@ struct call_once_storage : call_once_storage_base { reinterpret_cast(storage)->~T(); } } - }; + } call_once_storage(const call_once_storage &) = delete; call_once_storage(call_once_storage &&) = delete; call_once_storage &operator=(const call_once_storage &) = delete; From a4d4d734863c4d413dc361893f72700e98703a94 Mon Sep 17 00:00:00 2001 From: "Ralf W. Grosse-Kunstleve" Date: Fri, 19 Dec 2025 21:57:11 -0800 Subject: [PATCH 20/60] Add comment explaining unused finalize parameter Clarify why the finalize callback parameter is intentionally ignored when subinterpreter support is disabled: the storage is process-global and leaked to avoid destructor calls after interpreter finalization. --- include/pybind11/gil_safe_call_once.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/include/pybind11/gil_safe_call_once.h b/include/pybind11/gil_safe_call_once.h index 51e7f0c931..6ac6af4aff 100644 --- a/include/pybind11/gil_safe_call_once.h +++ b/include/pybind11/gil_safe_call_once.h @@ -70,6 +70,9 @@ template class gil_safe_call_once_and_store { public: // PRECONDITION: The GIL must be held when `call_once_and_store_result()` is called. + // Note: The second parameter (finalize callback) is intentionally unused when subinterpreter + // support is disabled. In that case, storage is process-global and intentionally leaked to + // avoid calling destructors after the Python interpreter has been finalized. template gil_safe_call_once_and_store &call_once_and_store_result(Callable &&fn, void (*)(T &) /*unused*/ = nullptr) { From 7cb30ceb070f8bc8682e245b4b9446f244664c20 Mon Sep 17 00:00:00 2001 From: "Ralf W. Grosse-Kunstleve" Date: Fri, 19 Dec 2025 21:57:36 -0800 Subject: [PATCH 21/60] Add comment explaining error_scope usage Clarify why error_scope is used: to preserve any existing Python error state that might be cleared or modified by dict_getitemstringref. --- include/pybind11/gil_safe_call_once.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/include/pybind11/gil_safe_call_once.h b/include/pybind11/gil_safe_call_once.h index 6ac6af4aff..083018f950 100644 --- a/include/pybind11/gil_safe_call_once.h +++ b/include/pybind11/gil_safe_call_once.h @@ -236,6 +236,9 @@ class gil_safe_call_once_and_store { } static call_once_storage_map_type *get_or_create_call_once_storage_map() { + // Preserve any existing Python error state. dict_getitemstringref may clear + // errors or set new ones when the key is not found; we restore the original + // error state when this scope exits. error_scope err_scope; dict state_dict = detail::get_python_state_dict(); auto storage_map_obj = reinterpret_steal( From 7d3413944d5d2b2c8788c8bc3c5da99ea4711b5b Mon Sep 17 00:00:00 2001 From: "Ralf W. Grosse-Kunstleve" Date: Fri, 19 Dec 2025 21:58:18 -0800 Subject: [PATCH 22/60] Improve exception safety in get_or_create_call_once_storage_map() Use std::unique_ptr to hold the newly allocated storage map until the capsule is successfully created. This prevents a memory leak if capsule creation throws an exception. --- include/pybind11/gil_safe_call_once.h | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/include/pybind11/gil_safe_call_once.h b/include/pybind11/gil_safe_call_once.h index 083018f950..b68dd26e65 100644 --- a/include/pybind11/gil_safe_call_once.h +++ b/include/pybind11/gil_safe_call_once.h @@ -13,6 +13,7 @@ # include #endif #ifdef PYBIND11_HAS_SUBINTERPRETER_SUPPORT +# include # include #endif @@ -254,17 +255,22 @@ class gil_safe_call_once_and_store { } storage_map = reinterpret_cast(raw_ptr); } else { - storage_map = new call_once_storage_map_type(); + // Use unique_ptr for exception safety: if capsule creation throws, + // the map is automatically deleted. + auto storage_map_ptr = std::unique_ptr( + new call_once_storage_map_type()); // Create capsule with destructor to clean up the storage map when the interpreter // shuts down state_dict[PYBIND11_CALL_ONCE_STORAGE_MAP_ID] - = capsule(storage_map, [](void *ptr) noexcept { + = capsule(storage_map_ptr.get(), [](void *ptr) noexcept { auto *map = reinterpret_cast(ptr); for (const auto &entry : *map) { delete entry.second; } delete map; }); + // Capsule now owns the storage map, release from unique_ptr + storage_map = storage_map_ptr.release(); } return storage_map; } From 78e39452d13fa1106f60ff8d6d6be82616444f9f Mon Sep 17 00:00:00 2001 From: "Ralf W. Grosse-Kunstleve" Date: Sat, 20 Dec 2025 03:32:42 -0800 Subject: [PATCH 23/60] Add timeout-minutes: 3 to cpptest workflow steps Add a 3-minute timeout to all C++ test (cpptest) steps across all platforms to detect hangs early. This uses GitHub Actions' built-in timeout-minutes property which works on Linux, macOS, and Windows. --- .github/workflows/ci.yml | 15 +++++++++++++++ .github/workflows/reusable-standard.yml | 1 + .github/workflows/upstream.yml | 2 ++ 3 files changed, 18 insertions(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index c5a200e32e..4800b9c25c 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -229,6 +229,7 @@ jobs: run: cmake --build . --target pytest - name: Compiled tests + timeout-minutes: 3 run: cmake --build . --target cpptest - name: Interface test @@ -334,6 +335,7 @@ jobs: run: cmake --build --preset default --target pytest - name: C++ tests + timeout-minutes: 3 run: cmake --build --preset default --target cpptest - name: Visibility test @@ -393,6 +395,7 @@ jobs: run: cmake --build build --target pytest - name: C++ tests + timeout-minutes: 3 run: cmake --build build --target cpptest - name: Interface test @@ -516,6 +519,7 @@ jobs: run: cmake --build build --target pytest - name: C++ tests + timeout-minutes: 3 run: cmake --build build --target cpptest - name: Interface test @@ -570,6 +574,7 @@ jobs: run: cmake --build build --target pytest - name: C++ tests + timeout-minutes: 3 run: cmake --build build --target cpptest - name: Interface test @@ -652,6 +657,7 @@ jobs: cmake --build build-11 --target check - name: C++ tests C++11 + timeout-minutes: 3 run: | set +e; source /opt/intel/oneapi/setvars.sh; set -e cmake --build build-11 --target cpptest @@ -689,6 +695,7 @@ jobs: cmake --build build-17 --target check - name: C++ tests C++17 + timeout-minutes: 3 run: | set +e; source /opt/intel/oneapi/setvars.sh; set -e cmake --build build-17 --target cpptest @@ -760,6 +767,7 @@ jobs: run: cmake --build build --target pytest - name: C++ tests + timeout-minutes: 3 run: cmake --build build --target cpptest - name: Interface test @@ -1000,6 +1008,7 @@ jobs: run: cmake --build build --target pytest - name: C++20 tests + timeout-minutes: 3 run: cmake --build build --target cpptest -j 2 - name: Interface test C++20 @@ -1076,6 +1085,7 @@ jobs: run: cmake --build build --target pytest -j 2 - name: C++11 tests + timeout-minutes: 3 run: PYTHONHOME=/${{matrix.sys}} PYTHONPATH=/${{matrix.sys}} cmake --build build --target cpptest -j 2 - name: Interface test C++11 @@ -1100,6 +1110,7 @@ jobs: run: cmake --build build2 --target pytest -j 2 - name: C++14 tests + timeout-minutes: 3 run: PYTHONHOME=/${{matrix.sys}} PYTHONPATH=/${{matrix.sys}} cmake --build build2 --target cpptest -j 2 - name: Interface test C++14 @@ -1124,6 +1135,7 @@ jobs: run: cmake --build build3 --target pytest -j 2 - name: C++17 tests + timeout-minutes: 3 run: PYTHONHOME=/${{matrix.sys}} PYTHONPATH=/${{matrix.sys}} cmake --build build3 --target cpptest -j 2 - name: Interface test C++17 @@ -1195,6 +1207,7 @@ jobs: run: cmake --build . --target pytest -j 2 - name: C++ tests + timeout-minutes: 3 run: cmake --build . --target cpptest -j 2 - name: Interface test @@ -1257,6 +1270,7 @@ jobs: run: cmake --build . --target pytest -j 2 - name: C++ tests + timeout-minutes: 3 run: cmake --build . --target cpptest -j 2 - name: Interface test @@ -1329,6 +1343,7 @@ jobs: run: cmake --build build --target pytest -j 2 - name: C++ tests + timeout-minutes: 3 run: PYTHONHOME=/clangarm64 PYTHONPATH=/clangarm64 cmake --build build --target cpptest -j 2 - name: Interface test diff --git a/.github/workflows/reusable-standard.yml b/.github/workflows/reusable-standard.yml index 96b14bdfba..56d92e2779 100644 --- a/.github/workflows/reusable-standard.yml +++ b/.github/workflows/reusable-standard.yml @@ -83,6 +83,7 @@ jobs: run: cmake --build build --target pytest - name: C++ tests + timeout-minutes: 3 run: cmake --build build --target cpptest - name: Interface test diff --git a/.github/workflows/upstream.yml b/.github/workflows/upstream.yml index 15ede7a856..890ae0b6fd 100644 --- a/.github/workflows/upstream.yml +++ b/.github/workflows/upstream.yml @@ -66,6 +66,7 @@ jobs: run: cmake --build build11 --target pytest -j 2 - name: C++11 tests + timeout-minutes: 3 run: cmake --build build11 --target cpptest -j 2 - name: Interface test C++11 @@ -87,6 +88,7 @@ jobs: run: cmake --build build17 --target pytest - name: C++17 tests + timeout-minutes: 3 run: cmake --build build17 --target cpptest # Third build - C++17 mode with unstable ABI From 1014ee403f6bd65f55ec5e62a5d969dd6e286d3c Mon Sep 17 00:00:00 2001 From: "Ralf W. Grosse-Kunstleve" Date: Sat, 20 Dec 2025 03:38:00 -0800 Subject: [PATCH 24/60] Add progress reporter for test_with_catch Catch2 runner Add a custom Catch2 streaming reporter that prints one line per test case as it starts and ends, with immediate flushing to keep CI logs current. This makes it easy to see where the embedded/interpreter tests are spending time and to pinpoint which test case is stuck when builds hang (e.g., free-threading issues). The reporter: - Prints "[ RUN ]" when each test starts - Prints "[ OK ]" or "[ FAILED ]" when each test ends - Prints the Python version once at the start via Py_GetVersion() - Uses StreamingReporterBase for immediate output (not buffered) - Is set as the default reporter via CATCH_CONFIG_DEFAULT_REPORTER This approach gives visibility into all tests without changing their behavior, turning otherwise opaque 90-minute CI timeouts into locatable issues in the Catch output. --- tests/test_with_catch/catch.cpp | 58 +++++++++++++++++++++++++++++++++ 1 file changed, 58 insertions(+) diff --git a/tests/test_with_catch/catch.cpp b/tests/test_with_catch/catch.cpp index 5bd8b3880e..5dbc01f677 100644 --- a/tests/test_with_catch/catch.cpp +++ b/tests/test_with_catch/catch.cpp @@ -13,10 +13,68 @@ PYBIND11_WARNING_DISABLE_MSVC(4996) #endif #define CATCH_CONFIG_RUNNER +#define CATCH_CONFIG_DEFAULT_REPORTER "progress" #include namespace py = pybind11; +// Simple progress reporter that prints a line per test case. +namespace { + +class ProgressReporter : public Catch::StreamingReporterBase { +public: + using StreamingReporterBase::StreamingReporterBase; + + static std::string getDescription() { return "Simple progress reporter (one line per test)"; } + + void testCaseStarting(Catch::TestCaseInfo const &testInfo) override { + print_python_version_once(); + auto &os = Catch::cout(); + os << "[ RUN ] " << testInfo.name << '\n'; + os.flush(); + } + + void testCaseEnded(Catch::TestCaseStats const &stats) override { + bool failed = stats.totals.assertions.failed > 0; + auto &os = Catch::cout(); + os << (failed ? "[ FAILED ] " : "[ OK ] ") << stats.testInfo.name << '\n'; + os.flush(); + } + + void noMatchingTestCases(std::string const &spec) override { + auto &os = Catch::cout(); + os << "[ NO TEST ] no matching test cases for spec: " << spec << '\n'; + os.flush(); + } + + void reportInvalidArguments(std::string const &arg) override { + auto &os = Catch::cout(); + os << "[ ERROR ] invalid Catch2 arguments: " << arg << '\n'; + os.flush(); + } + + void assertionStarting(Catch::AssertionInfo const &) override {} + + bool assertionEnded(Catch::AssertionStats const &) override { return false; } + +private: + void print_python_version_once() { + if (printed_) { + return; + } + printed_ = true; + auto &os = Catch::cout(); + os << "[ PYTHON ] " << Py_GetVersion() << '\n'; + os.flush(); + } + + bool printed_ = false; +}; + +} // namespace + +CATCH_REGISTER_REPORTER("progress", ProgressReporter) + int main(int argc, char *argv[]) { // Setup for TEST_CASE in test_interpreter.cpp, tagging on a large random number: std::string updated_pythonpath("pybind11_test_with_catch_PYTHONPATH_2099743835476552"); From 21d0dc5e5919449a57d30729079b27f87101e7c7 Mon Sep 17 00:00:00 2001 From: "Ralf W. Grosse-Kunstleve" Date: Sat, 20 Dec 2025 03:40:50 -0800 Subject: [PATCH 25/60] clang-format auto-fix (overlooked before) --- include/pybind11/gil_safe_call_once.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/pybind11/gil_safe_call_once.h b/include/pybind11/gil_safe_call_once.h index b68dd26e65..a6450cf7d6 100644 --- a/include/pybind11/gil_safe_call_once.h +++ b/include/pybind11/gil_safe_call_once.h @@ -257,8 +257,8 @@ class gil_safe_call_once_and_store { } else { // Use unique_ptr for exception safety: if capsule creation throws, // the map is automatically deleted. - auto storage_map_ptr = std::unique_ptr( - new call_once_storage_map_type()); + auto storage_map_ptr + = std::unique_ptr(new call_once_storage_map_type()); // Create capsule with destructor to clean up the storage map when the interpreter // shuts down state_dict[PYBIND11_CALL_ONCE_STORAGE_MAP_ID] From e1b1b1bdd6519bbdfb1d5bcb4669b97fd8cab4dd Mon Sep 17 00:00:00 2001 From: "Ralf W. Grosse-Kunstleve" Date: Sat, 20 Dec 2025 18:59:44 -0800 Subject: [PATCH 26/60] Disable "Move Subinterpreter" test on free-threaded Python 3.14+ This test hangs in Py_EndInterpreter() when the subinterpreter is destroyed from a different thread than it was created on. The hang was observed: - Intermittently on macOS with Python 3.14.0t - Predictably on macOS, Ubuntu, and Windows with Python 3.14.1t and 3.14.2t Root cause analysis points to an interaction between pybind11's subinterpreter creation code and CPython's free-threaded runtime, specifically around PyThreadState_Swap() after PyThreadState_DeleteCurrent(). See detailed analysis: https://github.com/pybind/pybind11/pull/5933 --- tests/test_with_catch/test_subinterpreter.cpp | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tests/test_with_catch/test_subinterpreter.cpp b/tests/test_with_catch/test_subinterpreter.cpp index 3c7c35be19..8f30c8f441 100644 --- a/tests/test_with_catch/test_subinterpreter.cpp +++ b/tests/test_with_catch/test_subinterpreter.cpp @@ -90,7 +90,11 @@ TEST_CASE("Single Subinterpreter") { unsafe_reset_internals_for_single_interpreter(); } -# if PY_VERSION_HEX >= 0x030D0000 +// "Move Subinterpreter" test is disabled on free-threaded Python 3.14+ due to a hang +// in Py_EndInterpreter() when the subinterpreter is destroyed from a different thread +// than it was created on. See: https://github.com/pybind/pybind11/pull/5933 +# if PY_VERSION_HEX >= 0x030D0000 \ + && !(PY_VERSION_HEX >= 0x030E0000 && defined(Py_GIL_DISABLED)) TEST_CASE("Move Subinterpreter") { std::unique_ptr sub(new py::subinterpreter(py::subinterpreter::create())); From 89cae6d6b9c5398d1895572c6be455a3afb604cf Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sun, 21 Dec 2025 03:01:50 +0000 Subject: [PATCH 27/60] style: pre-commit fixes --- tests/test_with_catch/test_subinterpreter.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/test_with_catch/test_subinterpreter.cpp b/tests/test_with_catch/test_subinterpreter.cpp index 8f30c8f441..a4cda7bc6e 100644 --- a/tests/test_with_catch/test_subinterpreter.cpp +++ b/tests/test_with_catch/test_subinterpreter.cpp @@ -93,8 +93,7 @@ TEST_CASE("Single Subinterpreter") { // "Move Subinterpreter" test is disabled on free-threaded Python 3.14+ due to a hang // in Py_EndInterpreter() when the subinterpreter is destroyed from a different thread // than it was created on. See: https://github.com/pybind/pybind11/pull/5933 -# if PY_VERSION_HEX >= 0x030D0000 \ - && !(PY_VERSION_HEX >= 0x030E0000 && defined(Py_GIL_DISABLED)) +# if PY_VERSION_HEX >= 0x030D0000 && !(PY_VERSION_HEX >= 0x030E0000 && defined(Py_GIL_DISABLED)) TEST_CASE("Move Subinterpreter") { std::unique_ptr sub(new py::subinterpreter(py::subinterpreter::create())); From a090637483f6bc5782a78f529df4d5315c95dc76 Mon Sep 17 00:00:00 2001 From: "Ralf W. Grosse-Kunstleve" Date: Sat, 20 Dec 2025 19:10:32 -0800 Subject: [PATCH 28/60] Add test for gil_safe_call_once_and_store per-interpreter isolation This test verifies that gil_safe_call_once_and_store provides separate storage for each interpreter when subinterpreter support is enabled. The test caches the interpreter ID in the main interpreter, then creates a subinterpreter and verifies it gets its own cached value (not the main interpreter's). Without per-interpreter storage, the subinterpreter would incorrectly see the main interpreter's cached object. --- tests/test_with_catch/test_subinterpreter.cpp | 58 +++++++++++++++++++ 1 file changed, 58 insertions(+) diff --git a/tests/test_with_catch/test_subinterpreter.cpp b/tests/test_with_catch/test_subinterpreter.cpp index a4cda7bc6e..172bfde284 100644 --- a/tests/test_with_catch/test_subinterpreter.cpp +++ b/tests/test_with_catch/test_subinterpreter.cpp @@ -1,5 +1,6 @@ #include #ifdef PYBIND11_HAS_SUBINTERPRETER_SUPPORT +# include # include // Silence MSVC C++17 deprecation warning from Catch regarding std::uncaught_exceptions (up to @@ -302,6 +303,63 @@ TEST_CASE("Multiple Subinterpreters") { unsafe_reset_internals_for_single_interpreter(); } +// Test that gil_safe_call_once_and_store provides per-interpreter storage. +// Without the per-interpreter storage fix, the subinterpreter would see the value +// cached by the main interpreter, which is invalid (different interpreter's object). +TEST_CASE("gil_safe_call_once_and_store per-interpreter isolation") { + unsafe_reset_internals_for_single_interpreter(); + + // This static simulates a typical usage pattern where a module caches + // an imported object using gil_safe_call_once_and_store. + PYBIND11_CONSTINIT static py::gil_safe_call_once_and_store storage; + + // Get the interpreter ID in the main interpreter + auto main_interp_id = PyInterpreterState_GetID(PyInterpreterState_Get()); + + // Store a value in the main interpreter - we'll store the interpreter ID as a Python int + auto &main_value = storage + .call_once_and_store_result([]() { + return py::int_(PyInterpreterState_GetID(PyInterpreterState_Get())); + }) + .get_stored(); + REQUIRE(main_value.cast() == main_interp_id); + + int64_t sub_interp_id = -1; + int64_t sub_cached_value = -1; + + // Create a subinterpreter and check that it gets its own storage + { + py::scoped_subinterpreter ssi; + + sub_interp_id = PyInterpreterState_GetID(PyInterpreterState_Get()); + REQUIRE(sub_interp_id != main_interp_id); + + // Access the same static storage from the subinterpreter. + // With per-interpreter storage, this should call the lambda again + // and cache a NEW value for this interpreter. + // Without per-interpreter storage, this would return main's cached value. + auto &sub_value + = storage + .call_once_and_store_result([]() { + return py::int_(PyInterpreterState_GetID(PyInterpreterState_Get())); + }) + .get_stored(); + + sub_cached_value = sub_value.cast(); + + // The cached value should be the SUBINTERPRETER's ID, not the main interpreter's. + // This would fail without per-interpreter storage. + REQUIRE(sub_cached_value == sub_interp_id); + REQUIRE(sub_cached_value != main_interp_id); + } + + // Back in main interpreter, verify main's value is unchanged + auto &main_value_after = storage.get_stored(); + REQUIRE(main_value_after.cast() == main_interp_id); + + unsafe_reset_internals_for_single_interpreter(); +} + # ifdef Py_MOD_PER_INTERPRETER_GIL_SUPPORTED TEST_CASE("Per-Subinterpreter GIL") { auto main_int From cb5e7d751abece7eaed1ea826e9642656fa35afa Mon Sep 17 00:00:00 2001 From: "Ralf W. Grosse-Kunstleve" Date: Sat, 20 Dec 2025 19:27:08 -0800 Subject: [PATCH 29/60] Add STARTING/DONE timestamps to test_with_catch output Print UTC timestamps at the beginning and end of the test run to make it immediately clear when tests started and whether they ran to completion. The DONE message includes the Catch session result value. Example output: [ STARTING ] 2025-12-21 03:23:20.497Z [ PYTHON ] 3.14.2 ... [ RUN ] Threads [ OK ] Threads [ DONE ] 2025-12-21 03:23:20.512Z (result 0) --- tests/test_with_catch/catch.cpp | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/tests/test_with_catch/catch.cpp b/tests/test_with_catch/catch.cpp index 5dbc01f677..46edbd2941 100644 --- a/tests/test_with_catch/catch.cpp +++ b/tests/test_with_catch/catch.cpp @@ -3,6 +3,11 @@ #include +#include +#include +#include +#include + // Silence MSVC C++17 deprecation warning from Catch regarding std::uncaught_exceptions (up to // catch 2.0.1; this should be fixed in the next catch release after 2.0.1). PYBIND11_WARNING_DISABLE_MSVC(4996) @@ -75,6 +80,28 @@ class ProgressReporter : public Catch::StreamingReporterBase { CATCH_REGISTER_REPORTER("progress", ProgressReporter) +namespace { + +std::string get_utc_timestamp() { + auto now = std::chrono::system_clock::now(); + auto time_t_now = std::chrono::system_clock::to_time_t(now); + auto ms = std::chrono::duration_cast(now.time_since_epoch()) % 1000; + + std::tm utc_tm{}; +#if defined(_WIN32) + gmtime_s(&utc_tm, &time_t_now); +#else + gmtime_r(&time_t_now, &utc_tm); +#endif + + std::ostringstream oss; + oss << std::put_time(&utc_tm, "%Y-%m-%d %H:%M:%S") << '.' << std::setfill('0') << std::setw(3) + << ms.count() << 'Z'; + return oss.str(); +} + +} // namespace + int main(int argc, char *argv[]) { // Setup for TEST_CASE in test_interpreter.cpp, tagging on a large random number: std::string updated_pythonpath("pybind11_test_with_catch_PYTHONPATH_2099743835476552"); @@ -93,9 +120,14 @@ int main(int argc, char *argv[]) { setenv("PYTHONPATH", updated_pythonpath.c_str(), /*replace=*/1); #endif + std::cout << "[ STARTING ] " << get_utc_timestamp() << std::endl; + py::scoped_interpreter guard{}; auto result = Catch::Session().run(argc, argv); + std::cout << "[ DONE ] " << get_utc_timestamp() << " (result " << result << ")" + << std::endl; + return result < 0xff ? result : 0xff; } From 0f8f32a92ac4f6c6e233e17104302aa61281aee5 Mon Sep 17 00:00:00 2001 From: "Ralf W. Grosse-Kunstleve" Date: Sat, 20 Dec 2025 19:37:53 -0800 Subject: [PATCH 30/60] Disable stdout buffering in test_with_catch Ensure test output appears immediately in CI logs by disabling stdout buffering. Without this, output may be lost if the process is killed by a timeout, making it difficult to diagnose which test was hanging. --- tests/test_with_catch/catch.cpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tests/test_with_catch/catch.cpp b/tests/test_with_catch/catch.cpp index 46edbd2941..80cbf1bb98 100644 --- a/tests/test_with_catch/catch.cpp +++ b/tests/test_with_catch/catch.cpp @@ -4,6 +4,7 @@ #include #include +#include #include #include #include @@ -103,6 +104,11 @@ std::string get_utc_timestamp() { } // namespace int main(int argc, char *argv[]) { + // Disable stdout buffering to ensure output appears immediately in CI logs. + // Without this, output may be lost if the process is killed by a timeout. + std::cout.setf(std::ios::unitbuf); + setvbuf(stdout, nullptr, _IONBF, 0); + // Setup for TEST_CASE in test_interpreter.cpp, tagging on a large random number: std::string updated_pythonpath("pybind11_test_with_catch_PYTHONPATH_2099743835476552"); const char *preexisting_pythonpath = getenv("PYTHONPATH"); From a3abdeea8976f0a11beb6dc3ed9c1f52a9f5f45a Mon Sep 17 00:00:00 2001 From: "Ralf W. Grosse-Kunstleve" Date: Sat, 20 Dec 2025 19:40:43 -0800 Subject: [PATCH 31/60] EXPERIMENT: Re-enable hanging test to verify CI log buffering fix This is a temporary commit to verify that the unbuffered stdout fix makes the hanging test visible in CI logs. REVERT THIS COMMIT after confirming the output appears. --- tests/test_with_catch/test_subinterpreter.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/test_with_catch/test_subinterpreter.cpp b/tests/test_with_catch/test_subinterpreter.cpp index 172bfde284..79182bff5e 100644 --- a/tests/test_with_catch/test_subinterpreter.cpp +++ b/tests/test_with_catch/test_subinterpreter.cpp @@ -94,7 +94,9 @@ TEST_CASE("Single Subinterpreter") { // "Move Subinterpreter" test is disabled on free-threaded Python 3.14+ due to a hang // in Py_EndInterpreter() when the subinterpreter is destroyed from a different thread // than it was created on. See: https://github.com/pybind/pybind11/pull/5933 -# if PY_VERSION_HEX >= 0x030D0000 && !(PY_VERSION_HEX >= 0x030E0000 && defined(Py_GIL_DISABLED)) +// EXPERIMENT: Temporarily re-enable to test CI log buffering fix - REVERT THIS +# if PY_VERSION_HEX >= 0x030D0000 \ + && !(0 && PY_VERSION_HEX >= 0x030E0000 && defined(Py_GIL_DISABLED)) TEST_CASE("Move Subinterpreter") { std::unique_ptr sub(new py::subinterpreter(py::subinterpreter::create())); From d6f2a7f11504caea2734cac7f4dd55d2bf8f809a Mon Sep 17 00:00:00 2001 From: "Ralf W. Grosse-Kunstleve" Date: Sat, 20 Dec 2025 20:54:10 -0800 Subject: [PATCH 32/60] Revert "Disable stdout buffering in test_with_catch" This reverts commit 0f8f32a92ac4f6c6e233e17104302aa61281aee5. --- tests/test_with_catch/catch.cpp | 6 ------ 1 file changed, 6 deletions(-) diff --git a/tests/test_with_catch/catch.cpp b/tests/test_with_catch/catch.cpp index 80cbf1bb98..46edbd2941 100644 --- a/tests/test_with_catch/catch.cpp +++ b/tests/test_with_catch/catch.cpp @@ -4,7 +4,6 @@ #include #include -#include #include #include #include @@ -104,11 +103,6 @@ std::string get_utc_timestamp() { } // namespace int main(int argc, char *argv[]) { - // Disable stdout buffering to ensure output appears immediately in CI logs. - // Without this, output may be lost if the process is killed by a timeout. - std::cout.setf(std::ios::unitbuf); - setvbuf(stdout, nullptr, _IONBF, 0); - // Setup for TEST_CASE in test_interpreter.cpp, tagging on a large random number: std::string updated_pythonpath("pybind11_test_with_catch_PYTHONPATH_2099743835476552"); const char *preexisting_pythonpath = getenv("PYTHONPATH"); From 9b70460b4751046dbfbc98bf61e63f620284acbd Mon Sep 17 00:00:00 2001 From: "Ralf W. Grosse-Kunstleve" Date: Sat, 20 Dec 2025 20:54:16 -0800 Subject: [PATCH 33/60] Use USES_TERMINAL for cpptest to show output immediately Ninja buffers subprocess output until completion. When a test hangs, the output is never shown, making it impossible to diagnose which test is hanging. USES_TERMINAL gives the command direct terminal access, bypassing ninja's buffering. This explains why Windows CI showed test progress but Linux/macOS did not - Windows uses MSBuild which doesn't buffer the same way. --- tests/test_with_catch/CMakeLists.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/test_with_catch/CMakeLists.txt b/tests/test_with_catch/CMakeLists.txt index 136537e67f..951a9d21f8 100644 --- a/tests/test_with_catch/CMakeLists.txt +++ b/tests/test_with_catch/CMakeLists.txt @@ -47,7 +47,8 @@ add_custom_target( cpptest COMMAND "$" DEPENDS test_with_catch - WORKING_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}") + WORKING_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}" + USES_TERMINAL) pybind11_add_module(external_module THIN_LTO external_module.cpp) set_target_properties(external_module PROPERTIES LIBRARY_OUTPUT_DIRECTORY From 8951004057e5aefbd7aee7752737b2b7fab01418 Mon Sep 17 00:00:00 2001 From: "Ralf W. Grosse-Kunstleve" Date: Sat, 20 Dec 2025 20:56:56 -0800 Subject: [PATCH 34/60] Fix clang-tidy performance-avoid-endl warning Use '\n' instead of std::endl since USES_TERMINAL now handles output buffering at the CMake level. --- tests/test_with_catch/catch.cpp | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/tests/test_with_catch/catch.cpp b/tests/test_with_catch/catch.cpp index 46edbd2941..3965873bed 100644 --- a/tests/test_with_catch/catch.cpp +++ b/tests/test_with_catch/catch.cpp @@ -120,14 +120,15 @@ int main(int argc, char *argv[]) { setenv("PYTHONPATH", updated_pythonpath.c_str(), /*replace=*/1); #endif - std::cout << "[ STARTING ] " << get_utc_timestamp() << std::endl; + std::cout << "[ STARTING ] " << get_utc_timestamp() << '\n'; + std::cout.flush(); py::scoped_interpreter guard{}; auto result = Catch::Session().run(argc, argv); - std::cout << "[ DONE ] " << get_utc_timestamp() << " (result " << result << ")" - << std::endl; + std::cout << "[ DONE ] " << get_utc_timestamp() << " (result " << result << ")\n"; + std::cout.flush(); return result < 0xff ? result : 0xff; } From c4cbe7332e96e6bb6a2bce9377084aafc7fcc457 Mon Sep 17 00:00:00 2001 From: "Ralf W. Grosse-Kunstleve" Date: Sat, 20 Dec 2025 21:27:40 -0800 Subject: [PATCH 35/60] Add SIGTERM handler to show when test is killed by timeout When a test hangs and is killed by `timeout`, Catch2 marks it as failed but the process exits before printing [ DONE ]. This made it unclear whether the test failed normally or was terminated. The signal handler prints a clear message when SIGTERM is received, making timeout-related failures obvious in CI logs. --- tests/test_with_catch/catch.cpp | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/tests/test_with_catch/catch.cpp b/tests/test_with_catch/catch.cpp index 3965873bed..ecff800c1d 100644 --- a/tests/test_with_catch/catch.cpp +++ b/tests/test_with_catch/catch.cpp @@ -4,10 +4,16 @@ #include #include +#include +#include #include #include #include +#ifndef _WIN32 +# include +#endif + // Silence MSVC C++17 deprecation warning from Catch regarding std::uncaught_exceptions (up to // catch 2.0.1; this should be fixed in the next catch release after 2.0.1). PYBIND11_WARNING_DISABLE_MSVC(4996) @@ -100,9 +106,26 @@ std::string get_utc_timestamp() { return oss.str(); } +#ifndef _WIN32 +// Signal handler to print a message when the process is terminated. +// Uses only async-signal-safe functions. +void termination_signal_handler(int sig) { + const char *msg = "[ SIGNAL ] Process received SIGTERM\n"; + // write() is async-signal-safe, unlike std::cout + (void) write(STDOUT_FILENO, msg, strlen(msg)); + // Re-raise with default handler to get proper exit status + std::signal(sig, SIG_DFL); + std::raise(sig); +} +#endif + } // namespace int main(int argc, char *argv[]) { +#ifndef _WIN32 + std::signal(SIGTERM, termination_signal_handler); +#endif + // Setup for TEST_CASE in test_interpreter.cpp, tagging on a large random number: std::string updated_pythonpath("pybind11_test_with_catch_PYTHONPATH_2099743835476552"); const char *preexisting_pythonpath = getenv("PYTHONPATH"); From f330a794fb52d4cc6c9ec5b6203670e52f4ca31b Mon Sep 17 00:00:00 2001 From: "Ralf W. Grosse-Kunstleve" Date: Sat, 20 Dec 2025 21:50:35 -0800 Subject: [PATCH 36/60] Fix typo: atleast -> at_least --- include/pybind11/gil_safe_call_once.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/include/pybind11/gil_safe_call_once.h b/include/pybind11/gil_safe_call_once.h index a6450cf7d6..8af42fe154 100644 --- a/include/pybind11/gil_safe_call_once.h +++ b/include/pybind11/gil_safe_call_once.h @@ -202,9 +202,9 @@ class gil_safe_call_once_and_store { value->finalize = finalize_fn; value->is_initialized = true; last_storage_ptr_ = reinterpret_cast(value->storage); - is_initialized_by_atleast_one_interpreter_ = true; + is_initialized_by_at_least_one_interpreter_ = true; }); - // All threads will observe `is_initialized_by_atleast_one_interpreter_` as true here. + // All threads will observe `is_initialized_by_at_least_one_interpreter_` as true here. } // Intentionally not returning `T &` to ensure the calling code is self-documenting. return *this; @@ -232,7 +232,7 @@ class gil_safe_call_once_and_store { private: bool is_last_storage_valid() const { - return is_initialized_by_atleast_one_interpreter_ + return is_initialized_by_at_least_one_interpreter_ && detail::get_num_interpreters_seen() == 1; } @@ -284,7 +284,7 @@ class gil_safe_call_once_and_store { T *last_storage_ptr_ = nullptr; // This flag is true if the value has been initialized by any interpreter (may not be the // current one). - detail::atomic_bool is_initialized_by_atleast_one_interpreter_{false}; + detail::atomic_bool is_initialized_by_at_least_one_interpreter_{false}; }; #endif From 6c1ccb98f1f51603e818fef9d9aefa5b60d9da1a Mon Sep 17 00:00:00 2001 From: "Ralf W. Grosse-Kunstleve" Date: Sat, 20 Dec 2025 21:50:40 -0800 Subject: [PATCH 37/60] Fix GCC warn_unused_result error for write() in signal handler Assign the return value to a variable to satisfy GCC's warn_unused_result attribute, then cast to void to suppress unused variable warning. --- tests/test_with_catch/catch.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/test_with_catch/catch.cpp b/tests/test_with_catch/catch.cpp index ecff800c1d..89fec39c37 100644 --- a/tests/test_with_catch/catch.cpp +++ b/tests/test_with_catch/catch.cpp @@ -112,7 +112,8 @@ std::string get_utc_timestamp() { void termination_signal_handler(int sig) { const char *msg = "[ SIGNAL ] Process received SIGTERM\n"; // write() is async-signal-safe, unlike std::cout - (void) write(STDOUT_FILENO, msg, strlen(msg)); + ssize_t written = write(STDOUT_FILENO, msg, strlen(msg)); + (void) written; // suppress "unused variable" warnings // Re-raise with default handler to get proper exit status std::signal(sig, SIG_DFL); std::raise(sig); From 3c01ff38b0666e3039a1e8dd8fb55239ecf024f6 Mon Sep 17 00:00:00 2001 From: "Ralf W. Grosse-Kunstleve" Date: Sat, 20 Dec 2025 22:02:45 -0800 Subject: [PATCH 38/60] Add USES_TERMINAL to other C++ test targets Apply the same ninja output buffering fix to test_cross_module_rtti and test_pure_cpp targets. Also add explanatory comments to all USES_TERMINAL usages. --- tests/pure_cpp/CMakeLists.txt | 4 +++- tests/test_cross_module_rtti/CMakeLists.txt | 4 +++- tests/test_with_catch/CMakeLists.txt | 3 ++- 3 files changed, 8 insertions(+), 3 deletions(-) diff --git a/tests/pure_cpp/CMakeLists.txt b/tests/pure_cpp/CMakeLists.txt index 1150cb405e..d2757db766 100644 --- a/tests/pure_cpp/CMakeLists.txt +++ b/tests/pure_cpp/CMakeLists.txt @@ -15,6 +15,8 @@ target_link_libraries(smart_holder_poc_test PRIVATE pybind11::headers Catch2::Ca add_custom_target( test_pure_cpp COMMAND "$" - WORKING_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}") + WORKING_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}" + USES_TERMINAL # Ensures output is shown immediately (not buffered and possibly lost on crash) +) add_dependencies(check test_pure_cpp) diff --git a/tests/test_cross_module_rtti/CMakeLists.txt b/tests/test_cross_module_rtti/CMakeLists.txt index 97d2c780cb..c9b95bfba1 100644 --- a/tests/test_cross_module_rtti/CMakeLists.txt +++ b/tests/test_cross_module_rtti/CMakeLists.txt @@ -60,7 +60,9 @@ add_custom_target( test_cross_module_rtti COMMAND "$" DEPENDS test_cross_module_rtti_main - WORKING_DIRECTORY "$") + WORKING_DIRECTORY "$" + USES_TERMINAL # Ensures output is shown immediately (not buffered and possibly lost on crash) +) set_target_properties(test_cross_module_rtti_bindings PROPERTIES LIBRARY_OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}") diff --git a/tests/test_with_catch/CMakeLists.txt b/tests/test_with_catch/CMakeLists.txt index 951a9d21f8..e6a9f67aa7 100644 --- a/tests/test_with_catch/CMakeLists.txt +++ b/tests/test_with_catch/CMakeLists.txt @@ -48,7 +48,8 @@ add_custom_target( COMMAND "$" DEPENDS test_with_catch WORKING_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}" - USES_TERMINAL) + USES_TERMINAL # Ensures output is shown immediately (not buffered and possibly lost on crash) +) pybind11_add_module(external_module THIN_LTO external_module.cpp) set_target_properties(external_module PROPERTIES LIBRARY_OUTPUT_DIRECTORY From 9e9843db2e664864d1ca0de2b08d7249a087f861 Mon Sep 17 00:00:00 2001 From: "Ralf W. Grosse-Kunstleve" Date: Sat, 20 Dec 2025 22:12:03 -0800 Subject: [PATCH 39/60] Revert "EXPERIMENT: Re-enable hanging test to verify CI log buffering fix" This reverts commit a3abdeea8976f0a11beb6dc3ed9c1f52a9f5f45a. --- tests/test_with_catch/test_subinterpreter.cpp | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tests/test_with_catch/test_subinterpreter.cpp b/tests/test_with_catch/test_subinterpreter.cpp index 79182bff5e..172bfde284 100644 --- a/tests/test_with_catch/test_subinterpreter.cpp +++ b/tests/test_with_catch/test_subinterpreter.cpp @@ -94,9 +94,7 @@ TEST_CASE("Single Subinterpreter") { // "Move Subinterpreter" test is disabled on free-threaded Python 3.14+ due to a hang // in Py_EndInterpreter() when the subinterpreter is destroyed from a different thread // than it was created on. See: https://github.com/pybind/pybind11/pull/5933 -// EXPERIMENT: Temporarily re-enable to test CI log buffering fix - REVERT THIS -# if PY_VERSION_HEX >= 0x030D0000 \ - && !(0 && PY_VERSION_HEX >= 0x030E0000 && defined(Py_GIL_DISABLED)) +# if PY_VERSION_HEX >= 0x030D0000 && !(PY_VERSION_HEX >= 0x030E0000 && defined(Py_GIL_DISABLED)) TEST_CASE("Move Subinterpreter") { std::unique_ptr sub(new py::subinterpreter(py::subinterpreter::create())); From e7c26485f8d9fd4fce07d917aaf0f08b989a38eb Mon Sep 17 00:00:00 2001 From: "Ralf W. Grosse-Kunstleve" Date: Sat, 20 Dec 2025 22:25:24 -0800 Subject: [PATCH 40/60] Update comment to reference PR #5940 for Move Subinterpreter fix --- tests/test_with_catch/test_subinterpreter.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_with_catch/test_subinterpreter.cpp b/tests/test_with_catch/test_subinterpreter.cpp index 172bfde284..0986eeb87e 100644 --- a/tests/test_with_catch/test_subinterpreter.cpp +++ b/tests/test_with_catch/test_subinterpreter.cpp @@ -93,7 +93,7 @@ TEST_CASE("Single Subinterpreter") { // "Move Subinterpreter" test is disabled on free-threaded Python 3.14+ due to a hang // in Py_EndInterpreter() when the subinterpreter is destroyed from a different thread -// than it was created on. See: https://github.com/pybind/pybind11/pull/5933 +// than it was created on. See: https://github.com/pybind/pybind11/pull/5940 # if PY_VERSION_HEX >= 0x030D0000 && !(PY_VERSION_HEX >= 0x030E0000 && defined(Py_GIL_DISABLED)) TEST_CASE("Move Subinterpreter") { std::unique_ptr sub(new py::subinterpreter(py::subinterpreter::create())); From 58c08ac4329f9430d5094cfdcc1d6960f76ffb18 Mon Sep 17 00:00:00 2001 From: Xuehai Pan Date: Sun, 21 Dec 2025 16:31:59 +0800 Subject: [PATCH 41/60] Add alias `interpid_t = std::int64_t` --- include/pybind11/subinterpreter.h | 5 ++++- tests/test_with_catch/test_subinterpreter.cpp | 10 +++++----- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/include/pybind11/subinterpreter.h b/include/pybind11/subinterpreter.h index aaf5204570..f7eafe0eaf 100644 --- a/include/pybind11/subinterpreter.h +++ b/include/pybind11/subinterpreter.h @@ -13,6 +13,7 @@ #include "detail/internals.h" #include "gil.h" +#include #include #ifndef PYBIND11_HAS_SUBINTERPRETER_SUPPORT @@ -30,6 +31,8 @@ inline PyInterpreterState *get_interpreter_state_unchecked() { } PYBIND11_NAMESPACE_END(detail) +using interpid_t = std::int64_t; + class subinterpreter; /// Activate the subinterpreter and acquire its GIL, while also releasing any GIL and interpreter @@ -214,7 +217,7 @@ class subinterpreter { } /// Get the numerical identifier for the sub-interpreter - int64_t id() const { + interpid_t id() const { if (istate_ != nullptr) { return PyInterpreterState_GetID(istate_); } diff --git a/tests/test_with_catch/test_subinterpreter.cpp b/tests/test_with_catch/test_subinterpreter.cpp index 0986eeb87e..2821c52055 100644 --- a/tests/test_with_catch/test_subinterpreter.cpp +++ b/tests/test_with_catch/test_subinterpreter.cpp @@ -322,10 +322,10 @@ TEST_CASE("gil_safe_call_once_and_store per-interpreter isolation") { return py::int_(PyInterpreterState_GetID(PyInterpreterState_Get())); }) .get_stored(); - REQUIRE(main_value.cast() == main_interp_id); + REQUIRE(main_value.cast() == main_interp_id); - int64_t sub_interp_id = -1; - int64_t sub_cached_value = -1; + py::interpid_t sub_interp_id = -1; + py::interpid_t sub_cached_value = -1; // Create a subinterpreter and check that it gets its own storage { @@ -345,7 +345,7 @@ TEST_CASE("gil_safe_call_once_and_store per-interpreter isolation") { }) .get_stored(); - sub_cached_value = sub_value.cast(); + sub_cached_value = sub_value.cast(); // The cached value should be the SUBINTERPRETER's ID, not the main interpreter's. // This would fail without per-interpreter storage. @@ -355,7 +355,7 @@ TEST_CASE("gil_safe_call_once_and_store per-interpreter isolation") { // Back in main interpreter, verify main's value is unchanged auto &main_value_after = storage.get_stored(); - REQUIRE(main_value_after.cast() == main_interp_id); + REQUIRE(main_value_after.cast() == main_interp_id); unsafe_reset_internals_for_single_interpreter(); } From 305a293b1c41d9726492d7ae14d167005dbb353b Mon Sep 17 00:00:00 2001 From: Xuehai Pan Date: Sun, 21 Dec 2025 16:46:39 +0800 Subject: [PATCH 42/60] Add isolation and gc test for `gil_safe_call_once_and_store` --- tests/test_with_catch/test_subinterpreter.cpp | 64 +++++++++++++++++++ 1 file changed, 64 insertions(+) diff --git a/tests/test_with_catch/test_subinterpreter.cpp b/tests/test_with_catch/test_subinterpreter.cpp index 2821c52055..73afe04644 100644 --- a/tests/test_with_catch/test_subinterpreter.cpp +++ b/tests/test_with_catch/test_subinterpreter.cpp @@ -40,6 +40,30 @@ void unsafe_reset_internals_for_single_interpreter() { py::detail::get_local_internals(); } +py::object &get_dict_type_object() { + PYBIND11_CONSTINIT static py::gil_safe_call_once_and_store storage; + return storage + .call_once_and_store_result( + []() -> py::object { return py::module_::import("builtins").attr("dict"); }) + .get_stored(); +} + +py::object &get_ordered_dict_type_object() { + PYBIND11_CONSTINIT static py::gil_safe_call_once_and_store storage; + return storage + .call_once_and_store_result( + []() -> py::object { return py::module_::import("collections").attr("OrderedDict"); }) + .get_stored(); +} + +py::object &get_default_dict_type_object() { + PYBIND11_CONSTINIT static py::gil_safe_call_once_and_store storage; + return storage + .call_once_and_store_result( + []() -> py::object { return py::module_::import("collections").attr("defaultdict"); }) + .get_stored(); +} + TEST_CASE("Single Subinterpreter") { unsafe_reset_internals_for_single_interpreter(); @@ -324,9 +348,15 @@ TEST_CASE("gil_safe_call_once_and_store per-interpreter isolation") { .get_stored(); REQUIRE(main_value.cast() == main_interp_id); + py::object dict_type = get_dict_type_object(); + py::object ordered_dict_type = get_ordered_dict_type_object(); + py::object default_dict_type = get_default_dict_type_object(); + py::interpid_t sub_interp_id = -1; py::interpid_t sub_cached_value = -1; + bool sub_default_dict_type_destroyed = false; + // Create a subinterpreter and check that it gets its own storage { py::scoped_subinterpreter ssi; @@ -351,12 +381,46 @@ TEST_CASE("gil_safe_call_once_and_store per-interpreter isolation") { // This would fail without per-interpreter storage. REQUIRE(sub_cached_value == sub_interp_id); REQUIRE(sub_cached_value != main_interp_id); + + py::object sub_dict_type = get_dict_type_object(); + py::object sub_ordered_dict_type = get_ordered_dict_type_object(); + py::object sub_default_dict_type = get_default_dict_type_object(); + + // Verify that the subinterpreter has its own cached type objects. + // For static types, they should be the same object across interpreters. + // See also: https://docs.python.org/3/c-api/typeobj.html#static-types + REQUIRE(sub_dict_type.is(dict_type)); // dict is a static type + REQUIRE(sub_ordered_dict_type.is(ordered_dict_type)); // OrderedDict is a static type + // For heap types, they are dynamically created per-interpreter. + // See also: https://docs.python.org/3/c-api/typeobj.html#heap-types + REQUIRE_FALSE(sub_default_dict_type.is(default_dict_type)); // defaultdict is a heap type + + // Set up a weakref callback to detect when the subinterpreter's cached default_dict_type + // is destroyed so the gil_safe_call_once_and_store storage is not leaked when the + // subinterpreter is shutdown. + (void) py::weakref(sub_default_dict_type, + py::cpp_function([&](py::handle weakref) -> void { + sub_default_dict_type_destroyed = true; + weakref.dec_ref(); + })) + .release(); } // Back in main interpreter, verify main's value is unchanged auto &main_value_after = storage.get_stored(); REQUIRE(main_value_after.cast() == main_interp_id); + // Verify that the types cached in main are unchanged + py::object dict_type_after = get_dict_type_object(); + py::object ordered_dict_type_after = get_ordered_dict_type_object(); + py::object default_dict_type_after = get_default_dict_type_object(); + REQUIRE(dict_type_after.is(dict_type)); + REQUIRE(ordered_dict_type_after.is(ordered_dict_type)); + REQUIRE(default_dict_type_after.is(default_dict_type)); + + // Verify that the subinterpreter's cached default_dict_type was destroyed + REQUIRE(sub_default_dict_type_destroyed); + unsafe_reset_internals_for_single_interpreter(); } From f6bba0fc6396a4c9852a72de25d5344cd40d67f7 Mon Sep 17 00:00:00 2001 From: Xuehai Pan Date: Sun, 21 Dec 2025 18:48:36 +0800 Subject: [PATCH 43/60] Add thread local cache for gil_safe_call_once_and_store --- include/pybind11/gil_safe_call_once.h | 70 ++++++++++++++++++++++----- 1 file changed, 59 insertions(+), 11 deletions(-) diff --git a/include/pybind11/gil_safe_call_once.h b/include/pybind11/gil_safe_call_once.h index 8af42fe154..12087b23da 100644 --- a/include/pybind11/gil_safe_call_once.h +++ b/include/pybind11/gil_safe_call_once.h @@ -174,7 +174,7 @@ class gil_safe_call_once_and_store { template gil_safe_call_once_and_store &call_once_and_store_result(Callable &&fn, void (*finalize_fn)(T &) = nullptr) { - if (!is_last_storage_valid()) { + if (!is_last_storage_tls_valid()) { // Multiple threads may enter here, because the GIL is released in the next line and // CPython API calls in the `fn()` call below may release and reacquire the GIL. gil_scoped_release gil_rel; // Needed to establish lock ordering. @@ -201,10 +201,10 @@ class gil_safe_call_once_and_store { ::new (value->storage) T(fn()); value->finalize = finalize_fn; value->is_initialized = true; - last_storage_ptr_ = reinterpret_cast(value->storage); is_initialized_by_at_least_one_interpreter_ = true; }); // All threads will observe `is_initialized_by_at_least_one_interpreter_` as true here. + update_storage_tls_cache(reinterpret_cast(value->storage)); } // Intentionally not returning `T &` to ensure the calling code is self-documenting. return *this; @@ -212,13 +212,14 @@ class gil_safe_call_once_and_store { // This must only be called after `call_once_and_store_result()` was called. T &get_stored() { - T *result = last_storage_ptr_; - if (!is_last_storage_valid()) { + T *result = get_storage_tls_cache(); + if (!is_last_storage_tls_valid()) { gil_scoped_acquire gil_acq; const void *const key = reinterpret_cast(this); auto &storage_map = *get_or_create_call_once_storage_map(); auto *value = static_cast *>(storage_map.at(key)); - result = last_storage_ptr_ = reinterpret_cast(value->storage); + result = reinterpret_cast(value->storage); + update_storage_tls_cache(result); } assert(result != nullptr); return *result; @@ -231,9 +232,55 @@ class gil_safe_call_once_and_store { PYBIND11_DTOR_CONSTEXPR ~gil_safe_call_once_and_store() = default; private: - bool is_last_storage_valid() const { - return is_initialized_by_at_least_one_interpreter_ - && detail::get_num_interpreters_seen() == 1; + // Fast local cache to avoid repeated lookups when the interpreter has not changed on the + // current thread. Similar to `internals_pp_manager::{internals_p_tls,last_istate_tls}`. + static T *&last_storage_ptr_tls() { + static thread_local T *last_storage_ptr = nullptr; + return last_storage_ptr; + } + + static PyInterpreterState *&last_istate_tls() { + static thread_local PyInterpreterState *last_istate = nullptr; + return last_istate; + } + + // See also: internals_pp_manager::get_pp() + T *get_storage_tls_cache() const { + // The caller should be aware that the cached pointer may be invalid. + // It can only be used after checking `is_last_storage_tls_valid()`. + if (detail::get_num_interpreters_seen() > 1) { + return last_storage_ptr_tls(); + } + return last_storage_ptr_singleton_; + } + + void update_storage_tls_cache(T *ptr) { + gil_scoped_acquire_simple gil; + if (detail::get_num_interpreters_seen() > 1) { + auto *tstate = detail::get_thread_state_unchecked(); + if (tstate) { + last_istate_tls() = tstate->interp; + } + last_storage_ptr_tls() = ptr; + } else { + last_storage_ptr_singleton_ = ptr; + } + } + + bool is_last_storage_tls_valid() const { + if (!is_initialized_by_at_least_one_interpreter_) { + return false; + } + if (detail::get_num_interpreters_seen() > 1) { + // Whenever the interpreter changes on the current thread we need to invalidate the + // cached storage pointer so that it can be pulled from the interpreter's state dict. + auto *tstate = detail::get_thread_state_unchecked(); + if (!tstate || tstate->interp != last_istate_tls()) { + return false; + } + return last_storage_ptr_tls() != nullptr; + } + return last_storage_ptr_singleton_ != nullptr; } static call_once_storage_map_type *get_or_create_call_once_storage_map() { @@ -257,8 +304,7 @@ class gil_safe_call_once_and_store { } else { // Use unique_ptr for exception safety: if capsule creation throws, // the map is automatically deleted. - auto storage_map_ptr - = std::unique_ptr(new call_once_storage_map_type()); + auto storage_map_ptr = std::make_unique(); // Create capsule with destructor to clean up the storage map when the interpreter // shuts down state_dict[PYBIND11_CALL_ONCE_STORAGE_MAP_ID] @@ -281,7 +327,9 @@ class gil_safe_call_once_and_store { // Fast local cache to avoid repeated lookups when there are no multiple interpreters. // This is only valid if there is a single interpreter. Otherwise, it is not used. - T *last_storage_ptr_ = nullptr; + // This is separate from the thread-local cache above and maybe not initialized by the main + // interpreter. + T *last_storage_ptr_singleton_ = nullptr; // This flag is true if the value has been initialized by any interpreter (may not be the // current one). detail::atomic_bool is_initialized_by_at_least_one_interpreter_{false}; From 66e469735a028aa34e14ed5e8ccd355e7f20687c Mon Sep 17 00:00:00 2001 From: Xuehai Pan Date: Sun, 21 Dec 2025 19:36:04 +0800 Subject: [PATCH 44/60] Revert "Add thread local cache for gil_safe_call_once_and_store" This reverts commit 5d6681956d2d326fe74c7bf80e845c8e8ddb2a7c. --- include/pybind11/gil_safe_call_once.h | 70 +++++---------------------- 1 file changed, 12 insertions(+), 58 deletions(-) diff --git a/include/pybind11/gil_safe_call_once.h b/include/pybind11/gil_safe_call_once.h index 12087b23da..0f831b51e1 100644 --- a/include/pybind11/gil_safe_call_once.h +++ b/include/pybind11/gil_safe_call_once.h @@ -174,7 +174,7 @@ class gil_safe_call_once_and_store { template gil_safe_call_once_and_store &call_once_and_store_result(Callable &&fn, void (*finalize_fn)(T &) = nullptr) { - if (!is_last_storage_tls_valid()) { + if (!is_last_storage_valid()) { // Multiple threads may enter here, because the GIL is released in the next line and // CPython API calls in the `fn()` call below may release and reacquire the GIL. gil_scoped_release gil_rel; // Needed to establish lock ordering. @@ -201,10 +201,10 @@ class gil_safe_call_once_and_store { ::new (value->storage) T(fn()); value->finalize = finalize_fn; value->is_initialized = true; + last_storage_ptr_ = reinterpret_cast(value->storage); is_initialized_by_at_least_one_interpreter_ = true; }); // All threads will observe `is_initialized_by_at_least_one_interpreter_` as true here. - update_storage_tls_cache(reinterpret_cast(value->storage)); } // Intentionally not returning `T &` to ensure the calling code is self-documenting. return *this; @@ -212,14 +212,13 @@ class gil_safe_call_once_and_store { // This must only be called after `call_once_and_store_result()` was called. T &get_stored() { - T *result = get_storage_tls_cache(); - if (!is_last_storage_tls_valid()) { + T *result = last_storage_ptr_; + if (!is_last_storage_valid()) { gil_scoped_acquire gil_acq; const void *const key = reinterpret_cast(this); auto &storage_map = *get_or_create_call_once_storage_map(); auto *value = static_cast *>(storage_map.at(key)); - result = reinterpret_cast(value->storage); - update_storage_tls_cache(result); + result = last_storage_ptr_ = reinterpret_cast(value->storage); } assert(result != nullptr); return *result; @@ -232,55 +231,9 @@ class gil_safe_call_once_and_store { PYBIND11_DTOR_CONSTEXPR ~gil_safe_call_once_and_store() = default; private: - // Fast local cache to avoid repeated lookups when the interpreter has not changed on the - // current thread. Similar to `internals_pp_manager::{internals_p_tls,last_istate_tls}`. - static T *&last_storage_ptr_tls() { - static thread_local T *last_storage_ptr = nullptr; - return last_storage_ptr; - } - - static PyInterpreterState *&last_istate_tls() { - static thread_local PyInterpreterState *last_istate = nullptr; - return last_istate; - } - - // See also: internals_pp_manager::get_pp() - T *get_storage_tls_cache() const { - // The caller should be aware that the cached pointer may be invalid. - // It can only be used after checking `is_last_storage_tls_valid()`. - if (detail::get_num_interpreters_seen() > 1) { - return last_storage_ptr_tls(); - } - return last_storage_ptr_singleton_; - } - - void update_storage_tls_cache(T *ptr) { - gil_scoped_acquire_simple gil; - if (detail::get_num_interpreters_seen() > 1) { - auto *tstate = detail::get_thread_state_unchecked(); - if (tstate) { - last_istate_tls() = tstate->interp; - } - last_storage_ptr_tls() = ptr; - } else { - last_storage_ptr_singleton_ = ptr; - } - } - - bool is_last_storage_tls_valid() const { - if (!is_initialized_by_at_least_one_interpreter_) { - return false; - } - if (detail::get_num_interpreters_seen() > 1) { - // Whenever the interpreter changes on the current thread we need to invalidate the - // cached storage pointer so that it can be pulled from the interpreter's state dict. - auto *tstate = detail::get_thread_state_unchecked(); - if (!tstate || tstate->interp != last_istate_tls()) { - return false; - } - return last_storage_ptr_tls() != nullptr; - } - return last_storage_ptr_singleton_ != nullptr; + bool is_last_storage_valid() const { + return is_initialized_by_at_least_one_interpreter_ + && detail::get_num_interpreters_seen() == 1; } static call_once_storage_map_type *get_or_create_call_once_storage_map() { @@ -327,9 +280,10 @@ class gil_safe_call_once_and_store { // Fast local cache to avoid repeated lookups when there are no multiple interpreters. // This is only valid if there is a single interpreter. Otherwise, it is not used. - // This is separate from the thread-local cache above and maybe not initialized by the main - // interpreter. - T *last_storage_ptr_singleton_ = nullptr; + // WARNING: We cannot use thread local cache similar to `internals_pp_manager::internals_p_tls` + // because the thread local storage cannot be explicitly invalidated when interpreters + // are destroyed (unlike `internals_pp_manager` which has explicit hooks for that). + T *last_storage_ptr_ = nullptr; // This flag is true if the value has been initialized by any interpreter (may not be the // current one). detail::atomic_bool is_initialized_by_at_least_one_interpreter_{false}; From d0819cca249f9a0bcc714b7fcc24ddb67dd9f6d2 Mon Sep 17 00:00:00 2001 From: Xuehai Pan Date: Mon, 22 Dec 2025 01:15:09 +0800 Subject: [PATCH 45/60] Revert changes according to code review --- include/pybind11/gil_safe_call_once.h | 3 ++- include/pybind11/subinterpreter.h | 5 +---- tests/test_with_catch/test_subinterpreter.cpp | 10 +++++----- 3 files changed, 8 insertions(+), 10 deletions(-) diff --git a/include/pybind11/gil_safe_call_once.h b/include/pybind11/gil_safe_call_once.h index 0f831b51e1..7814a65c9e 100644 --- a/include/pybind11/gil_safe_call_once.h +++ b/include/pybind11/gil_safe_call_once.h @@ -257,7 +257,8 @@ class gil_safe_call_once_and_store { } else { // Use unique_ptr for exception safety: if capsule creation throws, // the map is automatically deleted. - auto storage_map_ptr = std::make_unique(); + auto storage_map_ptr + = std::unique_ptr(new call_once_storage_map_type()); // Create capsule with destructor to clean up the storage map when the interpreter // shuts down state_dict[PYBIND11_CALL_ONCE_STORAGE_MAP_ID] diff --git a/include/pybind11/subinterpreter.h b/include/pybind11/subinterpreter.h index f7eafe0eaf..aaf5204570 100644 --- a/include/pybind11/subinterpreter.h +++ b/include/pybind11/subinterpreter.h @@ -13,7 +13,6 @@ #include "detail/internals.h" #include "gil.h" -#include #include #ifndef PYBIND11_HAS_SUBINTERPRETER_SUPPORT @@ -31,8 +30,6 @@ inline PyInterpreterState *get_interpreter_state_unchecked() { } PYBIND11_NAMESPACE_END(detail) -using interpid_t = std::int64_t; - class subinterpreter; /// Activate the subinterpreter and acquire its GIL, while also releasing any GIL and interpreter @@ -217,7 +214,7 @@ class subinterpreter { } /// Get the numerical identifier for the sub-interpreter - interpid_t id() const { + int64_t id() const { if (istate_ != nullptr) { return PyInterpreterState_GetID(istate_); } diff --git a/tests/test_with_catch/test_subinterpreter.cpp b/tests/test_with_catch/test_subinterpreter.cpp index 73afe04644..1eefdc68ca 100644 --- a/tests/test_with_catch/test_subinterpreter.cpp +++ b/tests/test_with_catch/test_subinterpreter.cpp @@ -346,14 +346,14 @@ TEST_CASE("gil_safe_call_once_and_store per-interpreter isolation") { return py::int_(PyInterpreterState_GetID(PyInterpreterState_Get())); }) .get_stored(); - REQUIRE(main_value.cast() == main_interp_id); + REQUIRE(main_value.cast() == main_interp_id); py::object dict_type = get_dict_type_object(); py::object ordered_dict_type = get_ordered_dict_type_object(); py::object default_dict_type = get_default_dict_type_object(); - py::interpid_t sub_interp_id = -1; - py::interpid_t sub_cached_value = -1; + int64_t sub_interp_id = -1; + int64_t sub_cached_value = -1; bool sub_default_dict_type_destroyed = false; @@ -375,7 +375,7 @@ TEST_CASE("gil_safe_call_once_and_store per-interpreter isolation") { }) .get_stored(); - sub_cached_value = sub_value.cast(); + sub_cached_value = sub_value.cast(); // The cached value should be the SUBINTERPRETER's ID, not the main interpreter's. // This would fail without per-interpreter storage. @@ -408,7 +408,7 @@ TEST_CASE("gil_safe_call_once_and_store per-interpreter isolation") { // Back in main interpreter, verify main's value is unchanged auto &main_value_after = storage.get_stored(); - REQUIRE(main_value_after.cast() == main_interp_id); + REQUIRE(main_value_after.cast() == main_interp_id); // Verify that the types cached in main are unchanged py::object dict_type_after = get_dict_type_object(); From 5ce00e58fab2cd0982e17bc4cc0daa2b226a2db6 Mon Sep 17 00:00:00 2001 From: Xuehai Pan Date: Sun, 21 Dec 2025 17:45:06 +0800 Subject: [PATCH 46/60] Relocate multiple-interpreters tests --- tests/CMakeLists.txt | 22 ++--------- .../test_multiple_interpreters/CMakeLists.txt | 39 +++++++++++++++++++ .../mod_per_interpreter_gil.cpp | 0 .../mod_shared_interpreter_gil.cpp | 0 .../test_multiple_interpreters.py | 0 5 files changed, 42 insertions(+), 19 deletions(-) create mode 100644 tests/test_multiple_interpreters/CMakeLists.txt rename tests/{ => test_multiple_interpreters}/mod_per_interpreter_gil.cpp (100%) rename tests/{ => test_multiple_interpreters}/mod_shared_interpreter_gil.cpp (100%) rename tests/{ => test_multiple_interpreters}/test_multiple_interpreters.py (100%) diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 47ba4aa863..bc875ef855 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -157,7 +157,7 @@ set(PYBIND11_TEST_FILES test_methods_and_attributes test_modules test_multiple_inheritance - test_multiple_interpreters.py + test_multiple_interpreters/test_multiple_interpreters.py test_native_enum test_numpy_array test_numpy_dtypes @@ -578,24 +578,8 @@ add_custom_target( USES_TERMINAL) if(NOT PYBIND11_CUDA_TESTS) - # This module doesn't get mixed with other test modules because those aren't subinterpreter safe. - pybind11_add_module(mod_per_interpreter_gil THIN_LTO mod_per_interpreter_gil.cpp) - pybind11_add_module(mod_shared_interpreter_gil THIN_LTO mod_shared_interpreter_gil.cpp) - set_target_properties(mod_per_interpreter_gil PROPERTIES LIBRARY_OUTPUT_DIRECTORY - "$<1:${CMAKE_CURRENT_BINARY_DIR}>") - set_target_properties(mod_shared_interpreter_gil PROPERTIES LIBRARY_OUTPUT_DIRECTORY - "$<1:${CMAKE_CURRENT_BINARY_DIR}>") - if(PYBIND11_TEST_SMART_HOLDER) - target_compile_definitions( - mod_per_interpreter_gil - PUBLIC -DPYBIND11_RUN_TESTING_WITH_SMART_HOLDER_AS_DEFAULT_BUT_NEVER_USE_IN_PRODUCTION_PLEASE - ) - target_compile_definitions( - mod_shared_interpreter_gil - PUBLIC -DPYBIND11_RUN_TESTING_WITH_SMART_HOLDER_AS_DEFAULT_BUT_NEVER_USE_IN_PRODUCTION_PLEASE - ) - endif() - add_dependencies(pytest mod_per_interpreter_gil mod_shared_interpreter_gil) + # Multiple interpreters tests are in a separate subdirectory + add_subdirectory(test_multiple_interpreters) endif() if(PYBIND11_TEST_OVERRIDE) diff --git a/tests/test_multiple_interpreters/CMakeLists.txt b/tests/test_multiple_interpreters/CMakeLists.txt new file mode 100644 index 0000000000..7d72402857 --- /dev/null +++ b/tests/test_multiple_interpreters/CMakeLists.txt @@ -0,0 +1,39 @@ +# CMakeLists.txt -- Build system for the pybind11 multiple interpreters test suite +# +# Copyright (c) 2015 Wenzel Jakob +# +# All rights reserved. Use of this source code is governed by a +# BSD-style license that can be found in the LICENSE file. + +set(PYBIND11_MULTIPLE_INTERPRETERS_TEST_FILES test_multiple_interpreters.py) + +# These modules don't get mixed with other test modules because those aren't subinterpreter safe. +pybind11_add_module(mod_per_interpreter_gil THIN_LTO mod_per_interpreter_gil.cpp) +pybind11_add_module(mod_shared_interpreter_gil THIN_LTO mod_shared_interpreter_gil.cpp) +pybind11_enable_warnings(mod_per_interpreter_gil) +pybind11_enable_warnings(mod_shared_interpreter_gil) + +# Put the built modules next to `pybind11_tests.so` so that the test scripts can find them. +get_target_property(pybind11_tests_output_directory pybind11_tests LIBRARY_OUTPUT_DIRECTORY) +set_target_properties(mod_per_interpreter_gil PROPERTIES LIBRARY_OUTPUT_DIRECTORY + "${pybind11_tests_output_directory}") +set_target_properties(mod_shared_interpreter_gil PROPERTIES LIBRARY_OUTPUT_DIRECTORY + "${pybind11_tests_output_directory}") + +if(PYBIND11_TEST_SMART_HOLDER) + target_compile_definitions( + mod_per_interpreter_gil + PUBLIC -DPYBIND11_RUN_TESTING_WITH_SMART_HOLDER_AS_DEFAULT_BUT_NEVER_USE_IN_PRODUCTION_PLEASE) + target_compile_definitions( + mod_shared_interpreter_gil + PUBLIC -DPYBIND11_RUN_TESTING_WITH_SMART_HOLDER_AS_DEFAULT_BUT_NEVER_USE_IN_PRODUCTION_PLEASE) +endif() + +add_dependencies(pytest mod_per_interpreter_gil mod_shared_interpreter_gil) + +# Convert relative to full file names and add to pytest test files +list(TRANSFORM PYBIND11_MULTIPLE_INTERPRETERS_TEST_FILES PREPEND "${CMAKE_CURRENT_SOURCE_DIR}/") +list(APPEND PYBIND11_ABS_PYTEST_FILES ${PYBIND11_MULTIPLE_INTERPRETERS_TEST_FILES}) +set(PYBIND11_ABS_PYTEST_FILES + ${PYBIND11_ABS_PYTEST_FILES} + PARENT_SCOPE) diff --git a/tests/mod_per_interpreter_gil.cpp b/tests/test_multiple_interpreters/mod_per_interpreter_gil.cpp similarity index 100% rename from tests/mod_per_interpreter_gil.cpp rename to tests/test_multiple_interpreters/mod_per_interpreter_gil.cpp diff --git a/tests/mod_shared_interpreter_gil.cpp b/tests/test_multiple_interpreters/mod_shared_interpreter_gil.cpp similarity index 100% rename from tests/mod_shared_interpreter_gil.cpp rename to tests/test_multiple_interpreters/mod_shared_interpreter_gil.cpp diff --git a/tests/test_multiple_interpreters.py b/tests/test_multiple_interpreters/test_multiple_interpreters.py similarity index 100% rename from tests/test_multiple_interpreters.py rename to tests/test_multiple_interpreters/test_multiple_interpreters.py From 97b50fe914971aae74b7bc7629d4cace853fdf36 Mon Sep 17 00:00:00 2001 From: Xuehai Pan Date: Mon, 22 Dec 2025 02:41:02 +0800 Subject: [PATCH 47/60] Add more tests for multiple interpreters --- .../test_multiple_interpreters/CMakeLists.txt | 33 +-- ...mod_per_interpreter_gil_with_singleton.cpp | 131 ++++++++++ .../test_multiple_interpreters.py | 237 ++++++++++++++++-- 3 files changed, 370 insertions(+), 31 deletions(-) create mode 100644 tests/test_multiple_interpreters/mod_per_interpreter_gil_with_singleton.cpp diff --git a/tests/test_multiple_interpreters/CMakeLists.txt b/tests/test_multiple_interpreters/CMakeLists.txt index 7d72402857..a22949dd9a 100644 --- a/tests/test_multiple_interpreters/CMakeLists.txt +++ b/tests/test_multiple_interpreters/CMakeLists.txt @@ -7,29 +7,32 @@ set(PYBIND11_MULTIPLE_INTERPRETERS_TEST_FILES test_multiple_interpreters.py) +set(modules mod_per_interpreter_gil mod_shared_interpreter_gil + mod_per_interpreter_gil_with_singleton) + # These modules don't get mixed with other test modules because those aren't subinterpreter safe. -pybind11_add_module(mod_per_interpreter_gil THIN_LTO mod_per_interpreter_gil.cpp) -pybind11_add_module(mod_shared_interpreter_gil THIN_LTO mod_shared_interpreter_gil.cpp) -pybind11_enable_warnings(mod_per_interpreter_gil) -pybind11_enable_warnings(mod_shared_interpreter_gil) +foreach(mod IN LISTS modules) + pybind11_add_module("${mod}" THIN_LTO "${mod}.cpp") + pybind11_enable_warnings("${mod}") +endforeach() # Put the built modules next to `pybind11_tests.so` so that the test scripts can find them. get_target_property(pybind11_tests_output_directory pybind11_tests LIBRARY_OUTPUT_DIRECTORY) -set_target_properties(mod_per_interpreter_gil PROPERTIES LIBRARY_OUTPUT_DIRECTORY - "${pybind11_tests_output_directory}") -set_target_properties(mod_shared_interpreter_gil PROPERTIES LIBRARY_OUTPUT_DIRECTORY - "${pybind11_tests_output_directory}") +foreach(mod IN LISTS modules) + set_target_properties("${mod}" PROPERTIES LIBRARY_OUTPUT_DIRECTORY + "${pybind11_tests_output_directory}") +endforeach() if(PYBIND11_TEST_SMART_HOLDER) - target_compile_definitions( - mod_per_interpreter_gil - PUBLIC -DPYBIND11_RUN_TESTING_WITH_SMART_HOLDER_AS_DEFAULT_BUT_NEVER_USE_IN_PRODUCTION_PLEASE) - target_compile_definitions( - mod_shared_interpreter_gil - PUBLIC -DPYBIND11_RUN_TESTING_WITH_SMART_HOLDER_AS_DEFAULT_BUT_NEVER_USE_IN_PRODUCTION_PLEASE) + foreach(mod IN LISTS modules) + target_compile_definitions( + "${mod}" + PUBLIC -DPYBIND11_RUN_TESTING_WITH_SMART_HOLDER_AS_DEFAULT_BUT_NEVER_USE_IN_PRODUCTION_PLEASE + ) + endforeach() endif() -add_dependencies(pytest mod_per_interpreter_gil mod_shared_interpreter_gil) +add_dependencies(pytest ${modules}) # Convert relative to full file names and add to pytest test files list(TRANSFORM PYBIND11_MULTIPLE_INTERPRETERS_TEST_FILES PREPEND "${CMAKE_CURRENT_SOURCE_DIR}/") diff --git a/tests/test_multiple_interpreters/mod_per_interpreter_gil_with_singleton.cpp b/tests/test_multiple_interpreters/mod_per_interpreter_gil_with_singleton.cpp new file mode 100644 index 0000000000..73256c09d8 --- /dev/null +++ b/tests/test_multiple_interpreters/mod_per_interpreter_gil_with_singleton.cpp @@ -0,0 +1,131 @@ +#include +#include + +#include + +namespace py = pybind11; + +#ifdef PYBIND11_HAS_NATIVE_ENUM +# include +#endif + +// A singleton class that holds references to certain Python objects +// This singleton is per-interpreter using gil_safe_call_once_and_store +class MySingleton { +public: + MySingleton() = default; + ~MySingleton() = default; + + static MySingleton &get_instance() { + PYBIND11_CONSTINIT static py::gil_safe_call_once_and_store storage; + return storage + .call_once_and_store_result([]() { + MySingleton instance{}; + + auto emplace = [&instance](const py::handle &obj) -> void { + obj.inc_ref(); // Ensure the object is not GC'd while interpreter is alive + instance.objects.emplace_back(obj); + }; + + // Example objects to store in the singleton + emplace(py::type::handle_of(py::none())); // static type + emplace(py::type::handle_of(py::tuple())); // static type + emplace(py::type::handle_of(py::list())); // static type + emplace(py::type::handle_of(py::dict())); // static type + emplace(py::module_::import("collections").attr("OrderedDict")); // static type + emplace(py::module_::import("collections").attr("defaultdict")); // heap type + emplace(py::module_::import("collections").attr("deque")); // heap type + return instance; + }) + .get_stored(); + } + + std::vector &get_objects() { return objects; } + + static void init() { + // Ensures the singleton is created + get_instance(); + // Register cleanup at interpreter exit + py::module_::import("atexit").attr("register")(py::cpp_function(&MySingleton::clear)); + } + + static void clear() { + auto &instance = get_instance(); + for (const auto &obj : instance.objects) { + obj.dec_ref(); + } + instance.objects.clear(); + } + +private: + std::vector objects; +}; + +class MyClass { +public: + explicit MyClass(py::ssize_t v) : value(v) {} + py::ssize_t get_value() const { return value; } + +private: + py::ssize_t value; +}; + +class MyGlobalError : public std::runtime_error { +public: + using std::runtime_error::runtime_error; +}; + +class MyLocalError : public std::runtime_error { +public: + using std::runtime_error::runtime_error; +}; + +enum class MyEnum : int { + ONE = 1, + TWO = 2, + THREE = 3, +}; + +PYBIND11_MODULE(mod_per_interpreter_gil_with_singleton, + m, + py::mod_gil_not_used(), + py::multiple_interpreters::per_interpreter_gil()) { +#ifdef PYBIND11_HAS_SUBINTERPRETER_SUPPORT + m.attr("defined_PYBIND11_HAS_SUBINTERPRETER_SUPPORT") = true; +#else + m.attr("defined_PYBIND11_HAS_SUBINTERPRETER_SUPPORT") = false; +#endif + + MySingleton::init(); + + // Ensure py::multiple_interpreters::per_interpreter_gil() works with singletons using + // py::gil_safe_call_once_and_store + m.def( + "get_objects_in_singleton", + []() -> std::vector { return MySingleton::get_instance().get_objects(); }, + "Get the list of objects stored in the singleton"); + + // Ensure py::multiple_interpreters::per_interpreter_gil() works with class bindings + py::class_(m, "MyClass") + .def(py::init()) + .def("get_value", &MyClass::get_value); + + // Ensure py::multiple_interpreters::per_interpreter_gil() works with global exceptions + py::register_exception(m, "MyGlobalError"); + // Ensure py::multiple_interpreters::per_interpreter_gil() works with local exceptions + py::register_local_exception(m, "MyLocalError"); + +#ifdef PYBIND11_HAS_NATIVE_ENUM + // Ensure py::multiple_interpreters::per_interpreter_gil() works with native_enum + py::native_enum(m, "MyEnum", "enum.IntEnum") + .value("ONE", MyEnum::ONE) + .value("TWO", MyEnum::TWO) + .value("THREE", MyEnum::THREE) + .finalize(); +#else + py::enum_(m, "MyEnum") + .value("ONE", MyEnum::ONE) + .value("TWO", MyEnum::TWO) + .value("THREE", MyEnum::THREE); +#endif +} diff --git a/tests/test_multiple_interpreters/test_multiple_interpreters.py b/tests/test_multiple_interpreters/test_multiple_interpreters.py index 627ccc591d..7e70324b6e 100644 --- a/tests/test_multiple_interpreters/test_multiple_interpreters.py +++ b/tests/test_multiple_interpreters/test_multiple_interpreters.py @@ -3,7 +3,9 @@ import contextlib import os import pickle +import subprocess import sys +import textwrap import pytest @@ -91,12 +93,14 @@ def test_independent_subinterpreters(): if not m.defined_PYBIND11_HAS_SUBINTERPRETER_SUPPORT: pytest.skip("Does not have subinterpreter support compiled in") - code = """ -import mod_per_interpreter_gil as m -import pickle -with open(pipeo, 'wb') as f: - pickle.dump(m.internals_at(), f) -""" + code = textwrap.dedent( + """ + import mod_per_interpreter_gil as m + import pickle + with open(pipeo, 'wb') as f: + pickle.dump(m.internals_at(), f) + """ + ).strip() with create() as interp1, create() as interp2: try: @@ -140,11 +144,13 @@ def test_independent_subinterpreters_modern(): from concurrent import interpreters - code = """ -import mod_per_interpreter_gil as m + code = textwrap.dedent( + """ + import mod_per_interpreter_gil as m -values.put_nowait(m.internals_at()) -""" + values.put_nowait(m.internals_at()) + """ + ).strip() with contextlib.closing(interpreters.create()) as interp1, contextlib.closing( interpreters.create() @@ -184,12 +190,14 @@ def test_dependent_subinterpreters(): if not m.defined_PYBIND11_HAS_SUBINTERPRETER_SUPPORT: pytest.skip("Does not have subinterpreter support compiled in") - code = """ -import mod_shared_interpreter_gil as m -import pickle -with open(pipeo, 'wb') as f: - pickle.dump(m.internals_at(), f) -""" + code = textwrap.dedent( + """ + import mod_shared_interpreter_gil as m + import pickle + with open(pipeo, 'wb') as f: + pickle.dump(m.internals_at(), f) + """ + ).strip() with create("legacy") as interp1: pipei, pipeo = os.pipe() @@ -198,3 +206,200 @@ def test_dependent_subinterpreters(): res1 = pickle.load(f) assert res1 != m.internals_at(), "internals should differ from main interpreter" + + +@pytest.mark.skipif( + sys.platform.startswith("emscripten"), reason="Requires loadable modules" +) +@pytest.mark.skipif(not CONCURRENT_INTERPRETERS_SUPPORT, reason="Requires 3.14.0b3+") +def test_import_module_with_singleton_per_interpreter(): + """Tests that a singleton storing Python objects works correctly per-interpreter""" + + sys.path.append(".") + + from concurrent import interpreters + + code = textwrap.dedent( + """ + import collections + import mod_per_interpreter_gil_with_singleton as m + + objects = m.get_objects_in_singleton() + assert objects == [ + type(None), + tuple, + list, + dict, + collections.OrderedDict, + collections.defaultdict, + collections.deque, + ] + + assert hasattr(m, 'MyClass') + assert hasattr(m, 'MyGlobalError') + assert hasattr(m, 'MyLocalError') + assert hasattr(m, 'MyEnum') + """ + ).strip() + + with contextlib.closing(interpreters.create()) as interp: + interp.exec(code) + + +def check_script_success(code: str, *, rerun: int = 5) -> None: + code = textwrap.dedent(code).strip() + try: + for _ in range(rerun): # run flakily failing test multiple times + subprocess.check_output( + [sys.executable, "-c", code], + cwd=os.getcwd(), + stderr=subprocess.STDOUT, + text=True, + ) + except subprocess.CalledProcessError as ex: + pytest.fail( + f"Subprocess failed with exit code {ex.returncode}.\nOutput:\n{ex.output}" + ) + + +@pytest.mark.skipif( + sys.platform.startswith("emscripten"), reason="Requires loadable modules" +) +@pytest.mark.skipif(not CONCURRENT_INTERPRETERS_SUPPORT, reason="Requires 3.14.0b3+") +def test_import_in_subinterpreter_before_after(): + """Tests that importing a module in a subinterpreter after the main interpreter works correctly""" + check_script_success( + """ + import contextlib + import gc + import sys + from concurrent import interpreters + + sys.path.append('.') + + def test(): + import collections + import mod_per_interpreter_gil_with_singleton as m + + objects = m.get_objects_in_singleton() + assert objects == [ + type(None), + tuple, + list, + dict, + collections.OrderedDict, + collections.defaultdict, + collections.deque, + ] + + assert hasattr(m, 'MyClass') + assert hasattr(m, 'MyGlobalError') + assert hasattr(m, 'MyLocalError') + assert hasattr(m, 'MyEnum') + + test() + + interp = None + with contextlib.closing(interpreters.create()) as interp: + interp.call(test) + del interp + + for _ in range(5): + gc.collect() + """ + ) + + +@pytest.mark.skipif( + sys.platform.startswith("emscripten"), reason="Requires loadable modules" +) +@pytest.mark.skipif(not CONCURRENT_INTERPRETERS_SUPPORT, reason="Requires 3.14.0b3+") +def test_import_in_subinterpreter_before_main(): + """Tests that importing a module in a subinterpreter before the main interpreter works correctly""" + check_script_success( + """ + import contextlib + import gc + import sys + from concurrent import interpreters + + sys.path.append('.') + + def test(): + import collections + import mod_per_interpreter_gil_with_singleton as m + + objects = m.get_objects_in_singleton() + assert objects == [ + type(None), + tuple, + list, + dict, + collections.OrderedDict, + collections.defaultdict, + collections.deque, + ] + + assert hasattr(m, 'MyClass') + assert hasattr(m, 'MyGlobalError') + assert hasattr(m, 'MyLocalError') + assert hasattr(m, 'MyEnum') + + interp = None + with contextlib.closing(interpreters.create()) as interp: + interp.call(test) + del interp + + test() + + for _ in range(5): + gc.collect() + """ + ) + + +@pytest.mark.skipif( + sys.platform.startswith("emscripten"), reason="Requires loadable modules" +) +@pytest.mark.skipif(not CONCURRENT_INTERPRETERS_SUPPORT, reason="Requires 3.14.0b3+") +def test_import_in_subinterpreter_concurrently(): + """Tests that importing a module in multiple subinterpreters concurrently works correctly""" + check_script_success( + """ + import gc + import sys + from concurrent.futures import InterpreterPoolExecutor, as_completed + + sys.path.append('.') + + def test(): + import collections + import mod_per_interpreter_gil_with_singleton as m + + objects = m.get_objects_in_singleton() + assert objects == [ + type(None), + tuple, + list, + dict, + collections.OrderedDict, + collections.defaultdict, + collections.deque, + ] + + assert hasattr(m, 'MyClass') + assert hasattr(m, 'MyGlobalError') + assert hasattr(m, 'MyLocalError') + assert hasattr(m, 'MyEnum') + + futures = future = None + with InterpreterPoolExecutor(max_workers=16) as executor: + futures = [executor.submit(test) for _ in range(32)] + for future in as_completed(futures): + future.result() + del futures, future, executor + + for _ in range(5): + gc.collect() + """ + ) From 8819ec43a1ec94f9cf7ee4cdb907bb48158fe446 Mon Sep 17 00:00:00 2001 From: Xuehai Pan Date: Mon, 22 Dec 2025 02:56:28 +0800 Subject: [PATCH 48/60] Remove copy constructor --- .../mod_per_interpreter_gil_with_singleton.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/test_multiple_interpreters/mod_per_interpreter_gil_with_singleton.cpp b/tests/test_multiple_interpreters/mod_per_interpreter_gil_with_singleton.cpp index 73256c09d8..c94563cee1 100644 --- a/tests/test_multiple_interpreters/mod_per_interpreter_gil_with_singleton.cpp +++ b/tests/test_multiple_interpreters/mod_per_interpreter_gil_with_singleton.cpp @@ -15,6 +15,10 @@ class MySingleton { public: MySingleton() = default; ~MySingleton() = default; + MySingleton(const MySingleton &) = delete; + MySingleton &operator=(const MySingleton &) = delete; + MySingleton(MySingleton &&) = default; + MySingleton &operator=(MySingleton &&) = default; static MySingleton &get_instance() { PYBIND11_CONSTINIT static py::gil_safe_call_once_and_store storage; From d9daef5b84f69697d436680c66b4bcf9b03c2aaf Mon Sep 17 00:00:00 2001 From: Xuehai Pan Date: Mon, 22 Dec 2025 12:55:43 +0800 Subject: [PATCH 49/60] Apply suggestions from code review --- include/pybind11/gil_safe_call_once.h | 49 +++++++++++++++------------ 1 file changed, 28 insertions(+), 21 deletions(-) diff --git a/include/pybind11/gil_safe_call_once.h b/include/pybind11/gil_safe_call_once.h index 7814a65c9e..2adad83bfc 100644 --- a/include/pybind11/gil_safe_call_once.h +++ b/include/pybind11/gil_safe_call_once.h @@ -19,13 +19,13 @@ PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE) -namespace detail { +PYBIND11_NAMESPACE_BEGIN(detail) #if defined(Py_GIL_DISABLED) || defined(PYBIND11_HAS_SUBINTERPRETER_SUPPORT) using atomic_bool = std::atomic_bool; #else using atomic_bool = bool; #endif -} // namespace detail +PYBIND11_NAMESPACE_END(detail) // Use the `gil_safe_call_once_and_store` class below instead of the naive // @@ -127,6 +127,7 @@ class gil_safe_call_once_and_store { // subinterpreter has its own separate state. The cached result may not shareable across // interpreters (e.g., imported modules and their members). +PYBIND11_NAMESPACE_BEGIN(detail) struct call_once_storage_base { call_once_storage_base() = default; virtual ~call_once_storage_base() = default; @@ -159,9 +160,7 @@ struct call_once_storage : call_once_storage_base { call_once_storage &operator=(call_once_storage &&) = delete; }; -/// Storage map for `gil_safe_call_once_and_store`. Stored in a capsule in the interpreter's state -/// dict with proper destructor to ensure cleanup when the interpreter is destroyed. -using call_once_storage_map_type = std::unordered_map; +PYBIND11_NAMESPACE_END(detail) # define PYBIND11_CALL_ONCE_STORAGE_MAP_ID PYBIND11_INTERNALS_ID "_call_once_storage_map__" @@ -178,18 +177,18 @@ class gil_safe_call_once_and_store { // Multiple threads may enter here, because the GIL is released in the next line and // CPython API calls in the `fn()` call below may release and reacquire the GIL. gil_scoped_release gil_rel; // Needed to establish lock ordering. - const void *const key = reinterpret_cast(this); + const void *const key = this; // There can be multiple threads going through here. - call_once_storage *value = nullptr; + storage_type *value = nullptr; { gil_scoped_acquire gil_acq; // Only one thread will enter here at a time. auto &storage_map = *get_or_create_call_once_storage_map(); const auto it = storage_map.find(key); if (it != storage_map.end()) { - value = static_cast *>(it->second); + value = static_cast(it->second); } else { - value = new call_once_storage{}; + value = new storage_type{}; storage_map.emplace(key, value); } } @@ -215,9 +214,9 @@ class gil_safe_call_once_and_store { T *result = last_storage_ptr_; if (!is_last_storage_valid()) { gil_scoped_acquire gil_acq; - const void *const key = reinterpret_cast(this); + const void *const key = this; auto &storage_map = *get_or_create_call_once_storage_map(); - auto *value = static_cast *>(storage_map.at(key)); + auto *value = static_cast(storage_map.at(key)); result = last_storage_ptr_ = reinterpret_cast(value->storage); } assert(result != nullptr); @@ -231,20 +230,27 @@ class gil_safe_call_once_and_store { PYBIND11_DTOR_CONSTEXPR ~gil_safe_call_once_and_store() = default; private: + using storage_base_type = detail::call_once_storage_base; + using storage_type = detail::call_once_storage; + // Use base type pointer for polymorphism + using storage_map_type = std::unordered_map; + bool is_last_storage_valid() const { return is_initialized_by_at_least_one_interpreter_ && detail::get_num_interpreters_seen() == 1; } - static call_once_storage_map_type *get_or_create_call_once_storage_map() { + // Storage map for `gil_safe_call_once_and_store`. Stored in a capsule in the interpreter's + // state dict with proper destructor to ensure cleanup when the interpreter is destroyed. + static storage_map_type *get_or_create_call_once_storage_map() { // Preserve any existing Python error state. dict_getitemstringref may clear // errors or set new ones when the key is not found; we restore the original // error state when this scope exits. error_scope err_scope; - dict state_dict = detail::get_python_state_dict(); + auto state_dict = reinterpret_borrow(detail::get_python_state_dict()); auto storage_map_obj = reinterpret_steal( detail::dict_getitemstringref(state_dict.ptr(), PYBIND11_CALL_ONCE_STORAGE_MAP_ID)); - call_once_storage_map_type *storage_map = nullptr; + storage_map_type *storage_map = nullptr; if (storage_map_obj) { void *raw_ptr = PyCapsule_GetPointer(storage_map_obj.ptr(), /*name=*/nullptr); if (!raw_ptr) { @@ -253,21 +259,22 @@ class gil_safe_call_once_and_store { "get_or_create_call_once_storage_map() FAILED"); throw error_already_set(); } - storage_map = reinterpret_cast(raw_ptr); + storage_map = static_cast(raw_ptr); } else { // Use unique_ptr for exception safety: if capsule creation throws, // the map is automatically deleted. - auto storage_map_ptr - = std::unique_ptr(new call_once_storage_map_type()); + auto storage_map_ptr = std::unique_ptr(new storage_map_type()); // Create capsule with destructor to clean up the storage map when the interpreter // shuts down state_dict[PYBIND11_CALL_ONCE_STORAGE_MAP_ID] = capsule(storage_map_ptr.get(), [](void *ptr) noexcept { - auto *map = reinterpret_cast(ptr); - for (const auto &entry : *map) { - delete entry.second; + auto *map = static_cast(ptr); + while (!map->empty()) { + auto it = map->begin(); + const auto *storage_ptr = it->second; + map->erase(it); + delete storage_ptr; } - delete map; }); // Capsule now owns the storage map, release from unique_ptr storage_map = storage_map_ptr.release(); From 9a3328bcd0d88bbac8b2df3244f3e9c5f94ad76e Mon Sep 17 00:00:00 2001 From: Xuehai Pan Date: Mon, 22 Dec 2025 14:50:10 +0800 Subject: [PATCH 50/60] Refactor to use per-storage capsule instead --- include/pybind11/gil_safe_call_once.h | 165 ++++++++++++++++---------- include/pybind11/pytypes.h | 43 ++++++- 2 files changed, 146 insertions(+), 62 deletions(-) diff --git a/include/pybind11/gil_safe_call_once.h b/include/pybind11/gil_safe_call_once.h index 2adad83bfc..56c14d13f6 100644 --- a/include/pybind11/gil_safe_call_once.h +++ b/include/pybind11/gil_safe_call_once.h @@ -13,8 +13,9 @@ # include #endif #ifdef PYBIND11_HAS_SUBINTERPRETER_SUPPORT +# include # include -# include +# include #endif PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE) @@ -111,6 +112,12 @@ class gil_safe_call_once_and_store { // may have been finalized already. PYBIND11_DTOR_CONSTEXPR ~gil_safe_call_once_and_store() = default; + // Disable copy and move operations. + gil_safe_call_once_and_store(const gil_safe_call_once_and_store &) = delete; + gil_safe_call_once_and_store(gil_safe_call_once_and_store &&) = delete; + gil_safe_call_once_and_store &operator=(const gil_safe_call_once_and_store &) = delete; + gil_safe_call_once_and_store &operator=(gil_safe_call_once_and_store &&) = delete; + private: // Global static storage (per process) when subinterpreter support is disabled. alignas(T) char storage_[sizeof(T)] = {}; @@ -128,24 +135,16 @@ class gil_safe_call_once_and_store { // interpreters (e.g., imported modules and their members). PYBIND11_NAMESPACE_BEGIN(detail) -struct call_once_storage_base { - call_once_storage_base() = default; - virtual ~call_once_storage_base() = default; - call_once_storage_base(const call_once_storage_base &) = delete; - call_once_storage_base(call_once_storage_base &&) = delete; - call_once_storage_base &operator=(const call_once_storage_base &) = delete; - call_once_storage_base &operator=(call_once_storage_base &&) = delete; -}; template -struct call_once_storage : call_once_storage_base { +struct call_once_storage { alignas(T) char storage[sizeof(T)] = {}; std::once_flag once_flag; void (*finalize)(T &) = nullptr; std::atomic_bool is_initialized{false}; call_once_storage() = default; - ~call_once_storage() override { + ~call_once_storage() { if (is_initialized) { if (finalize != nullptr) { finalize(*reinterpret_cast(storage)); @@ -162,7 +161,8 @@ struct call_once_storage : call_once_storage_base { PYBIND11_NAMESPACE_END(detail) -# define PYBIND11_CALL_ONCE_STORAGE_MAP_ID PYBIND11_INTERNALS_ID "_call_once_storage_map__" +// Prefix for storage keys in the interpreter state dict. +# define PYBIND11_CALL_ONCE_STORAGE_KEY_PREFIX PYBIND11_INTERNALS_ID "_call_once_storage__" // The life span of the stored result is the entire interpreter lifetime. An additional // `finalize_fn` can be provided to clean up the stored result when the interpreter is destroyed. @@ -177,20 +177,12 @@ class gil_safe_call_once_and_store { // Multiple threads may enter here, because the GIL is released in the next line and // CPython API calls in the `fn()` call below may release and reacquire the GIL. gil_scoped_release gil_rel; // Needed to establish lock ordering. - const void *const key = this; // There can be multiple threads going through here. storage_type *value = nullptr; { gil_scoped_acquire gil_acq; // Only one thread will enter here at a time. - auto &storage_map = *get_or_create_call_once_storage_map(); - const auto it = storage_map.find(key); - if (it != storage_map.end()) { - value = static_cast(it->second); - } else { - value = new storage_type{}; - storage_map.emplace(key, value); - } + value = get_or_create_storage_in_state_dict(); } assert(value != nullptr); std::call_once(value->once_flag, [&] { @@ -214,9 +206,7 @@ class gil_safe_call_once_and_store { T *result = last_storage_ptr_; if (!is_last_storage_valid()) { gil_scoped_acquire gil_acq; - const void *const key = this; - auto &storage_map = *get_or_create_call_once_storage_map(); - auto *value = static_cast(storage_map.at(key)); + auto *value = get_or_create_storage_in_state_dict(); result = last_storage_ptr_ = reinterpret_cast(value->storage); } assert(result != nullptr); @@ -229,62 +219,115 @@ class gil_safe_call_once_and_store { // may have been finalized already. PYBIND11_DTOR_CONSTEXPR ~gil_safe_call_once_and_store() = default; + // Disable copy and move operations because the memory address is used as key. + gil_safe_call_once_and_store(const gil_safe_call_once_and_store &) = delete; + gil_safe_call_once_and_store(gil_safe_call_once_and_store &&) = delete; + gil_safe_call_once_and_store &operator=(const gil_safe_call_once_and_store &) = delete; + gil_safe_call_once_and_store &operator=(gil_safe_call_once_and_store &&) = delete; + private: - using storage_base_type = detail::call_once_storage_base; using storage_type = detail::call_once_storage; - // Use base type pointer for polymorphism - using storage_map_type = std::unordered_map; + // Indicator of fast path for single-interpreter case. bool is_last_storage_valid() const { return is_initialized_by_at_least_one_interpreter_ && detail::get_num_interpreters_seen() == 1; } - // Storage map for `gil_safe_call_once_and_store`. Stored in a capsule in the interpreter's - // state dict with proper destructor to ensure cleanup when the interpreter is destroyed. - static storage_map_type *get_or_create_call_once_storage_map() { - // Preserve any existing Python error state. dict_getitemstringref may clear - // errors or set new ones when the key is not found; we restore the original - // error state when this scope exits. - error_scope err_scope; + // Get the unique key for this storage instance in the interpreter's state dict. + // Do not change the type of this to py::str because PyObject is interpreter-dependent. + std::string get_storage_key() const { + // The instance are expected to be global static, so using its address as unique + // identifier. The typical usage is like: + // + // PYBIND11_CONSTINIT static gil_safe_call_once_and_store storage; + // + return PYBIND11_CALL_ONCE_STORAGE_KEY_PREFIX + + std::to_string(reinterpret_cast(this)); + } + + // Get or create per-storage capsule. Uses test-and-set pattern with `PyDict_SetDefault` for + // thread-safe concurrent access. + storage_type *get_or_create_storage_in_state_dict() { + error_scope err_scope; // preserve any existing Python error states + auto state_dict = reinterpret_borrow(detail::get_python_state_dict()); - auto storage_map_obj = reinterpret_steal( - detail::dict_getitemstringref(state_dict.ptr(), PYBIND11_CALL_ONCE_STORAGE_MAP_ID)); - storage_map_type *storage_map = nullptr; - if (storage_map_obj) { - void *raw_ptr = PyCapsule_GetPointer(storage_map_obj.ptr(), /*name=*/nullptr); + const std::string key = get_storage_key(); + PyObject *result = nullptr; + + // First, try to get existing storage (fast path). + { + result = detail::dict_getitemstring(state_dict.ptr(), key.c_str()); + if (result != nullptr) { + // Storage already exists, get the storage pointer from the existing capsule. + void *raw_ptr = PyCapsule_GetPointer(result, /*name=*/nullptr); + if (!raw_ptr) { + raise_from(PyExc_SystemError, + "pybind11::gil_safe_call_once_and_store::" + "get_or_create_storage_in_state_dict() FAILED " + "(get existing)"); + throw error_already_set(); + } + return static_cast(raw_ptr); + } + if (PyErr_Occurred()) { + throw error_already_set(); + } + } + + // Storage doesn't exist yet, create a new one。 + // Use unique_ptr for exception safety: if capsule creation throws, + // the storage is automatically deleted. + auto storage_ptr = std::unique_ptr(new storage_type{}); + // Create capsule with destructor to clean up when the interpreter shuts down. + auto new_capsule = capsule( + storage_ptr.get(), [](void *ptr) -> void { delete static_cast(ptr); }); + + // Use `PyDict_SetDefault` for atomic test-and-set: + // - If key doesn't exist, inserts our capsule and returns it. + // - If key exists (another thread inserted first), returns the existing value. + // This is thread-safe because `PyDict_SetDefault` will hold a lock on the dict. + // + // NOTE: Here we use `dict_setdefaultstring` instead of `dict_setdefaultstringref` because + // the capsule is kept alive until interpreter shutdown, so we do not need to handle incref + // and decref here. + result = detail::dict_setdefaultstring(state_dict.ptr(), key.c_str(), new_capsule.ptr()); + if (result == nullptr) { + throw error_already_set(); + } + + // Check whether we inserted our new capsule or another thread did. + if (result == new_capsule.ptr()) { + // We successfully inserted our new capsule, release ownership from unique_ptr. + return storage_ptr.release(); + } + // Another thread already inserted a capsule, use theirs and discard ours. + { + // Disable the destructor of our unused capsule to prevent double-free: + // unique_ptr will clean up the storage on function exit, and the capsule should not. + if (PyCapsule_SetDestructor(new_capsule.ptr(), nullptr) != 0) { + raise_from(PyExc_SystemError, + "pybind11::gil_safe_call_once_and_store::" + "get_or_create_storage_in_state_dict() FAILED " + "(clear destructor of unused capsule)"); + throw error_already_set(); + } + // Get the storage pointer from the existing capsule. + void *raw_ptr = PyCapsule_GetPointer(result, /*name=*/nullptr); if (!raw_ptr) { raise_from(PyExc_SystemError, "pybind11::gil_safe_call_once_and_store::" - "get_or_create_call_once_storage_map() FAILED"); + "get_or_create_storage_in_state_dict() FAILED " + "(get after setdefault)"); throw error_already_set(); } - storage_map = static_cast(raw_ptr); - } else { - // Use unique_ptr for exception safety: if capsule creation throws, - // the map is automatically deleted. - auto storage_map_ptr = std::unique_ptr(new storage_map_type()); - // Create capsule with destructor to clean up the storage map when the interpreter - // shuts down - state_dict[PYBIND11_CALL_ONCE_STORAGE_MAP_ID] - = capsule(storage_map_ptr.get(), [](void *ptr) noexcept { - auto *map = static_cast(ptr); - while (!map->empty()) { - auto it = map->begin(); - const auto *storage_ptr = it->second; - map->erase(it); - delete storage_ptr; - } - }); - // Capsule now owns the storage map, release from unique_ptr - storage_map = storage_map_ptr.release(); + return static_cast(raw_ptr); } - return storage_map; } // No storage needed when subinterpreter support is enabled. // The actual storage is stored in the per-interpreter state dict via - // `get_or_create_call_once_storage_map()`. + // `get_or_create_storage_in_state_dict()`. // Fast local cache to avoid repeated lookups when there are no multiple interpreters. // This is only valid if there is a single interpreter. Otherwise, it is not used. diff --git a/include/pybind11/pytypes.h b/include/pybind11/pytypes.h index 0ab0b73e1f..64ebb1244c 100644 --- a/include/pybind11/pytypes.h +++ b/include/pybind11/pytypes.h @@ -997,8 +997,10 @@ inline PyObject *dict_getitem(PyObject *v, PyObject *key) { return rv; } +// PyDict_GetItemStringRef was added in Python 3.13.0a1. +// See also: https://github.com/python/pythoncapi-compat/blob/main/pythoncapi_compat.h inline PyObject *dict_getitemstringref(PyObject *v, const char *key) { -#if PY_VERSION_HEX >= 0x030D0000 +#if PY_VERSION_HEX >= 0x030D00A1 PyObject *rv = nullptr; if (PyDict_GetItemStringRef(v, key, &rv) < 0) { throw error_already_set(); @@ -1014,6 +1016,45 @@ inline PyObject *dict_getitemstringref(PyObject *v, const char *key) { #endif } +inline PyObject *dict_setdefaultstring(PyObject *v, const char *key, PyObject *defaultobj) { + PyObject *kv = PyUnicode_FromString(key); + if (kv == nullptr) { + throw error_already_set(); + } + + PyObject *rv = PyDict_SetDefault(v, kv, defaultobj); + Py_DECREF(kv); + if (rv == nullptr) { + throw error_already_set(); + } + return rv; +} + +// PyDict_SetDefaultRef was added in Python 3.13.0a4. +// See also: https://github.com/python/pythoncapi-compat/blob/main/pythoncapi_compat.h +inline PyObject *dict_setdefaultstringref(PyObject *v, const char *key, PyObject *defaultobj) { +#if PY_VERSION_HEX >= 0x030D00A4 + PyObject *kv = PyUnicode_FromString(key); + if (kv == nullptr) { + throw error_already_set(); + } + PyObject *rv = nullptr; + if (PyDict_SetDefaultRef(v, kv, defaultobj, &rv) < 0) { + Py_DECREF(kv); + throw error_already_set(); + } + Py_DECREF(kv); + return rv; +#else + PyObject *rv = dict_setdefaultstring(v, key, defaultobj); + if (rv == nullptr || PyErr_Occurred()) { + throw error_already_set(); + } + Py_XINCREF(rv); + return rv; +#endif +} + // Helper aliases/functions to support implicit casting of values given to python // accessors/methods. When given a pyobject, this simply returns the pyobject as-is; for other C++ // type, the value goes through pybind11::cast(obj) to convert it to an `object`. From bc20601a1769b7e766f842f6dda7ad33399c7c3f Mon Sep 17 00:00:00 2001 From: Xuehai Pan Date: Mon, 22 Dec 2025 15:05:57 +0800 Subject: [PATCH 51/60] Update comments --- include/pybind11/gil_safe_call_once.h | 33 ++++++++++--------- .../test_multiple_interpreters/CMakeLists.txt | 12 +++---- 2 files changed, 23 insertions(+), 22 deletions(-) diff --git a/include/pybind11/gil_safe_call_once.h b/include/pybind11/gil_safe_call_once.h index 56c14d13f6..6ecb02800b 100644 --- a/include/pybind11/gil_safe_call_once.h +++ b/include/pybind11/gil_safe_call_once.h @@ -3,7 +3,6 @@ #pragma once #include "detail/common.h" -#include "detail/internals.h" #include "gil.h" #include @@ -72,7 +71,8 @@ template class gil_safe_call_once_and_store { public: // PRECONDITION: The GIL must be held when `call_once_and_store_result()` is called. - // Note: The second parameter (finalize callback) is intentionally unused when subinterpreter + // + // NOTE: The second parameter (finalize callback) is intentionally unused when subinterpreter // support is disabled. In that case, storage is process-global and intentionally leaked to // avoid calling destructors after the Python interpreter has been finalized. template @@ -119,7 +119,7 @@ class gil_safe_call_once_and_store { gil_safe_call_once_and_store &operator=(gil_safe_call_once_and_store &&) = delete; private: - // Global static storage (per process) when subinterpreter support is disabled. + // The global static storage (per-process) when subinterpreter support is disabled. alignas(T) char storage_[sizeof(T)] = {}; std::once_flag once_flag_; @@ -235,10 +235,10 @@ class gil_safe_call_once_and_store { } // Get the unique key for this storage instance in the interpreter's state dict. - // Do not change the type of this to py::str because PyObject is interpreter-dependent. + // The return type should not be `py::str` because PyObject is interpreter-dependent. std::string get_storage_key() const { - // The instance are expected to be global static, so using its address as unique - // identifier. The typical usage is like: + // The instance is expected to be global static, so using its address as unique identifier. + // The typical usage is like: // // PYBIND11_CONSTINIT static gil_safe_call_once_and_store storage; // @@ -246,21 +246,21 @@ class gil_safe_call_once_and_store { + std::to_string(reinterpret_cast(this)); } - // Get or create per-storage capsule. Uses test-and-set pattern with `PyDict_SetDefault` for - // thread-safe concurrent access. + // Get or create per-storage capsule in the current interpreter's state dict. + // Use test-and-set pattern with `PyDict_SetDefault` for thread-safe concurrent access. storage_type *get_or_create_storage_in_state_dict() { error_scope err_scope; // preserve any existing Python error states auto state_dict = reinterpret_borrow(detail::get_python_state_dict()); const std::string key = get_storage_key(); - PyObject *result = nullptr; + PyObject *capsule_obj = nullptr; // First, try to get existing storage (fast path). { - result = detail::dict_getitemstring(state_dict.ptr(), key.c_str()); - if (result != nullptr) { + capsule_obj = detail::dict_getitemstring(state_dict.ptr(), key.c_str()); + if (capsule_obj != nullptr) { // Storage already exists, get the storage pointer from the existing capsule. - void *raw_ptr = PyCapsule_GetPointer(result, /*name=*/nullptr); + void *raw_ptr = PyCapsule_GetPointer(capsule_obj, /*name=*/nullptr); if (!raw_ptr) { raise_from(PyExc_SystemError, "pybind11::gil_safe_call_once_and_store::" @@ -291,13 +291,14 @@ class gil_safe_call_once_and_store { // NOTE: Here we use `dict_setdefaultstring` instead of `dict_setdefaultstringref` because // the capsule is kept alive until interpreter shutdown, so we do not need to handle incref // and decref here. - result = detail::dict_setdefaultstring(state_dict.ptr(), key.c_str(), new_capsule.ptr()); - if (result == nullptr) { + capsule_obj + = detail::dict_setdefaultstring(state_dict.ptr(), key.c_str(), new_capsule.ptr()); + if (capsule_obj == nullptr) { throw error_already_set(); } // Check whether we inserted our new capsule or another thread did. - if (result == new_capsule.ptr()) { + if (capsule_obj == new_capsule.ptr()) { // We successfully inserted our new capsule, release ownership from unique_ptr. return storage_ptr.release(); } @@ -313,7 +314,7 @@ class gil_safe_call_once_and_store { throw error_already_set(); } // Get the storage pointer from the existing capsule. - void *raw_ptr = PyCapsule_GetPointer(result, /*name=*/nullptr); + void *raw_ptr = PyCapsule_GetPointer(capsule_obj, /*name=*/nullptr); if (!raw_ptr) { raise_from(PyExc_SystemError, "pybind11::gil_safe_call_once_and_store::" diff --git a/tests/test_multiple_interpreters/CMakeLists.txt b/tests/test_multiple_interpreters/CMakeLists.txt index a22949dd9a..9a1445e0e9 100644 --- a/tests/test_multiple_interpreters/CMakeLists.txt +++ b/tests/test_multiple_interpreters/CMakeLists.txt @@ -7,24 +7,24 @@ set(PYBIND11_MULTIPLE_INTERPRETERS_TEST_FILES test_multiple_interpreters.py) -set(modules mod_per_interpreter_gil mod_shared_interpreter_gil - mod_per_interpreter_gil_with_singleton) +set(PYBIND11_MULTIPLE_INTERPRETERS_TEST_MODULES mod_per_interpreter_gil mod_shared_interpreter_gil + mod_per_interpreter_gil_with_singleton) # These modules don't get mixed with other test modules because those aren't subinterpreter safe. -foreach(mod IN LISTS modules) +foreach(mod IN LISTS PYBIND11_MULTIPLE_INTERPRETERS_TEST_MODULES) pybind11_add_module("${mod}" THIN_LTO "${mod}.cpp") pybind11_enable_warnings("${mod}") endforeach() # Put the built modules next to `pybind11_tests.so` so that the test scripts can find them. get_target_property(pybind11_tests_output_directory pybind11_tests LIBRARY_OUTPUT_DIRECTORY) -foreach(mod IN LISTS modules) +foreach(mod IN LISTS PYBIND11_MULTIPLE_INTERPRETERS_TEST_MODULES) set_target_properties("${mod}" PROPERTIES LIBRARY_OUTPUT_DIRECTORY "${pybind11_tests_output_directory}") endforeach() if(PYBIND11_TEST_SMART_HOLDER) - foreach(mod IN LISTS modules) + foreach(mod IN LISTS PYBIND11_MULTIPLE_INTERPRETERS_TEST_MODULES) target_compile_definitions( "${mod}" PUBLIC -DPYBIND11_RUN_TESTING_WITH_SMART_HOLDER_AS_DEFAULT_BUT_NEVER_USE_IN_PRODUCTION_PLEASE @@ -32,7 +32,7 @@ if(PYBIND11_TEST_SMART_HOLDER) endforeach() endif() -add_dependencies(pytest ${modules}) +add_dependencies(pytest ${PYBIND11_MULTIPLE_INTERPRETERS_TEST_MODULES}) # Convert relative to full file names and add to pytest test files list(TRANSFORM PYBIND11_MULTIPLE_INTERPRETERS_TEST_FILES PREPEND "${CMAKE_CURRENT_SOURCE_DIR}/") From b39c04948644016e49cb12a44b18ea9f0dd75e10 Mon Sep 17 00:00:00 2001 From: Xuehai Pan Date: Mon, 22 Dec 2025 15:54:17 +0800 Subject: [PATCH 52/60] Update singleton tests --- include/pybind11/gil_safe_call_once.h | 1 + .../mod_per_interpreter_gil_with_singleton.cpp | 10 +++++++--- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/include/pybind11/gil_safe_call_once.h b/include/pybind11/gil_safe_call_once.h index 6ecb02800b..bfce9fa55a 100644 --- a/include/pybind11/gil_safe_call_once.h +++ b/include/pybind11/gil_safe_call_once.h @@ -3,6 +3,7 @@ #pragma once #include "detail/common.h" +#include "detail/internals.h" #include "gil.h" #include diff --git a/tests/test_multiple_interpreters/mod_per_interpreter_gil_with_singleton.cpp b/tests/test_multiple_interpreters/mod_per_interpreter_gil_with_singleton.cpp index c94563cee1..5ee5b050e0 100644 --- a/tests/test_multiple_interpreters/mod_per_interpreter_gil_with_singleton.cpp +++ b/tests/test_multiple_interpreters/mod_per_interpreter_gil_with_singleton.cpp @@ -23,7 +23,7 @@ class MySingleton { static MySingleton &get_instance() { PYBIND11_CONSTINIT static py::gil_safe_call_once_and_store storage; return storage - .call_once_and_store_result([]() { + .call_once_and_store_result([]() -> MySingleton { MySingleton instance{}; auto emplace = [&instance](const py::handle &obj) -> void { @@ -39,6 +39,8 @@ class MySingleton { emplace(py::module_::import("collections").attr("OrderedDict")); // static type emplace(py::module_::import("collections").attr("defaultdict")); // heap type emplace(py::module_::import("collections").attr("deque")); // heap type + + assert(instance.objects.size() == 7); return instance; }) .get_stored(); @@ -47,14 +49,16 @@ class MySingleton { std::vector &get_objects() { return objects; } static void init() { - // Ensures the singleton is created - get_instance(); + // Ensure the singleton is created + auto &instance = get_instance(); + assert(instance.objects.size() == 7); // Register cleanup at interpreter exit py::module_::import("atexit").attr("register")(py::cpp_function(&MySingleton::clear)); } static void clear() { auto &instance = get_instance(); + assert(instance.objects.size() == 7); for (const auto &obj : instance.objects) { obj.dec_ref(); } From 9ef71ecf6bc49b0c31f17751e33080df094d5736 Mon Sep 17 00:00:00 2001 From: Xuehai Pan Date: Mon, 22 Dec 2025 15:43:44 +0800 Subject: [PATCH 53/60] Use interpreter id type for `get_num_interpreters_seen()` --- include/pybind11/detail/internals.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/pybind11/detail/internals.h b/include/pybind11/detail/internals.h index 5ccd4d18e5..44d9f9e27a 100644 --- a/include/pybind11/detail/internals.h +++ b/include/pybind11/detail/internals.h @@ -420,8 +420,8 @@ inline PyThreadState *get_thread_state_unchecked() { /// We use this counter to figure out if there are or have been multiple subinterpreters active at /// any point. This must never decrease while any interpreter may be running in any thread! -inline std::atomic &get_num_interpreters_seen() { - static std::atomic counter(0); +inline std::atomic &get_num_interpreters_seen() { + static std::atomic counter(0); return counter; } From 98370f2d1a5097f86b8731d8334b9e1a1108a15d Mon Sep 17 00:00:00 2001 From: Xuehai Pan Date: Mon, 22 Dec 2025 16:37:59 +0800 Subject: [PATCH 54/60] Suppress unused variable warning --- .../mod_per_interpreter_gil_with_singleton.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/test_multiple_interpreters/mod_per_interpreter_gil_with_singleton.cpp b/tests/test_multiple_interpreters/mod_per_interpreter_gil_with_singleton.cpp index 5ee5b050e0..874b93c77f 100644 --- a/tests/test_multiple_interpreters/mod_per_interpreter_gil_with_singleton.cpp +++ b/tests/test_multiple_interpreters/mod_per_interpreter_gil_with_singleton.cpp @@ -51,6 +51,7 @@ class MySingleton { static void init() { // Ensure the singleton is created auto &instance = get_instance(); + (void) instance; // suppress unused variable warning assert(instance.objects.size() == 7); // Register cleanup at interpreter exit py::module_::import("atexit").attr("register")(py::cpp_function(&MySingleton::clear)); @@ -58,6 +59,7 @@ class MySingleton { static void clear() { auto &instance = get_instance(); + (void) instance; // suppress unused variable warning assert(instance.objects.size() == 7); for (const auto &obj : instance.objects) { obj.dec_ref(); From 534235ea559247f7a4984174f57f809b90a93af4 Mon Sep 17 00:00:00 2001 From: Xuehai Pan Date: Mon, 22 Dec 2025 15:44:51 +0800 Subject: [PATCH 55/60] HACKING --- include/pybind11/detail/internals.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/include/pybind11/detail/internals.h b/include/pybind11/detail/internals.h index 44d9f9e27a..0a2b03c807 100644 --- a/include/pybind11/detail/internals.h +++ b/include/pybind11/detail/internals.h @@ -421,7 +421,7 @@ inline PyThreadState *get_thread_state_unchecked() { /// We use this counter to figure out if there are or have been multiple subinterpreters active at /// any point. This must never decrease while any interpreter may be running in any thread! inline std::atomic &get_num_interpreters_seen() { - static std::atomic counter(0); + static std::atomic counter(2); // !FIXME!: hack for `get_num_interpreters_seen() > 1` return counter; } @@ -564,6 +564,10 @@ class internals_pp_manager { /// acquire the GIL. Will never return nullptr. std::unique_ptr *get_pp() { #ifdef PYBIND11_HAS_SUBINTERPRETER_SUPPORT + // !FIXME!: If the module is imported from subinterpreter before the main interpreter, + // (get_num_interpreters_seen() == 1) will be true here. + // This is causing bugs like: + // https://github.com/pybind/pybind11/pull/5933#discussion_r2638712777 if (get_num_interpreters_seen() > 1) { // Whenever the interpreter changes on the current thread we need to invalidate the // internals_pp so that it can be pulled from the interpreter's state dict. That is From d0387141e968c0f701240cae19ad538e11ff5767 Mon Sep 17 00:00:00 2001 From: Xuehai Pan Date: Mon, 22 Dec 2025 20:23:49 +0800 Subject: [PATCH 56/60] Revert "HACKING" This reverts commit 534235ea559247f7a4984174f57f809b90a93af4. --- include/pybind11/detail/internals.h | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/include/pybind11/detail/internals.h b/include/pybind11/detail/internals.h index 0a2b03c807..44d9f9e27a 100644 --- a/include/pybind11/detail/internals.h +++ b/include/pybind11/detail/internals.h @@ -421,7 +421,7 @@ inline PyThreadState *get_thread_state_unchecked() { /// We use this counter to figure out if there are or have been multiple subinterpreters active at /// any point. This must never decrease while any interpreter may be running in any thread! inline std::atomic &get_num_interpreters_seen() { - static std::atomic counter(2); // !FIXME!: hack for `get_num_interpreters_seen() > 1` + static std::atomic counter(0); return counter; } @@ -564,10 +564,6 @@ class internals_pp_manager { /// acquire the GIL. Will never return nullptr. std::unique_ptr *get_pp() { #ifdef PYBIND11_HAS_SUBINTERPRETER_SUPPORT - // !FIXME!: If the module is imported from subinterpreter before the main interpreter, - // (get_num_interpreters_seen() == 1) will be true here. - // This is causing bugs like: - // https://github.com/pybind/pybind11/pull/5933#discussion_r2638712777 if (get_num_interpreters_seen() > 1) { // Whenever the interpreter changes on the current thread we need to invalidate the // internals_pp so that it can be pulled from the interpreter's state dict. That is From 3a2c34a99ab7129fba3c5e54b000569cf0e474ef Mon Sep 17 00:00:00 2001 From: Xuehai Pan Date: Mon, 22 Dec 2025 20:25:07 +0800 Subject: [PATCH 57/60] Try fix concurrency --- include/pybind11/detail/internals.h | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/include/pybind11/detail/internals.h b/include/pybind11/detail/internals.h index 44d9f9e27a..7afe6878dc 100644 --- a/include/pybind11/detail/internals.h +++ b/include/pybind11/detail/internals.h @@ -564,7 +564,7 @@ class internals_pp_manager { /// acquire the GIL. Will never return nullptr. std::unique_ptr *get_pp() { #ifdef PYBIND11_HAS_SUBINTERPRETER_SUPPORT - if (get_num_interpreters_seen() > 1) { + if (get_num_interpreters_seen() > 1 || last_istate_tls() == nullptr) { // Whenever the interpreter changes on the current thread we need to invalidate the // internals_pp so that it can be pulled from the interpreter's state dict. That is // slow, so we use the current PyThreadState to check if it is necessary. @@ -590,11 +590,8 @@ class internals_pp_manager { /// Drop all the references we're currently holding. void unref() { #ifdef PYBIND11_HAS_SUBINTERPRETER_SUPPORT - if (get_num_interpreters_seen() > 1) { - last_istate_tls() = nullptr; - internals_p_tls() = nullptr; - return; - } + last_istate_tls() = nullptr; + internals_p_tls() = nullptr; #endif internals_singleton_pp_ = nullptr; } @@ -606,7 +603,6 @@ class internals_pp_manager { // this could be called without an active interpreter, just use what was cached if (!tstate || tstate->interp == last_istate_tls()) { auto tpp = internals_p_tls(); - delete tpp; } unref(); From 99a095d89af372401722964b2a0d547677766feb Mon Sep 17 00:00:00 2001 From: Xuehai Pan Date: Mon, 22 Dec 2025 21:06:46 +0800 Subject: [PATCH 58/60] Test even harder --- .../test_multiple_interpreters.py | 228 +++++++++++------- 1 file changed, 137 insertions(+), 91 deletions(-) diff --git a/tests/test_multiple_interpreters/test_multiple_interpreters.py b/tests/test_multiple_interpreters/test_multiple_interpreters.py index 7e70324b6e..66363b142a 100644 --- a/tests/test_multiple_interpreters/test_multiple_interpreters.py +++ b/tests/test_multiple_interpreters/test_multiple_interpreters.py @@ -208,19 +208,13 @@ def test_dependent_subinterpreters(): assert res1 != m.internals_at(), "internals should differ from main interpreter" -@pytest.mark.skipif( - sys.platform.startswith("emscripten"), reason="Requires loadable modules" -) -@pytest.mark.skipif(not CONCURRENT_INTERPRETERS_SUPPORT, reason="Requires 3.14.0b3+") -def test_import_module_with_singleton_per_interpreter(): - """Tests that a singleton storing Python objects works correctly per-interpreter""" +PREAMBLE_CODE = textwrap.dedent( + """ + import sys - sys.path.append(".") - - from concurrent import interpreters + sys.path.append('.') - code = textwrap.dedent( - """ + def test(): import collections import mod_per_interpreter_gil_with_singleton as m @@ -239,14 +233,26 @@ def test_import_module_with_singleton_per_interpreter(): assert hasattr(m, 'MyGlobalError') assert hasattr(m, 'MyLocalError') assert hasattr(m, 'MyEnum') - """ - ).strip() + """ +).lstrip() + + +@pytest.mark.skipif( + sys.platform.startswith("emscripten"), reason="Requires loadable modules" +) +@pytest.mark.skipif(not CONCURRENT_INTERPRETERS_SUPPORT, reason="Requires 3.14.0b3+") +def test_import_module_with_singleton_per_interpreter(): + """Tests that a singleton storing Python objects works correctly per-interpreter""" + from concurrent import interpreters + + code = f"{PREAMBLE_CODE.strip()}\n\ntest()\n" with contextlib.closing(interpreters.create()) as interp: interp.exec(code) -def check_script_success(code: str, *, rerun: int = 5) -> None: +def check_script_success_in_subprocess(code: str, *, rerun: int = 8) -> None: + """Runs the given code in a subprocess.""" code = textwrap.dedent(code).strip() try: for _ in range(rerun): # run flakily failing test multiple times @@ -266,47 +272,55 @@ def check_script_success(code: str, *, rerun: int = 5) -> None: sys.platform.startswith("emscripten"), reason="Requires loadable modules" ) @pytest.mark.skipif(not CONCURRENT_INTERPRETERS_SUPPORT, reason="Requires 3.14.0b3+") -def test_import_in_subinterpreter_before_after(): +def test_import_in_subinterpreter_after_main(): """Tests that importing a module in a subinterpreter after the main interpreter works correctly""" - check_script_success( - """ - import contextlib - import gc - import sys - from concurrent import interpreters - - sys.path.append('.') - - def test(): - import collections - import mod_per_interpreter_gil_with_singleton as m - - objects = m.get_objects_in_singleton() - assert objects == [ - type(None), - tuple, - list, - dict, - collections.OrderedDict, - collections.defaultdict, - collections.deque, - ] - - assert hasattr(m, 'MyClass') - assert hasattr(m, 'MyGlobalError') - assert hasattr(m, 'MyLocalError') - assert hasattr(m, 'MyEnum') - - test() - - interp = None - with contextlib.closing(interpreters.create()) as interp: - interp.call(test) - del interp + check_script_success_in_subprocess( + PREAMBLE_CODE + + textwrap.dedent( + """ + import contextlib + import gc + from concurrent import interpreters + + test() + + interp = None + with contextlib.closing(interpreters.create()) as interp: + interp.call(test) + + del interp + for _ in range(5): + gc.collect() + """ + ) + ) - for _ in range(5): - gc.collect() - """ + check_script_success_in_subprocess( + PREAMBLE_CODE + + textwrap.dedent( + """ + import contextlib + import gc + import random + from concurrent import interpreters + + test() + + interps = interp = None + with contextlib.ExitStack() as stack: + interps = [ + stack.enter_context(contextlib.closing(interpreters.create())) + for _ in range(4) + ] + random.shuffle(interps) + for interp in interps: + interp.call(test) + + del interps, interp, stack + for _ in range(5): + gc.collect() + """ + ) ) @@ -316,45 +330,77 @@ def test(): @pytest.mark.skipif(not CONCURRENT_INTERPRETERS_SUPPORT, reason="Requires 3.14.0b3+") def test_import_in_subinterpreter_before_main(): """Tests that importing a module in a subinterpreter before the main interpreter works correctly""" - check_script_success( - """ - import contextlib - import gc - import sys - from concurrent import interpreters - - sys.path.append('.') - - def test(): - import collections - import mod_per_interpreter_gil_with_singleton as m - - objects = m.get_objects_in_singleton() - assert objects == [ - type(None), - tuple, - list, - dict, - collections.OrderedDict, - collections.defaultdict, - collections.deque, - ] - - assert hasattr(m, 'MyClass') - assert hasattr(m, 'MyGlobalError') - assert hasattr(m, 'MyLocalError') - assert hasattr(m, 'MyEnum') - - interp = None - with contextlib.closing(interpreters.create()) as interp: - interp.call(test) - del interp + check_script_success_in_subprocess( + PREAMBLE_CODE + + textwrap.dedent( + """ + import contextlib + import gc + from concurrent import interpreters + + interp = None + with contextlib.closing(interpreters.create()) as interp: + interp.call(test) + + test() + + del interp + for _ in range(5): + gc.collect() + """ + ) + ) - test() + check_script_success_in_subprocess( + PREAMBLE_CODE + + textwrap.dedent( + """ + import contextlib + import gc + from concurrent import interpreters + + interps = interp = None + with contextlib.ExitStack() as stack: + interps = [ + stack.enter_context(contextlib.closing(interpreters.create())) + for _ in range(4) + ] + for interp in interps: + interp.call(test) + + test() + + del interps, interp, stack + for _ in range(5): + gc.collect() + """ + ) + ) - for _ in range(5): - gc.collect() - """ + check_script_success_in_subprocess( + PREAMBLE_CODE + + textwrap.dedent( + """ + import contextlib + import gc + from concurrent import interpreters + + interps = interp = None + with contextlib.ExitStack() as stack: + interps = [ + stack.enter_context(contextlib.closing(interpreters.create())) + for _ in range(4) + ] + for interp in interps: + interp.call(test) + + test() + + del interps, interp, stack + for _ in range(5): + gc.collect() + """ + ) ) @@ -364,7 +410,7 @@ def test(): @pytest.mark.skipif(not CONCURRENT_INTERPRETERS_SUPPORT, reason="Requires 3.14.0b3+") def test_import_in_subinterpreter_concurrently(): """Tests that importing a module in multiple subinterpreters concurrently works correctly""" - check_script_success( + check_script_success_in_subprocess( """ import gc import sys From 7daecd75c847205d147d7308fcb184afb01584d7 Mon Sep 17 00:00:00 2001 From: Xuehai Pan Date: Tue, 23 Dec 2025 11:24:23 +0800 Subject: [PATCH 59/60] Reorg code to avoid duplicates --- include/pybind11/gil_safe_call_once.h | 104 ++++++++---------- .../test_multiple_interpreters.py | 8 +- 2 files changed, 51 insertions(+), 61 deletions(-) diff --git a/include/pybind11/gil_safe_call_once.h b/include/pybind11/gil_safe_call_once.h index bfce9fa55a..f915a8576c 100644 --- a/include/pybind11/gil_safe_call_once.h +++ b/include/pybind11/gil_safe_call_once.h @@ -181,8 +181,8 @@ class gil_safe_call_once_and_store { // There can be multiple threads going through here. storage_type *value = nullptr; { - gil_scoped_acquire gil_acq; - // Only one thread will enter here at a time. + gil_scoped_acquire gil_acq; // Restore lock ordering. + // This function is thread-safe under free-threading. value = get_or_create_storage_in_state_dict(); } assert(value != nullptr); @@ -259,72 +259,56 @@ class gil_safe_call_once_and_store { // First, try to get existing storage (fast path). { capsule_obj = detail::dict_getitemstring(state_dict.ptr(), key.c_str()); - if (capsule_obj != nullptr) { - // Storage already exists, get the storage pointer from the existing capsule. - void *raw_ptr = PyCapsule_GetPointer(capsule_obj, /*name=*/nullptr); - if (!raw_ptr) { - raise_from(PyExc_SystemError, - "pybind11::gil_safe_call_once_and_store::" - "get_or_create_storage_in_state_dict() FAILED " - "(get existing)"); - throw error_already_set(); - } - return static_cast(raw_ptr); - } - if (PyErr_Occurred()) { + if (capsule_obj == nullptr && PyErr_Occurred()) { throw error_already_set(); } + // Fallthrough if capsule_obj is nullptr (not found). + // Otherwise, we have found the existing storage (most common case) and return it + // below. } - // Storage doesn't exist yet, create a new one。 - // Use unique_ptr for exception safety: if capsule creation throws, - // the storage is automatically deleted. - auto storage_ptr = std::unique_ptr(new storage_type{}); - // Create capsule with destructor to clean up when the interpreter shuts down. - auto new_capsule = capsule( - storage_ptr.get(), [](void *ptr) -> void { delete static_cast(ptr); }); - - // Use `PyDict_SetDefault` for atomic test-and-set: - // - If key doesn't exist, inserts our capsule and returns it. - // - If key exists (another thread inserted first), returns the existing value. - // This is thread-safe because `PyDict_SetDefault` will hold a lock on the dict. - // - // NOTE: Here we use `dict_setdefaultstring` instead of `dict_setdefaultstringref` because - // the capsule is kept alive until interpreter shutdown, so we do not need to handle incref - // and decref here. - capsule_obj - = detail::dict_setdefaultstring(state_dict.ptr(), key.c_str(), new_capsule.ptr()); if (capsule_obj == nullptr) { - throw error_already_set(); - } - - // Check whether we inserted our new capsule or another thread did. - if (capsule_obj == new_capsule.ptr()) { - // We successfully inserted our new capsule, release ownership from unique_ptr. - return storage_ptr.release(); - } - // Another thread already inserted a capsule, use theirs and discard ours. - { - // Disable the destructor of our unused capsule to prevent double-free: - // unique_ptr will clean up the storage on function exit, and the capsule should not. - if (PyCapsule_SetDestructor(new_capsule.ptr(), nullptr) != 0) { - raise_from(PyExc_SystemError, - "pybind11::gil_safe_call_once_and_store::" - "get_or_create_storage_in_state_dict() FAILED " - "(clear destructor of unused capsule)"); - throw error_already_set(); - } - // Get the storage pointer from the existing capsule. - void *raw_ptr = PyCapsule_GetPointer(capsule_obj, /*name=*/nullptr); - if (!raw_ptr) { - raise_from(PyExc_SystemError, - "pybind11::gil_safe_call_once_and_store::" - "get_or_create_storage_in_state_dict() FAILED " - "(get after setdefault)"); + // Storage doesn't exist yet, create a new one. + // Use unique_ptr for exception safety: if capsule creation throws, the storage is + // automatically deleted. + auto storage_ptr = std::unique_ptr(new storage_type{}); + // Create capsule with destructor to clean up when the interpreter shuts down. + auto new_capsule = capsule(storage_ptr.get(), [](void *ptr) -> void { + delete static_cast(ptr); + }); + // At this point, the capsule object is created successfully. + // Release the unique_ptr and let the capsule object own the storage to avoid + // double-free. + storage_ptr.reset(); + + // Use `PyDict_SetDefault` for atomic test-and-set: + // - If key doesn't exist, inserts our capsule and returns it. + // - If key exists (another thread inserted first), returns the existing value. + // This is thread-safe because `PyDict_SetDefault` will hold a lock on the dict. + // + // NOTE: Here we use `dict_setdefaultstring` instead of `dict_setdefaultstringref` + // because the capsule is kept alive until interpreter shutdown, so we do not need to + // handle incref and decref here. + capsule_obj + = detail::dict_setdefaultstring(state_dict.ptr(), key.c_str(), new_capsule.ptr()); + if (capsule_obj == nullptr) { throw error_already_set(); } - return static_cast(raw_ptr); + // - If key already existed, our `new_capsule` is not inserted, it will be destructed + // when going out of scope here, which will also free the storage. + // - Otherwise, our `new_capsule` is now in the dict, and it owns the storage and the + // state dict will incref it. + } + + // Get the storage pointer from the capsule. + void *raw_ptr = PyCapsule_GetPointer(capsule_obj, /*name=*/nullptr); + if (!raw_ptr) { + raise_from(PyExc_SystemError, + "pybind11::gil_safe_call_once_and_store::" + "get_or_create_storage_in_state_dict() FAILED"); + throw error_already_set(); } + return static_cast(raw_ptr); } // No storage needed when subinterpreter support is enabled. diff --git a/tests/test_multiple_interpreters/test_multiple_interpreters.py b/tests/test_multiple_interpreters/test_multiple_interpreters.py index 66363b142a..c8b578ac8e 100644 --- a/tests/test_multiple_interpreters/test_multiple_interpreters.py +++ b/tests/test_multiple_interpreters/test_multiple_interpreters.py @@ -264,7 +264,13 @@ def check_script_success_in_subprocess(code: str, *, rerun: int = 8) -> None: ) except subprocess.CalledProcessError as ex: pytest.fail( - f"Subprocess failed with exit code {ex.returncode}.\nOutput:\n{ex.output}" + f"Subprocess failed with exit code {ex.returncode}.\n\n" + f"Code:\n" + f"```python\n" + f"{code}\n" + f"```\n\n" + f"Output:\n" + f"{ex.output}" ) From cd950dc40b43306310cafd11eda03b4b4bad0a70 Mon Sep 17 00:00:00 2001 From: Xuehai Pan Date: Tue, 23 Dec 2025 11:36:44 +0800 Subject: [PATCH 60/60] Fix unique_ptr::reset -> unique_ptr::release --- include/pybind11/gil_safe_call_once.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/include/pybind11/gil_safe_call_once.h b/include/pybind11/gil_safe_call_once.h index f915a8576c..7e76690ca6 100644 --- a/include/pybind11/gil_safe_call_once.h +++ b/include/pybind11/gil_safe_call_once.h @@ -279,16 +279,16 @@ class gil_safe_call_once_and_store { // At this point, the capsule object is created successfully. // Release the unique_ptr and let the capsule object own the storage to avoid // double-free. - storage_ptr.reset(); + (void) storage_ptr.release(); // Use `PyDict_SetDefault` for atomic test-and-set: // - If key doesn't exist, inserts our capsule and returns it. // - If key exists (another thread inserted first), returns the existing value. // This is thread-safe because `PyDict_SetDefault` will hold a lock on the dict. // - // NOTE: Here we use `dict_setdefaultstring` instead of `dict_setdefaultstringref` - // because the capsule is kept alive until interpreter shutdown, so we do not need to - // handle incref and decref here. + // NOTE: Here we use `PyDict_SetDefault` instead of `PyDict_SetDefaultRef` because the + // capsule is kept alive until interpreter shutdown, so we do not need to handle incref + // and decref here. capsule_obj = detail::dict_setdefaultstring(state_dict.ptr(), key.c_str(), new_capsule.ptr()); if (capsule_obj == nullptr) {