From 20d33da87641d6463711f86ef39e98f0f4623f51 Mon Sep 17 00:00:00 2001
From: Andy Jost <ajost@nvidia.com>
Date: Fri, 5 Dec 2025 15:22:00 -0800
Subject: [PATCH 01/38] Centralize context management code

- Create _context.pxd with Context class declaration
- Move get_primary_context from _device.pyx to _context.pyx
- Add Cython-level context functions: get_current_context, set_current_context, get_stream_context
- Update _device.pyx to use context module functions
- Update _stream.pyx to use context module functions
- Remove push_context and pop_context (unused, replaced with direct CUDA API calls)
- Reorganize _context.pyx according to style guide (principal class first)
---
 cuda_core/cuda/core/experimental/_context.pxd |  22 ++++
 cuda_core/cuda/core/experimental/_context.pyx | 109 ++++++++++++++++--
 cuda_core/cuda/core/experimental/_device.pyx  |  64 +++++-----
 cuda_core/cuda/core/experimental/_stream.pyx  |  21 ++--
 4 files changed, 163 insertions(+), 53 deletions(-)
 create mode 100644 cuda_core/cuda/core/experimental/_context.pxd

diff --git a/cuda_core/cuda/core/experimental/_context.pxd b/cuda_core/cuda/core/experimental/_context.pxd
new file mode 100644
index 0000000000..0e0df83831
--- /dev/null
+++ b/cuda_core/cuda/core/experimental/_context.pxd
@@ -0,0 +1,22 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from cuda.bindings cimport cydriver
+
+cdef class Context:
+    """Cython declaration for Context class.
+
+    This class provides access to CUDA contexts. Context objects cannot be
+    instantiated directly - use factory methods or Device/Stream APIs.
+    """
+
+    cdef:
+        readonly object _handle
+        int _device_id
+
+# Cython-level context operations
+cdef cydriver.CUcontext get_primary_context(int dev_id) except?NULL
+cdef cydriver.CUcontext get_current_context() except?NULL nogil
+cdef void set_current_context(cydriver.CUcontext ctx) except *
+cdef cydriver.CUcontext get_stream_context(cydriver.CUstream stream) except?NULL nogil
diff --git a/cuda_core/cuda/core/experimental/_context.pyx b/cuda_core/cuda/core/experimental/_context.pyx
index f9858c1710..244109584d 100644
--- a/cuda_core/cuda/core/experimental/_context.pyx
+++ b/cuda_core/cuda/core/experimental/_context.pyx
@@ -4,19 +4,23 @@
 
 from dataclasses import dataclass
 
-from cuda.core.experimental._utils.cuda_utils import driver
+import threading
+from libc.stdint cimport uintptr_t
 
+from cuda.bindings cimport cydriver
+from cuda.core.experimental._utils.cuda_utils import driver, CUDAError
+from cuda.core.experimental._utils.cuda_utils cimport HANDLE_RETURN
 
-@dataclass
-class ContextOptions:
-    pass  # TODO
+
+__all__ = ['Context', 'ContextOptions']
 
 
 cdef class Context:
+    """CUDA context wrapper.
 
-    cdef:
-        readonly object _handle
-        int _device_id
+    Context objects represent CUDA contexts and cannot be instantiated directly.
+    Use Device or Stream APIs to obtain context objects.
+    """
 
     def __init__(self, *args, **kwargs):
         raise RuntimeError("Context objects cannot be instantiated directly. Please use Device or Stream APIs.")
@@ -36,3 +40,94 @@ cdef class Context:
 
     def __hash__(self) -> int:
         return hash(int(self._handle))
+
+
+@dataclass
+class ContextOptions:
+    """Options for context creation.
+
+    Currently unused, reserved for future use.
+    """
+    pass  # TODO
+
+
+cdef cydriver.CUcontext get_current_context() except?NULL nogil:
+    """Get the current CUDA context.
+
+    Returns
+    -------
+    CUcontext
+        Current context handle, or NULL if no context is bound
+    """
+    cdef cydriver.CUcontext ctx = NULL
+    HANDLE_RETURN(cydriver.cuCtxGetCurrent(&ctx))
+    return ctx
+
+
+cdef cydriver.CUcontext get_primary_context(int dev_id) except?NULL:
+    """Get the primary context for a device.
+
+    Uses thread-local storage to cache primary contexts per device.
+    The primary context is lazily initialized on first access.
+
+    Parameters
+    ----------
+    dev_id : int
+        Device ID
+
+    Returns
+    -------
+    CUcontext
+        Primary context handle for the device, or NULL on error
+    """
+    cdef int total = 0
+    cdef cydriver.CUcontext ctx
+
+    try:
+        primary_ctxs = _tls.primary_ctxs
+    except AttributeError:
+        # Initialize primary context cache
+        with nogil:
+            HANDLE_RETURN(cydriver.cuDeviceGetCount(&total))
+        primary_ctxs = _tls.primary_ctxs = [0] * total
+
+    ctx = <cydriver.CUcontext><uintptr_t>(primary_ctxs[dev_id])
+    if ctx == NULL:
+        with nogil:
+            HANDLE_RETURN(cydriver.cuDevicePrimaryCtxRetain(&ctx, dev_id))
+        primary_ctxs[dev_id] = <uintptr_t>(ctx)
+    return ctx
+
+
+cdef cydriver.CUcontext get_stream_context(cydriver.CUstream stream) except?NULL nogil:
+    """Get the context associated with a stream.
+
+    Parameters
+    ----------
+    stream : CUstream
+        Stream handle
+
+    Returns
+    -------
+    CUcontext
+        Context handle associated with the stream, or NULL on error
+    """
+    cdef cydriver.CUcontext ctx = NULL
+    HANDLE_RETURN(cydriver.cuStreamGetCtx(stream, &ctx))
+    return ctx
+
+
+cdef void set_current_context(cydriver.CUcontext ctx) except *:
+    """Set the current CUDA context.
+
+    Parameters
+    ----------
+    ctx : CUcontext
+        Context handle to set as current
+    """
+    with nogil:
+        HANDLE_RETURN(cydriver.cuCtxSetCurrent(ctx))
+
+
+# Thread-local storage for primary context cache
+_tls = threading.local()
diff --git a/cuda_core/cuda/core/experimental/_device.pyx b/cuda_core/cuda/core/experimental/_device.pyx
index cd802943a5..c6efa21ac7 100644
--- a/cuda_core/cuda/core/experimental/_device.pyx
+++ b/cuda_core/cuda/core/experimental/_device.pyx
@@ -11,7 +11,13 @@ from cuda.core.experimental._utils.cuda_utils cimport HANDLE_RETURN
 import threading
 from typing import Optional, TYPE_CHECKING, Union
 
-from cuda.core.experimental._context import Context, ContextOptions
+from cuda.core.experimental._context cimport (
+    Context,
+    get_primary_context,
+    get_current_context,
+    set_current_context,
+)
+from cuda.core.experimental._context import ContextOptions
 from cuda.core.experimental._event import Event, EventOptions
 from cuda.core.experimental._graph import GraphBuilder
 from cuda.core.experimental._stream import IsStreamT, Stream, StreamOptions
@@ -908,20 +914,6 @@ cdef class DeviceProperties:
         )
 
 
-cdef cydriver.CUcontext _get_primary_context(int dev_id) except?NULL:
-    try:
-        primary_ctxs = _tls.primary_ctxs
-    except AttributeError:
-        total = len(_tls.devices)
-        primary_ctxs = _tls.primary_ctxs = [0] * total
-    cdef cydriver.CUcontext ctx = <cydriver.CUcontext><uintptr_t>(primary_ctxs[dev_id])
-    if ctx == NULL:
-        with nogil:
-            HANDLE_RETURN(cydriver.cuDevicePrimaryCtxRetain(&ctx, dev_id))
-        primary_ctxs[dev_id] = <uintptr_t>(ctx)
-    return ctx
-
-
 class Device:
     """Represent a GPU and act as an entry point for cuda.core features.
 
@@ -973,8 +965,7 @@ class Device:
             if err == cydriver.CUresult.CUDA_SUCCESS:
                 device_id = int(dev)
             elif err == cydriver.CUresult.CUDA_ERROR_INVALID_CONTEXT:
-                with nogil:
-                    HANDLE_RETURN(cydriver.cuCtxGetCurrent(&ctx))
+                ctx = get_current_context()
                 assert <void*>(ctx) == NULL
                 device_id = 0  # cudart behavior
             else:
@@ -1010,19 +1001,6 @@ class Device:
                 f"Device {self._id} is not yet initialized, perhaps you forgot to call .set_current() first?"
             )
 
-    def _get_current_context(self, bint check_consistency=False) -> driver.CUcontext:
-        cdef cydriver.CUcontext ctx
-        cdef cydriver.CUdevice dev
-        cdef cydriver.CUdevice this_dev = self._id
-        with nogil:
-            HANDLE_RETURN(cydriver.cuCtxGetCurrent(&ctx))
-            if ctx == NULL:
-                raise CUDAError("No context is bound to the calling CPU thread.")
-            if check_consistency:
-                HANDLE_RETURN(cydriver.cuCtxGetDevice(&dev))
-                if dev != this_dev:
-                    raise CUDAError("Internal error (current device is not equal to Device.device_id)")
-        return driver.CUcontext(<uintptr_t>ctx)
 
     @property
     def device_id(self) -> int:
@@ -1136,8 +1114,16 @@ class Device:
 
         """
         self._check_context_initialized()
-        ctx = self._get_current_context(check_consistency=True)
-        return Context._from_ctx(ctx, self._id)
+        cdef cydriver.CUcontext ctx
+        cdef cydriver.CUdevice dev
+        with nogil:
+            ctx = get_current_context()
+            HANDLE_RETURN(cydriver.cuCtxGetDevice(&dev))
+        if ctx == NULL:
+            raise CUDAError("No context is bound to the calling CPU thread.")
+        if <int>dev != self._id:
+            raise CUDAError("Internal error (current device is not equal to Device.device_id)")
+        return Context._from_ctx(<uintptr_t>(ctx), self._id)
 
     @property
     def memory_resource(self) -> MemoryResource:
@@ -1241,6 +1227,7 @@ class Device:
                 )
             # prev_ctx is the previous context
             curr_ctx = <cydriver.CUcontext>(ctx._handle)
+            prev_ctx = NULL
             with nogil:
                 HANDLE_RETURN(cydriver.cuCtxPopCurrent(&prev_ctx))
                 HANDLE_RETURN(cydriver.cuCtxPushCurrent(curr_ctx))
@@ -1249,9 +1236,8 @@ class Device:
                 return Context._from_ctx(<uintptr_t>(prev_ctx), self._id)
         else:
             # use primary ctx
-            curr_ctx = _get_primary_context(self._id)
-            with nogil:
-                HANDLE_RETURN(cydriver.cuCtxSetCurrent(curr_ctx))
+            curr_ctx = get_primary_context(self._id)
+            set_current_context(curr_ctx)
             self._has_inited = True
 
     def create_context(self, options: ContextOptions = None) -> Context:
@@ -1324,8 +1310,12 @@ class Device:
 
         """
         self._check_context_initialized()
-        ctx = self._get_current_context()
-        return Event._init(self._id, ctx, options, True)
+        cdef cydriver.CUcontext ctx
+        with nogil:
+            ctx = get_current_context()
+        if ctx == NULL:
+            raise CUDAError("No context is bound to the calling CPU thread.")
+        return Event._init(self._id, <uintptr_t>(ctx), options, True)
 
     def allocate(self, size, stream: Stream | GraphBuilder | None = None) -> Buffer:
         """Allocate device memory from a specified stream.
diff --git a/cuda_core/cuda/core/experimental/_stream.pyx b/cuda_core/cuda/core/experimental/_stream.pyx
index 87ec4a691a..440130f679 100644
--- a/cuda_core/cuda/core/experimental/_stream.pyx
+++ b/cuda_core/cuda/core/experimental/_stream.pyx
@@ -25,7 +25,11 @@ from typing import TYPE_CHECKING, Optional, Protocol, Union
 if TYPE_CHECKING:
     import cuda.bindings
     from cuda.core.experimental._device import Device
-from cuda.core.experimental._context import Context
+from cuda.core.experimental._context cimport (
+    Context,
+    get_stream_context,
+    get_current_context,
+)
 from cuda.core.experimental._event import Event, EventOptions
 from cuda.core.experimental._graph import GraphBuilder
 from cuda.core.experimental._utils.cuda_utils import (
@@ -317,19 +321,18 @@ cdef class Stream:
 
     cdef int _get_context(self) except?-1 nogil:
         if self._ctx_handle == CU_CONTEXT_INVALID:
-            HANDLE_RETURN(cydriver.cuStreamGetCtx(self._handle, &(self._ctx_handle)))
+            self._ctx_handle = get_stream_context(self._handle)
         return 0
 
     cdef int _get_device_and_context(self) except?-1:
         cdef cydriver.CUcontext curr_ctx
         if self._device_id == cydriver.CU_DEVICE_INVALID:
-            with nogil:
-                # Get the current context
-                HANDLE_RETURN(cydriver.cuCtxGetCurrent(&curr_ctx))
-                # Get the stream's context (self.ctx_handle is populated)
-                self._get_context()
-                # Get the stream's device (may require a context-switching dance)
-                self._device_id = get_device_from_ctx(self._ctx_handle, curr_ctx)
+            # Get the current context
+            curr_ctx = get_current_context()
+            # Get the stream's context (self.ctx_handle is populated)
+            self._get_context()
+            # Get the stream's device (may require a context-switching dance)
+            self._device_id = get_device_from_ctx(self._ctx_handle, curr_ctx)
         return 0
 
     @property

From 758f9f9aede579d59d93f74b069083c21f58244a Mon Sep 17 00:00:00 2001
From: Andy Jost <ajost@nvidia.com>
Date: Sat, 6 Dec 2025 08:13:17 -0800
Subject: [PATCH 02/38] Integrate resource handles into Context class

- Replace Context._handle (object) with ContextHandle (shared_ptr) resource handle
- Add handle property to Context returning driver.CUcontext
- Update Context._from_ctx to create ContextHandle using create_context_handle()
- Update Context.__eq__ to compare actual CUcontext values (not shared_ptr addresses)
- Update Context.__hash__ to include type(self) and handle value with NULL safety
- Update _device.pyx to use ctx._handle.get()[0] for direct access
- Update _graph.pyx to use context.handle property
- Update C++ implementation to use default deleter (simplifies code)
- Rename _resource_handles_impl.cpp to _context_impl.cpp
- Remove test_dev.py development script
- Update .gitignore to allow *_impl.cpp files
- Fix all test files to use context.handle instead of context._handle
---
 .gitattributes                                |  3 +
 .gitignore                                    |  1 +
 cuda_core/build_hooks.py                      | 27 ++++++++-
 cuda_core/cuda/core/experimental/_context.pxd |  3 +-
 cuda_core/cuda/core/experimental/_context.pyx | 28 +++++++--
 .../experimental/_cpp/resource_handles.cpp    | 58 +++++++++++++++++++
 .../experimental/_cpp/resource_handles.hpp    | 30 ++++++++++
 cuda_core/cuda/core/experimental/_device.pyx  |  2 +-
 cuda_core/cuda/core/experimental/_graph.py    |  2 +-
 .../core/experimental/_resource_handles.pxd   | 15 +++++
 .../core/experimental/_resource_handles.pyx   |  6 ++
 cuda_core/tests/test_comparable.py            |  2 +-
 cuda_core/tests/test_hashable.py              |  4 +-
 cuda_core/tests/test_stream.py                |  2 +-
 14 files changed, 170 insertions(+), 13 deletions(-)
 create mode 100644 cuda_core/cuda/core/experimental/_cpp/resource_handles.cpp
 create mode 100644 cuda_core/cuda/core/experimental/_cpp/resource_handles.hpp
 create mode 100644 cuda_core/cuda/core/experimental/_resource_handles.pxd
 create mode 100644 cuda_core/cuda/core/experimental/_resource_handles.pyx

diff --git a/.gitattributes b/.gitattributes
index 6a3ee0fe72..68492b15c9 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -6,6 +6,9 @@ cuda/_version.py export-subst
 # we do not own any headers checked in, don't touch them
 *.h binary
 *.hpp binary
+# Exception: headers we own (cuda_core C++ implementation)
+cuda_core/cuda/core/experimental/_cpp/*.h -binary text diff
+cuda_core/cuda/core/experimental/_cpp/*.hpp -binary text diff
 # git should not convert line endings in PNG files
 *.png binary
 *.svg binary
diff --git a/.gitignore b/.gitignore
index 1455b1dfc2..685fa231f8 100644
--- a/.gitignore
+++ b/.gitignore
@@ -11,6 +11,7 @@ __pycache__/
 .pytest_cache/
 .benchmarks/
 *.cpp
+!*_impl.cpp
 !cuda_bindings/cuda/bindings/_lib/param_packer.cpp
 !cuda_bindings/cuda/bindings/_bindings/loader.cpp
 cache_driver
diff --git a/cuda_core/build_hooks.py b/cuda_core/build_hooks.py
index e38f5676df..7ebb67cef0 100644
--- a/cuda_core/build_hooks.py
+++ b/cuda_core/build_hooks.py
@@ -84,11 +84,34 @@ def get_cuda_paths():
         print("CUDA paths:", CUDA_PATH)
         return CUDA_PATH
 
+    def get_sources(mod_name):
+        """Get source files for a module, including any .cpp files."""
+        sources = [f"cuda/core/experimental/{mod_name}.pyx"]
+
+        # Add module-specific .cpp file from _cpp/ directory if it exists
+        cpp_file = f"cuda/core/experimental/_cpp/{mod_name.lstrip('_')}.cpp"
+        if os.path.exists(cpp_file):
+            sources.append(cpp_file)
+
+        # Modules that use resource handles need to link against _resource_handles_impl.cpp
+        # This includes _context, _stream, _event, etc. as they adopt handle-based management
+        resource_handle_users = {"_context", "_stream", "_event"}
+        if mod_name in resource_handle_users:
+            resource_handles_impl = "cuda/core/experimental/_resource_handles_impl.cpp"
+            if os.path.exists(resource_handles_impl):
+                sources.append(resource_handles_impl)
+
+        return sources
+
     ext_modules = tuple(
         Extension(
             f"cuda.core.experimental.{mod.replace(os.path.sep, '.')}",
-            sources=[f"cuda/core/experimental/{mod}.pyx"],
-            include_dirs=list(os.path.join(root, "include") for root in get_cuda_paths()),
+            sources=get_sources(mod),
+            include_dirs=[
+                "cuda/core/experimental/include",
+                "cuda/core/experimental/_cpp",
+            ]
+            + list(os.path.join(root, "include") for root in get_cuda_paths()),
             language="c++",
         )
         for mod in module_names
diff --git a/cuda_core/cuda/core/experimental/_context.pxd b/cuda_core/cuda/core/experimental/_context.pxd
index 0e0df83831..875d95d283 100644
--- a/cuda_core/cuda/core/experimental/_context.pxd
+++ b/cuda_core/cuda/core/experimental/_context.pxd
@@ -3,6 +3,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 from cuda.bindings cimport cydriver
+from cuda.core.experimental._resource_handles cimport ContextHandle
 
 cdef class Context:
     """Cython declaration for Context class.
@@ -12,7 +13,7 @@ cdef class Context:
     """
 
     cdef:
-        readonly object _handle
+        ContextHandle _resource_handle
         int _device_id
 
 # Cython-level context operations
diff --git a/cuda_core/cuda/core/experimental/_context.pyx b/cuda_core/cuda/core/experimental/_context.pyx
index 244109584d..81daa2aa09 100644
--- a/cuda_core/cuda/core/experimental/_context.pyx
+++ b/cuda_core/cuda/core/experimental/_context.pyx
@@ -8,7 +8,8 @@ import threading
 from libc.stdint cimport uintptr_t
 
 from cuda.bindings cimport cydriver
-from cuda.core.experimental._utils.cuda_utils import driver, CUDAError
+from cuda.core.experimental._resource_handles cimport create_context_handle_ref
+from cuda.core.experimental._utils.cuda_utils import driver
 from cuda.core.experimental._utils.cuda_utils cimport HANDLE_RETURN
 
 
@@ -28,18 +29,37 @@ cdef class Context:
     @classmethod
     def _from_ctx(cls, handle: driver.CUcontext, int device_id):
         cdef Context ctx = Context.__new__(Context)
-        ctx._handle = handle
+        # Convert Python CUcontext to C-level CUcontext and create non-owning ContextHandle
+        cdef cydriver.CUcontext c_ctx = <cydriver.CUcontext><uintptr_t>int(handle)
+        ctx._resource_handle = create_context_handle_ref(c_ctx)
         ctx._device_id = device_id
         return ctx
 
+    @property
+    def handle(self):
+        """Return the underlying CUcontext handle."""
+        cdef const cydriver.CUcontext* ptr = self._resource_handle.get()
+        if ptr != NULL:
+            return driver.CUcontext(<uintptr_t>(ptr[0]))
+        return None
+
     def __eq__(self, other):
         if not isinstance(other, Context):
             return NotImplemented
         cdef Context _other = <Context>other
-        return int(self._handle) == int(_other._handle)
+        # Compare the actual CUcontext values, not the shared_ptr objects
+        # (aliasing constructor creates different addresses even for same CUcontext)
+        cdef const cydriver.CUcontext* ptr1 = self._resource_handle.get()
+        cdef const cydriver.CUcontext* ptr2 = _other._resource_handle.get()
+        if ptr1 == NULL or ptr2 == NULL:
+            return ptr1 == ptr2
+        return ptr1[0] == ptr2[0]
 
     def __hash__(self) -> int:
-        return hash(int(self._handle))
+        cdef const cydriver.CUcontext* ptr = self._resource_handle.get()
+        if ptr == NULL:
+            return hash((type(self), 0))
+        return hash((type(self), <uintptr_t>(ptr[0])))
 
 
 @dataclass
diff --git a/cuda_core/cuda/core/experimental/_cpp/resource_handles.cpp b/cuda_core/cuda/core/experimental/_cpp/resource_handles.cpp
new file mode 100644
index 0000000000..93a31551e0
--- /dev/null
+++ b/cuda_core/cuda/core/experimental/_cpp/resource_handles.cpp
@@ -0,0 +1,58 @@
+// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include "resource_handles.hpp"
+#include <cuda.h>
+
+namespace cuda_core {
+
+// ============================================================================
+// Context Handles
+// ============================================================================
+
+ContextHandle create_context_handle_ref(CUcontext ctx) {
+    // Creates a non-owning handle that references an existing context
+    // (e.g., primary context managed by CUDA driver)
+
+    // Allocate the box containing the context resource
+    ContextBox* box = new ContextBox();
+    box->resource = ctx;
+
+    // Use default deleter - it will delete the box, but not touch the CUcontext
+    // CUcontext lifetime is managed externally (e.g., by CUDA driver)
+    std::shared_ptr<const ContextBox> box_ptr(box);
+
+    // Use aliasing constructor to create handle that exposes only CUcontext
+    // The handle's reference count is tied to box_ptr, but it points to &box_ptr->resource
+    return ContextHandle(box_ptr, &box_ptr->resource);
+}
+
+// TODO: Future owning handle for cuCtxCreate/cuCtxDestroy
+// ContextHandle create_context_handle(CUdevice dev, unsigned int flags) { ... }
+
+// ============================================================================
+// Stream Handles
+// ============================================================================
+
+// TODO: Implement StreamH create_stream_handle(...) when Stream gets handle support
+
+// ============================================================================
+// Event Handles
+// ============================================================================
+
+// TODO: Implement EventH create_event_handle(...) when Event gets handle support
+
+// ============================================================================
+// Device Pointer Handles
+// ============================================================================
+
+// TODO: Implement DevicePtrH create_deviceptr_handle(...) when DevicePtr gets handle support
+
+// ============================================================================
+// Memory Pool Handles
+// ============================================================================
+
+// TODO: Implement MemPoolH create_mempool_handle(...) when MemPool gets handle support
+
+}  // namespace cuda_core
diff --git a/cuda_core/cuda/core/experimental/_cpp/resource_handles.hpp b/cuda_core/cuda/core/experimental/_cpp/resource_handles.hpp
new file mode 100644
index 0000000000..949d9f1289
--- /dev/null
+++ b/cuda_core/cuda/core/experimental/_cpp/resource_handles.hpp
@@ -0,0 +1,30 @@
+// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include <memory>
+#include <cuda.h>
+
+namespace cuda_core {
+
+// Forward declarations
+struct ContextBox;
+
+// Handle type aliases - expose only the raw CUDA resource
+using ContextHandle = std::shared_ptr<const CUcontext>;
+
+// Internal box structure for Context
+// This holds the resource and any dependencies needed for lifetime management
+struct ContextBox {
+    CUcontext resource;
+    // Context doesn't depend on other CUDA resources, but we keep the structure
+    // extensible for future needs
+};
+
+// Function to create a non-owning context handle (references existing context)
+// This will be implemented in the .cpp file
+ContextHandle create_context_handle_ref(CUcontext ctx);
+
+}  // namespace cuda_core
diff --git a/cuda_core/cuda/core/experimental/_device.pyx b/cuda_core/cuda/core/experimental/_device.pyx
index c6efa21ac7..ceea0c6a5f 100644
--- a/cuda_core/cuda/core/experimental/_device.pyx
+++ b/cuda_core/cuda/core/experimental/_device.pyx
@@ -1226,7 +1226,7 @@ class Device:
                     f" id={ctx._id}, which is different from the target id={self._id}"
                 )
             # prev_ctx is the previous context
-            curr_ctx = <cydriver.CUcontext>(ctx._handle)
+            curr_ctx = ctx._resource_handle.get()[0]
             prev_ctx = NULL
             with nogil:
                 HANDLE_RETURN(cydriver.cuCtxPopCurrent(&prev_ctx))
diff --git a/cuda_core/cuda/core/experimental/_graph.py b/cuda_core/cuda/core/experimental/_graph.py
index a82bd70f55..5dbcc80c04 100644
--- a/cuda_core/cuda/core/experimental/_graph.py
+++ b/cuda_core/cuda/core/experimental/_graph.py
@@ -453,7 +453,7 @@ def __cuda_stream__(self) -> tuple[int, int]:
         return self.stream.__cuda_stream__()
 
     def _get_conditional_context(self) -> driver.CUcontext:
-        return self._mnff.stream.context._handle
+        return self._mnff.stream.context.handle
 
     def create_conditional_handle(self, default_value=None) -> driver.CUgraphConditionalHandle:
         """Creates a conditional handle for the graph builder.
diff --git a/cuda_core/cuda/core/experimental/_resource_handles.pxd b/cuda_core/cuda/core/experimental/_resource_handles.pxd
new file mode 100644
index 0000000000..8ada7d8cd5
--- /dev/null
+++ b/cuda_core/cuda/core/experimental/_resource_handles.pxd
@@ -0,0 +1,15 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from libcpp.memory cimport shared_ptr
+
+from cuda.bindings cimport cydriver
+
+# Declare the C++ namespace and types
+cdef extern from "_cpp/resource_handles.hpp" namespace "cuda_core":
+    # Handle type - shared_ptr to const CUcontext
+    ctypedef shared_ptr[const cydriver.CUcontext] ContextHandle
+
+    # Function to create a non-owning context handle (references existing context)
+    ContextHandle create_context_handle_ref(cydriver.CUcontext ctx)
diff --git a/cuda_core/cuda/core/experimental/_resource_handles.pyx b/cuda_core/cuda/core/experimental/_resource_handles.pyx
new file mode 100644
index 0000000000..564f2abac3
--- /dev/null
+++ b/cuda_core/cuda/core/experimental/_resource_handles.pyx
@@ -0,0 +1,6 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+# This module exists to compile _cpp/resource_handles.cpp into a shared library.
+# The C++ code provides handle management for CUDA contexts and other resources.
diff --git a/cuda_core/tests/test_comparable.py b/cuda_core/tests/test_comparable.py
index c99963cd23..72b3caa2ba 100644
--- a/cuda_core/tests/test_comparable.py
+++ b/cuda_core/tests/test_comparable.py
@@ -136,7 +136,7 @@ class MyContext(Context):
     context = stream.context
 
     # MyContext._from_ctx() returns a Context instance, not MyContext
-    my_context = MyContext._from_ctx(context._handle, device.device_id)
+    my_context = MyContext._from_ctx(context.handle, device.device_id)
     assert type(my_context) is Context, "_from_ctx returns Context, not subclass"
     assert type(my_context) is not MyContext
 
diff --git a/cuda_core/tests/test_hashable.py b/cuda_core/tests/test_hashable.py
index 4aa801866f..751a88250c 100644
--- a/cuda_core/tests/test_hashable.py
+++ b/cuda_core/tests/test_hashable.py
@@ -174,7 +174,7 @@ class MyContext(Context):
     context = stream.context
 
     # MyContext._from_ctx() returns Context, not MyContext
-    my_context = MyContext._from_ctx(context._handle, device.device_id)
+    my_context = MyContext._from_ctx(context.handle, device.device_id)
     assert type(my_context) is Context, "_from_ctx returns Context type"
 
     # Same handle -> same hash
@@ -221,7 +221,7 @@ class MyContext(Context):
 
     # Test Context: always returns base type from _from_ctx
     ctx = device.context
-    my_ctx = MyContext._from_ctx(ctx._handle, device.device_id)
+    my_ctx = MyContext._from_ctx(ctx.handle, device.device_id)
 
     assert ctx == my_ctx, "Equal contexts with same handle"
     assert hash(ctx) == hash(my_ctx), "Equal objects have equal hashes"
diff --git a/cuda_core/tests/test_stream.py b/cuda_core/tests/test_stream.py
index 695a70e931..ef83c09d05 100644
--- a/cuda_core/tests/test_stream.py
+++ b/cuda_core/tests/test_stream.py
@@ -74,7 +74,7 @@ def test_stream_context(init_cuda):
     stream = Device().create_stream(options=StreamOptions())
     context = stream.context
     assert context is not None
-    assert context._handle is not None
+    assert context.handle is not None
 
 
 def test_stream_from_foreign_stream(init_cuda):

From 31880d4e973b0a7a254b1289eda178f6e7b85f0e Mon Sep 17 00:00:00 2001
From: Andy Jost <ajost@nvidia.com>
Date: Mon, 8 Dec 2025 11:41:18 -0800
Subject: [PATCH 03/38] Refactor context helpers to use ContextHandle and TLS
 cache

- switch context helper APIs to return ContextHandle instead of raw CUcontext
- add TLS wrapper for primary context caching using handles
- update device/stream code to consume ContextHandle-based helpers
- expose create_context_handle_ref as nogil-safe in the pxd
---
 cuda_core/cuda/core/experimental/_context.pxd | 12 +--
 cuda_core/cuda/core/experimental/_context.pyx | 93 ++++++++++++-------
 cuda_core/cuda/core/experimental/_device.pyx  | 41 ++++----
 .../core/experimental/_resource_handles.pxd   |  3 +-
 cuda_core/cuda/core/experimental/_stream.pyx  | 13 ++-
 5 files changed, 100 insertions(+), 62 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_context.pxd b/cuda_core/cuda/core/experimental/_context.pxd
index 875d95d283..d4dfa08085 100644
--- a/cuda_core/cuda/core/experimental/_context.pxd
+++ b/cuda_core/cuda/core/experimental/_context.pxd
@@ -13,11 +13,11 @@ cdef class Context:
     """
 
     cdef:
-        ContextHandle _resource_handle
+        ContextHandle _h_context
         int _device_id
 
-# Cython-level context operations
-cdef cydriver.CUcontext get_primary_context(int dev_id) except?NULL
-cdef cydriver.CUcontext get_current_context() except?NULL nogil
-cdef void set_current_context(cydriver.CUcontext ctx) except *
-cdef cydriver.CUcontext get_stream_context(cydriver.CUstream stream) except?NULL nogil
+# Cython-level context operations (handle-centric API)
+cdef ContextHandle get_primary_context(int dev_id) except *
+cdef ContextHandle get_current_context() except * nogil
+cdef void set_current_context(ContextHandle h_context) except * nogil
+cdef ContextHandle get_stream_context(cydriver.CUstream stream) except * nogil
diff --git a/cuda_core/cuda/core/experimental/_context.pyx b/cuda_core/cuda/core/experimental/_context.pyx
index 81daa2aa09..70870d3b1c 100644
--- a/cuda_core/cuda/core/experimental/_context.pyx
+++ b/cuda_core/cuda/core/experimental/_context.pyx
@@ -16,6 +16,21 @@ from cuda.core.experimental._utils.cuda_utils cimport HANDLE_RETURN
 __all__ = ['Context', 'ContextOptions']
 
 
+# Lightweight Python wrapper for ContextHandle (for caching in TLS)
+cdef class _ContextHandleWrapper:
+    """Internal wrapper to store ContextHandle in Python containers."""
+    cdef ContextHandle h_context
+
+    def __cinit__(self):
+        pass
+
+    @staticmethod
+    cdef _ContextHandleWrapper create(ContextHandle h_context):
+        cdef _ContextHandleWrapper wrapper = _ContextHandleWrapper.__new__(_ContextHandleWrapper)
+        wrapper.h_context = h_context
+        return wrapper
+
+
 cdef class Context:
     """CUDA context wrapper.
 
@@ -31,14 +46,14 @@ cdef class Context:
         cdef Context ctx = Context.__new__(Context)
         # Convert Python CUcontext to C-level CUcontext and create non-owning ContextHandle
         cdef cydriver.CUcontext c_ctx = <cydriver.CUcontext><uintptr_t>int(handle)
-        ctx._resource_handle = create_context_handle_ref(c_ctx)
+        ctx._h_context = create_context_handle_ref(c_ctx)
         ctx._device_id = device_id
         return ctx
 
     @property
     def handle(self):
         """Return the underlying CUcontext handle."""
-        cdef const cydriver.CUcontext* ptr = self._resource_handle.get()
+        cdef const cydriver.CUcontext* ptr = self._h_context.get()
         if ptr != NULL:
             return driver.CUcontext(<uintptr_t>(ptr[0]))
         return None
@@ -49,14 +64,14 @@ cdef class Context:
         cdef Context _other = <Context>other
         # Compare the actual CUcontext values, not the shared_ptr objects
         # (aliasing constructor creates different addresses even for same CUcontext)
-        cdef const cydriver.CUcontext* ptr1 = self._resource_handle.get()
-        cdef const cydriver.CUcontext* ptr2 = _other._resource_handle.get()
+        cdef const cydriver.CUcontext* ptr1 = self._h_context.get()
+        cdef const cydriver.CUcontext* ptr2 = _other._h_context.get()
         if ptr1 == NULL or ptr2 == NULL:
             return ptr1 == ptr2
         return ptr1[0] == ptr2[0]
 
     def __hash__(self) -> int:
-        cdef const cydriver.CUcontext* ptr = self._resource_handle.get()
+        cdef const cydriver.CUcontext* ptr = self._h_context.get()
         if ptr == NULL:
             return hash((type(self), 0))
         return hash((type(self), <uintptr_t>(ptr[0])))
@@ -71,23 +86,25 @@ class ContextOptions:
     pass  # TODO
 
 
-cdef cydriver.CUcontext get_current_context() except?NULL nogil:
-    """Get the current CUDA context.
+cdef ContextHandle get_current_context() except * nogil:
+    """Get handle to the current CUDA context.
 
     Returns
     -------
-    CUcontext
-        Current context handle, or NULL if no context is bound
+    ContextHandle
+        Handle to current context, or empty handle if no context is bound
     """
     cdef cydriver.CUcontext ctx = NULL
     HANDLE_RETURN(cydriver.cuCtxGetCurrent(&ctx))
-    return ctx
+    if ctx == NULL:
+        return ContextHandle()
+    return create_context_handle_ref(ctx)
 
 
-cdef cydriver.CUcontext get_primary_context(int dev_id) except?NULL:
-    """Get the primary context for a device.
+cdef ContextHandle get_primary_context(int dev_id) except *:
+    """Get handle to the primary context for a device.
 
-    Uses thread-local storage to cache primary contexts per device.
+    Uses thread-local storage to cache primary context handles per device.
     The primary context is lazily initialized on first access.
 
     Parameters
@@ -97,30 +114,40 @@ cdef cydriver.CUcontext get_primary_context(int dev_id) except?NULL:
 
     Returns
     -------
-    CUcontext
-        Primary context handle for the device, or NULL on error
+    ContextHandle
+        Handle to primary context for the device
     """
     cdef int total = 0
     cdef cydriver.CUcontext ctx
+    cdef ContextHandle h_context
+    cdef _ContextHandleWrapper wrapper
 
+    # Check TLS cache
     try:
         primary_ctxs = _tls.primary_ctxs
     except AttributeError:
         # Initialize primary context cache
         with nogil:
             HANDLE_RETURN(cydriver.cuDeviceGetCount(&total))
-        primary_ctxs = _tls.primary_ctxs = [0] * total
+        primary_ctxs = _tls.primary_ctxs = [None] * total
 
-    ctx = <cydriver.CUcontext><uintptr_t>(primary_ctxs[dev_id])
-    if ctx == NULL:
-        with nogil:
-            HANDLE_RETURN(cydriver.cuDevicePrimaryCtxRetain(&ctx, dev_id))
-        primary_ctxs[dev_id] = <uintptr_t>(ctx)
-    return ctx
+    wrapper = primary_ctxs[dev_id]
+    if wrapper is not None:
+        return wrapper.h_context
+
+    # Acquire primary context (release GIL for driver call)
+    with nogil:
+        HANDLE_RETURN(cydriver.cuDevicePrimaryCtxRetain(&ctx, dev_id))
+        h_context = create_context_handle_ref(ctx)
 
+    # Cache the handle (wrapped in Python object)
+    _tls.primary_ctxs[dev_id] = _ContextHandleWrapper.create(h_context)
 
-cdef cydriver.CUcontext get_stream_context(cydriver.CUstream stream) except?NULL nogil:
-    """Get the context associated with a stream.
+    return h_context
+
+
+cdef ContextHandle get_stream_context(cydriver.CUstream stream) except * nogil:
+    """Get handle to the context associated with a stream.
 
     Parameters
     ----------
@@ -129,24 +156,26 @@ cdef cydriver.CUcontext get_stream_context(cydriver.CUstream stream) except?NULL
 
     Returns
     -------
-    CUcontext
-        Context handle associated with the stream, or NULL on error
+    ContextHandle
+        Handle to context associated with the stream
     """
     cdef cydriver.CUcontext ctx = NULL
     HANDLE_RETURN(cydriver.cuStreamGetCtx(stream, &ctx))
-    return ctx
+    return create_context_handle_ref(ctx)
 
 
-cdef void set_current_context(cydriver.CUcontext ctx) except *:
-    """Set the current CUDA context.
+cdef void set_current_context(ContextHandle h_context) except * nogil:
+    """Set the current CUDA context from a handle.
 
     Parameters
     ----------
-    ctx : CUcontext
+    h_context : ContextHandle
         Context handle to set as current
     """
-    with nogil:
-        HANDLE_RETURN(cydriver.cuCtxSetCurrent(ctx))
+    if h_context.get() == NULL:
+        with gil:
+            raise ValueError("Cannot set NULL context as current")
+    HANDLE_RETURN(cydriver.cuCtxSetCurrent(h_context.get()[0]))
 
 
 # Thread-local storage for primary context cache
diff --git a/cuda_core/cuda/core/experimental/_device.pyx b/cuda_core/cuda/core/experimental/_device.pyx
index ceea0c6a5f..bf38a5515e 100644
--- a/cuda_core/cuda/core/experimental/_device.pyx
+++ b/cuda_core/cuda/core/experimental/_device.pyx
@@ -19,6 +19,7 @@ from cuda.core.experimental._context cimport (
 )
 from cuda.core.experimental._context import ContextOptions
 from cuda.core.experimental._event import Event, EventOptions
+from cuda.core.experimental._resource_handles cimport ContextHandle
 from cuda.core.experimental._graph import GraphBuilder
 from cuda.core.experimental._stream import IsStreamT, Stream, StreamOptions
 from cuda.core.experimental._utils.clear_error_support import assert_type
@@ -959,14 +960,15 @@ class Device:
         # important: creating a Device instance does not initialize the GPU!
         cdef cydriver.CUdevice dev
         cdef cydriver.CUcontext ctx
+        cdef ContextHandle h_context
         if device_id is None:
             with nogil:
                 err = cydriver.cuCtxGetDevice(&dev)
             if err == cydriver.CUresult.CUDA_SUCCESS:
                 device_id = int(dev)
             elif err == cydriver.CUresult.CUDA_ERROR_INVALID_CONTEXT:
-                ctx = get_current_context()
-                assert <void*>(ctx) == NULL
+                h_context = get_current_context()
+                assert h_context.get() == NULL
                 device_id = 0  # cudart behavior
             else:
                 HANDLE_RETURN(err)
@@ -1114,16 +1116,16 @@ class Device:
 
         """
         self._check_context_initialized()
-        cdef cydriver.CUcontext ctx
+        cdef ContextHandle h_context
         cdef cydriver.CUdevice dev
+        h_context = get_current_context()
+        if h_context.get() == NULL:
+            raise CUDAError("No context is bound to the calling CPU thread.")
         with nogil:
-            ctx = get_current_context()
             HANDLE_RETURN(cydriver.cuCtxGetDevice(&dev))
-        if ctx == NULL:
-            raise CUDAError("No context is bound to the calling CPU thread.")
         if <int>dev != self._id:
             raise CUDAError("Internal error (current device is not equal to Device.device_id)")
-        return Context._from_ctx(<uintptr_t>(ctx), self._id)
+        return Context._from_ctx(<uintptr_t>(h_context.get()[0]), self._id)
 
     @property
     def memory_resource(self) -> MemoryResource:
@@ -1215,18 +1217,19 @@ class Device:
         >>> # ... do work on device 0 ...
 
         """
-        cdef cydriver.CUcontext prev_ctx
-        cdef cydriver.CUcontext curr_ctx
+        cdef ContextHandle h_context
+        cdef cydriver.CUcontext prev_ctx, curr_ctx
+
         if ctx is not None:
             # TODO: revisit once Context is cythonized
             assert_type(ctx, Context)
-            if ctx._id != self._id:
+            if ctx._device_id != self._id:
                 raise RuntimeError(
                     "the provided context was created on the device with"
-                    f" id={ctx._id}, which is different from the target id={self._id}"
+                    f" id={ctx._device_id}, which is different from the target id={self._id}"
                 )
             # prev_ctx is the previous context
-            curr_ctx = ctx._resource_handle.get()[0]
+            curr_ctx = ctx._h_context.get()[0]
             prev_ctx = NULL
             with nogil:
                 HANDLE_RETURN(cydriver.cuCtxPopCurrent(&prev_ctx))
@@ -1236,8 +1239,9 @@ class Device:
                 return Context._from_ctx(<uintptr_t>(prev_ctx), self._id)
         else:
             # use primary ctx
-            curr_ctx = get_primary_context(self._id)
-            set_current_context(curr_ctx)
+            h_context = get_primary_context(self._id)
+            with nogil:
+                set_current_context(h_context)
             self._has_inited = True
 
     def create_context(self, options: ContextOptions = None) -> Context:
@@ -1310,12 +1314,11 @@ class Device:
 
         """
         self._check_context_initialized()
-        cdef cydriver.CUcontext ctx
-        with nogil:
-            ctx = get_current_context()
-        if ctx == NULL:
+        cdef ContextHandle h_context
+        h_context = get_current_context()
+        if h_context.get() == NULL:
             raise CUDAError("No context is bound to the calling CPU thread.")
-        return Event._init(self._id, <uintptr_t>(ctx), options, True)
+        return Event._init(self._id, <uintptr_t>(h_context.get()[0]), options, True)
 
     def allocate(self, size, stream: Stream | GraphBuilder | None = None) -> Buffer:
         """Allocate device memory from a specified stream.
diff --git a/cuda_core/cuda/core/experimental/_resource_handles.pxd b/cuda_core/cuda/core/experimental/_resource_handles.pxd
index 8ada7d8cd5..08a4cc01c6 100644
--- a/cuda_core/cuda/core/experimental/_resource_handles.pxd
+++ b/cuda_core/cuda/core/experimental/_resource_handles.pxd
@@ -12,4 +12,5 @@ cdef extern from "_cpp/resource_handles.hpp" namespace "cuda_core":
     ctypedef shared_ptr[const cydriver.CUcontext] ContextHandle
 
     # Function to create a non-owning context handle (references existing context)
-    ContextHandle create_context_handle_ref(cydriver.CUcontext ctx)
+    # This is nogil-safe (pure C++, no Python dependencies)
+    ContextHandle create_context_handle_ref(cydriver.CUcontext ctx) nogil
diff --git a/cuda_core/cuda/core/experimental/_stream.pyx b/cuda_core/cuda/core/experimental/_stream.pyx
index 440130f679..de87f378df 100644
--- a/cuda_core/cuda/core/experimental/_stream.pyx
+++ b/cuda_core/cuda/core/experimental/_stream.pyx
@@ -31,6 +31,7 @@ from cuda.core.experimental._context cimport (
     get_current_context,
 )
 from cuda.core.experimental._event import Event, EventOptions
+from cuda.core.experimental._resource_handles cimport ContextHandle
 from cuda.core.experimental._graph import GraphBuilder
 from cuda.core.experimental._utils.cuda_utils import (
     driver,
@@ -321,16 +322,20 @@ cdef class Stream:
 
     cdef int _get_context(self) except?-1 nogil:
         if self._ctx_handle == CU_CONTEXT_INVALID:
-            self._ctx_handle = get_stream_context(self._handle)
+            h_context = get_stream_context(self._handle)
+            self._ctx_handle = h_context.get()[0]
         return 0
 
     cdef int _get_device_and_context(self) except?-1:
+        cdef ContextHandle h_curr_context
         cdef cydriver.CUcontext curr_ctx
         if self._device_id == cydriver.CU_DEVICE_INVALID:
             # Get the current context
-            curr_ctx = get_current_context()
-            # Get the stream's context (self.ctx_handle is populated)
-            self._get_context()
+            with nogil:
+                h_curr_context = get_current_context()
+                curr_ctx = h_curr_context.get()[0] if h_curr_context.get() != NULL else <cydriver.CUcontext>0
+                # Get the stream's context (self._ctx_handle is populated)
+                self._get_context()
             # Get the stream's device (may require a context-switching dance)
             self._device_id = get_device_from_ctx(self._ctx_handle, curr_ctx)
         return 0

From c173e3d1f4ed764a4495042fecbdc1649b0d914a Mon Sep 17 00:00:00 2001
From: Andy Jost <ajost@nvidia.com>
Date: Mon, 8 Dec 2025 12:18:06 -0800
Subject: [PATCH 04/38] Add helper functions to extract raw resources from
 ContextHandle

Introduce three helper functions for ContextHandle resource extraction:
- native(h): Returns cydriver.CUcontext for use with cydriver API calls
- py(h): Returns driver.CUcontext for use with Python driver API
- intptr(h): Returns uintptr_t for internal APIs expecting integer addresses

These helpers replace direct h_context.get()[0] calls, providing:
- Cleaner, more semantic code
- Consistent extraction pattern across all handle types
- Type-safe conversions with clear intent

Implementation details:
- native() and intptr() are inline nogil functions in .pxd
- py() requires Python module access, implemented in new _resource_handles.pyx
- Updated all call sites in _context, _device, and _stream modules
---
 cuda_core/cuda/core/experimental/_context.pyx |  4 +--
 cuda_core/cuda/core/experimental/_device.pyx  |  8 +++---
 .../core/experimental/_resource_handles.pxd   | 25 +++++++++++++++++++
 .../core/experimental/_resource_handles.pyx   | 14 +++++++++--
 cuda_core/cuda/core/experimental/_stream.pyx  |  6 ++---
 5 files changed, 46 insertions(+), 11 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_context.pyx b/cuda_core/cuda/core/experimental/_context.pyx
index 70870d3b1c..2b6ecad8e0 100644
--- a/cuda_core/cuda/core/experimental/_context.pyx
+++ b/cuda_core/cuda/core/experimental/_context.pyx
@@ -8,7 +8,7 @@ import threading
 from libc.stdint cimport uintptr_t
 
 from cuda.bindings cimport cydriver
-from cuda.core.experimental._resource_handles cimport create_context_handle_ref
+from cuda.core.experimental._resource_handles cimport create_context_handle_ref, native
 from cuda.core.experimental._utils.cuda_utils import driver
 from cuda.core.experimental._utils.cuda_utils cimport HANDLE_RETURN
 
@@ -175,7 +175,7 @@ cdef void set_current_context(ContextHandle h_context) except * nogil:
     if h_context.get() == NULL:
         with gil:
             raise ValueError("Cannot set NULL context as current")
-    HANDLE_RETURN(cydriver.cuCtxSetCurrent(h_context.get()[0]))
+    HANDLE_RETURN(cydriver.cuCtxSetCurrent(native(h_context)))
 
 
 # Thread-local storage for primary context cache
diff --git a/cuda_core/cuda/core/experimental/_device.pyx b/cuda_core/cuda/core/experimental/_device.pyx
index bf38a5515e..2e3e5a1e43 100644
--- a/cuda_core/cuda/core/experimental/_device.pyx
+++ b/cuda_core/cuda/core/experimental/_device.pyx
@@ -19,7 +19,7 @@ from cuda.core.experimental._context cimport (
 )
 from cuda.core.experimental._context import ContextOptions
 from cuda.core.experimental._event import Event, EventOptions
-from cuda.core.experimental._resource_handles cimport ContextHandle
+from cuda.core.experimental._resource_handles cimport ContextHandle, intptr, native
 from cuda.core.experimental._graph import GraphBuilder
 from cuda.core.experimental._stream import IsStreamT, Stream, StreamOptions
 from cuda.core.experimental._utils.clear_error_support import assert_type
@@ -1125,7 +1125,7 @@ class Device:
             HANDLE_RETURN(cydriver.cuCtxGetDevice(&dev))
         if <int>dev != self._id:
             raise CUDAError("Internal error (current device is not equal to Device.device_id)")
-        return Context._from_ctx(<uintptr_t>(h_context.get()[0]), self._id)
+        return Context._from_ctx(intptr(h_context), self._id)
 
     @property
     def memory_resource(self) -> MemoryResource:
@@ -1229,7 +1229,7 @@ class Device:
                     f" id={ctx._device_id}, which is different from the target id={self._id}"
                 )
             # prev_ctx is the previous context
-            curr_ctx = ctx._h_context.get()[0]
+            curr_ctx = native(ctx._h_context)
             prev_ctx = NULL
             with nogil:
                 HANDLE_RETURN(cydriver.cuCtxPopCurrent(&prev_ctx))
@@ -1318,7 +1318,7 @@ class Device:
         h_context = get_current_context()
         if h_context.get() == NULL:
             raise CUDAError("No context is bound to the calling CPU thread.")
-        return Event._init(self._id, <uintptr_t>(h_context.get()[0]), options, True)
+        return Event._init(self._id, intptr(h_context), options, True)
 
     def allocate(self, size, stream: Stream | GraphBuilder | None = None) -> Buffer:
         """Allocate device memory from a specified stream.
diff --git a/cuda_core/cuda/core/experimental/_resource_handles.pxd b/cuda_core/cuda/core/experimental/_resource_handles.pxd
index 08a4cc01c6..4ec0e6b62c 100644
--- a/cuda_core/cuda/core/experimental/_resource_handles.pxd
+++ b/cuda_core/cuda/core/experimental/_resource_handles.pxd
@@ -2,6 +2,7 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
+from libc.stdint cimport uintptr_t
 from libcpp.memory cimport shared_ptr
 
 from cuda.bindings cimport cydriver
@@ -14,3 +15,27 @@ cdef extern from "_cpp/resource_handles.hpp" namespace "cuda_core":
     # Function to create a non-owning context handle (references existing context)
     # This is nogil-safe (pure C++, no Python dependencies)
     ContextHandle create_context_handle_ref(cydriver.CUcontext ctx) nogil
+
+
+# ============================================================================
+# Helper functions to extract raw resources from handles
+# ============================================================================
+
+cdef inline cydriver.CUcontext native(ContextHandle h) nogil:
+    """Extract the native C type (cydriver.CUcontext) from the handle.
+
+    This is for use with cydriver API calls that expect the raw C type.
+    """
+    return h.get()[0]
+
+
+# Python conversion function (implemented in .pyx due to Python module dependency)
+cdef object py(ContextHandle h)
+
+
+cdef inline uintptr_t intptr(ContextHandle h) nogil:
+    """Extract the handle as a uintptr_t integer address.
+
+    This is for use with internal APIs that expect integer addresses.
+    """
+    return <uintptr_t>(h.get()[0])
diff --git a/cuda_core/cuda/core/experimental/_resource_handles.pyx b/cuda_core/cuda/core/experimental/_resource_handles.pyx
index 564f2abac3..b150228762 100644
--- a/cuda_core/cuda/core/experimental/_resource_handles.pyx
+++ b/cuda_core/cuda/core/experimental/_resource_handles.pyx
@@ -2,5 +2,15 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-# This module exists to compile _cpp/resource_handles.cpp into a shared library.
-# The C++ code provides handle management for CUDA contexts and other resources.
+from libc.stdint cimport uintptr_t
+
+from cuda.bindings import driver
+from cuda.core.experimental._resource_handles cimport ContextHandle
+
+
+cdef object py(ContextHandle h):
+    """Convert the handle to a Python driver.CUcontext object.
+
+    This is for use with driver (Python) API calls or returning to Python code.
+    """
+    return driver.CUcontext(<uintptr_t>(h.get()[0]))
diff --git a/cuda_core/cuda/core/experimental/_stream.pyx b/cuda_core/cuda/core/experimental/_stream.pyx
index de87f378df..ec6436af53 100644
--- a/cuda_core/cuda/core/experimental/_stream.pyx
+++ b/cuda_core/cuda/core/experimental/_stream.pyx
@@ -31,7 +31,7 @@ from cuda.core.experimental._context cimport (
     get_current_context,
 )
 from cuda.core.experimental._event import Event, EventOptions
-from cuda.core.experimental._resource_handles cimport ContextHandle
+from cuda.core.experimental._resource_handles cimport ContextHandle, native
 from cuda.core.experimental._graph import GraphBuilder
 from cuda.core.experimental._utils.cuda_utils import (
     driver,
@@ -323,7 +323,7 @@ cdef class Stream:
     cdef int _get_context(self) except?-1 nogil:
         if self._ctx_handle == CU_CONTEXT_INVALID:
             h_context = get_stream_context(self._handle)
-            self._ctx_handle = h_context.get()[0]
+            self._ctx_handle = native(h_context)
         return 0
 
     cdef int _get_device_and_context(self) except?-1:
@@ -333,7 +333,7 @@ cdef class Stream:
             # Get the current context
             with nogil:
                 h_curr_context = get_current_context()
-                curr_ctx = h_curr_context.get()[0] if h_curr_context.get() != NULL else <cydriver.CUcontext>0
+                curr_ctx = native(h_curr_context) if h_curr_context.get() != NULL else <cydriver.CUcontext>0
                 # Get the stream's context (self._ctx_handle is populated)
                 self._get_context()
             # Get the stream's device (may require a context-switching dance)

From 4357f580a553053f0c77c4d40c35bd06b5a07407 Mon Sep 17 00:00:00 2001
From: Andy Jost <ajost@nvidia.com>
Date: Mon, 8 Dec 2025 16:31:56 -0800
Subject: [PATCH 05/38] Refactor context acquisition to C++ handle helpers

Move get_primary_context/get_current_context into C++ with thread-local caching and conditional GIL release; inline create_context_handle_ref in the header; update Cython modules and build hooks to link handle users (including _device) against resource_handles and libcuda.
---
 cuda_core/build_hooks.py                      |  29 ++++-
 cuda_core/cuda/core/experimental/_context.pxd |   3 +-
 cuda_core/cuda/core/experimental/_context.pyx |  87 ++------------
 .../experimental/_cpp/resource_handles.cpp    | 107 ++++++++++++++++--
 .../experimental/_cpp/resource_handles.hpp    |  26 ++---
 cuda_core/cuda/core/experimental/_device.pyx  |  10 +-
 .../core/experimental/_resource_handles.pxd   |   4 +
 cuda_core/cuda/core/experimental/_stream.pyx  |   7 +-
 8 files changed, 163 insertions(+), 110 deletions(-)

diff --git a/cuda_core/build_hooks.py b/cuda_core/build_hooks.py
index 7ebb67cef0..9b85973e17 100644
--- a/cuda_core/build_hooks.py
+++ b/cuda_core/build_hooks.py
@@ -84,6 +84,18 @@ def get_cuda_paths():
         print("CUDA paths:", CUDA_PATH)
         return CUDA_PATH
 
+    @functools.cache
+    def get_cuda_library_dirs():
+        """Return library search paths for CUDA driver runtime."""
+
+        libdirs = []
+        for root in get_cuda_paths():
+            for subdir in ("lib64", "lib"):
+                candidate = os.path.join(root, subdir)
+                if os.path.isdir(candidate):
+                    libdirs.append(candidate)
+        return libdirs
+
     def get_sources(mod_name):
         """Get source files for a module, including any .cpp files."""
         sources = [f"cuda/core/experimental/{mod_name}.pyx"]
@@ -95,7 +107,11 @@ def get_sources(mod_name):
 
         # Modules that use resource handles need to link against _resource_handles_impl.cpp
         # This includes _context, _stream, _event, etc. as they adopt handle-based management
-        resource_handle_users = {"_context", "_stream", "_event"}
+        # Modules that call into the handle helpers implemented in
+        # `_resource_handles_impl.cpp` must link against that translation unit.
+        # Keep this in sync with any module that cimports `get_primary_context`
+        # or other helpers defined there.
+        resource_handle_users = {"_context", "_stream", "_event", "_device"}
         if mod_name in resource_handle_users:
             resource_handles_impl = "cuda/core/experimental/_resource_handles_impl.cpp"
             if os.path.exists(resource_handles_impl):
@@ -103,6 +119,16 @@ def get_sources(mod_name):
 
         return sources
 
+    def get_extension_kwargs(mod_name):
+        """Return Extension kwargs (libraries, library_dirs) per module."""
+
+        resource_handle_users = {"_context", "_stream", "_event", "_device"}
+        kwargs = {}
+        if mod_name in resource_handle_users:
+            kwargs["libraries"] = ["cuda"]
+            kwargs["library_dirs"] = get_cuda_library_dirs()
+        return kwargs
+
     ext_modules = tuple(
         Extension(
             f"cuda.core.experimental.{mod.replace(os.path.sep, '.')}",
@@ -113,6 +139,7 @@ def get_sources(mod_name):
             ]
             + list(os.path.join(root, "include") for root in get_cuda_paths()),
             language="c++",
+            **get_extension_kwargs(mod),
         )
         for mod in module_names
     )
diff --git a/cuda_core/cuda/core/experimental/_context.pxd b/cuda_core/cuda/core/experimental/_context.pxd
index d4dfa08085..01552c055e 100644
--- a/cuda_core/cuda/core/experimental/_context.pxd
+++ b/cuda_core/cuda/core/experimental/_context.pxd
@@ -17,7 +17,6 @@ cdef class Context:
         int _device_id
 
 # Cython-level context operations (handle-centric API)
-cdef ContextHandle get_primary_context(int dev_id) except *
-cdef ContextHandle get_current_context() except * nogil
+# Note: get_primary_context and get_current_context are now pure C++ (imported from _resource_handles)
 cdef void set_current_context(ContextHandle h_context) except * nogil
 cdef ContextHandle get_stream_context(cydriver.CUstream stream) except * nogil
diff --git a/cuda_core/cuda/core/experimental/_context.pyx b/cuda_core/cuda/core/experimental/_context.pyx
index 2b6ecad8e0..6532eecadf 100644
--- a/cuda_core/cuda/core/experimental/_context.pyx
+++ b/cuda_core/cuda/core/experimental/_context.pyx
@@ -4,11 +4,15 @@
 
 from dataclasses import dataclass
 
-import threading
 from libc.stdint cimport uintptr_t
 
 from cuda.bindings cimport cydriver
-from cuda.core.experimental._resource_handles cimport create_context_handle_ref, native
+from cuda.core.experimental._resource_handles cimport (
+    create_context_handle_ref,
+    get_primary_context,
+    get_current_context,
+    native,
+)
 from cuda.core.experimental._utils.cuda_utils import driver
 from cuda.core.experimental._utils.cuda_utils cimport HANDLE_RETURN
 
@@ -16,21 +20,6 @@ from cuda.core.experimental._utils.cuda_utils cimport HANDLE_RETURN
 __all__ = ['Context', 'ContextOptions']
 
 
-# Lightweight Python wrapper for ContextHandle (for caching in TLS)
-cdef class _ContextHandleWrapper:
-    """Internal wrapper to store ContextHandle in Python containers."""
-    cdef ContextHandle h_context
-
-    def __cinit__(self):
-        pass
-
-    @staticmethod
-    cdef _ContextHandleWrapper create(ContextHandle h_context):
-        cdef _ContextHandleWrapper wrapper = _ContextHandleWrapper.__new__(_ContextHandleWrapper)
-        wrapper.h_context = h_context
-        return wrapper
-
-
 cdef class Context:
     """CUDA context wrapper.
 
@@ -86,64 +75,8 @@ class ContextOptions:
     pass  # TODO
 
 
-cdef ContextHandle get_current_context() except * nogil:
-    """Get handle to the current CUDA context.
-
-    Returns
-    -------
-    ContextHandle
-        Handle to current context, or empty handle if no context is bound
-    """
-    cdef cydriver.CUcontext ctx = NULL
-    HANDLE_RETURN(cydriver.cuCtxGetCurrent(&ctx))
-    if ctx == NULL:
-        return ContextHandle()
-    return create_context_handle_ref(ctx)
-
-
-cdef ContextHandle get_primary_context(int dev_id) except *:
-    """Get handle to the primary context for a device.
-
-    Uses thread-local storage to cache primary context handles per device.
-    The primary context is lazily initialized on first access.
-
-    Parameters
-    ----------
-    dev_id : int
-        Device ID
-
-    Returns
-    -------
-    ContextHandle
-        Handle to primary context for the device
-    """
-    cdef int total = 0
-    cdef cydriver.CUcontext ctx
-    cdef ContextHandle h_context
-    cdef _ContextHandleWrapper wrapper
-
-    # Check TLS cache
-    try:
-        primary_ctxs = _tls.primary_ctxs
-    except AttributeError:
-        # Initialize primary context cache
-        with nogil:
-            HANDLE_RETURN(cydriver.cuDeviceGetCount(&total))
-        primary_ctxs = _tls.primary_ctxs = [None] * total
-
-    wrapper = primary_ctxs[dev_id]
-    if wrapper is not None:
-        return wrapper.h_context
-
-    # Acquire primary context (release GIL for driver call)
-    with nogil:
-        HANDLE_RETURN(cydriver.cuDevicePrimaryCtxRetain(&ctx, dev_id))
-        h_context = create_context_handle_ref(ctx)
-
-    # Cache the handle (wrapped in Python object)
-    _tls.primary_ctxs[dev_id] = _ContextHandleWrapper.create(h_context)
-
-    return h_context
+# get_current_context() and get_primary_context() are now pure C++ functions
+# imported from _resource_handles (with thread-local caching in C++)
 
 
 cdef ContextHandle get_stream_context(cydriver.CUstream stream) except * nogil:
@@ -176,7 +109,3 @@ cdef void set_current_context(ContextHandle h_context) except * nogil:
         with gil:
             raise ValueError("Cannot set NULL context as current")
     HANDLE_RETURN(cydriver.cuCtxSetCurrent(native(h_context)))
-
-
-# Thread-local storage for primary context cache
-_tls = threading.local()
diff --git a/cuda_core/cuda/core/experimental/_cpp/resource_handles.cpp b/cuda_core/cuda/core/experimental/_cpp/resource_handles.cpp
index 93a31551e0..f584304496 100644
--- a/cuda_core/cuda/core/experimental/_cpp/resource_handles.cpp
+++ b/cuda_core/cuda/core/experimental/_cpp/resource_handles.cpp
@@ -2,34 +2,121 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
+#include <Python.h>
+
 #include "resource_handles.hpp"
 #include <cuda.h>
+#include <vector>
 
 namespace cuda_core {
 
-// ============================================================================
-// Context Handles
-// ============================================================================
+// Helper to release the GIL while calling into the CUDA driver.
+// This guard is *conditional*: if the caller already dropped the GIL,
+// we avoid calling PyEval_SaveThread (which requires holding the GIL).
+// It also handles the case where Python is finalizing and GIL operations
+// are no longer safe.
+class GILReleaseGuard {
+public:
+    GILReleaseGuard() : tstate_(nullptr), released_(false) {
+        // Don't try to manipulate GIL if Python is finalizing
+        if (!Py_IsInitialized() || _Py_IsFinalizing()) {
+            return;
+        }
+        // PyGILState_Check() returns 1 if the GIL is held by this thread.
+        if (PyGILState_Check()) {
+            tstate_ = PyEval_SaveThread();
+            released_ = true;
+        }
+    }
+
+    ~GILReleaseGuard() {
+        if (released_) {
+            PyEval_RestoreThread(tstate_);
+        }
+    }
+
+private:
+    PyThreadState* tstate_;
+    bool released_;
+};
+
+// Internal box structure for Context (kept private to this TU)
+struct ContextBox {
+    CUcontext resource;
+};
 
 ContextHandle create_context_handle_ref(CUcontext ctx) {
     // Creates a non-owning handle that references an existing context
     // (e.g., primary context managed by CUDA driver)
 
-    // Allocate the box containing the context resource
-    ContextBox* box = new ContextBox();
-    box->resource = ctx;
-
     // Use default deleter - it will delete the box, but not touch the CUcontext
     // CUcontext lifetime is managed externally (e.g., by CUDA driver)
-    std::shared_ptr<const ContextBox> box_ptr(box);
+    auto box = new ContextBox{ctx};
+    auto box_ptr = std::shared_ptr<const ContextBox>(box);
 
     // Use aliasing constructor to create handle that exposes only CUcontext
     // The handle's reference count is tied to box_ptr, but it points to &box_ptr->resource
     return ContextHandle(box_ptr, &box_ptr->resource);
 }
 
-// TODO: Future owning handle for cuCtxCreate/cuCtxDestroy
-// ContextHandle create_context_handle(CUdevice dev, unsigned int flags) { ... }
+// Thread-local storage for primary context cache
+// Each thread maintains its own cache of primary contexts indexed by device ID
+thread_local std::vector<ContextHandle> primary_context_cache;
+
+ContextHandle get_primary_context(int dev_id) noexcept {
+    // Check thread-local cache
+    if (static_cast<size_t>(dev_id) < primary_context_cache.size()) {
+        auto cached = primary_context_cache[dev_id];
+        if (cached.get() != nullptr) {
+            return cached;  // Cache hit
+        }
+    }
+
+    // Cache miss - acquire primary context from driver
+    CUcontext ctx;
+    CUresult err;
+    {
+        GILReleaseGuard gil;
+        err = cuDevicePrimaryCtxRetain(&ctx, dev_id);
+    }
+    if (err != CUDA_SUCCESS) {
+        // Return empty handle on error (caller must check)
+        return ContextHandle();
+    }
+
+    // Create owning handle with custom deleter that releases the primary context
+    auto box = new ContextBox{ctx};
+    auto box_ptr = std::shared_ptr<const ContextBox>(box, [dev_id](const ContextBox* b) {
+        GILReleaseGuard gil;
+        cuDevicePrimaryCtxRelease(dev_id);
+        delete b;
+    });
+
+    // Use aliasing constructor to expose only CUcontext
+    auto h_context = ContextHandle(box_ptr, &box_ptr->resource);
+
+    // Resize cache if needed
+    if (static_cast<size_t>(dev_id) >= primary_context_cache.size()) {
+        primary_context_cache.resize(dev_id + 1);
+    }
+    primary_context_cache[dev_id] = h_context;
+
+    return h_context;
+}
+
+ContextHandle get_current_context() noexcept {
+    CUcontext ctx = nullptr;
+    CUresult err;
+    {
+        GILReleaseGuard gil;
+        err = cuCtxGetCurrent(&ctx);
+    }
+    if (err != CUDA_SUCCESS || ctx == nullptr) {
+        // Return empty handle if no current context or error
+        return ContextHandle();
+    }
+    return create_context_handle_ref(ctx);
+}
 
 // ============================================================================
 // Stream Handles
diff --git a/cuda_core/cuda/core/experimental/_cpp/resource_handles.hpp b/cuda_core/cuda/core/experimental/_cpp/resource_handles.hpp
index 949d9f1289..7d6892ccef 100644
--- a/cuda_core/cuda/core/experimental/_cpp/resource_handles.hpp
+++ b/cuda_core/cuda/core/experimental/_cpp/resource_handles.hpp
@@ -9,22 +9,22 @@
 
 namespace cuda_core {
 
-// Forward declarations
-struct ContextBox;
-
 // Handle type aliases - expose only the raw CUDA resource
 using ContextHandle = std::shared_ptr<const CUcontext>;
 
-// Internal box structure for Context
-// This holds the resource and any dependencies needed for lifetime management
-struct ContextBox {
-    CUcontext resource;
-    // Context doesn't depend on other CUDA resources, but we keep the structure
-    // extensible for future needs
-};
-
-// Function to create a non-owning context handle (references existing context)
-// This will be implemented in the .cpp file
+// Function to create a non-owning context handle (references existing context).
 ContextHandle create_context_handle_ref(CUcontext ctx);
 
+// ============================================================================
+// Context acquisition functions (pure C++, nogil-safe)
+// ============================================================================
+
+// Get handle to the primary context for a device (with thread-local caching)
+// Returns empty handle on error (caller must check)
+ContextHandle get_primary_context(int dev_id) noexcept;
+
+// Get handle to the current CUDA context
+// Returns empty handle if no context is current (caller must check)
+ContextHandle get_current_context() noexcept;
+
 }  // namespace cuda_core
diff --git a/cuda_core/cuda/core/experimental/_device.pyx b/cuda_core/cuda/core/experimental/_device.pyx
index 2e3e5a1e43..8d62b1de51 100644
--- a/cuda_core/cuda/core/experimental/_device.pyx
+++ b/cuda_core/cuda/core/experimental/_device.pyx
@@ -13,13 +13,17 @@ from typing import Optional, TYPE_CHECKING, Union
 
 from cuda.core.experimental._context cimport (
     Context,
-    get_primary_context,
-    get_current_context,
     set_current_context,
 )
 from cuda.core.experimental._context import ContextOptions
 from cuda.core.experimental._event import Event, EventOptions
-from cuda.core.experimental._resource_handles cimport ContextHandle, intptr, native
+from cuda.core.experimental._resource_handles cimport (
+    ContextHandle,
+    get_primary_context,
+    get_current_context,
+    intptr,
+    native,
+)
 from cuda.core.experimental._graph import GraphBuilder
 from cuda.core.experimental._stream import IsStreamT, Stream, StreamOptions
 from cuda.core.experimental._utils.clear_error_support import assert_type
diff --git a/cuda_core/cuda/core/experimental/_resource_handles.pxd b/cuda_core/cuda/core/experimental/_resource_handles.pxd
index 4ec0e6b62c..99e97f977e 100644
--- a/cuda_core/cuda/core/experimental/_resource_handles.pxd
+++ b/cuda_core/cuda/core/experimental/_resource_handles.pxd
@@ -16,6 +16,10 @@ cdef extern from "_cpp/resource_handles.hpp" namespace "cuda_core":
     # This is nogil-safe (pure C++, no Python dependencies)
     ContextHandle create_context_handle_ref(cydriver.CUcontext ctx) nogil
 
+    # Context acquisition functions (pure C++, nogil-safe with thread-local caching)
+    ContextHandle get_primary_context(int dev_id) nogil
+    ContextHandle get_current_context() nogil
+
 
 # ============================================================================
 # Helper functions to extract raw resources from handles
diff --git a/cuda_core/cuda/core/experimental/_stream.pyx b/cuda_core/cuda/core/experimental/_stream.pyx
index ec6436af53..8e775c56be 100644
--- a/cuda_core/cuda/core/experimental/_stream.pyx
+++ b/cuda_core/cuda/core/experimental/_stream.pyx
@@ -28,10 +28,13 @@ if TYPE_CHECKING:
 from cuda.core.experimental._context cimport (
     Context,
     get_stream_context,
-    get_current_context,
 )
 from cuda.core.experimental._event import Event, EventOptions
-from cuda.core.experimental._resource_handles cimport ContextHandle, native
+from cuda.core.experimental._resource_handles cimport (
+    ContextHandle,
+    get_current_context,
+    native,
+)
 from cuda.core.experimental._graph import GraphBuilder
 from cuda.core.experimental._utils.cuda_utils import (
     driver,

From 625a86f077f0d95a737b920a943654dec4caabef Mon Sep 17 00:00:00 2001
From: Andy Jost <ajost@nvidia.com>
Date: Mon, 8 Dec 2025 17:47:11 -0800
Subject: [PATCH 06/38] Fix link error by loading _resource_handles with
 RTLD_GLOBAL

The C++ implementation in _resource_handles_impl.cpp is now compiled
only into _resource_handles.so. Other modules that depend on these
symbols (_context, _device, etc.) resolve them at runtime via the
global symbol table.

This ensures a single shared instance of thread-local caches and
avoids setuptools issues with shared source files across extensions.
---
 cuda_core/build_hooks.py                      | 19 ++++---------
 cuda_core/cuda/core/experimental/__init__.py  | 17 +++++++++++
 .../experimental/_cpp/resource_handles.cpp    | 28 +++----------------
 .../experimental/_cpp/resource_handles.hpp    |  2 +-
 4 files changed, 27 insertions(+), 39 deletions(-)

diff --git a/cuda_core/build_hooks.py b/cuda_core/build_hooks.py
index 9b85973e17..a20407488e 100644
--- a/cuda_core/build_hooks.py
+++ b/cuda_core/build_hooks.py
@@ -105,26 +105,17 @@ def get_sources(mod_name):
         if os.path.exists(cpp_file):
             sources.append(cpp_file)
 
-        # Modules that use resource handles need to link against _resource_handles_impl.cpp
-        # This includes _context, _stream, _event, etc. as they adopt handle-based management
-        # Modules that call into the handle helpers implemented in
-        # `_resource_handles_impl.cpp` must link against that translation unit.
-        # Keep this in sync with any module that cimports `get_primary_context`
-        # or other helpers defined there.
-        resource_handle_users = {"_context", "_stream", "_event", "_device"}
-        if mod_name in resource_handle_users:
-            resource_handles_impl = "cuda/core/experimental/_resource_handles_impl.cpp"
-            if os.path.exists(resource_handles_impl):
-                sources.append(resource_handles_impl)
-
         return sources
 
     def get_extension_kwargs(mod_name):
         """Return Extension kwargs (libraries, library_dirs) per module."""
 
-        resource_handle_users = {"_context", "_stream", "_event", "_device"}
+        # Modules that use CUDA driver APIs need to link against libcuda
+        # _resource_handles: contains the C++ implementation that calls CUDA driver
+        # _context, _stream, _event, _device: use resource handles and may call CUDA driver directly
+        cuda_users = {"_resource_handles", "_context", "_stream", "_event", "_device"}
         kwargs = {}
-        if mod_name in resource_handle_users:
+        if mod_name in cuda_users:
             kwargs["libraries"] = ["cuda"]
             kwargs["library_dirs"] = get_cuda_library_dirs()
         return kwargs
diff --git a/cuda_core/cuda/core/experimental/__init__.py b/cuda_core/cuda/core/experimental/__init__.py
index 826ea70b97..ac0627222b 100644
--- a/cuda_core/cuda/core/experimental/__init__.py
+++ b/cuda_core/cuda/core/experimental/__init__.py
@@ -12,6 +12,23 @@
         raise ImportError("cuda.bindings 12.x or 13.x must be installed")
 
 import importlib
+import sys
+
+# Load _resource_handles with RTLD_GLOBAL so its C++ symbols are available
+# to other extension modules that depend on them (_context, _device, etc.)
+# This must happen before importing any dependent modules.
+if sys.platform != "win32":
+    import os
+
+    _old_dlopen_flags = sys.getdlopenflags()
+    sys.setdlopenflags(os.RTLD_GLOBAL | os.RTLD_NOW)
+    try:
+        from cuda.core.experimental import _resource_handles  # noqa: F401
+    finally:
+        sys.setdlopenflags(_old_dlopen_flags)
+    del _old_dlopen_flags, os
+else:
+    from cuda.core.experimental import _resource_handles  # noqa: F401
 
 subdir = f"cu{cuda_major}"
 try:
diff --git a/cuda_core/cuda/core/experimental/_cpp/resource_handles.cpp b/cuda_core/cuda/core/experimental/_cpp/resource_handles.cpp
index f584304496..6ee1088937 100644
--- a/cuda_core/cuda/core/experimental/_cpp/resource_handles.cpp
+++ b/cuda_core/cuda/core/experimental/_cpp/resource_handles.cpp
@@ -35,6 +35,10 @@ class GILReleaseGuard {
         }
     }
 
+    // Non-copyable, non-movable
+    GILReleaseGuard(const GILReleaseGuard&) = delete;
+    GILReleaseGuard& operator=(const GILReleaseGuard&) = delete;
+
 private:
     PyThreadState* tstate_;
     bool released_;
@@ -118,28 +122,4 @@ ContextHandle get_current_context() noexcept {
     return create_context_handle_ref(ctx);
 }
 
-// ============================================================================
-// Stream Handles
-// ============================================================================
-
-// TODO: Implement StreamH create_stream_handle(...) when Stream gets handle support
-
-// ============================================================================
-// Event Handles
-// ============================================================================
-
-// TODO: Implement EventH create_event_handle(...) when Event gets handle support
-
-// ============================================================================
-// Device Pointer Handles
-// ============================================================================
-
-// TODO: Implement DevicePtrH create_deviceptr_handle(...) when DevicePtr gets handle support
-
-// ============================================================================
-// Memory Pool Handles
-// ============================================================================
-
-// TODO: Implement MemPoolH create_mempool_handle(...) when MemPool gets handle support
-
 }  // namespace cuda_core
diff --git a/cuda_core/cuda/core/experimental/_cpp/resource_handles.hpp b/cuda_core/cuda/core/experimental/_cpp/resource_handles.hpp
index 7d6892ccef..54e7c3ba39 100644
--- a/cuda_core/cuda/core/experimental/_cpp/resource_handles.hpp
+++ b/cuda_core/cuda/core/experimental/_cpp/resource_handles.hpp
@@ -4,8 +4,8 @@
 
 #pragma once
 
-#include <memory>
 #include <cuda.h>
+#include <memory>
 
 namespace cuda_core {
 

From c0cbacd21c89b3ea08b68500f6802f0c5c6b5eb4 Mon Sep 17 00:00:00 2001
From: Andy Jost <ajost@nvidia.com>
Date: Tue, 9 Dec 2025 10:55:57 -0800
Subject: [PATCH 07/38] Move helper functions to C++ for overloading support

Move native(), intptr(), and py() from Cython inline functions to
inline C++ functions in resource_handles.hpp. This enables function
overloading when additional handle types (e.g., StreamHandle) are added.

- native(): extract raw CUDA handle from ContextHandle
- intptr(): extract handle as uintptr_t for Python interop
- py(): convert handle to Python driver wrapper object
---
 .../experimental/_cpp/resource_handles.hpp    | 33 +++++++++++++++++++
 .../core/experimental/_resource_handles.pxd   | 30 ++++++-----------
 .../core/experimental/_resource_handles.pyx   | 15 ++-------
 3 files changed, 46 insertions(+), 32 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_cpp/resource_handles.hpp b/cuda_core/cuda/core/experimental/_cpp/resource_handles.hpp
index 54e7c3ba39..7c0bf2ec63 100644
--- a/cuda_core/cuda/core/experimental/_cpp/resource_handles.hpp
+++ b/cuda_core/cuda/core/experimental/_cpp/resource_handles.hpp
@@ -4,7 +4,9 @@
 
 #pragma once
 
+#include <Python.h>
 #include <cuda.h>
+#include <cstdint>
 #include <memory>
 
 namespace cuda_core {
@@ -27,4 +29,35 @@ ContextHandle get_primary_context(int dev_id) noexcept;
 // Returns empty handle if no context is current (caller must check)
 ContextHandle get_current_context() noexcept;
 
+// ============================================================================
+// Helper functions to extract raw resources from handles
+// These are defined as inline C++ functions to support overloading when
+// additional handle types (e.g., StreamHandle) are added.
+// ============================================================================
+
+// native() - extract the raw CUDA handle
+inline CUcontext native(const ContextHandle& h) noexcept {
+    return h ? *h : nullptr;
+}
+
+// intptr() - extract handle as uintptr_t for Python interop
+inline std::uintptr_t intptr(const ContextHandle& h) noexcept {
+    return reinterpret_cast<std::uintptr_t>(h ? *h : nullptr);
+}
+
+// py() - convert handle to Python driver wrapper object
+// Returns new reference. Caller must hold GIL.
+inline PyObject* py(const ContextHandle& h) {
+    static PyObject* cls = nullptr;
+    if (!cls) {
+        PyObject* mod = PyImport_ImportModule("cuda.bindings.driver");
+        if (!mod) return nullptr;
+        cls = PyObject_GetAttrString(mod, "CUcontext");
+        Py_DECREF(mod);
+        if (!cls) return nullptr;
+    }
+    std::uintptr_t val = h ? reinterpret_cast<std::uintptr_t>(*h) : 0;
+    return PyObject_CallFunction(cls, "K", val);
+}
+
 }  // namespace cuda_core
diff --git a/cuda_core/cuda/core/experimental/_resource_handles.pxd b/cuda_core/cuda/core/experimental/_resource_handles.pxd
index 99e97f977e..b65ee676a8 100644
--- a/cuda_core/cuda/core/experimental/_resource_handles.pxd
+++ b/cuda_core/cuda/core/experimental/_resource_handles.pxd
@@ -20,26 +20,16 @@ cdef extern from "_cpp/resource_handles.hpp" namespace "cuda_core":
     ContextHandle get_primary_context(int dev_id) nogil
     ContextHandle get_current_context() nogil
 
+    # ========================================================================
+    # Helper functions to extract raw resources from handles
+    # Defined in C++ to support overloading when additional handle types are added
+    # ========================================================================
 
-# ============================================================================
-# Helper functions to extract raw resources from handles
-# ============================================================================
+    # native() - extract the raw CUDA handle (nogil-safe)
+    cydriver.CUcontext native(ContextHandle h) nogil
 
-cdef inline cydriver.CUcontext native(ContextHandle h) nogil:
-    """Extract the native C type (cydriver.CUcontext) from the handle.
+    # intptr() - extract handle as uintptr_t (nogil-safe)
+    uintptr_t intptr(ContextHandle h) nogil
 
-    This is for use with cydriver API calls that expect the raw C type.
-    """
-    return h.get()[0]
-
-
-# Python conversion function (implemented in .pyx due to Python module dependency)
-cdef object py(ContextHandle h)
-
-
-cdef inline uintptr_t intptr(ContextHandle h) nogil:
-    """Extract the handle as a uintptr_t integer address.
-
-    This is for use with internal APIs that expect integer addresses.
-    """
-    return <uintptr_t>(h.get()[0])
+    # py() - convert handle to Python driver wrapper object (requires GIL)
+    object py(ContextHandle h)
diff --git a/cuda_core/cuda/core/experimental/_resource_handles.pyx b/cuda_core/cuda/core/experimental/_resource_handles.pyx
index b150228762..6395f21e2a 100644
--- a/cuda_core/cuda/core/experimental/_resource_handles.pyx
+++ b/cuda_core/cuda/core/experimental/_resource_handles.pyx
@@ -2,15 +2,6 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-from libc.stdint cimport uintptr_t
-
-from cuda.bindings import driver
-from cuda.core.experimental._resource_handles cimport ContextHandle
-
-
-cdef object py(ContextHandle h):
-    """Convert the handle to a Python driver.CUcontext object.
-
-    This is for use with driver (Python) API calls or returning to Python code.
-    """
-    return driver.CUcontext(<uintptr_t>(h.get()[0]))
+# This module exists to compile _cpp/resource_handles.cpp into a shared library.
+# The helper functions (native, intptr, py) are implemented as inline C++ functions
+# in _cpp/resource_handles.hpp and declared as extern in _resource_handles.pxd.

From 4046023188b493813d49df24f1a56ffca4699d2c Mon Sep 17 00:00:00 2001
From: Andy Jost <ajost@nvidia.com>
Date: Tue, 9 Dec 2025 11:19:29 -0800
Subject: [PATCH 08/38] Extend resource handle paradigm to Stream

Add StreamHandle for automatic stream lifetime management using the same
shared_ptr-based pattern established for ContextHandle.

Changes:
- Add StreamHandle type and create_stream_handle/create_stream_handle_ref
  functions in C++ with implementations in _resource_handles_impl.cpp
- Add overloaded native(), intptr(), py() helpers for StreamHandle
- Update Stream class to use _h_stream (StreamHandle) instead of raw _handle
- Owned streams are automatically destroyed when last reference is released
- Borrowed streams (from __cuda_stream__ protocol) hold _owner reference
- Update memory resource files to use native(stream._h_stream)
- Simplify Context using intptr() and py() helpers
---
 cuda_core/cuda/core/experimental/_context.pyx | 28 ++------
 .../experimental/_cpp/resource_handles.cpp    | 41 +++++++++--
 .../experimental/_cpp/resource_handles.hpp    | 50 +++++++++++--
 .../_memory/_device_memory_resource.pyx       |  5 +-
 .../_memory/_graph_memory_resource.pyx        |  5 +-
 .../core/experimental/_resource_handles.pxd   | 26 +++++--
 cuda_core/cuda/core/experimental/_stream.pxd  |  3 +-
 cuda_core/cuda/core/experimental/_stream.pyx  | 70 ++++++++++---------
 8 files changed, 148 insertions(+), 80 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_context.pyx b/cuda_core/cuda/core/experimental/_context.pyx
index 6532eecadf..94c8379875 100644
--- a/cuda_core/cuda/core/experimental/_context.pyx
+++ b/cuda_core/cuda/core/experimental/_context.pyx
@@ -9,9 +9,9 @@ from libc.stdint cimport uintptr_t
 from cuda.bindings cimport cydriver
 from cuda.core.experimental._resource_handles cimport (
     create_context_handle_ref,
-    get_primary_context,
-    get_current_context,
+    intptr,
     native,
+    py,
 )
 from cuda.core.experimental._utils.cuda_utils import driver
 from cuda.core.experimental._utils.cuda_utils cimport HANDLE_RETURN
@@ -42,28 +42,18 @@ cdef class Context:
     @property
     def handle(self):
         """Return the underlying CUcontext handle."""
-        cdef const cydriver.CUcontext* ptr = self._h_context.get()
-        if ptr != NULL:
-            return driver.CUcontext(<uintptr_t>(ptr[0]))
-        return None
+        if self._h_context.get() == NULL:
+            return None
+        return py(self._h_context)
 
     def __eq__(self, other):
         if not isinstance(other, Context):
             return NotImplemented
         cdef Context _other = <Context>other
-        # Compare the actual CUcontext values, not the shared_ptr objects
-        # (aliasing constructor creates different addresses even for same CUcontext)
-        cdef const cydriver.CUcontext* ptr1 = self._h_context.get()
-        cdef const cydriver.CUcontext* ptr2 = _other._h_context.get()
-        if ptr1 == NULL or ptr2 == NULL:
-            return ptr1 == ptr2
-        return ptr1[0] == ptr2[0]
+        return intptr(self._h_context) == intptr(_other._h_context)
 
     def __hash__(self) -> int:
-        cdef const cydriver.CUcontext* ptr = self._h_context.get()
-        if ptr == NULL:
-            return hash((type(self), 0))
-        return hash((type(self), <uintptr_t>(ptr[0])))
+        return hash((type(self), intptr(self._h_context)))
 
 
 @dataclass
@@ -75,10 +65,6 @@ class ContextOptions:
     pass  # TODO
 
 
-# get_current_context() and get_primary_context() are now pure C++ functions
-# imported from _resource_handles (with thread-local caching in C++)
-
-
 cdef ContextHandle get_stream_context(cydriver.CUstream stream) except * nogil:
     """Get handle to the context associated with a stream.
 
diff --git a/cuda_core/cuda/core/experimental/_cpp/resource_handles.cpp b/cuda_core/cuda/core/experimental/_cpp/resource_handles.cpp
index 6ee1088937..a99a0c09e3 100644
--- a/cuda_core/cuda/core/experimental/_cpp/resource_handles.cpp
+++ b/cuda_core/cuda/core/experimental/_cpp/resource_handles.cpp
@@ -55,12 +55,11 @@ ContextHandle create_context_handle_ref(CUcontext ctx) {
 
     // Use default deleter - it will delete the box, but not touch the CUcontext
     // CUcontext lifetime is managed externally (e.g., by CUDA driver)
-    auto box = new ContextBox{ctx};
-    auto box_ptr = std::shared_ptr<const ContextBox>(box);
+    auto box = std::shared_ptr<const ContextBox>(new ContextBox{ctx});
 
     // Use aliasing constructor to create handle that exposes only CUcontext
-    // The handle's reference count is tied to box_ptr, but it points to &box_ptr->resource
-    return ContextHandle(box_ptr, &box_ptr->resource);
+    // The handle's reference count is tied to box, but it points to &box->resource
+    return ContextHandle(box, &box->resource);
 }
 
 // Thread-local storage for primary context cache
@@ -89,15 +88,14 @@ ContextHandle get_primary_context(int dev_id) noexcept {
     }
 
     // Create owning handle with custom deleter that releases the primary context
-    auto box = new ContextBox{ctx};
-    auto box_ptr = std::shared_ptr<const ContextBox>(box, [dev_id](const ContextBox* b) {
+    auto box = std::shared_ptr<const ContextBox>(new ContextBox{ctx}, [dev_id](const ContextBox* b) {
         GILReleaseGuard gil;
         cuDevicePrimaryCtxRelease(dev_id);
         delete b;
     });
 
     // Use aliasing constructor to expose only CUcontext
-    auto h_context = ContextHandle(box_ptr, &box_ptr->resource);
+    auto h_context = ContextHandle(box, &box->resource);
 
     // Resize cache if needed
     if (static_cast<size_t>(dev_id) >= primary_context_cache.size()) {
@@ -122,4 +120,33 @@ ContextHandle get_current_context() noexcept {
     return create_context_handle_ref(ctx);
 }
 
+// ============================================================================
+// Stream Handles
+// ============================================================================
+
+// Internal box structure for Stream
+struct StreamBox {
+    CUstream resource;
+};
+
+StreamHandle create_stream_handle(CUstream stream) {
+    // Creates an owning handle - stream will be destroyed when handle is released
+    auto box = std::shared_ptr<const StreamBox>(new StreamBox{stream}, [](const StreamBox* b) {
+        GILReleaseGuard gil;
+        cuStreamDestroy(b->resource);
+        delete b;
+    });
+
+    // Use aliasing constructor to expose only CUstream
+    return StreamHandle(box, &box->resource);
+}
+
+StreamHandle create_stream_handle_ref(CUstream stream) {
+    // Creates a non-owning handle - stream will NOT be destroyed
+    auto box = std::shared_ptr<const StreamBox>(new StreamBox{stream});
+
+    // Use aliasing constructor to expose only CUstream
+    return StreamHandle(box, &box->resource);
+}
+
 }  // namespace cuda_core
diff --git a/cuda_core/cuda/core/experimental/_cpp/resource_handles.hpp b/cuda_core/cuda/core/experimental/_cpp/resource_handles.hpp
index 7c0bf2ec63..f6f7d6fa79 100644
--- a/cuda_core/cuda/core/experimental/_cpp/resource_handles.hpp
+++ b/cuda_core/cuda/core/experimental/_cpp/resource_handles.hpp
@@ -11,16 +11,20 @@
 
 namespace cuda_core {
 
+// ============================================================================
 // Handle type aliases - expose only the raw CUDA resource
-using ContextHandle = std::shared_ptr<const CUcontext>;
+// ============================================================================
 
-// Function to create a non-owning context handle (references existing context).
-ContextHandle create_context_handle_ref(CUcontext ctx);
+using ContextHandle = std::shared_ptr<const CUcontext>;
+using StreamHandle = std::shared_ptr<const CUstream>;
 
 // ============================================================================
-// Context acquisition functions (pure C++, nogil-safe)
+// Context handle functions
 // ============================================================================
 
+// Function to create a non-owning context handle (references existing context).
+ContextHandle create_context_handle_ref(CUcontext ctx);
+
 // Get handle to the primary context for a device (with thread-local caching)
 // Returns empty handle on error (caller must check)
 ContextHandle get_primary_context(int dev_id) noexcept;
@@ -30,9 +34,20 @@ ContextHandle get_primary_context(int dev_id) noexcept;
 ContextHandle get_current_context() noexcept;
 
 // ============================================================================
-// Helper functions to extract raw resources from handles
-// These are defined as inline C++ functions to support overloading when
-// additional handle types (e.g., StreamHandle) are added.
+// Stream handle functions
+// ============================================================================
+
+// Create an owning stream handle. When the last reference is released,
+// cuStreamDestroy is called automatically.
+StreamHandle create_stream_handle(CUstream stream);
+
+// Create a non-owning stream handle (references existing stream).
+// Use for borrowed streams (from foreign code) or built-in streams.
+// The stream will NOT be destroyed when the handle is released.
+StreamHandle create_stream_handle_ref(CUstream stream);
+
+// ============================================================================
+// Overloaded helper functions to extract raw resources from handles
 // ============================================================================
 
 // native() - extract the raw CUDA handle
@@ -40,11 +55,19 @@ inline CUcontext native(const ContextHandle& h) noexcept {
     return h ? *h : nullptr;
 }
 
+inline CUstream native(const StreamHandle& h) noexcept {
+    return h ? *h : nullptr;
+}
+
 // intptr() - extract handle as uintptr_t for Python interop
 inline std::uintptr_t intptr(const ContextHandle& h) noexcept {
     return reinterpret_cast<std::uintptr_t>(h ? *h : nullptr);
 }
 
+inline std::uintptr_t intptr(const StreamHandle& h) noexcept {
+    return reinterpret_cast<std::uintptr_t>(h ? *h : nullptr);
+}
+
 // py() - convert handle to Python driver wrapper object
 // Returns new reference. Caller must hold GIL.
 inline PyObject* py(const ContextHandle& h) {
@@ -60,4 +83,17 @@ inline PyObject* py(const ContextHandle& h) {
     return PyObject_CallFunction(cls, "K", val);
 }
 
+inline PyObject* py(const StreamHandle& h) {
+    static PyObject* cls = nullptr;
+    if (!cls) {
+        PyObject* mod = PyImport_ImportModule("cuda.bindings.driver");
+        if (!mod) return nullptr;
+        cls = PyObject_GetAttrString(mod, "CUstream");
+        Py_DECREF(mod);
+        if (!cls) return nullptr;
+    }
+    std::uintptr_t val = h ? reinterpret_cast<std::uintptr_t>(*h) : 0;
+    return PyObject_CallFunction(cls, "K", val);
+}
+
 }  // namespace cuda_core
diff --git a/cuda_core/cuda/core/experimental/_memory/_device_memory_resource.pyx b/cuda_core/cuda/core/experimental/_memory/_device_memory_resource.pyx
index ac18079a62..3bfdb59c07 100644
--- a/cuda_core/cuda/core/experimental/_memory/_device_memory_resource.pyx
+++ b/cuda_core/cuda/core/experimental/_memory/_device_memory_resource.pyx
@@ -13,6 +13,7 @@ from cuda.bindings cimport cydriver
 from cuda.core.experimental._memory._buffer cimport Buffer, MemoryResource
 from cuda.core.experimental._memory cimport _ipc
 from cuda.core.experimental._memory._ipc cimport IPCAllocationHandle, IPCDataForMR
+from cuda.core.experimental._resource_handles cimport native
 from cuda.core.experimental._stream cimport default_stream, Stream_accept, Stream
 from cuda.core.experimental._utils.cuda_utils cimport (
     check_or_create_options,
@@ -552,7 +553,7 @@ cdef inline int check_not_capturing(cydriver.CUstream s) except?-1 nogil:
 
 
 cdef inline Buffer DMR_allocate(DeviceMemoryResource self, size_t size, Stream stream):
-    cdef cydriver.CUstream s = stream._handle
+    cdef cydriver.CUstream s = native(stream._h_stream)
     cdef cydriver.CUdeviceptr devptr
     with nogil:
         check_not_capturing(s)
@@ -569,7 +570,7 @@ cdef inline Buffer DMR_allocate(DeviceMemoryResource self, size_t size, Stream s
 cdef inline void DMR_deallocate(
     DeviceMemoryResource self, uintptr_t ptr, size_t size, Stream stream
 ) noexcept:
-    cdef cydriver.CUstream s = stream._handle
+    cdef cydriver.CUstream s = native(stream._h_stream)
     cdef cydriver.CUdeviceptr devptr = <cydriver.CUdeviceptr>ptr
     cdef cydriver.CUresult r
     with nogil:
diff --git a/cuda_core/cuda/core/experimental/_memory/_graph_memory_resource.pyx b/cuda_core/cuda/core/experimental/_memory/_graph_memory_resource.pyx
index c65354b612..9a83c9007c 100644
--- a/cuda_core/cuda/core/experimental/_memory/_graph_memory_resource.pyx
+++ b/cuda_core/cuda/core/experimental/_memory/_graph_memory_resource.pyx
@@ -8,6 +8,7 @@ from libc.stdint cimport intptr_t
 
 from cuda.bindings cimport cydriver
 from cuda.core.experimental._memory._buffer cimport Buffer, MemoryResource
+from cuda.core.experimental._resource_handles cimport native
 from cuda.core.experimental._stream cimport default_stream, Stream_accept, Stream
 from cuda.core.experimental._utils.cuda_utils cimport HANDLE_RETURN
 
@@ -186,7 +187,7 @@ cdef inline int check_capturing(cydriver.CUstream s) except?-1 nogil:
 
 
 cdef inline Buffer GMR_allocate(cyGraphMemoryResource self, size_t size, Stream stream):
-    cdef cydriver.CUstream s = stream._handle
+    cdef cydriver.CUstream s = native(stream._h_stream)
     cdef cydriver.CUdeviceptr devptr
     with nogil:
         check_capturing(s)
@@ -201,7 +202,7 @@ cdef inline Buffer GMR_allocate(cyGraphMemoryResource self, size_t size, Stream
 
 
 cdef inline void GMR_deallocate(intptr_t ptr, size_t size, Stream stream) noexcept:
-    cdef cydriver.CUstream s = stream._handle
+    cdef cydriver.CUstream s = native(stream._h_stream)
     cdef cydriver.CUdeviceptr devptr = <cydriver.CUdeviceptr>ptr
     with nogil:
         HANDLE_RETURN(cydriver.cuMemFreeAsync(devptr, s))
diff --git a/cuda_core/cuda/core/experimental/_resource_handles.pxd b/cuda_core/cuda/core/experimental/_resource_handles.pxd
index b65ee676a8..1cf7a31a8d 100644
--- a/cuda_core/cuda/core/experimental/_resource_handles.pxd
+++ b/cuda_core/cuda/core/experimental/_resource_handles.pxd
@@ -9,11 +9,12 @@ from cuda.bindings cimport cydriver
 
 # Declare the C++ namespace and types
 cdef extern from "_cpp/resource_handles.hpp" namespace "cuda_core":
-    # Handle type - shared_ptr to const CUcontext
+    # ========================================================================
+    # Context Handle
+    # ========================================================================
     ctypedef shared_ptr[const cydriver.CUcontext] ContextHandle
 
     # Function to create a non-owning context handle (references existing context)
-    # This is nogil-safe (pure C++, no Python dependencies)
     ContextHandle create_context_handle_ref(cydriver.CUcontext ctx) nogil
 
     # Context acquisition functions (pure C++, nogil-safe with thread-local caching)
@@ -21,15 +22,28 @@ cdef extern from "_cpp/resource_handles.hpp" namespace "cuda_core":
     ContextHandle get_current_context() nogil
 
     # ========================================================================
-    # Helper functions to extract raw resources from handles
-    # Defined in C++ to support overloading when additional handle types are added
+    # Stream Handle
+    # ========================================================================
+    ctypedef shared_ptr[const cydriver.CUstream] StreamHandle
+
+    # Create an owning stream handle (stream destroyed when handle released)
+    StreamHandle create_stream_handle(cydriver.CUstream stream) nogil
+
+    # Create a non-owning stream handle (stream NOT destroyed when handle released)
+    StreamHandle create_stream_handle_ref(cydriver.CUstream stream) nogil
+
+    # ========================================================================
+    # Overloaded helper functions (C++ handles dispatch by type)
     # ========================================================================
 
-    # native() - extract the raw CUDA handle (nogil-safe)
+    # native() - extract the raw CUDA handle
     cydriver.CUcontext native(ContextHandle h) nogil
+    cydriver.CUstream native(StreamHandle h) nogil
 
-    # intptr() - extract handle as uintptr_t (nogil-safe)
+    # intptr() - extract handle as uintptr_t for Python interop
     uintptr_t intptr(ContextHandle h) nogil
+    uintptr_t intptr(StreamHandle h) nogil
 
     # py() - convert handle to Python driver wrapper object (requires GIL)
     object py(ContextHandle h)
+    object py(StreamHandle h)
diff --git a/cuda_core/cuda/core/experimental/_stream.pxd b/cuda_core/cuda/core/experimental/_stream.pxd
index edc25e2ba7..f386386e98 100644
--- a/cuda_core/cuda/core/experimental/_stream.pxd
+++ b/cuda_core/cuda/core/experimental/_stream.pxd
@@ -3,12 +3,13 @@
 # SPDX-License-Identifier: Apache-2.0
 
 from cuda.bindings cimport cydriver
+from cuda.core.experimental._resource_handles cimport StreamHandle
 
 
 cdef class Stream:
 
     cdef:
-        cydriver.CUstream _handle
+        StreamHandle _h_stream
         object _owner
         bint _builtin
         int _nonblocking
diff --git a/cuda_core/cuda/core/experimental/_stream.pyx b/cuda_core/cuda/core/experimental/_stream.pyx
index 8e775c56be..e8bd46f9b4 100644
--- a/cuda_core/cuda/core/experimental/_stream.pyx
+++ b/cuda_core/cuda/core/experimental/_stream.pyx
@@ -32,8 +32,13 @@ from cuda.core.experimental._context cimport (
 from cuda.core.experimental._event import Event, EventOptions
 from cuda.core.experimental._resource_handles cimport (
     ContextHandle,
+    StreamHandle,
+    create_stream_handle,
+    create_stream_handle_ref,
     get_current_context,
+    intptr,
     native,
+    py,
 )
 from cuda.core.experimental._graph import GraphBuilder
 from cuda.core.experimental._utils.cuda_utils import (
@@ -87,7 +92,7 @@ cdef class Stream:
     using Stream.from_handle().
     """
     def __cinit__(self):
-        self._handle = <cydriver.CUstream>(NULL)
+        # _h_stream is default-initialized to empty StreamHandle by C++
         self._owner = None
         self._builtin = False
         self._nonblocking = -1  # lazy init'd
@@ -104,26 +109,31 @@ cdef class Stream:
     @classmethod
     def _legacy_default(cls):
         cdef Stream self = Stream.__new__(cls)
-        self._handle = <cydriver.CUstream>(cydriver.CU_STREAM_LEGACY)
+        # Built-in streams are non-owning references
+        self._h_stream = create_stream_handle_ref(<cydriver.CUstream>(cydriver.CU_STREAM_LEGACY))
         self._builtin = True
         return self
 
     @classmethod
     def _per_thread_default(cls):
         cdef Stream self = Stream.__new__(cls)
-        self._handle = <cydriver.CUstream>(cydriver.CU_STREAM_PER_THREAD)
+        # Built-in streams are non-owning references
+        self._h_stream = create_stream_handle_ref(<cydriver.CUstream>(cydriver.CU_STREAM_PER_THREAD))
         self._builtin = True
         return self
 
     @classmethod
     def _init(cls, obj: IsStreamT | None = None, options=None, device_id: int = None):
         cdef Stream self = Stream.__new__(cls)
+        cdef cydriver.CUstream borrowed
 
         if obj is not None and options is not None:
             raise ValueError("obj and options cannot be both specified")
         if obj is not None:
-            self._handle = _handle_from_stream_protocol(obj)
-            # TODO: check if obj is created under the current context/device
+            # Borrowed stream from foreign object - non-owning reference
+            # Hold a reference to the owner to keep the underlying stream alive
+            borrowed = _handle_from_stream_protocol(obj)
+            self._h_stream = create_stream_handle_ref(borrowed)
             self._owner = obj
             return self
 
@@ -147,46 +157,40 @@ cdef class Stream:
         cdef cydriver.CUstream s
         with nogil:
             HANDLE_RETURN(cydriver.cuStreamCreateWithPriority(&s, flags, prio))
-        self._handle = s
+        # Owned stream - will be destroyed when handle is released
+        self._h_stream = create_stream_handle(s)
         self._nonblocking = int(nonblocking)
         self._priority = prio
         self._device_id = device_id if device_id is not None else self._device_id
         return self
 
-    def __dealloc__(self):
-        self.close()
-
     cpdef close(self):
         """Destroy the stream.
 
-        Destroy the stream if we own it. Borrowed foreign stream
-        object will instead have their references released.
-
+        Releases the stream handle. For owned streams, this destroys the
+        underlying CUDA stream. For borrowed streams, this just releases
+        the reference.
         """
-        if self._owner is None:
-            if self._handle and not self._builtin:
-                with nogil:
-                    HANDLE_RETURN(cydriver.cuStreamDestroy(self._handle))
-        else:
-            self._owner = None
-        self._handle = <cydriver.CUstream>(NULL)
+        # Reset handle to empty - this decrements refcount and may trigger destruction
+        self._h_stream.reset()
+        self._owner = None
 
     def __cuda_stream__(self) -> tuple[int, int]:
         """Return an instance of a __cuda_stream__ protocol."""
-        return (0, <uintptr_t>(self._handle))
+        return (0, intptr(self._h_stream))
 
     def __hash__(self) -> int:
         # Ensure context is initialized for hash consistency
         if self._ctx_handle == CU_CONTEXT_INVALID:
             self._get_context()
-        return hash((<uintptr_t>(self._ctx_handle), <uintptr_t>(self._handle)))
+        return hash((<uintptr_t>(self._ctx_handle), intptr(self._h_stream)))
 
     def __eq__(self, other) -> bool:
         if not isinstance(other, Stream):
             return NotImplemented
         cdef Stream _other = <Stream>other
         # Fast path: compare handles first
-        if <uintptr_t>(self._handle) != <uintptr_t>((_other)._handle):
+        if intptr(self._h_stream) != intptr(_other._h_stream):
             return False
         # Ensure contexts are initialized for both streams
         if self._ctx_handle == CU_CONTEXT_INVALID:
@@ -205,7 +209,7 @@ cdef class Stream:
             This handle is a Python object. To get the memory address of the underlying C
             handle, call ``int(Stream.handle)``.
         """
-        return driver.CUstream(<uintptr_t>(self._handle))
+        return py(self._h_stream)
 
     @property
     def is_nonblocking(self) -> bool:
@@ -213,11 +217,8 @@ cdef class Stream:
         cdef unsigned int flags
         if self._nonblocking == -1:
             with nogil:
-                HANDLE_RETURN(cydriver.cuStreamGetFlags(self._handle, &flags))
-            if flags & cydriver.CUstream_flags.CU_STREAM_NON_BLOCKING:
-                self._nonblocking = True
-            else:
-                self._nonblocking = False
+                HANDLE_RETURN(cydriver.cuStreamGetFlags(native(self._h_stream), &flags))
+            self._nonblocking = flags & cydriver.CUstream_flags.CU_STREAM_NON_BLOCKING
         return bool(self._nonblocking)
 
     @property
@@ -226,14 +227,14 @@ cdef class Stream:
         cdef int prio
         if self._priority == INT32_MIN:
             with nogil:
-                HANDLE_RETURN(cydriver.cuStreamGetPriority(self._handle, &prio))
+                HANDLE_RETURN(cydriver.cuStreamGetPriority(native(self._h_stream), &prio))
             self._priority = prio
         return self._priority
 
     def sync(self):
         """Synchronize the stream."""
         with nogil:
-            HANDLE_RETURN(cydriver.cuStreamSynchronize(self._handle))
+            HANDLE_RETURN(cydriver.cuStreamSynchronize(native(self._h_stream)))
 
     def record(self, event: Event = None, options: EventOptions = None) -> Event:
         """Record an event onto the stream.
@@ -268,7 +269,7 @@ cdef class Stream:
 
         cdef cydriver.CUevent e = (<cyEvent?>(event))._handle
         with nogil:
-            HANDLE_RETURN(cydriver.cuEventRecord(e, self._handle))
+            HANDLE_RETURN(cydriver.cuEventRecord(e, native(self._h_stream)))
         return event
 
     def wait(self, event_or_stream: Union[Event, Stream]):
@@ -288,7 +289,7 @@ cdef class Stream:
             event = <cydriver.CUevent><uintptr_t>(event_or_stream.handle)
             with nogil:
                 # TODO: support flags other than 0?
-                HANDLE_RETURN(cydriver.cuStreamWaitEvent(self._handle, event, 0))
+                HANDLE_RETURN(cydriver.cuStreamWaitEvent(native(self._h_stream), event, 0))
         else:
             if isinstance(event_or_stream, Stream):
                 stream = <cydriver.CUstream><uintptr_t>(event_or_stream.handle)
@@ -305,7 +306,7 @@ cdef class Stream:
                 HANDLE_RETURN(cydriver.cuEventCreate(&event, cydriver.CUevent_flags.CU_EVENT_DISABLE_TIMING))
                 HANDLE_RETURN(cydriver.cuEventRecord(event, stream))
                 # TODO: support flags other than 0?
-                HANDLE_RETURN(cydriver.cuStreamWaitEvent(self._handle, event, 0))
+                HANDLE_RETURN(cydriver.cuStreamWaitEvent(native(self._h_stream), event, 0))
                 HANDLE_RETURN(cydriver.cuEventDestroy(event))
 
     @property
@@ -324,8 +325,9 @@ cdef class Stream:
         return Device(<int>(self._device_id))
 
     cdef int _get_context(self) except?-1 nogil:
+        cdef ContextHandle h_context
         if self._ctx_handle == CU_CONTEXT_INVALID:
-            h_context = get_stream_context(self._handle)
+            h_context = get_stream_context(native(self._h_stream))
             self._ctx_handle = native(h_context)
         return 0
 

From 39fbefcfed6c296c9d7637e3b5167331f202d424 Mon Sep 17 00:00:00 2001
From: Andy Jost <ajost@nvidia.com>
Date: Tue, 9 Dec 2025 12:50:31 -0800
Subject: [PATCH 09/38] Simplify Stream by moving more logic to C++

- Move stream creation to C++ (create_stream_handle now calls
  cuStreamCreateWithPriority internally)
- Add get_legacy_stream/get_per_thread_stream for built-in streams
- Add create_stream_handle_with_owner for borrowed streams that
  prevents Python owner from being GC'd via captured PyObject*
- Add GILAcquireGuard (symmetric to GILReleaseGuard) for safely
  acquiring GIL in C++ destructors
- Simplify Stream class: remove __cinit__, _owner, _builtin,
  _legacy_default, _per_thread_default
- Use _from_handle as single initialization point for Stream
- Remove obsolete subclassing tests for removed methods
---
 .../experimental/_cpp/resource_handles.cpp    | 80 ++++++++++++++++++-
 .../experimental/_cpp/resource_handles.hpp    | 17 +++-
 .../core/experimental/_resource_handles.pxd   | 14 +++-
 cuda_core/cuda/core/experimental/_stream.pxd  |  5 +-
 cuda_core/cuda/core/experimental/_stream.pyx  | 72 ++++++++---------
 5 files changed, 142 insertions(+), 46 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_cpp/resource_handles.cpp b/cuda_core/cuda/core/experimental/_cpp/resource_handles.cpp
index a99a0c09e3..a8b0fa60ef 100644
--- a/cuda_core/cuda/core/experimental/_cpp/resource_handles.cpp
+++ b/cuda_core/cuda/core/experimental/_cpp/resource_handles.cpp
@@ -44,6 +44,38 @@ class GILReleaseGuard {
     bool released_;
 };
 
+// Helper to acquire the GIL when we might not hold it.
+// Use in C++ destructors that need to manipulate Python objects.
+// Symmetric counterpart to GILReleaseGuard.
+class GILAcquireGuard {
+public:
+    GILAcquireGuard() : acquired_(false) {
+        // Don't try to acquire GIL if Python is finalizing
+        if (!Py_IsInitialized() || _Py_IsFinalizing()) {
+            return;
+        }
+        gstate_ = PyGILState_Ensure();
+        acquired_ = true;
+    }
+
+    ~GILAcquireGuard() {
+        if (acquired_) {
+            PyGILState_Release(gstate_);
+        }
+    }
+
+    // Check if GIL was successfully acquired (for conditional operations)
+    bool acquired() const { return acquired_; }
+
+    // Non-copyable, non-movable
+    GILAcquireGuard(const GILAcquireGuard&) = delete;
+    GILAcquireGuard& operator=(const GILAcquireGuard&) = delete;
+
+private:
+    PyGILState_STATE gstate_;
+    bool acquired_;
+};
+
 // Internal box structure for Context (kept private to this TU)
 struct ContextBox {
     CUcontext resource;
@@ -129,8 +161,19 @@ struct StreamBox {
     CUstream resource;
 };
 
-StreamHandle create_stream_handle(CUstream stream) {
-    // Creates an owning handle - stream will be destroyed when handle is released
+StreamHandle create_stream_handle(unsigned int flags, int priority) {
+    // Creates an owning stream handle - calls cuStreamCreateWithPriority internally.
+    // Returns empty handle on error (caller must check).
+    CUstream stream;
+    CUresult err;
+    {
+        GILReleaseGuard gil;
+        err = cuStreamCreateWithPriority(&stream, flags, priority);
+    }
+    if (err != CUDA_SUCCESS) {
+        return StreamHandle();
+    }
+
     auto box = std::shared_ptr<const StreamBox>(new StreamBox{stream}, [](const StreamBox* b) {
         GILReleaseGuard gil;
         cuStreamDestroy(b->resource);
@@ -149,4 +192,37 @@ StreamHandle create_stream_handle_ref(CUstream stream) {
     return StreamHandle(box, &box->resource);
 }
 
+StreamHandle create_stream_handle_with_owner(CUstream stream, PyObject* owner) {
+    // Creates a non-owning handle that prevents a Python owner from being GC'd.
+    // The owner's refcount is incremented here and decremented when handle is released.
+    Py_XINCREF(owner);
+
+    auto box = std::shared_ptr<const StreamBox>(new StreamBox{stream}, [owner](const StreamBox* b) {
+        // Safely decrement owner refcount (GILAcquireGuard handles finalization check)
+        {
+            GILAcquireGuard gil;
+            if (gil.acquired()) {
+                Py_XDECREF(owner);
+            }
+        }
+        delete b;
+    });
+
+    return StreamHandle(box, &box->resource);
+}
+
+StreamHandle get_legacy_stream() noexcept {
+    // Return non-owning handle to the legacy default stream.
+    // Use function-local static for efficient repeated access.
+    static StreamHandle handle = create_stream_handle_ref(CU_STREAM_LEGACY);
+    return handle;
+}
+
+StreamHandle get_per_thread_stream() noexcept {
+    // Return non-owning handle to the per-thread default stream.
+    // Use function-local static for efficient repeated access.
+    static StreamHandle handle = create_stream_handle_ref(CU_STREAM_PER_THREAD);
+    return handle;
+}
+
 }  // namespace cuda_core
diff --git a/cuda_core/cuda/core/experimental/_cpp/resource_handles.hpp b/cuda_core/cuda/core/experimental/_cpp/resource_handles.hpp
index f6f7d6fa79..e32bc4d77c 100644
--- a/cuda_core/cuda/core/experimental/_cpp/resource_handles.hpp
+++ b/cuda_core/cuda/core/experimental/_cpp/resource_handles.hpp
@@ -37,15 +37,26 @@ ContextHandle get_current_context() noexcept;
 // Stream handle functions
 // ============================================================================
 
-// Create an owning stream handle. When the last reference is released,
-// cuStreamDestroy is called automatically.
-StreamHandle create_stream_handle(CUstream stream);
+// Create an owning stream handle by calling cuStreamCreateWithPriority.
+// When the last reference is released, cuStreamDestroy is called automatically.
+// Returns empty handle on error (caller must check).
+StreamHandle create_stream_handle(unsigned int flags, int priority);
 
 // Create a non-owning stream handle (references existing stream).
 // Use for borrowed streams (from foreign code) or built-in streams.
 // The stream will NOT be destroyed when the handle is released.
 StreamHandle create_stream_handle_ref(CUstream stream);
 
+// Create a non-owning stream handle that prevents a Python owner from being GC'd.
+// The owner's refcount is incremented; decremented when handle is released.
+StreamHandle create_stream_handle_with_owner(CUstream stream, PyObject* owner);
+
+// Get non-owning handle to the legacy default stream (CU_STREAM_LEGACY)
+StreamHandle get_legacy_stream() noexcept;
+
+// Get non-owning handle to the per-thread default stream (CU_STREAM_PER_THREAD)
+StreamHandle get_per_thread_stream() noexcept;
+
 // ============================================================================
 // Overloaded helper functions to extract raw resources from handles
 // ============================================================================
diff --git a/cuda_core/cuda/core/experimental/_resource_handles.pxd b/cuda_core/cuda/core/experimental/_resource_handles.pxd
index 1cf7a31a8d..da152f4473 100644
--- a/cuda_core/cuda/core/experimental/_resource_handles.pxd
+++ b/cuda_core/cuda/core/experimental/_resource_handles.pxd
@@ -26,12 +26,22 @@ cdef extern from "_cpp/resource_handles.hpp" namespace "cuda_core":
     # ========================================================================
     ctypedef shared_ptr[const cydriver.CUstream] StreamHandle
 
-    # Create an owning stream handle (stream destroyed when handle released)
-    StreamHandle create_stream_handle(cydriver.CUstream stream) nogil
+    # Create an owning stream handle via cuStreamCreateWithPriority
+    # Returns empty handle on error (caller must check)
+    StreamHandle create_stream_handle(unsigned int flags, int priority) nogil
 
     # Create a non-owning stream handle (stream NOT destroyed when handle released)
     StreamHandle create_stream_handle_ref(cydriver.CUstream stream) nogil
 
+    # Create non-owning handle that prevents Python owner from being GC'd
+    StreamHandle create_stream_handle_with_owner(cydriver.CUstream stream, object owner)
+
+    # Get non-owning handle to the legacy default stream
+    StreamHandle get_legacy_stream() nogil
+
+    # Get non-owning handle to the per-thread default stream
+    StreamHandle get_per_thread_stream() nogil
+
     # ========================================================================
     # Overloaded helper functions (C++ handles dispatch by type)
     # ========================================================================
diff --git a/cuda_core/cuda/core/experimental/_stream.pxd b/cuda_core/cuda/core/experimental/_stream.pxd
index f386386e98..e727a29226 100644
--- a/cuda_core/cuda/core/experimental/_stream.pxd
+++ b/cuda_core/cuda/core/experimental/_stream.pxd
@@ -10,13 +10,14 @@ cdef class Stream:
 
     cdef:
         StreamHandle _h_stream
-        object _owner
-        bint _builtin
         int _nonblocking
         int _priority
         cydriver.CUdevice _device_id
         cydriver.CUcontext _ctx_handle
 
+    @staticmethod
+    cdef Stream _from_handle(type cls, StreamHandle h_stream)
+
     cpdef close(self)
     cdef int _get_context(self) except?-1 nogil
     cdef int _get_device_and_context(self) except?-1
diff --git a/cuda_core/cuda/core/experimental/_stream.pyx b/cuda_core/cuda/core/experimental/_stream.pyx
index e8bd46f9b4..10742e2730 100644
--- a/cuda_core/cuda/core/experimental/_stream.pyx
+++ b/cuda_core/cuda/core/experimental/_stream.pyx
@@ -34,8 +34,10 @@ from cuda.core.experimental._resource_handles cimport (
     ContextHandle,
     StreamHandle,
     create_stream_handle,
-    create_stream_handle_ref,
+    create_stream_handle_with_owner,
     get_current_context,
+    get_legacy_stream,
+    get_per_thread_stream,
     intptr,
     native,
     py,
@@ -91,57 +93,54 @@ cdef class Stream:
     object, or created directly through using an existing handle
     using Stream.from_handle().
     """
-    def __cinit__(self):
-        # _h_stream is default-initialized to empty StreamHandle by C++
-        self._owner = None
-        self._builtin = False
-        self._nonblocking = -1  # lazy init'd
-        self._priority = INT32_MIN  # lazy init'd
-        self._device_id = cydriver.CU_DEVICE_INVALID  # lazy init'd
-        self._ctx_handle = CU_CONTEXT_INVALID  # lazy init'd
-
     def __init__(self, *args, **kwargs):
         raise RuntimeError(
             "Stream objects cannot be instantiated directly. "
             "Please use Device APIs (create_stream) or other Stream APIs (from_handle)."
         )
 
+    @staticmethod
+    cdef Stream _from_handle(type cls, StreamHandle h_stream):
+        """Create a Stream from an existing StreamHandle (cdef-only factory)."""
+        cdef Stream s = cls.__new__(cls)
+        s._h_stream = h_stream
+        s._nonblocking = -1  # lazy init'd
+        s._priority = INT32_MIN  # lazy init'd
+        s._device_id = cydriver.CU_DEVICE_INVALID  # lazy init'd
+        s._ctx_handle = CU_CONTEXT_INVALID  # lazy init'd
+        return s
+
     @classmethod
     def _legacy_default(cls):
-        cdef Stream self = Stream.__new__(cls)
-        # Built-in streams are non-owning references
-        self._h_stream = create_stream_handle_ref(<cydriver.CUstream>(cydriver.CU_STREAM_LEGACY))
-        self._builtin = True
-        return self
+        """Return the legacy default stream (supports subclassing)."""
+        return Stream._from_handle(cls, get_legacy_stream())
 
     @classmethod
     def _per_thread_default(cls):
-        cdef Stream self = Stream.__new__(cls)
-        # Built-in streams are non-owning references
-        self._h_stream = create_stream_handle_ref(<cydriver.CUstream>(cydriver.CU_STREAM_PER_THREAD))
-        self._builtin = True
-        return self
+        """Return the per-thread default stream (supports subclassing)."""
+        return Stream._from_handle(cls, get_per_thread_stream())
 
     @classmethod
     def _init(cls, obj: IsStreamT | None = None, options=None, device_id: int = None):
-        cdef Stream self = Stream.__new__(cls)
+        cdef StreamHandle h_stream
         cdef cydriver.CUstream borrowed
+        cdef Stream self
 
         if obj is not None and options is not None:
             raise ValueError("obj and options cannot be both specified")
         if obj is not None:
-            # Borrowed stream from foreign object - non-owning reference
-            # Hold a reference to the owner to keep the underlying stream alive
+            # Borrowed stream from foreign object
+            # C++ handle prevents owner from being GC'd until handle is released
             borrowed = _handle_from_stream_protocol(obj)
-            self._h_stream = create_stream_handle_ref(borrowed)
-            self._owner = obj
-            return self
+            h_stream = create_stream_handle_with_owner(borrowed, obj)
+            return Stream._from_handle(cls, h_stream)
 
         cdef StreamOptions opts = check_or_create_options(StreamOptions, options, "Stream options")
         nonblocking = opts.nonblocking
         priority = opts.priority
 
-        flags = cydriver.CUstream_flags.CU_STREAM_NON_BLOCKING if nonblocking else cydriver.CUstream_flags.CU_STREAM_DEFAULT
+        cdef unsigned int flags = (cydriver.CUstream_flags.CU_STREAM_NON_BLOCKING if nonblocking
+                                   else cydriver.CUstream_flags.CU_STREAM_DEFAULT)
         # TODO: we might want to consider memoizing high/low per CUDA context and avoid this call
         cdef int high, low
         with nogil:
@@ -154,26 +153,25 @@ cdef class Stream:
         else:
             prio = high
 
-        cdef cydriver.CUstream s
-        with nogil:
-            HANDLE_RETURN(cydriver.cuStreamCreateWithPriority(&s, flags, prio))
-        # Owned stream - will be destroyed when handle is released
-        self._h_stream = create_stream_handle(s)
+        # C++ creates the stream and returns owning handle
+        h_stream = create_stream_handle(flags, prio)
+        if not h_stream:
+            raise RuntimeError("Failed to create CUDA stream")
+        self = Stream._from_handle(cls, h_stream)
         self._nonblocking = int(nonblocking)
         self._priority = prio
-        self._device_id = device_id if device_id is not None else self._device_id
+        if device_id is not None:
+            self._device_id = device_id
         return self
 
     cpdef close(self):
         """Destroy the stream.
 
         Releases the stream handle. For owned streams, this destroys the
-        underlying CUDA stream. For borrowed streams, this just releases
-        the reference.
+        underlying CUDA stream. For borrowed streams, this releases the
+        reference and allows the Python owner to be GC'd.
         """
-        # Reset handle to empty - this decrements refcount and may trigger destruction
         self._h_stream.reset()
-        self._owner = None
 
     def __cuda_stream__(self) -> tuple[int, int]:
         """Return an instance of a __cuda_stream__ protocol."""

From f90e625f3b69c45131fdbd33b87f5c7763b76a1d Mon Sep 17 00:00:00 2001
From: Andy Jost <ajost@nvidia.com>
Date: Tue, 9 Dec 2025 14:02:06 -0800
Subject: [PATCH 10/38] Refactor Stream to use ContextHandle and simplify
 initialization

- Replace raw CUcontext _ctx_handle with ContextHandle _h_context
  for consistent handle paradigm and cleaner code
- Replace CUdevice _device_id with int using -1 sentinel
- Use intptr() helper instead of <uintptr_t>() casts throughout
- Add _from_handle(type cls, ...) factory with subclass support
- Add _legacy_default and _per_thread_default classmethods
- Eliminate duplicated initialization code in _init
---
 cuda_core/cuda/core/experimental/_stream.pxd |  7 ++--
 cuda_core/cuda/core/experimental/_stream.pyx | 35 +++++++++-----------
 2 files changed, 19 insertions(+), 23 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_stream.pxd b/cuda_core/cuda/core/experimental/_stream.pxd
index e727a29226..5b7603d23b 100644
--- a/cuda_core/cuda/core/experimental/_stream.pxd
+++ b/cuda_core/cuda/core/experimental/_stream.pxd
@@ -2,18 +2,17 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-from cuda.bindings cimport cydriver
-from cuda.core.experimental._resource_handles cimport StreamHandle
+from cuda.core.experimental._resource_handles cimport ContextHandle, StreamHandle
 
 
 cdef class Stream:
 
     cdef:
         StreamHandle _h_stream
+        ContextHandle _h_context
+        int _device_id
         int _nonblocking
         int _priority
-        cydriver.CUdevice _device_id
-        cydriver.CUcontext _ctx_handle
 
     @staticmethod
     cdef Stream _from_handle(type cls, StreamHandle h_stream)
diff --git a/cuda_core/cuda/core/experimental/_stream.pyx b/cuda_core/cuda/core/experimental/_stream.pyx
index 10742e2730..9114bcb65f 100644
--- a/cuda_core/cuda/core/experimental/_stream.pyx
+++ b/cuda_core/cuda/core/experimental/_stream.pyx
@@ -12,7 +12,6 @@ from cuda.bindings cimport cydriver
 from cuda.core.experimental._event cimport Event as cyEvent
 from cuda.core.experimental._utils.cuda_utils cimport (
     check_or_create_options,
-    CU_CONTEXT_INVALID,
     get_device_from_ctx,
     HANDLE_RETURN,
 )
@@ -104,10 +103,10 @@ cdef class Stream:
         """Create a Stream from an existing StreamHandle (cdef-only factory)."""
         cdef Stream s = cls.__new__(cls)
         s._h_stream = h_stream
+        # _h_context is default-initialized to empty ContextHandle by C++
+        s._device_id = -1  # lazy init'd (invalid sentinel)
         s._nonblocking = -1  # lazy init'd
         s._priority = INT32_MIN  # lazy init'd
-        s._device_id = cydriver.CU_DEVICE_INVALID  # lazy init'd
-        s._ctx_handle = CU_CONTEXT_INVALID  # lazy init'd
         return s
 
     @classmethod
@@ -179,9 +178,9 @@ cdef class Stream:
 
     def __hash__(self) -> int:
         # Ensure context is initialized for hash consistency
-        if self._ctx_handle == CU_CONTEXT_INVALID:
+        if not self._h_context:
             self._get_context()
-        return hash((<uintptr_t>(self._ctx_handle), intptr(self._h_stream)))
+        return hash((intptr(self._h_context), intptr(self._h_stream)))
 
     def __eq__(self, other) -> bool:
         if not isinstance(other, Stream):
@@ -191,12 +190,12 @@ cdef class Stream:
         if intptr(self._h_stream) != intptr(_other._h_stream):
             return False
         # Ensure contexts are initialized for both streams
-        if self._ctx_handle == CU_CONTEXT_INVALID:
+        if not self._h_context:
             self._get_context()
-        if _other._ctx_handle == CU_CONTEXT_INVALID:
+        if not _other._h_context:
             _other._get_context()
         # Compare contexts as well
-        return <uintptr_t>(self._ctx_handle) == <uintptr_t>((_other)._ctx_handle)
+        return intptr(self._h_context) == intptr(_other._h_context)
 
     @property
     def handle(self) -> cuda.bindings.driver.CUstream:
@@ -258,7 +257,7 @@ cdef class Stream:
         # and CU_EVENT_RECORD_EXTERNAL, can be set in EventOptions.
         if event is None:
             self._get_device_and_context()
-            event = Event._init(<int>(self._device_id), <uintptr_t>(self._ctx_handle), options, False)
+            event = Event._init(self._device_id, intptr(self._h_context), options, False)
         elif event.is_ipc_enabled:
             raise TypeError(
                 "IPC-enabled events should not be re-recorded, instead create a "
@@ -320,27 +319,25 @@ cdef class Stream:
         """
         from cuda.core.experimental._device import Device  # avoid circular import
         self._get_device_and_context()
-        return Device(<int>(self._device_id))
+        return Device(self._device_id)
 
     cdef int _get_context(self) except?-1 nogil:
-        cdef ContextHandle h_context
-        if self._ctx_handle == CU_CONTEXT_INVALID:
-            h_context = get_stream_context(native(self._h_stream))
-            self._ctx_handle = native(h_context)
+        if not self._h_context:
+            self._h_context = get_stream_context(native(self._h_stream))
         return 0
 
     cdef int _get_device_and_context(self) except?-1:
         cdef ContextHandle h_curr_context
         cdef cydriver.CUcontext curr_ctx
-        if self._device_id == cydriver.CU_DEVICE_INVALID:
+        if self._device_id < 0:
             # Get the current context
             with nogil:
                 h_curr_context = get_current_context()
-                curr_ctx = native(h_curr_context) if h_curr_context.get() != NULL else <cydriver.CUcontext>0
-                # Get the stream's context (self._ctx_handle is populated)
+                curr_ctx = native(h_curr_context) if h_curr_context else <cydriver.CUcontext>0
+                # Get the stream's context (self._h_context is populated)
                 self._get_context()
             # Get the stream's device (may require a context-switching dance)
-            self._device_id = get_device_from_ctx(self._ctx_handle, curr_ctx)
+            self._device_id = get_device_from_ctx(native(self._h_context), curr_ctx)
         return 0
 
     @property
@@ -348,7 +345,7 @@ cdef class Stream:
         """Return the :obj:`~_context.Context` associated with this stream."""
         self._get_context()
         self._get_device_and_context()
-        return Context._from_ctx(<uintptr_t>(self._ctx_handle), <int>(self._device_id))
+        return Context._from_ctx(intptr(self._h_context), self._device_id)
 
     @staticmethod
     def from_handle(handle: int) -> Stream:

From d7a999dcc739235866d002ccce0326e6b06acfcf Mon Sep 17 00:00:00 2001
From: Andy Jost <ajost@nvidia.com>
Date: Tue, 9 Dec 2025 16:03:55 -0800
Subject: [PATCH 11/38] Extend ContextHandle to Event and standardize naming

- Event now uses ContextHandle for _h_context instead of raw object
- Event._init is now a cdef staticmethod accepting ContextHandle
- Context._from_ctx renamed to Context._from_handle (cdef staticmethod)
- Moved get_device_from_ctx to Stream module as Stream_ensure_ctx_device
- Inlined get_stream_context into Stream_ensure_ctx
- Simplified context push/pop logic in Stream_ensure_ctx_device

Naming standardization:
- Device._id -> Device._device_id
- _dev_id -> _device_id throughout codebase
- dev_id -> device_id for local variables
- Updated tests to use public APIs instead of internal _init methods
---
 cuda_core/cuda/core/experimental/_context.pxd |   7 +-
 cuda_core/cuda/core/experimental/_context.pyx |  47 ++------
 .../experimental/_cpp/resource_handles.cpp    |  18 ++--
 .../experimental/_cpp/resource_handles.hpp    |   2 +-
 cuda_core/cuda/core/experimental/_device.pyx  |  58 +++++-----
 cuda_core/cuda/core/experimental/_event.pxd   |   6 +-
 cuda_core/cuda/core/experimental/_event.pyx   |  22 ++--
 .../_memory/_device_memory_resource.pxd       |   2 +-
 .../_memory/_device_memory_resource.pyx       |  42 ++++----
 .../_memory/_graph_memory_resource.pxd        |   2 +-
 .../_memory/_graph_memory_resource.pyx        |  16 +--
 .../cuda/core/experimental/_memory/_ipc.pyx   |   2 +-
 .../cuda/core/experimental/_memory/_legacy.py |   6 +-
 .../core/experimental/_resource_handles.pxd   |   2 +-
 cuda_core/cuda/core/experimental/_stream.pxd  |   2 -
 cuda_core/cuda/core/experimental/_stream.pyx  |  77 +++++++-------
 .../core/experimental/_utils/cuda_utils.pxd   |   4 -
 .../core/experimental/_utils/cuda_utils.pyx   |  19 ----
 cuda_core/tests/test_comparable.py            |  47 +++-----
 cuda_core/tests/test_event.py                 |  10 +-
 cuda_core/tests/test_hashable.py              | 100 +++++++-----------
 21 files changed, 197 insertions(+), 294 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_context.pxd b/cuda_core/cuda/core/experimental/_context.pxd
index 01552c055e..062e865172 100644
--- a/cuda_core/cuda/core/experimental/_context.pxd
+++ b/cuda_core/cuda/core/experimental/_context.pxd
@@ -2,7 +2,6 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-from cuda.bindings cimport cydriver
 from cuda.core.experimental._resource_handles cimport ContextHandle
 
 cdef class Context:
@@ -16,7 +15,5 @@ cdef class Context:
         ContextHandle _h_context
         int _device_id
 
-# Cython-level context operations (handle-centric API)
-# Note: get_primary_context and get_current_context are now pure C++ (imported from _resource_handles)
-cdef void set_current_context(ContextHandle h_context) except * nogil
-cdef ContextHandle get_stream_context(cydriver.CUstream stream) except * nogil
+    @staticmethod
+    cdef Context _from_handle(type cls, ContextHandle h_context, int device_id)
diff --git a/cuda_core/cuda/core/experimental/_context.pyx b/cuda_core/cuda/core/experimental/_context.pyx
index 94c8379875..0504778207 100644
--- a/cuda_core/cuda/core/experimental/_context.pyx
+++ b/cuda_core/cuda/core/experimental/_context.pyx
@@ -4,16 +4,14 @@
 
 from dataclasses import dataclass
 
-from libc.stdint cimport uintptr_t
-
 from cuda.bindings cimport cydriver
 from cuda.core.experimental._resource_handles cimport (
+    ContextHandle,
     create_context_handle_ref,
     intptr,
     native,
     py,
 )
-from cuda.core.experimental._utils.cuda_utils import driver
 from cuda.core.experimental._utils.cuda_utils cimport HANDLE_RETURN
 
 
@@ -30,12 +28,11 @@ cdef class Context:
     def __init__(self, *args, **kwargs):
         raise RuntimeError("Context objects cannot be instantiated directly. Please use Device or Stream APIs.")
 
-    @classmethod
-    def _from_ctx(cls, handle: driver.CUcontext, int device_id):
-        cdef Context ctx = Context.__new__(Context)
-        # Convert Python CUcontext to C-level CUcontext and create non-owning ContextHandle
-        cdef cydriver.CUcontext c_ctx = <cydriver.CUcontext><uintptr_t>int(handle)
-        ctx._h_context = create_context_handle_ref(c_ctx)
+    @staticmethod
+    cdef Context _from_handle(type cls, ContextHandle h_context, int device_id):
+        """Create Context from existing ContextHandle (cdef-only factory)."""
+        cdef Context ctx = cls.__new__(cls)
+        ctx._h_context = h_context
         ctx._device_id = device_id
         return ctx
 
@@ -63,35 +60,3 @@ class ContextOptions:
     Currently unused, reserved for future use.
     """
     pass  # TODO
-
-
-cdef ContextHandle get_stream_context(cydriver.CUstream stream) except * nogil:
-    """Get handle to the context associated with a stream.
-
-    Parameters
-    ----------
-    stream : CUstream
-        Stream handle
-
-    Returns
-    -------
-    ContextHandle
-        Handle to context associated with the stream
-    """
-    cdef cydriver.CUcontext ctx = NULL
-    HANDLE_RETURN(cydriver.cuStreamGetCtx(stream, &ctx))
-    return create_context_handle_ref(ctx)
-
-
-cdef void set_current_context(ContextHandle h_context) except * nogil:
-    """Set the current CUDA context from a handle.
-
-    Parameters
-    ----------
-    h_context : ContextHandle
-        Context handle to set as current
-    """
-    if h_context.get() == NULL:
-        with gil:
-            raise ValueError("Cannot set NULL context as current")
-    HANDLE_RETURN(cydriver.cuCtxSetCurrent(native(h_context)))
diff --git a/cuda_core/cuda/core/experimental/_cpp/resource_handles.cpp b/cuda_core/cuda/core/experimental/_cpp/resource_handles.cpp
index a8b0fa60ef..076ff10810 100644
--- a/cuda_core/cuda/core/experimental/_cpp/resource_handles.cpp
+++ b/cuda_core/cuda/core/experimental/_cpp/resource_handles.cpp
@@ -98,10 +98,10 @@ ContextHandle create_context_handle_ref(CUcontext ctx) {
 // Each thread maintains its own cache of primary contexts indexed by device ID
 thread_local std::vector<ContextHandle> primary_context_cache;
 
-ContextHandle get_primary_context(int dev_id) noexcept {
+ContextHandle get_primary_context(int device_id) noexcept {
     // Check thread-local cache
-    if (static_cast<size_t>(dev_id) < primary_context_cache.size()) {
-        auto cached = primary_context_cache[dev_id];
+    if (static_cast<size_t>(device_id) < primary_context_cache.size()) {
+        auto cached = primary_context_cache[device_id];
         if (cached.get() != nullptr) {
             return cached;  // Cache hit
         }
@@ -112,7 +112,7 @@ ContextHandle get_primary_context(int dev_id) noexcept {
     CUresult err;
     {
         GILReleaseGuard gil;
-        err = cuDevicePrimaryCtxRetain(&ctx, dev_id);
+        err = cuDevicePrimaryCtxRetain(&ctx, device_id);
     }
     if (err != CUDA_SUCCESS) {
         // Return empty handle on error (caller must check)
@@ -120,9 +120,9 @@ ContextHandle get_primary_context(int dev_id) noexcept {
     }
 
     // Create owning handle with custom deleter that releases the primary context
-    auto box = std::shared_ptr<const ContextBox>(new ContextBox{ctx}, [dev_id](const ContextBox* b) {
+    auto box = std::shared_ptr<const ContextBox>(new ContextBox{ctx}, [device_id](const ContextBox* b) {
         GILReleaseGuard gil;
-        cuDevicePrimaryCtxRelease(dev_id);
+        cuDevicePrimaryCtxRelease(device_id);
         delete b;
     });
 
@@ -130,10 +130,10 @@ ContextHandle get_primary_context(int dev_id) noexcept {
     auto h_context = ContextHandle(box, &box->resource);
 
     // Resize cache if needed
-    if (static_cast<size_t>(dev_id) >= primary_context_cache.size()) {
-        primary_context_cache.resize(dev_id + 1);
+    if (static_cast<size_t>(device_id) >= primary_context_cache.size()) {
+        primary_context_cache.resize(device_id + 1);
     }
-    primary_context_cache[dev_id] = h_context;
+    primary_context_cache[device_id] = h_context;
 
     return h_context;
 }
diff --git a/cuda_core/cuda/core/experimental/_cpp/resource_handles.hpp b/cuda_core/cuda/core/experimental/_cpp/resource_handles.hpp
index e32bc4d77c..945ac0b2a8 100644
--- a/cuda_core/cuda/core/experimental/_cpp/resource_handles.hpp
+++ b/cuda_core/cuda/core/experimental/_cpp/resource_handles.hpp
@@ -27,7 +27,7 @@ ContextHandle create_context_handle_ref(CUcontext ctx);
 
 // Get handle to the primary context for a device (with thread-local caching)
 // Returns empty handle on error (caller must check)
-ContextHandle get_primary_context(int dev_id) noexcept;
+ContextHandle get_primary_context(int device_id) noexcept;
 
 // Get handle to the current CUDA context
 // Returns empty handle if no context is current (caller must check)
diff --git a/cuda_core/cuda/core/experimental/_device.pyx b/cuda_core/cuda/core/experimental/_device.pyx
index 8d62b1de51..f2f2f72a72 100644
--- a/cuda_core/cuda/core/experimental/_device.pyx
+++ b/cuda_core/cuda/core/experimental/_device.pyx
@@ -11,17 +11,15 @@ from cuda.core.experimental._utils.cuda_utils cimport HANDLE_RETURN
 import threading
 from typing import Optional, TYPE_CHECKING, Union
 
-from cuda.core.experimental._context cimport (
-    Context,
-    set_current_context,
-)
+from cuda.core.experimental._context cimport Context
 from cuda.core.experimental._context import ContextOptions
+from cuda.core.experimental._event cimport Event as cyEvent
 from cuda.core.experimental._event import Event, EventOptions
 from cuda.core.experimental._resource_handles cimport (
     ContextHandle,
+    create_context_handle_ref,
     get_primary_context,
     get_current_context,
-    intptr,
     native,
 )
 from cuda.core.experimental._graph import GraphBuilder
@@ -945,7 +943,7 @@ class Device:
         Default value of `None` return the currently used device.
 
     """
-    __slots__ = ("_id", "_memory_resource", "_has_inited", "_properties", "_uuid")
+    __slots__ = ("_device_id", "_memory_resource", "_has_inited", "_properties", "_uuid")
 
     def __new__(cls, device_id: Device | int | None = None):
         # Handle device_id argument.
@@ -987,9 +985,9 @@ class Device:
             with nogil:
                 HANDLE_RETURN(cydriver.cuDeviceGetCount(&total))
             devices = _tls.devices = []
-            for dev_id in range(total):
+            for i in range(total):
                 device = super().__new__(cls)
-                device._id = dev_id
+                device._device_id = i
                 device._memory_resource = None
                 device._has_inited = False
                 device._properties = None
@@ -1004,19 +1002,19 @@ class Device:
     def _check_context_initialized(self):
         if not self._has_inited:
             raise CUDAError(
-                f"Device {self._id} is not yet initialized, perhaps you forgot to call .set_current() first?"
+                f"Device {self._device_id} is not yet initialized, perhaps you forgot to call .set_current() first?"
             )
 
 
     @property
     def device_id(self) -> int:
         """Return device ordinal."""
-        return self._id
+        return self._device_id
 
     @property
     def pci_bus_id(self) -> str:
         """Return a PCI Bus Id string for this device."""
-        bus_id = handle_return(runtime.cudaDeviceGetPCIBusId(13, self._id))
+        bus_id = handle_return(runtime.cudaDeviceGetPCIBusId(13, self._device_id))
         return bus_id[:12].decode()
 
     def can_access_peer(self, peer: Device | int) -> bool:
@@ -1062,7 +1060,7 @@ class Device:
         cdef str uuid_hex
 
         if self._uuid is None:
-            dev = self._id
+            dev = self._device_id
             with nogil:
                 IF CUDA_CORE_BUILD_MAJOR == "12":
                     HANDLE_RETURN(cydriver.cuDeviceGetUuid_v2(&uuid, dev))
@@ -1081,7 +1079,7 @@ class Device:
         cdef int LENGTH = 256
         cdef bytes name = bytes(LENGTH)
         cdef char* name_ptr = name
-        cdef cydriver.CUdevice this_dev = self._id
+        cdef cydriver.CUdevice this_dev = self._device_id
         with nogil:
             HANDLE_RETURN(cydriver.cuDeviceGetName(name_ptr, LENGTH, this_dev))
         name = name.split(b"\0")[0]
@@ -1091,7 +1089,7 @@ class Device:
     def properties(self) -> DeviceProperties:
         """Return a :obj:`~_device.DeviceProperties` class with information about the device."""
         if self._properties is None:
-            self._properties = DeviceProperties._init(self._id)
+            self._properties = DeviceProperties._init(self._device_id)
 
         return self._properties
 
@@ -1127,9 +1125,9 @@ class Device:
             raise CUDAError("No context is bound to the calling CPU thread.")
         with nogil:
             HANDLE_RETURN(cydriver.cuCtxGetDevice(&dev))
-        if <int>dev != self._id:
+        if <int>dev != self._device_id:
             raise CUDAError("Internal error (current device is not equal to Device.device_id)")
-        return Context._from_ctx(intptr(h_context), self._id)
+        return Context._from_handle(Context, h_context, self._device_id)
 
     @property
     def memory_resource(self) -> MemoryResource:
@@ -1138,7 +1136,7 @@ class Device:
         if self._memory_resource is None:
             # If the device is in TCC mode, or does not support memory pools for some other reason,
             # use the SynchronousMemoryResource which does not use memory pools.
-            device_id = self._id
+            device_id = self._device_id
             with nogil:
                 HANDLE_RETURN(
                     cydriver.cuDeviceGetAttribute(
@@ -1147,10 +1145,10 @@ class Device:
                 )
             if attr == 1:
                 from cuda.core.experimental._memory import DeviceMemoryResource
-                self._memory_resource = DeviceMemoryResource(self._id)
+                self._memory_resource = DeviceMemoryResource(self._device_id)
             else:
                 from cuda.core.experimental._memory import _SynchronousMemoryResource
-                self._memory_resource = _SynchronousMemoryResource(self._id)
+                self._memory_resource = _SynchronousMemoryResource(self._device_id)
 
         return self._memory_resource
 
@@ -1175,10 +1173,10 @@ class Device:
 
     def __int__(self):
         """Return device_id."""
-        return self._id
+        return self._device_id
 
     def __repr__(self):
-        return f"<Device {self._id} ({self.name})>"
+        return f"<Device {self._device_id} ({self.name})>"
 
     def __hash__(self) -> int:
         return hash(self.uuid)
@@ -1186,7 +1184,7 @@ class Device:
     def __eq__(self, other) -> bool:
         if not isinstance(other, Device):
             return NotImplemented
-        return self._id == other._id
+        return self._device_id == other._device_id
 
     def __reduce__(self):
         return Device, (self.device_id,)
@@ -1227,10 +1225,10 @@ class Device:
         if ctx is not None:
             # TODO: revisit once Context is cythonized
             assert_type(ctx, Context)
-            if ctx._device_id != self._id:
+            if ctx._device_id != self._device_id:
                 raise RuntimeError(
                     "the provided context was created on the device with"
-                    f" id={ctx._device_id}, which is different from the target id={self._id}"
+                    f" id={ctx._device_id}, which is different from the target id={self._device_id}"
                 )
             # prev_ctx is the previous context
             curr_ctx = native(ctx._h_context)
@@ -1240,12 +1238,14 @@ class Device:
                 HANDLE_RETURN(cydriver.cuCtxPushCurrent(curr_ctx))
             self._has_inited = True
             if prev_ctx != NULL:
-                return Context._from_ctx(<uintptr_t>(prev_ctx), self._id)
+                return Context._from_handle(Context, create_context_handle_ref(prev_ctx), self._device_id)
         else:
             # use primary ctx
-            h_context = get_primary_context(self._id)
+            h_context = get_primary_context(self._device_id)
+            if h_context.get() == NULL:
+                raise ValueError("Cannot set NULL context as current")
             with nogil:
-                set_current_context(h_context)
+                HANDLE_RETURN(cydriver.cuCtxSetCurrent(native(h_context)))
             self._has_inited = True
 
     def create_context(self, options: ContextOptions = None) -> Context:
@@ -1297,7 +1297,7 @@ class Device:
 
         """
         self._check_context_initialized()
-        return Stream._init(obj=obj, options=options, device_id=self._id)
+        return Stream._init(obj=obj, options=options, device_id=self._device_id)
 
     def create_event(self, options: EventOptions | None = None) -> Event:
         """Create an Event object without recording it to a Stream.
@@ -1322,7 +1322,7 @@ class Device:
         h_context = get_current_context()
         if h_context.get() == NULL:
             raise CUDAError("No context is bound to the calling CPU thread.")
-        return Event._init(self._id, intptr(h_context), options, True)
+        return cyEvent._init(cyEvent, self._device_id, h_context, options, True)
 
     def allocate(self, size, stream: Stream | GraphBuilder | None = None) -> Buffer:
         """Allocate device memory from a specified stream.
diff --git a/cuda_core/cuda/core/experimental/_event.pxd b/cuda_core/cuda/core/experimental/_event.pxd
index 1f586f18df..d92c9627c3 100644
--- a/cuda_core/cuda/core/experimental/_event.pxd
+++ b/cuda_core/cuda/core/experimental/_event.pxd
@@ -3,6 +3,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 from cuda.bindings cimport cydriver
+from cuda.core.experimental._resource_handles cimport ContextHandle
 
 
 cdef class Event:
@@ -14,6 +15,9 @@ cdef class Event:
         bint _ipc_enabled
         object _ipc_descriptor
         int _device_id
-        object _ctx_handle
+        ContextHandle _h_context
+
+    @staticmethod
+    cdef Event _init(type cls, int device_id, ContextHandle h_context, options, bint is_free)
 
     cpdef close(self)
diff --git a/cuda_core/cuda/core/experimental/_event.pyx b/cuda_core/cuda/core/experimental/_event.pyx
index 149c92b8e1..2ac284d8c9 100644
--- a/cuda_core/cuda/core/experimental/_event.pyx
+++ b/cuda_core/cuda/core/experimental/_event.pyx
@@ -8,6 +8,8 @@ cimport cpython
 from libc.stdint cimport uintptr_t
 from libc.string cimport memcpy
 from cuda.bindings cimport cydriver
+from cuda.core.experimental._context cimport Context
+from cuda.core.experimental._resource_handles cimport ContextHandle, intptr
 from cuda.core.experimental._utils.cuda_utils cimport (
     check_or_create_options,
     HANDLE_RETURN
@@ -17,8 +19,6 @@ import cython
 from dataclasses import dataclass
 import multiprocessing
 from typing import TYPE_CHECKING, Optional
-
-from cuda.core.experimental._context import Context
 from cuda.core.experimental._utils.cuda_utils import (
     CUDAError,
     check_multiprocessing_start_method,
@@ -87,9 +87,9 @@ cdef class Event:
     def __init__(self, *args, **kwargs):
         raise RuntimeError("Event objects cannot be instantiated directly. Please use Stream APIs (record).")
 
-    @classmethod
-    def _init(cls, device_id: int, ctx_handle: Context, options=None, is_free=False):
-        cdef Event self = Event.__new__(cls)
+    @staticmethod
+    cdef Event _init(type cls, int device_id, ContextHandle h_context, options, bint is_free):
+        cdef Event self = cls.__new__(cls)
         cdef EventOptions opts = check_or_create_options(EventOptions, options, "Event options")
         cdef unsigned int flags = 0x0
         self._timing_disabled = False
@@ -114,7 +114,7 @@ cdef class Event:
         with nogil:
             HANDLE_RETURN(cydriver.cuEventCreate(&self._handle, flags))
         self._device_id = device_id
-        self._ctx_handle = ctx_handle
+        self._h_context = h_context
         if opts.ipc_enabled:
             self.get_ipc_descriptor()
         return self
@@ -165,7 +165,7 @@ cdef class Event:
             raise RuntimeError(explanation)
 
     def __hash__(self) -> int:
-        return hash((self._ctx_handle, <uintptr_t>(self._handle)))
+        return hash((type(self), intptr(self._h_context), <uintptr_t>(self._handle)))
 
     def __eq__(self, other) -> bool:
         # Note: using isinstance because `Event` can be subclassed.
@@ -199,8 +199,8 @@ cdef class Event:
         self._busy_waited = ipc_descriptor._busy_waited
         self._ipc_enabled = True
         self._ipc_descriptor = ipc_descriptor
-        self._device_id = -1  # ??
-        self._ctx_handle = None  # ??
+        self._device_id = -1
+        self._h_context = ContextHandle()
         return self
 
     @property
@@ -271,8 +271,8 @@ cdef class Event:
     @property
     def context(self) -> Context:
         """Return the :obj:`~_context.Context` associated with this event."""
-        if self._ctx_handle is not None and self._device_id >= 0:
-            return Context._from_ctx(self._ctx_handle, self._device_id)
+        if self._h_context and self._device_id >= 0:
+            return Context._from_handle(Context, self._h_context, self._device_id)
 
 
 cdef class IPCEventDescriptor:
diff --git a/cuda_core/cuda/core/experimental/_memory/_device_memory_resource.pxd b/cuda_core/cuda/core/experimental/_memory/_device_memory_resource.pxd
index 823a270b27..d31ff7b2e1 100644
--- a/cuda_core/cuda/core/experimental/_memory/_device_memory_resource.pxd
+++ b/cuda_core/cuda/core/experimental/_memory/_device_memory_resource.pxd
@@ -9,7 +9,7 @@ from cuda.core.experimental._memory._ipc cimport IPCDataForMR
 
 cdef class DeviceMemoryResource(MemoryResource):
     cdef:
-        int                   _dev_id
+        int                   _device_id
         cydriver.CUmemoryPool _handle
         bint                  _mempool_owned
         IPCDataForMR          _ipc_data
diff --git a/cuda_core/cuda/core/experimental/_memory/_device_memory_resource.pyx b/cuda_core/cuda/core/experimental/_memory/_device_memory_resource.pyx
index 3bfdb59c07..d06f0b8297 100644
--- a/cuda_core/cuda/core/experimental/_memory/_device_memory_resource.pyx
+++ b/cuda_core/cuda/core/experimental/_memory/_device_memory_resource.pyx
@@ -219,7 +219,7 @@ cdef class DeviceMemoryResource(MemoryResource):
     """
 
     def __cinit__(self):
-        self._dev_id = cydriver.CU_DEVICE_INVALID
+        self._device_id = cydriver.CU_DEVICE_INVALID
         self._handle = NULL
         self._mempool_owned = False
         self._ipc_data = None
@@ -228,16 +228,16 @@ cdef class DeviceMemoryResource(MemoryResource):
 
     def __init__(self, device_id: Device | int, options=None):
         from .._device import Device
-        cdef int dev_id = Device(device_id).device_id
+        cdef int c_device_id = Device(device_id).device_id
         opts = check_or_create_options(
             DeviceMemoryResourceOptions, options, "DeviceMemoryResource options",
             keep_none=True
         )
 
         if opts is None:
-            DMR_init_current(self, dev_id)
+            DMR_init_current(self, c_device_id)
         else:
-            DMR_init_create(self, dev_id, opts)
+            DMR_init_create(self, c_device_id, opts)
 
     def __dealloc__(self):
         DMR_close(self)
@@ -366,7 +366,7 @@ cdef class DeviceMemoryResource(MemoryResource):
     @property
     def device_id(self) -> int:
         """The associated device ordinal."""
-        return self._dev_id
+        return self._device_id
 
     @property
     def handle(self) -> driver.CUmemoryPool:
@@ -438,11 +438,11 @@ cdef class DeviceMemoryResource(MemoryResource):
 
         # Convert all devices to device IDs
         cdef set[int] target_ids = {Device(dev).device_id for dev in devices}
-        target_ids.discard(self._dev_id)  # exclude this device from peer access list
-        this_dev = Device(self._dev_id)
+        target_ids.discard(self._device_id)  # exclude this device from peer access list
+        this_dev = Device(self._device_id)
         cdef list bad = [dev for dev in target_ids if not this_dev.can_access_peer(dev)]
         if bad:
-            raise ValueError(f"Device {self._dev_id} cannot access peer(s): {', '.join(map(str, bad))}")
+            raise ValueError(f"Device {self._device_id} cannot access peer(s): {', '.join(map(str, bad))}")
         cdef set[int] cur_ids = set(self._peer_accessible_by)
         cdef set[int] to_add = target_ids - cur_ids
         cdef set[int] to_rm = cur_ids - target_ids
@@ -456,16 +456,16 @@ cdef class DeviceMemoryResource(MemoryResource):
                 raise MemoryError("Failed to allocate memory for access descriptors")
 
             try:
-                for dev_id in to_add:
+                for device_id in to_add:
                     access_desc[i].flags = cydriver.CUmemAccess_flags.CU_MEM_ACCESS_FLAGS_PROT_READWRITE
                     access_desc[i].location.type = cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE
-                    access_desc[i].location.id = dev_id
+                    access_desc[i].location.id = device_id
                     i += 1
 
-                for dev_id in to_rm:
+                for device_id in to_rm:
                     access_desc[i].flags = cydriver.CUmemAccess_flags.CU_MEM_ACCESS_FLAGS_PROT_NONE
                     access_desc[i].location.type = cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE
-                    access_desc[i].location.id = dev_id
+                    access_desc[i].location.id = device_id
                     i += 1
 
                 with nogil:
@@ -480,16 +480,16 @@ cdef class DeviceMemoryResource(MemoryResource):
 # DeviceMemoryResource Implementation
 # -----------------------------------
 
-cdef void DMR_init_current(DeviceMemoryResource self, int dev_id):
+cdef void DMR_init_current(DeviceMemoryResource self, int device_id):
     # Get the current memory pool.
     cdef cydriver.cuuint64_t current_threshold
     cdef cydriver.cuuint64_t max_threshold = ULLONG_MAX
 
-    self._dev_id = dev_id
+    self._device_id = device_id
     self._mempool_owned = False
 
     with nogil:
-        HANDLE_RETURN(cydriver.cuDeviceGetMemPool(&(self._handle), dev_id))
+        HANDLE_RETURN(cydriver.cuDeviceGetMemPool(&(self._handle), device_id))
 
         # Set a higher release threshold to improve performance when there are
         # no active allocations.  By default, the release threshold is 0, which
@@ -513,7 +513,7 @@ cdef void DMR_init_current(DeviceMemoryResource self, int dev_id):
 
 
 cdef void DMR_init_create(
-    DeviceMemoryResource self, int dev_id, DeviceMemoryResourceOptions opts
+    DeviceMemoryResource self, int device_id, DeviceMemoryResourceOptions opts
 ):
     # Create a new memory pool.
     cdef cydriver.CUmemPoolProps properties
@@ -524,13 +524,13 @@ cdef void DMR_init_create(
     memset(&properties, 0, sizeof(cydriver.CUmemPoolProps))
     properties.allocType = cydriver.CUmemAllocationType.CU_MEM_ALLOCATION_TYPE_PINNED
     properties.handleTypes = _ipc.IPC_HANDLE_TYPE if opts.ipc_enabled else cydriver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_NONE
-    properties.location.id = dev_id
+    properties.location.id = device_id
     properties.location.type = cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE
     properties.maxSize = opts.max_size
     properties.win32SecurityAttributes = NULL
     properties.usage = 0
 
-    self._dev_id = dev_id
+    self._device_id = device_id
     self._mempool_owned = True
 
     with nogil:
@@ -593,7 +593,7 @@ cdef inline DMR_close(DeviceMemoryResource self):
             with nogil:
                 HANDLE_RETURN(cydriver.cuMemPoolDestroy(self._handle))
     finally:
-        self._dev_id = cydriver.CU_DEVICE_INVALID
+        self._device_id = cydriver.CU_DEVICE_INVALID
         self._handle = NULL
         self._attributes = None
         self._mempool_owned = False
@@ -618,12 +618,12 @@ cpdef DMR_mempool_get_access(DeviceMemoryResource dmr, int device_id):
     """
     from .._device import Device
 
-    cdef int dev_id = Device(device_id).device_id
+    cdef int c_device_id = Device(device_id).device_id
     cdef cydriver.CUmemAccess_flags flags
     cdef cydriver.CUmemLocation location
 
     location.type = cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE
-    location.id = dev_id
+    location.id = c_device_id
 
     with nogil:
         HANDLE_RETURN(cydriver.cuMemPoolGetAccess(&flags, dmr._handle, &location))
diff --git a/cuda_core/cuda/core/experimental/_memory/_graph_memory_resource.pxd b/cuda_core/cuda/core/experimental/_memory/_graph_memory_resource.pxd
index f9c7798e76..00af6e407b 100644
--- a/cuda_core/cuda/core/experimental/_memory/_graph_memory_resource.pxd
+++ b/cuda_core/cuda/core/experimental/_memory/_graph_memory_resource.pxd
@@ -7,4 +7,4 @@ from cuda.core.experimental._memory._buffer cimport MemoryResource
 
 cdef class cyGraphMemoryResource(MemoryResource):
     cdef:
-        int _dev_id
+        int _device_id
diff --git a/cuda_core/cuda/core/experimental/_memory/_graph_memory_resource.pyx b/cuda_core/cuda/core/experimental/_memory/_graph_memory_resource.pyx
index 9a83c9007c..5ad9d86c53 100644
--- a/cuda_core/cuda/core/experimental/_memory/_graph_memory_resource.pyx
+++ b/cuda_core/cuda/core/experimental/_memory/_graph_memory_resource.pyx
@@ -23,7 +23,7 @@ __all__ = ['GraphMemoryResource']
 
 cdef class GraphMemoryResourceAttributes:
     cdef:
-        int _dev_id
+        int _device_id
 
     def __init__(self, *args, **kwargs):
         raise RuntimeError("GraphMemoryResourceAttributes cannot be instantiated directly. Please use MemoryResource APIs.")
@@ -31,7 +31,7 @@ cdef class GraphMemoryResourceAttributes:
     @classmethod
     def _init(cls, device_id: int):
         cdef GraphMemoryResourceAttributes self = GraphMemoryResourceAttributes.__new__(cls)
-        self._dev_id = device_id
+        self._device_id = device_id
         return self
 
     def __repr__(self):
@@ -42,12 +42,12 @@ cdef class GraphMemoryResourceAttributes:
 
     cdef int _getattribute(self, cydriver.CUgraphMem_attribute attr_enum, void* value) except?-1:
         with nogil:
-            HANDLE_RETURN(cydriver.cuDeviceGetGraphMemAttribute(self._dev_id, attr_enum, value))
+            HANDLE_RETURN(cydriver.cuDeviceGetGraphMemAttribute(self._device_id, attr_enum, value))
         return 0
 
     cdef int _setattribute(self, cydriver.CUgraphMem_attribute attr_enum, void* value) except?-1:
         with nogil:
-            HANDLE_RETURN(cydriver.cuDeviceSetGraphMemAttribute(self._dev_id, attr_enum, value))
+            HANDLE_RETURN(cydriver.cuDeviceSetGraphMemAttribute(self._device_id, attr_enum, value))
         return 0
 
     @property
@@ -101,7 +101,7 @@ cdef class GraphMemoryResourceAttributes:
 
 cdef class cyGraphMemoryResource(MemoryResource):
     def __cinit__(self, int device_id):
-        self._dev_id = device_id
+        self._device_id = device_id
 
     def allocate(self, size_t size, stream: Stream | GraphBuilder | None = None) -> Buffer:
         """
@@ -124,17 +124,17 @@ cdef class cyGraphMemoryResource(MemoryResource):
     def trim(self):
         """Free unused memory that was cached on the specified device for use with graphs back to the OS."""
         with nogil:
-             HANDLE_RETURN(cydriver.cuDeviceGraphMemTrim(self._dev_id))
+             HANDLE_RETURN(cydriver.cuDeviceGraphMemTrim(self._device_id))
 
     @property
     def attributes(self) -> GraphMemoryResourceAttributes:
         """Asynchronous allocation attributes related to graphs."""
-        return GraphMemoryResourceAttributes._init(self._dev_id)
+        return GraphMemoryResourceAttributes._init(self._device_id)
 
     @property
     def device_id(self) -> int:
         """The associated device ordinal."""
-        return self._dev_id
+        return self._device_id
 
     @property
     def is_device_accessible(self) -> bool:
diff --git a/cuda_core/cuda/core/experimental/_memory/_ipc.pyx b/cuda_core/cuda/core/experimental/_memory/_ipc.pyx
index c9931855cf..7c5a9b0409 100644
--- a/cuda_core/cuda/core/experimental/_memory/_ipc.pyx
+++ b/cuda_core/cuda/core/experimental/_memory/_ipc.pyx
@@ -212,7 +212,7 @@ cdef DeviceMemoryResource DMR_from_allocation_handle(cls, device_id, alloc_handl
     # Construct a new DMR.
     cdef DeviceMemoryResource self = DeviceMemoryResource.__new__(cls)
     from .._device import Device
-    self._dev_id = Device(device_id).device_id
+    self._device_id = Device(device_id).device_id
     self._mempool_owned = True
     self._ipc_data = IPCDataForMR(alloc_handle, True)
 
diff --git a/cuda_core/cuda/core/experimental/_memory/_legacy.py b/cuda_core/cuda/core/experimental/_memory/_legacy.py
index 09ea0e15d2..bff7638734 100644
--- a/cuda_core/cuda/core/experimental/_memory/_legacy.py
+++ b/cuda_core/cuda/core/experimental/_memory/_legacy.py
@@ -84,12 +84,12 @@ def device_id(self) -> int:
 
 
 class _SynchronousMemoryResource(MemoryResource):
-    __slots__ = ("_dev_id",)
+    __slots__ = ("_device_id",)
 
     def __init__(self, device_id):
         from .._device import Device
 
-        self._dev_id = Device(device_id).device_id
+        self._device_id = Device(device_id).device_id
 
     def allocate(self, size, stream=None) -> Buffer:
         if stream is None:
@@ -116,4 +116,4 @@ def is_host_accessible(self) -> bool:
 
     @property
     def device_id(self) -> int:
-        return self._dev_id
+        return self._device_id
diff --git a/cuda_core/cuda/core/experimental/_resource_handles.pxd b/cuda_core/cuda/core/experimental/_resource_handles.pxd
index da152f4473..0423ef0ec8 100644
--- a/cuda_core/cuda/core/experimental/_resource_handles.pxd
+++ b/cuda_core/cuda/core/experimental/_resource_handles.pxd
@@ -18,7 +18,7 @@ cdef extern from "_cpp/resource_handles.hpp" namespace "cuda_core":
     ContextHandle create_context_handle_ref(cydriver.CUcontext ctx) nogil
 
     # Context acquisition functions (pure C++, nogil-safe with thread-local caching)
-    ContextHandle get_primary_context(int dev_id) nogil
+    ContextHandle get_primary_context(int device_id) nogil
     ContextHandle get_current_context() nogil
 
     # ========================================================================
diff --git a/cuda_core/cuda/core/experimental/_stream.pxd b/cuda_core/cuda/core/experimental/_stream.pxd
index 5b7603d23b..0877d37be1 100644
--- a/cuda_core/cuda/core/experimental/_stream.pxd
+++ b/cuda_core/cuda/core/experimental/_stream.pxd
@@ -18,8 +18,6 @@ cdef class Stream:
     cdef Stream _from_handle(type cls, StreamHandle h_stream)
 
     cpdef close(self)
-    cdef int _get_context(self) except?-1 nogil
-    cdef int _get_device_and_context(self) except?-1
 
 
 cpdef Stream default_stream()
diff --git a/cuda_core/cuda/core/experimental/_stream.pyx b/cuda_core/cuda/core/experimental/_stream.pyx
index 9114bcb65f..3ba38095e4 100644
--- a/cuda_core/cuda/core/experimental/_stream.pyx
+++ b/cuda_core/cuda/core/experimental/_stream.pyx
@@ -12,7 +12,6 @@ from cuda.bindings cimport cydriver
 from cuda.core.experimental._event cimport Event as cyEvent
 from cuda.core.experimental._utils.cuda_utils cimport (
     check_or_create_options,
-    get_device_from_ctx,
     HANDLE_RETURN,
 )
 
@@ -24,14 +23,12 @@ from typing import TYPE_CHECKING, Optional, Protocol, Union
 if TYPE_CHECKING:
     import cuda.bindings
     from cuda.core.experimental._device import Device
-from cuda.core.experimental._context cimport (
-    Context,
-    get_stream_context,
-)
+from cuda.core.experimental._context cimport Context
 from cuda.core.experimental._event import Event, EventOptions
 from cuda.core.experimental._resource_handles cimport (
     ContextHandle,
     StreamHandle,
+    create_context_handle_ref,
     create_stream_handle,
     create_stream_handle_with_owner,
     get_current_context,
@@ -178,8 +175,7 @@ cdef class Stream:
 
     def __hash__(self) -> int:
         # Ensure context is initialized for hash consistency
-        if not self._h_context:
-            self._get_context()
+        Stream_ensure_ctx(self)
         return hash((intptr(self._h_context), intptr(self._h_stream)))
 
     def __eq__(self, other) -> bool:
@@ -190,10 +186,8 @@ cdef class Stream:
         if intptr(self._h_stream) != intptr(_other._h_stream):
             return False
         # Ensure contexts are initialized for both streams
-        if not self._h_context:
-            self._get_context()
-        if not _other._h_context:
-            _other._get_context()
+        Stream_ensure_ctx(self)
+        Stream_ensure_ctx(_other)
         # Compare contexts as well
         return intptr(self._h_context) == intptr(_other._h_context)
 
@@ -256,8 +250,8 @@ cdef class Stream:
         # on the stream. Event flags such as disabling timing, nonblocking,
         # and CU_EVENT_RECORD_EXTERNAL, can be set in EventOptions.
         if event is None:
-            self._get_device_and_context()
-            event = Event._init(self._device_id, intptr(self._h_context), options, False)
+            Stream_ensure_ctx_device(self)
+            event = cyEvent._init(cyEvent, self._device_id, self._h_context, options, False)
         elif event.is_ipc_enabled:
             raise TypeError(
                 "IPC-enabled events should not be re-recorded, instead create a "
@@ -318,34 +312,15 @@ cdef class Stream:
 
         """
         from cuda.core.experimental._device import Device  # avoid circular import
-        self._get_device_and_context()
+        Stream_ensure_ctx_device(self)
         return Device(self._device_id)
 
-    cdef int _get_context(self) except?-1 nogil:
-        if not self._h_context:
-            self._h_context = get_stream_context(native(self._h_stream))
-        return 0
-
-    cdef int _get_device_and_context(self) except?-1:
-        cdef ContextHandle h_curr_context
-        cdef cydriver.CUcontext curr_ctx
-        if self._device_id < 0:
-            # Get the current context
-            with nogil:
-                h_curr_context = get_current_context()
-                curr_ctx = native(h_curr_context) if h_curr_context else <cydriver.CUcontext>0
-                # Get the stream's context (self._h_context is populated)
-                self._get_context()
-            # Get the stream's device (may require a context-switching dance)
-            self._device_id = get_device_from_ctx(native(self._h_context), curr_ctx)
-        return 0
-
     @property
     def context(self) -> Context:
         """Return the :obj:`~_context.Context` associated with this stream."""
-        self._get_context()
-        self._get_device_and_context()
-        return Context._from_ctx(intptr(self._h_context), self._device_id)
+        Stream_ensure_ctx(self)
+        Stream_ensure_ctx_device(self)
+        return Context._from_handle(Context, self._h_context, self._device_id)
 
     @staticmethod
     def from_handle(handle: int) -> Stream:
@@ -425,6 +400,36 @@ cpdef Stream default_stream():
         return C_LEGACY_DEFAULT_STREAM
 
 
+cdef inline int Stream_ensure_ctx(Stream self) except?-1 nogil:
+    """Ensure the stream's context handle is populated."""
+    cdef cydriver.CUcontext ctx
+    if not self._h_context:
+        HANDLE_RETURN(cydriver.cuStreamGetCtx(native(self._h_stream), &ctx))
+        self._h_context = create_context_handle_ref(ctx)
+    return 0
+
+
+cdef inline int Stream_ensure_ctx_device(Stream self) except?-1:
+    """Ensure the stream's context and device_id are populated."""
+    cdef ContextHandle h_curr_context
+    cdef cydriver.CUcontext target_ctx, curr_ctx, ctx
+    cdef cydriver.CUdevice target_dev
+    cdef bint switch_context
+
+    if self._device_id < 0:
+        with nogil:
+            # Get device ID from context, switching context temporarily if needed
+            Stream_ensure_ctx(self)
+            switch_context = (get_current_context() != self._h_context)
+            if switch_context:
+                HANDLE_RETURN(cydriver.cuCtxPushCurrent(native(self._h_context)))
+            HANDLE_RETURN(cydriver.cuCtxGetDevice(&target_dev))
+            if switch_context:
+                HANDLE_RETURN(cydriver.cuCtxPopCurrent(&ctx))
+        self._device_id = <int>target_dev
+    return 0
+
+
 cdef cydriver.CUstream _handle_from_stream_protocol(obj) except*:
     if isinstance(obj, Stream):
         return <cydriver.CUstream><uintptr_t>(obj.handle)
diff --git a/cuda_core/cuda/core/experimental/_utils/cuda_utils.pxd b/cuda_core/cuda/core/experimental/_utils/cuda_utils.pxd
index ce30285aa5..9b5044beda 100644
--- a/cuda_core/cuda/core/experimental/_utils/cuda_utils.pxd
+++ b/cuda_core/cuda/core/experimental/_utils/cuda_utils.pxd
@@ -22,10 +22,6 @@ ctypedef fused integer_t:
 cdef const cydriver.CUcontext CU_CONTEXT_INVALID = <cydriver.CUcontext>(-2)
 
 
-cdef cydriver.CUdevice get_device_from_ctx(
-    cydriver.CUcontext target_ctx, cydriver.CUcontext curr_ctx) except?cydriver.CU_DEVICE_INVALID nogil
-
-
 cdef int HANDLE_RETURN(supported_error_type err) except?-1 nogil
 
 
diff --git a/cuda_core/cuda/core/experimental/_utils/cuda_utils.pyx b/cuda_core/cuda/core/experimental/_utils/cuda_utils.pyx
index 4489871747..22b6fb6c4b 100644
--- a/cuda_core/cuda/core/experimental/_utils/cuda_utils.pyx
+++ b/cuda_core/cuda/core/experimental/_utils/cuda_utils.pyx
@@ -197,25 +197,6 @@ def precondition(checker: Callable[..., None], str what="") -> Callable:
     return outer
 
 
-cdef cydriver.CUdevice get_device_from_ctx(
-        cydriver.CUcontext target_ctx, cydriver.CUcontext curr_ctx) except?cydriver.CU_DEVICE_INVALID nogil:
-    """Get device ID from the given ctx."""
-    cdef bint switch_context = (curr_ctx != target_ctx)
-    cdef cydriver.CUcontext ctx
-    cdef cydriver.CUdevice target_dev
-    with nogil:
-        if switch_context:
-            HANDLE_RETURN(cydriver.cuCtxPopCurrent(&ctx))
-            assert curr_ctx == ctx
-            HANDLE_RETURN(cydriver.cuCtxPushCurrent(target_ctx))
-        HANDLE_RETURN(cydriver.cuCtxGetDevice(&target_dev))
-        if switch_context:
-            HANDLE_RETURN(cydriver.cuCtxPopCurrent(&ctx))
-            assert target_ctx == ctx
-            HANDLE_RETURN(cydriver.cuCtxPushCurrent(curr_ctx))
-    return target_dev
-
-
 def is_sequence(obj):
     """
     Check if the given object is a sequence (list or tuple).
diff --git a/cuda_core/tests/test_comparable.py b/cuda_core/tests/test_comparable.py
index 72b3caa2ba..2c05932dcc 100644
--- a/cuda_core/tests/test_comparable.py
+++ b/cuda_core/tests/test_comparable.py
@@ -9,8 +9,7 @@
 """
 
 from cuda.core.experimental import Device, Stream
-from cuda.core.experimental._context import Context
-from cuda.core.experimental._event import Event, EventOptions
+from cuda.core.experimental._event import Event
 from cuda.core.experimental._stream import StreamOptions
 
 # ============================================================================
@@ -105,50 +104,34 @@ def test_event_subclass_equality(init_cuda):
 
     Event uses isinstance() for equality checking, similar to Stream.
     """
-
-    class MyEvent(Event):
-        pass
-
     device = Device(0)
     device.set_current()
 
-    # Create two different events
-    event = Event._init(device.device_id, device.context, options=EventOptions())
-    my_event = MyEvent._init(device.device_id, device.context, options=EventOptions())
+    # Create events using public API
+    event1 = device.create_event()
+    event2 = device.create_event()
+    event3 = device.create_event()
 
     # Different events should not be equal (different handles)
-    assert event != my_event, "Different Event instances are not equal"
+    assert event1 != event2, "Different Event instances are not equal"
+    assert event2 != event3, "Different Event instances are not equal"
 
-    # Same subclass type with different handles
-    my_event2 = MyEvent._init(device.device_id, device.context, options=EventOptions())
-    assert my_event != my_event2, "Different MyEvent instances are not equal"
-
-
-def test_context_subclass_equality(init_cuda):
-    """Test Context subclass equality behavior."""
-
-    class MyContext(Context):
-        pass
 
+def test_context_equality(init_cuda):
+    """Test Context equality behavior."""
     device = Device(0)
     device.set_current()
-    stream = device.create_stream()
-    context = stream.context
-
-    # MyContext._from_ctx() returns a Context instance, not MyContext
-    my_context = MyContext._from_ctx(context.handle, device.device_id)
-    assert type(my_context) is Context, "_from_ctx returns Context, not subclass"
-    assert type(my_context) is not MyContext
-
-    # Since both are Context instances with same handle, they're equal
-    assert context == my_context, "Context instances with same handle are equal"
 
-    # Create another context from different stream
+    # Get context from different sources
+    stream1 = device.create_stream()
     stream2 = device.create_stream()
+    context1 = stream1.context
     context2 = stream2.context
+    device_context = device.context
 
     # Same device, same primary context, should be equal
-    assert context == context2, "Contexts from same device are equal"
+    assert context1 == context2, "Contexts from same device are equal"
+    assert context1 == device_context, "Stream context equals device context"
 
 
 def test_subclass_type_safety(init_cuda):
diff --git a/cuda_core/tests/test_event.py b/cuda_core/tests/test_event.py
index ec35448619..f5bf19f8e3 100644
--- a/cuda_core/tests/test_event.py
+++ b/cuda_core/tests/test_event.py
@@ -148,14 +148,12 @@ def test_event_context(init_cuda):
     assert context is not None
 
 
-def test_event_subclassing():
-    class MyEvent(Event):
-        pass
-
+def test_event_creation():
+    """Test Event creation via public API."""
     dev = Device()
     dev.set_current()
-    event = MyEvent._init(dev.device_id, dev.context)
-    assert isinstance(event, MyEvent)
+    event = dev.create_event()
+    assert isinstance(event, Event)
 
 
 # ============================================================================
diff --git a/cuda_core/tests/test_hashable.py b/cuda_core/tests/test_hashable.py
index 751a88250c..1ecf8cdedd 100644
--- a/cuda_core/tests/test_hashable.py
+++ b/cuda_core/tests/test_hashable.py
@@ -13,8 +13,7 @@
 """
 
 from cuda.core.experimental import Device
-from cuda.core.experimental._context import Context
-from cuda.core.experimental._event import Event, EventOptions
+from cuda.core.experimental._event import Event
 from cuda.core.experimental._stream import Stream, StreamOptions
 
 # ============================================================================
@@ -128,65 +127,51 @@ class MyStream(Stream):
     assert hash(my_stream) != hash(my_stream2), "Different streams have different hashes"
 
 
-def test_event_subclass_hash(init_cuda):
-    """Test Event subclass hash behavior."""
-
-    class MyEvent(Event):
-        pass
-
+def test_event_hash(init_cuda):
+    """Test Event hash behavior."""
     device = Device(0)
     device.set_current()
 
-    # Create events with different handles
-    event = Event._init(device.device_id, device.context, options=EventOptions())
-    my_event = MyEvent._init(device.device_id, device.context, options=EventOptions())
+    # Create events using public API
+    event1 = device.create_event()
+    event2 = device.create_event()
 
     # Different events (different handles) -> different hashes
-    assert hash(event) != hash(my_event), "Different events have different hashes"
-    assert event != my_event, "Different handles means not equal"
+    assert hash(event1) != hash(event2), "Different events have different hashes"
+    assert event1 != event2, "Different handles means not equal"
 
     # Verify hash consistency
-    hash1 = hash(event)
-    hash2 = hash(event)
+    hash1 = hash(event1)
+    hash2 = hash(event1)
     assert hash1 == hash2, "Hash is consistent across multiple calls"
 
     # Both should be usable as dict keys
-    cache = {event: "base", my_event: "subclass"}
+    cache = {event1: "first", event2: "second"}
     assert len(cache) == 2, "Different events are distinct dict keys"
-    assert cache[event] == "base"
-    assert cache[my_event] == "subclass"
-
-
-def test_context_subclass_hash(init_cuda):
-    """Test Context subclass hash behavior.
+    assert cache[event1] == "first"
+    assert cache[event2] == "second"
 
-    Context._from_ctx() always returns Context instances, even when called
-    as MyContext._from_ctx(). This means we can't create actual MyContext
-    instances in practice.
-    """
-
-    class MyContext(Context):
-        pass
 
+def test_context_hash(init_cuda):
+    """Test Context hash behavior."""
     device = Device(0)
     device.set_current()
-    stream = device.create_stream()
-    context = stream.context
 
-    # MyContext._from_ctx() returns Context, not MyContext
-    my_context = MyContext._from_ctx(context.handle, device.device_id)
-    assert type(my_context) is Context, "_from_ctx returns Context type"
+    # Get context from different sources
+    stream1 = device.create_stream()
+    stream2 = device.create_stream()
+    context1 = stream1.context
+    context2 = stream2.context
 
-    # Same handle -> same hash
-    assert hash(context) == hash(my_context), "Contexts with same handle have same hash"
+    # Same underlying context -> same hash
+    assert hash(context1) == hash(context2), "Contexts with same handle have same hash"
 
     # Verify equality matches hash
-    assert context == my_context, "Contexts with same handle are equal"
-    assert hash(context) == hash(my_context), "Equal contexts have equal hashes"
+    assert context1 == context2, "Contexts with same handle are equal"
 
     # Verify hash consistency
-    hash1 = hash(context)
-    hash2 = hash(context)
+    hash1 = hash(context1)
+    hash2 = hash(context1)
     assert hash1 == hash2, "Hash is consistent across multiple calls"
 
 
@@ -200,33 +185,24 @@ def test_hash_equality_contract_maintained(init_cuda):
     allowing cross-type equality with consistent hashing.
     """
 
-    class MyStream(Stream):
-        pass
-
-    class MyEvent(Event):
-        pass
-
-    class MyContext(Context):
-        pass
-
     device = Device(0)
     device.set_current()
 
-    # Test Stream: base and subclass with same handle
-    my_stream = MyStream._init(options=StreamOptions(), device_id=device.device_id)
-    stream = Stream.from_handle(int(my_stream.handle))
+    # Test Stream: two references to same handle
+    stream1 = device.create_stream()
+    stream2 = Stream.from_handle(int(stream1.handle))
 
-    assert my_stream == stream, "Equal due to isinstance() check and same handle"
-    assert hash(my_stream) == hash(stream), "Equal objects have equal hashes"
+    assert stream1 == stream2, "Equal due to same handle"
+    assert hash(stream1) == hash(stream2), "Equal objects have equal hashes"
 
-    # Test Context: always returns base type from _from_ctx
-    ctx = device.context
-    my_ctx = MyContext._from_ctx(ctx.handle, device.device_id)
+    # Test Context: contexts from same device share same underlying context
+    ctx1 = device.context
+    ctx2 = device.create_stream().context
 
-    assert ctx == my_ctx, "Equal contexts with same handle"
-    assert hash(ctx) == hash(my_ctx), "Equal objects have equal hashes"
+    assert ctx1 == ctx2, "Equal contexts with same handle"
+    assert hash(ctx1) == hash(ctx2), "Equal objects have equal hashes"
 
     # Test that different handles still produce different hashes
-    my_stream2 = MyStream._init(options=StreamOptions(), device_id=device.device_id)
-    assert my_stream != my_stream2, "Different handles means not equal"
-    assert hash(my_stream) != hash(my_stream2), "Different objects have different hashes"
+    stream3 = device.create_stream()
+    assert stream1 != stream3, "Different handles means not equal"
+    assert hash(stream1) != hash(stream3), "Different objects have different hashes"

From 1e1398471d793a0921b704380d9ba6d8189fdec0 Mon Sep 17 00:00:00 2001
From: Andy Jost <ajost@nvidia.com>
Date: Wed, 10 Dec 2025 21:06:44 -0800
Subject: [PATCH 12/38] Store owning context handle in Device

Device now stores its Context in _context slot, set during set_current().
This ensures Device holds an owning reference to its context, enabling
proper lifetime management when passed to Stream and Event creation.

Changes:
- Add _context to Device.__slots__
- Store Context in set_current() for both primary and explicit context paths
- Simplify context property to return stored _context
- Update create_event() to use self._context._h_context
- Remove get_current_context import (no longer needed in _device.pyx)

Add structural context dependency to owned streams

StreamBox now holds ContextHandle to ensure context outlives the stream.
This structural dependency is only for owned streams - borrowed streams
delegate context lifetime management to their owners.

C++ changes:
- StreamBox gains h_context member
- create_stream_handle(h_ctx, flags, priority) takes owning context
- create_stream_handle_ref(stream) - caller manages context
- create_stream_handle_with_owner(stream, owner) - Python owner manages context

Cython/Python changes:
- Stream._init() accepts optional ctx parameter
- Device.create_stream() passes self._context to Stream._init()
- Owned streams get context handle embedded in C++ handle
---
 .../experimental/_cpp/resource_handles.cpp    | 11 ++++--
 .../experimental/_cpp/resource_handles.hpp    |  7 +++-
 cuda_core/cuda/core/experimental/_device.pyx  | 35 +++++++------------
 .../core/experimental/_resource_handles.pxd   |  9 +++--
 cuda_core/cuda/core/experimental/_stream.pyx  | 13 +++++--
 5 files changed, 42 insertions(+), 33 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_cpp/resource_handles.cpp b/cuda_core/cuda/core/experimental/_cpp/resource_handles.cpp
index 076ff10810..f39fc10816 100644
--- a/cuda_core/cuda/core/experimental/_cpp/resource_handles.cpp
+++ b/cuda_core/cuda/core/experimental/_cpp/resource_handles.cpp
@@ -161,8 +161,9 @@ struct StreamBox {
     CUstream resource;
 };
 
-StreamHandle create_stream_handle(unsigned int flags, int priority) {
+StreamHandle create_stream_handle(ContextHandle h_ctx, unsigned int flags, int priority) {
     // Creates an owning stream handle - calls cuStreamCreateWithPriority internally.
+    // The context handle is captured in the deleter to ensure context outlives the stream.
     // Returns empty handle on error (caller must check).
     CUstream stream;
     CUresult err;
@@ -174,10 +175,12 @@ StreamHandle create_stream_handle(unsigned int flags, int priority) {
         return StreamHandle();
     }
 
-    auto box = std::shared_ptr<const StreamBox>(new StreamBox{stream}, [](const StreamBox* b) {
+    // Capture h_ctx in lambda - shared_ptr control block keeps it alive
+    auto box = std::shared_ptr<const StreamBox>(new StreamBox{stream}, [h_ctx](const StreamBox* b) {
         GILReleaseGuard gil;
         cuStreamDestroy(b->resource);
         delete b;
+        // h_ctx destructor runs here when last stream reference is released
     });
 
     // Use aliasing constructor to expose only CUstream
@@ -185,7 +188,8 @@ StreamHandle create_stream_handle(unsigned int flags, int priority) {
 }
 
 StreamHandle create_stream_handle_ref(CUstream stream) {
-    // Creates a non-owning handle - stream will NOT be destroyed
+    // Creates a non-owning handle - stream will NOT be destroyed.
+    // Caller is responsible for keeping the stream's context alive.
     auto box = std::shared_ptr<const StreamBox>(new StreamBox{stream});
 
     // Use aliasing constructor to expose only CUstream
@@ -195,6 +199,7 @@ StreamHandle create_stream_handle_ref(CUstream stream) {
 StreamHandle create_stream_handle_with_owner(CUstream stream, PyObject* owner) {
     // Creates a non-owning handle that prevents a Python owner from being GC'd.
     // The owner's refcount is incremented here and decremented when handle is released.
+    // The owner is responsible for keeping the stream's context alive.
     Py_XINCREF(owner);
 
     auto box = std::shared_ptr<const StreamBox>(new StreamBox{stream}, [owner](const StreamBox* b) {
diff --git a/cuda_core/cuda/core/experimental/_cpp/resource_handles.hpp b/cuda_core/cuda/core/experimental/_cpp/resource_handles.hpp
index 945ac0b2a8..06b04ba974 100644
--- a/cuda_core/cuda/core/experimental/_cpp/resource_handles.hpp
+++ b/cuda_core/cuda/core/experimental/_cpp/resource_handles.hpp
@@ -38,23 +38,28 @@ ContextHandle get_current_context() noexcept;
 // ============================================================================
 
 // Create an owning stream handle by calling cuStreamCreateWithPriority.
+// The stream structurally depends on the provided context handle.
 // When the last reference is released, cuStreamDestroy is called automatically.
 // Returns empty handle on error (caller must check).
-StreamHandle create_stream_handle(unsigned int flags, int priority);
+StreamHandle create_stream_handle(ContextHandle h_ctx, unsigned int flags, int priority);
 
 // Create a non-owning stream handle (references existing stream).
 // Use for borrowed streams (from foreign code) or built-in streams.
 // The stream will NOT be destroyed when the handle is released.
+// Caller is responsible for keeping the stream's context alive.
 StreamHandle create_stream_handle_ref(CUstream stream);
 
 // Create a non-owning stream handle that prevents a Python owner from being GC'd.
 // The owner's refcount is incremented; decremented when handle is released.
+// The owner is responsible for keeping the stream's context alive.
 StreamHandle create_stream_handle_with_owner(CUstream stream, PyObject* owner);
 
 // Get non-owning handle to the legacy default stream (CU_STREAM_LEGACY)
+// Note: Legacy stream has no specific context dependency.
 StreamHandle get_legacy_stream() noexcept;
 
 // Get non-owning handle to the per-thread default stream (CU_STREAM_PER_THREAD)
+// Note: Per-thread stream has no specific context dependency.
 StreamHandle get_per_thread_stream() noexcept;
 
 // ============================================================================
diff --git a/cuda_core/cuda/core/experimental/_device.pyx b/cuda_core/cuda/core/experimental/_device.pyx
index f2f2f72a72..7cf8e8dbbd 100644
--- a/cuda_core/cuda/core/experimental/_device.pyx
+++ b/cuda_core/cuda/core/experimental/_device.pyx
@@ -19,7 +19,6 @@ from cuda.core.experimental._resource_handles cimport (
     ContextHandle,
     create_context_handle_ref,
     get_primary_context,
-    get_current_context,
     native,
 )
 from cuda.core.experimental._graph import GraphBuilder
@@ -943,7 +942,7 @@ class Device:
         Default value of `None` return the currently used device.
 
     """
-    __slots__ = ("_device_id", "_memory_resource", "_has_inited", "_properties", "_uuid")
+    __slots__ = ("_device_id", "_memory_resource", "_has_inited", "_properties", "_uuid", "_context")
 
     def __new__(cls, device_id: Device | int | None = None):
         # Handle device_id argument.
@@ -962,16 +961,15 @@ class Device:
         # important: creating a Device instance does not initialize the GPU!
         cdef cydriver.CUdevice dev
         cdef cydriver.CUcontext ctx
-        cdef ContextHandle h_context
         if device_id is None:
             with nogil:
                 err = cydriver.cuCtxGetDevice(&dev)
             if err == cydriver.CUresult.CUDA_SUCCESS:
                 device_id = int(dev)
             elif err == cydriver.CUresult.CUDA_ERROR_INVALID_CONTEXT:
-                h_context = get_current_context()
-                assert h_context.get() == NULL
-                device_id = 0  # cudart behavior
+                # No context is current - verify and default to device 0 (cudart behavior)
+                assert cydriver.cuCtxGetCurrent(&ctx) == cydriver.CUresult.CUDA_SUCCESS and ctx == NULL
+                device_id = 0
             else:
                 HANDLE_RETURN(err)
         elif device_id < 0:
@@ -992,6 +990,7 @@ class Device:
                 device._has_inited = False
                 device._properties = None
                 device._uuid = None
+                device._context = None
                 devices.append(device)
 
         try:
@@ -1110,7 +1109,7 @@ class Device:
 
     @property
     def context(self) -> Context:
-        """Return the current :obj:`~_context.Context` associated with this device.
+        """Return the :obj:`~_context.Context` associated with this device.
 
         Note
         ----
@@ -1118,16 +1117,7 @@ class Device:
 
         """
         self._check_context_initialized()
-        cdef ContextHandle h_context
-        cdef cydriver.CUdevice dev
-        h_context = get_current_context()
-        if h_context.get() == NULL:
-            raise CUDAError("No context is bound to the calling CPU thread.")
-        with nogil:
-            HANDLE_RETURN(cydriver.cuCtxGetDevice(&dev))
-        if <int>dev != self._device_id:
-            raise CUDAError("Internal error (current device is not equal to Device.device_id)")
-        return Context._from_handle(Context, h_context, self._device_id)
+        return self._context
 
     @property
     def memory_resource(self) -> MemoryResource:
@@ -1237,6 +1227,7 @@ class Device:
                 HANDLE_RETURN(cydriver.cuCtxPopCurrent(&prev_ctx))
                 HANDLE_RETURN(cydriver.cuCtxPushCurrent(curr_ctx))
             self._has_inited = True
+            self._context = ctx  # Store owning context reference
             if prev_ctx != NULL:
                 return Context._from_handle(Context, create_context_handle_ref(prev_ctx), self._device_id)
         else:
@@ -1247,6 +1238,7 @@ class Device:
             with nogil:
                 HANDLE_RETURN(cydriver.cuCtxSetCurrent(native(h_context)))
             self._has_inited = True
+            self._context = Context._from_handle(Context, h_context, self._device_id)  # Store owning context
 
     def create_context(self, options: ContextOptions = None) -> Context:
         """Create a new :obj:`~_context.Context` object.
@@ -1297,7 +1289,7 @@ class Device:
 
         """
         self._check_context_initialized()
-        return Stream._init(obj=obj, options=options, device_id=self._device_id)
+        return Stream._init(obj=obj, options=options, device_id=self._device_id, ctx=self._context)
 
     def create_event(self, options: EventOptions | None = None) -> Event:
         """Create an Event object without recording it to a Stream.
@@ -1318,11 +1310,8 @@ class Device:
 
         """
         self._check_context_initialized()
-        cdef ContextHandle h_context
-        h_context = get_current_context()
-        if h_context.get() == NULL:
-            raise CUDAError("No context is bound to the calling CPU thread.")
-        return cyEvent._init(cyEvent, self._device_id, h_context, options, True)
+        cdef Context ctx = self._context
+        return cyEvent._init(cyEvent, self._device_id, ctx._h_context, options, True)
 
     def allocate(self, size, stream: Stream | GraphBuilder | None = None) -> Buffer:
         """Allocate device memory from a specified stream.
diff --git a/cuda_core/cuda/core/experimental/_resource_handles.pxd b/cuda_core/cuda/core/experimental/_resource_handles.pxd
index 0423ef0ec8..711b28ffcb 100644
--- a/cuda_core/cuda/core/experimental/_resource_handles.pxd
+++ b/cuda_core/cuda/core/experimental/_resource_handles.pxd
@@ -27,19 +27,22 @@ cdef extern from "_cpp/resource_handles.hpp" namespace "cuda_core":
     ctypedef shared_ptr[const cydriver.CUstream] StreamHandle
 
     # Create an owning stream handle via cuStreamCreateWithPriority
+    # Context handle establishes structural dependency (context outlives stream)
     # Returns empty handle on error (caller must check)
-    StreamHandle create_stream_handle(unsigned int flags, int priority) nogil
+    StreamHandle create_stream_handle(ContextHandle h_ctx, unsigned int flags, int priority) nogil
 
     # Create a non-owning stream handle (stream NOT destroyed when handle released)
+    # Caller is responsible for keeping the stream's context alive
     StreamHandle create_stream_handle_ref(cydriver.CUstream stream) nogil
 
     # Create non-owning handle that prevents Python owner from being GC'd
+    # Owner is responsible for keeping the stream's context alive
     StreamHandle create_stream_handle_with_owner(cydriver.CUstream stream, object owner)
 
-    # Get non-owning handle to the legacy default stream
+    # Get non-owning handle to the legacy default stream (no context dependency)
     StreamHandle get_legacy_stream() nogil
 
-    # Get non-owning handle to the per-thread default stream
+    # Get non-owning handle to the per-thread default stream (no context dependency)
     StreamHandle get_per_thread_stream() nogil
 
     # ========================================================================
diff --git a/cuda_core/cuda/core/experimental/_stream.pyx b/cuda_core/cuda/core/experimental/_stream.pyx
index 3ba38095e4..d75e2bef0e 100644
--- a/cuda_core/cuda/core/experimental/_stream.pyx
+++ b/cuda_core/cuda/core/experimental/_stream.pyx
@@ -117,16 +117,23 @@ cdef class Stream:
         return Stream._from_handle(cls, get_per_thread_stream())
 
     @classmethod
-    def _init(cls, obj: IsStreamT | None = None, options=None, device_id: int = None):
+    def _init(cls, obj: IsStreamT | None = None, options=None, device_id: int = None,
+              ctx: Context = None):
         cdef StreamHandle h_stream
         cdef cydriver.CUstream borrowed
+        cdef ContextHandle h_context
         cdef Stream self
 
+        # Extract context handle if provided
+        if ctx is not None:
+            h_context = (<Context>ctx)._h_context
+
         if obj is not None and options is not None:
             raise ValueError("obj and options cannot be both specified")
         if obj is not None:
             # Borrowed stream from foreign object
             # C++ handle prevents owner from being GC'd until handle is released
+            # Owner is responsible for keeping the stream's context alive
             borrowed = _handle_from_stream_protocol(obj)
             h_stream = create_stream_handle_with_owner(borrowed, obj)
             return Stream._from_handle(cls, h_stream)
@@ -149,8 +156,8 @@ cdef class Stream:
         else:
             prio = high
 
-        # C++ creates the stream and returns owning handle
-        h_stream = create_stream_handle(flags, prio)
+        # C++ creates the stream and returns owning handle with context dependency
+        h_stream = create_stream_handle(h_context, flags, prio)
         if not h_stream:
             raise RuntimeError("Failed to create CUDA stream")
         self = Stream._from_handle(cls, h_stream)

From 6268b6e7ab7b0341f60e1c65e2dad5911719e8b0 Mon Sep 17 00:00:00 2001
From: Andy Jost <ajost@nvidia.com>
Date: Wed, 10 Dec 2025 22:08:44 -0800
Subject: [PATCH 13/38] Convert Event to use resource handles

Event now uses EventHandle (shared_ptr) for RAII-based lifetime management,
following the same pattern as Stream.

C++ changes:
- Add EventHandle type alias and EventBox struct
- Add create_event_handle(h_ctx, flags) with context captured in deleter
- Add create_event_handle_ipc(ipc_handle) for IPC events (no context dep)
- Add native(), intptr(), py() overloads for EventHandle

Cython changes:
- Event._h_event replaces raw CUevent _handle
- _init() uses create_event_handle()
- from_ipc_descriptor() uses create_event_handle_ipc()
- close() uses _h_event.reset()
- Keep _h_context for cached fast access
---
 .../experimental/_cpp/resource_handles.cpp    | 59 +++++++++++++++++++
 .../experimental/_cpp/resource_handles.hpp    | 38 ++++++++++++
 cuda_core/cuda/core/experimental/_event.pxd   |  6 +-
 cuda_core/cuda/core/experimental/_event.pyx   | 56 ++++++++++--------
 .../core/experimental/_resource_handles.pxd   | 18 ++++++
 cuda_core/cuda/core/experimental/_stream.pyx  |  3 +-
 6 files changed, 153 insertions(+), 27 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_cpp/resource_handles.cpp b/cuda_core/cuda/core/experimental/_cpp/resource_handles.cpp
index f39fc10816..a236176b95 100644
--- a/cuda_core/cuda/core/experimental/_cpp/resource_handles.cpp
+++ b/cuda_core/cuda/core/experimental/_cpp/resource_handles.cpp
@@ -230,4 +230,63 @@ StreamHandle get_per_thread_stream() noexcept {
     return handle;
 }
 
+// ============================================================================
+// Event Handles
+// ============================================================================
+
+// Internal box structure for Event
+struct EventBox {
+    CUevent resource;
+};
+
+EventHandle create_event_handle(ContextHandle h_ctx, unsigned int flags) {
+    // Creates an owning event handle - calls cuEventCreate internally.
+    // The context handle is captured in the deleter to ensure context outlives the event.
+    // Returns empty handle on error (caller must check).
+    CUevent event;
+    CUresult err;
+    {
+        GILReleaseGuard gil;
+        err = cuEventCreate(&event, flags);
+    }
+    if (err != CUDA_SUCCESS) {
+        return EventHandle();
+    }
+
+    // Capture h_ctx in lambda - shared_ptr control block keeps it alive
+    auto box = std::shared_ptr<const EventBox>(new EventBox{event}, [h_ctx](const EventBox* b) {
+        GILReleaseGuard gil;
+        cuEventDestroy(b->resource);
+        delete b;
+        // h_ctx destructor runs here when last event reference is released
+    });
+
+    // Use aliasing constructor to expose only CUevent
+    return EventHandle(box, &box->resource);
+}
+
+EventHandle create_event_handle_ipc(const CUipcEventHandle& ipc_handle) {
+    // Creates an owning event handle from an IPC handle.
+    // The originating process owns the event and its context.
+    // Returns empty handle on error (caller must check).
+    CUevent event;
+    CUresult err;
+    {
+        GILReleaseGuard gil;
+        err = cuIpcOpenEventHandle(&event, ipc_handle);
+    }
+    if (err != CUDA_SUCCESS) {
+        return EventHandle();
+    }
+
+    auto box = std::shared_ptr<const EventBox>(new EventBox{event}, [](const EventBox* b) {
+        GILReleaseGuard gil;
+        cuEventDestroy(b->resource);
+        delete b;
+    });
+
+    // Use aliasing constructor to expose only CUevent
+    return EventHandle(box, &box->resource);
+}
+
 }  // namespace cuda_core
diff --git a/cuda_core/cuda/core/experimental/_cpp/resource_handles.hpp b/cuda_core/cuda/core/experimental/_cpp/resource_handles.hpp
index 06b04ba974..44a8dd823a 100644
--- a/cuda_core/cuda/core/experimental/_cpp/resource_handles.hpp
+++ b/cuda_core/cuda/core/experimental/_cpp/resource_handles.hpp
@@ -17,6 +17,7 @@ namespace cuda_core {
 
 using ContextHandle = std::shared_ptr<const CUcontext>;
 using StreamHandle = std::shared_ptr<const CUstream>;
+using EventHandle = std::shared_ptr<const CUevent>;
 
 // ============================================================================
 // Context handle functions
@@ -62,6 +63,22 @@ StreamHandle get_legacy_stream() noexcept;
 // Note: Per-thread stream has no specific context dependency.
 StreamHandle get_per_thread_stream() noexcept;
 
+// ============================================================================
+// Event handle functions
+// ============================================================================
+
+// Create an owning event handle by calling cuEventCreate.
+// The event structurally depends on the provided context handle.
+// When the last reference is released, cuEventDestroy is called automatically.
+// Returns empty handle on error (caller must check).
+EventHandle create_event_handle(ContextHandle h_ctx, unsigned int flags);
+
+// Create an owning event handle from an IPC handle.
+// The originating process owns the event and its context.
+// When the last reference is released, cuEventDestroy is called automatically.
+// Returns empty handle on error (caller must check).
+EventHandle create_event_handle_ipc(const CUipcEventHandle& ipc_handle);
+
 // ============================================================================
 // Overloaded helper functions to extract raw resources from handles
 // ============================================================================
@@ -75,6 +92,10 @@ inline CUstream native(const StreamHandle& h) noexcept {
     return h ? *h : nullptr;
 }
 
+inline CUevent native(const EventHandle& h) noexcept {
+    return h ? *h : nullptr;
+}
+
 // intptr() - extract handle as uintptr_t for Python interop
 inline std::uintptr_t intptr(const ContextHandle& h) noexcept {
     return reinterpret_cast<std::uintptr_t>(h ? *h : nullptr);
@@ -84,6 +105,10 @@ inline std::uintptr_t intptr(const StreamHandle& h) noexcept {
     return reinterpret_cast<std::uintptr_t>(h ? *h : nullptr);
 }
 
+inline std::uintptr_t intptr(const EventHandle& h) noexcept {
+    return reinterpret_cast<std::uintptr_t>(h ? *h : nullptr);
+}
+
 // py() - convert handle to Python driver wrapper object
 // Returns new reference. Caller must hold GIL.
 inline PyObject* py(const ContextHandle& h) {
@@ -112,4 +137,17 @@ inline PyObject* py(const StreamHandle& h) {
     return PyObject_CallFunction(cls, "K", val);
 }
 
+inline PyObject* py(const EventHandle& h) {
+    static PyObject* cls = nullptr;
+    if (!cls) {
+        PyObject* mod = PyImport_ImportModule("cuda.bindings.driver");
+        if (!mod) return nullptr;
+        cls = PyObject_GetAttrString(mod, "CUevent");
+        Py_DECREF(mod);
+        if (!cls) return nullptr;
+    }
+    std::uintptr_t val = h ? reinterpret_cast<std::uintptr_t>(*h) : 0;
+    return PyObject_CallFunction(cls, "K", val);
+}
+
 }  // namespace cuda_core
diff --git a/cuda_core/cuda/core/experimental/_event.pxd b/cuda_core/cuda/core/experimental/_event.pxd
index d92c9627c3..29317dde66 100644
--- a/cuda_core/cuda/core/experimental/_event.pxd
+++ b/cuda_core/cuda/core/experimental/_event.pxd
@@ -3,19 +3,19 @@
 # SPDX-License-Identifier: Apache-2.0
 
 from cuda.bindings cimport cydriver
-from cuda.core.experimental._resource_handles cimport ContextHandle
+from cuda.core.experimental._resource_handles cimport ContextHandle, EventHandle
 
 
 cdef class Event:
 
     cdef:
-        cydriver.CUevent _handle
+        EventHandle _h_event
+        ContextHandle _h_context  # Cached for fast access
         bint _timing_disabled
         bint _busy_waited
         bint _ipc_enabled
         object _ipc_descriptor
         int _device_id
-        ContextHandle _h_context
 
     @staticmethod
     cdef Event _init(type cls, int device_id, ContextHandle h_context, options, bint is_free)
diff --git a/cuda_core/cuda/core/experimental/_event.pyx b/cuda_core/cuda/core/experimental/_event.pyx
index 2ac284d8c9..763df94fe3 100644
--- a/cuda_core/cuda/core/experimental/_event.pyx
+++ b/cuda_core/cuda/core/experimental/_event.pyx
@@ -9,7 +9,15 @@ from libc.stdint cimport uintptr_t
 from libc.string cimport memcpy
 from cuda.bindings cimport cydriver
 from cuda.core.experimental._context cimport Context
-from cuda.core.experimental._resource_handles cimport ContextHandle, intptr
+from cuda.core.experimental._resource_handles cimport (
+    ContextHandle,
+    EventHandle,
+    create_event_handle,
+    create_event_handle_ipc,
+    intptr,
+    native,
+    py,
+)
 from cuda.core.experimental._utils.cuda_utils cimport (
     check_or_create_options,
     HANDLE_RETURN
@@ -81,8 +89,6 @@ cdef class Event:
     and they should instead be created through a :obj:`~_stream.Stream` object.
 
     """
-    def __cinit__(self):
-        self._handle = <cydriver.CUevent>(NULL)
 
     def __init__(self, *args, **kwargs):
         raise RuntimeError("Event objects cannot be instantiated directly. Please use Stream APIs (record).")
@@ -111,23 +117,24 @@ cdef class Event:
             self._ipc_enabled = True
             if not self._timing_disabled:
                 raise TypeError("IPC-enabled events cannot use timing.")
-        with nogil:
-            HANDLE_RETURN(cydriver.cuEventCreate(&self._handle, flags))
-        self._device_id = device_id
+        # C++ creates the event and returns owning handle with context dependency
+        cdef EventHandle h_event = create_event_handle(h_context, flags)
+        if not h_event:
+            raise RuntimeError("Failed to create CUDA event")
+        self._h_event = h_event
         self._h_context = h_context
+        self._device_id = device_id
         if opts.ipc_enabled:
             self.get_ipc_descriptor()
         return self
 
     cpdef close(self):
-        """Destroy the event."""
-        if self._handle != NULL:
-            with nogil:
-                HANDLE_RETURN(cydriver.cuEventDestroy(self._handle))
-            self._handle = <cydriver.CUevent>(NULL)
+        """Destroy the event.
 
-    def __dealloc__(self):
-        self.close()
+        Releases the event handle. The underlying CUDA event is destroyed
+        when the last reference is released.
+        """
+        self._h_event.reset()
 
     def __isub__(self, other):
         return NotImplemented
@@ -139,7 +146,7 @@ cdef class Event:
         # return self - other (in milliseconds)
         cdef float timing
         with nogil:
-            err = cydriver.cuEventElapsedTime(&timing, other._handle, self._handle)
+            err = cydriver.cuEventElapsedTime(&timing, native((<Event>other)._h_event), native(self._h_event))
         if err == 0:
             return timing
         else:
@@ -165,14 +172,14 @@ cdef class Event:
             raise RuntimeError(explanation)
 
     def __hash__(self) -> int:
-        return hash((type(self), intptr(self._h_context), <uintptr_t>(self._handle)))
+        return hash((type(self), intptr(self._h_context), intptr(self._h_event)))
 
     def __eq__(self, other) -> bool:
         # Note: using isinstance because `Event` can be subclassed.
         if not isinstance(other, Event):
             return NotImplemented
         cdef Event _other = <Event>other
-        return <uintptr_t>(self._handle) == <uintptr_t>(_other._handle)
+        return intptr(self._h_event) == intptr(_other._h_event)
 
     def get_ipc_descriptor(self) -> IPCEventDescriptor:
         """Export an event allocated for sharing between processes."""
@@ -182,7 +189,7 @@ cdef class Event:
             raise RuntimeError("Event is not IPC-enabled")
         cdef cydriver.CUipcEventHandle data
         with nogil:
-            HANDLE_RETURN(cydriver.cuIpcGetEventHandle(&data, <cydriver.CUevent>(self._handle)))
+            HANDLE_RETURN(cydriver.cuIpcGetEventHandle(&data, native(self._h_event)))
         cdef bytes data_b = cpython.PyBytes_FromStringAndSize(<char*>(data.reserved), sizeof(data.reserved))
         self._ipc_descriptor = IPCEventDescriptor._init(data_b, self._busy_waited)
         return self._ipc_descriptor
@@ -193,14 +200,17 @@ cdef class Event:
         cdef cydriver.CUipcEventHandle data
         memcpy(data.reserved, <const void*><const char*>(ipc_descriptor._reserved), sizeof(data.reserved))
         cdef Event self = Event.__new__(cls)
-        with nogil:
-            HANDLE_RETURN(cydriver.cuIpcOpenEventHandle(&self._handle, data))
+        # IPC events: the originating process owns the event and its context
+        cdef EventHandle h_event = create_event_handle_ipc(data)
+        if not h_event:
+            raise RuntimeError("Failed to open IPC event handle")
+        self._h_event = h_event
+        self._h_context = ContextHandle()
         self._timing_disabled = True
         self._busy_waited = ipc_descriptor._busy_waited
         self._ipc_enabled = True
         self._ipc_descriptor = ipc_descriptor
         self._device_id = -1
-        self._h_context = ContextHandle()
         return self
 
     @property
@@ -229,13 +239,13 @@ cdef class Event:
 
         """
         with nogil:
-            HANDLE_RETURN(cydriver.cuEventSynchronize(self._handle))
+            HANDLE_RETURN(cydriver.cuEventSynchronize(native(self._h_event)))
 
     @property
     def is_done(self) -> bool:
         """Return True if all captured works have been completed, otherwise False."""
         with nogil:
-            result = cydriver.cuEventQuery(self._handle)
+            result = cydriver.cuEventQuery(native(self._h_event))
         if result == cydriver.CUresult.CUDA_SUCCESS:
             return True
         if result == cydriver.CUresult.CUDA_ERROR_NOT_READY:
@@ -251,7 +261,7 @@ cdef class Event:
             This handle is a Python object. To get the memory address of the underlying C
             handle, call ``int(Event.handle)``.
         """
-        return driver.CUevent(<uintptr_t>(self._handle))
+        return py(self._h_event)
 
     @property
     def device(self) -> Device:
diff --git a/cuda_core/cuda/core/experimental/_resource_handles.pxd b/cuda_core/cuda/core/experimental/_resource_handles.pxd
index 711b28ffcb..4c33a9f358 100644
--- a/cuda_core/cuda/core/experimental/_resource_handles.pxd
+++ b/cuda_core/cuda/core/experimental/_resource_handles.pxd
@@ -45,6 +45,21 @@ cdef extern from "_cpp/resource_handles.hpp" namespace "cuda_core":
     # Get non-owning handle to the per-thread default stream (no context dependency)
     StreamHandle get_per_thread_stream() nogil
 
+    # ========================================================================
+    # Event Handle
+    # ========================================================================
+    ctypedef shared_ptr[const cydriver.CUevent] EventHandle
+
+    # Create an owning event handle via cuEventCreate
+    # Context handle establishes structural dependency (context outlives event)
+    # Returns empty handle on error (caller must check)
+    EventHandle create_event_handle(ContextHandle h_ctx, unsigned int flags) nogil
+
+    # Create an owning event handle from IPC handle
+    # The originating process owns the event and its context
+    # Returns empty handle on error (caller must check)
+    EventHandle create_event_handle_ipc(const cydriver.CUipcEventHandle& ipc_handle) nogil
+
     # ========================================================================
     # Overloaded helper functions (C++ handles dispatch by type)
     # ========================================================================
@@ -52,11 +67,14 @@ cdef extern from "_cpp/resource_handles.hpp" namespace "cuda_core":
     # native() - extract the raw CUDA handle
     cydriver.CUcontext native(ContextHandle h) nogil
     cydriver.CUstream native(StreamHandle h) nogil
+    cydriver.CUevent native(EventHandle h) nogil
 
     # intptr() - extract handle as uintptr_t for Python interop
     uintptr_t intptr(ContextHandle h) nogil
     uintptr_t intptr(StreamHandle h) nogil
+    uintptr_t intptr(EventHandle h) nogil
 
     # py() - convert handle to Python driver wrapper object (requires GIL)
     object py(ContextHandle h)
     object py(StreamHandle h)
+    object py(EventHandle h)
diff --git a/cuda_core/cuda/core/experimental/_stream.pyx b/cuda_core/cuda/core/experimental/_stream.pyx
index d75e2bef0e..a3f9149d3e 100644
--- a/cuda_core/cuda/core/experimental/_stream.pyx
+++ b/cuda_core/cuda/core/experimental/_stream.pyx
@@ -27,6 +27,7 @@ from cuda.core.experimental._context cimport Context
 from cuda.core.experimental._event import Event, EventOptions
 from cuda.core.experimental._resource_handles cimport (
     ContextHandle,
+    EventHandle,
     StreamHandle,
     create_context_handle_ref,
     create_stream_handle,
@@ -265,7 +266,7 @@ cdef class Stream:
                 "new event by supplying options."
             )
 
-        cdef cydriver.CUevent e = (<cyEvent?>(event))._handle
+        cdef cydriver.CUevent e = native((<cyEvent?>(event))._h_event)
         with nogil:
             HANDLE_RETURN(cydriver.cuEventRecord(e, native(self._h_stream)))
         return event

From 1082f5a7302479d0a379222264f003e0479d43dd Mon Sep 17 00:00:00 2001
From: Andy Jost <ajost@nvidia.com>
Date: Thu, 11 Dec 2025 08:19:28 -0800
Subject: [PATCH 14/38] Clean up Stream.wait() to use EventHandle for temporary
 events

- Simplified branch structure: early return for Event, single path for Stream
- Use native() helper for handle access instead of casting via handle property
- Temporary events now use EventHandle with RAII cleanup (no explicit cuEventDestroy)
- Added create_event_handle import
---
 cuda_core/cuda/core/experimental/_stream.pyx | 47 +++++++++++---------
 1 file changed, 26 insertions(+), 21 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_stream.pyx b/cuda_core/cuda/core/experimental/_stream.pyx
index a3f9149d3e..078497c066 100644
--- a/cuda_core/cuda/core/experimental/_stream.pyx
+++ b/cuda_core/cuda/core/experimental/_stream.pyx
@@ -30,6 +30,7 @@ from cuda.core.experimental._resource_handles cimport (
     EventHandle,
     StreamHandle,
     create_context_handle_ref,
+    create_event_handle,
     create_stream_handle,
     create_stream_handle_with_owner,
     get_current_context,
@@ -281,32 +282,36 @@ cdef class Stream:
         on the stream and then waiting on it.
 
         """
-        cdef cydriver.CUevent event
-        cdef cydriver.CUstream stream
+        cdef Stream stream
+        cdef EventHandle h_event
 
+        # Handle Event directly
         if isinstance(event_or_stream, Event):
-            event = <cydriver.CUevent><uintptr_t>(event_or_stream.handle)
             with nogil:
                 # TODO: support flags other than 0?
-                HANDLE_RETURN(cydriver.cuStreamWaitEvent(native(self._h_stream), event, 0))
+                HANDLE_RETURN(cydriver.cuStreamWaitEvent(
+                    native(self._h_stream), native((<cyEvent>event_or_stream)._h_event), 0))
+            return
+
+        # Convert to Stream if needed
+        if isinstance(event_or_stream, Stream):
+            stream = <Stream>event_or_stream
         else:
-            if isinstance(event_or_stream, Stream):
-                stream = <cydriver.CUstream><uintptr_t>(event_or_stream.handle)
-            else:
-                try:
-                    s = Stream._init(obj=event_or_stream)
-                except Exception as e:
-                    raise ValueError(
-                        "only an Event, Stream, or object supporting __cuda_stream__ can be waited,"
-                        f" got {type(event_or_stream)}"
-                    ) from e
-                stream = <cydriver.CUstream><uintptr_t>(s.handle)
-            with nogil:
-                HANDLE_RETURN(cydriver.cuEventCreate(&event, cydriver.CUevent_flags.CU_EVENT_DISABLE_TIMING))
-                HANDLE_RETURN(cydriver.cuEventRecord(event, stream))
-                # TODO: support flags other than 0?
-                HANDLE_RETURN(cydriver.cuStreamWaitEvent(native(self._h_stream), event, 0))
-                HANDLE_RETURN(cydriver.cuEventDestroy(event))
+            try:
+                stream = Stream._init(obj=event_or_stream)
+            except Exception as e:
+                raise ValueError(
+                    "only an Event, Stream, or object supporting __cuda_stream__ can be waited,"
+                    f" got {type(event_or_stream)}"
+                ) from e
+
+        # Wait on stream via temporary event
+        Stream_ensure_ctx(self)
+        h_event = create_event_handle(self._h_context, cydriver.CUevent_flags.CU_EVENT_DISABLE_TIMING)
+        with nogil:
+            HANDLE_RETURN(cydriver.cuEventRecord(native(h_event), native(stream._h_stream)))
+            # TODO: support flags other than 0?
+            HANDLE_RETURN(cydriver.cuStreamWaitEvent(native(self._h_stream), native(h_event), 0))
 
     @property
     def device(self) -> Device:

From cd81f485681dc85a0c88af08f77de0606917f656 Mon Sep 17 00:00:00 2001
From: Andy Jost <ajost@nvidia.com>
Date: Thu, 11 Dec 2025 08:34:07 -0800
Subject: [PATCH 15/38] Add create_event_handle overload for temporary events

- New overload takes only flags (no ContextHandle) for temporary events
- Delegates to existing overload with empty ContextHandle
- Updated _stream.pyx and _memoryview.pyx to use simpler overload
- Removed unnecessary get_current_context import from _memoryview.pyx
- Removed unnecessary Stream_ensure_ctx call from Stream.wait()
---
 .../experimental/_cpp/resource_handles.cpp    |  7 +++++++
 .../experimental/_cpp/resource_handles.hpp    |  6 ++++++
 .../cuda/core/experimental/_memoryview.pyx    | 20 ++++++++++++++-----
 .../core/experimental/_resource_handles.pxd   |  5 +++++
 cuda_core/cuda/core/experimental/_stream.pyx  |  3 +--
 5 files changed, 34 insertions(+), 7 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_cpp/resource_handles.cpp b/cuda_core/cuda/core/experimental/_cpp/resource_handles.cpp
index a236176b95..860aae4857 100644
--- a/cuda_core/cuda/core/experimental/_cpp/resource_handles.cpp
+++ b/cuda_core/cuda/core/experimental/_cpp/resource_handles.cpp
@@ -265,6 +265,13 @@ EventHandle create_event_handle(ContextHandle h_ctx, unsigned int flags) {
     return EventHandle(box, &box->resource);
 }
 
+EventHandle create_event_handle(unsigned int flags) {
+    // Creates an owning event handle without context dependency.
+    // Use for temporary events that are created and destroyed in the same scope.
+    // Returns empty handle on error (caller must check).
+    return create_event_handle(ContextHandle{}, flags);
+}
+
 EventHandle create_event_handle_ipc(const CUipcEventHandle& ipc_handle) {
     // Creates an owning event handle from an IPC handle.
     // The originating process owns the event and its context.
diff --git a/cuda_core/cuda/core/experimental/_cpp/resource_handles.hpp b/cuda_core/cuda/core/experimental/_cpp/resource_handles.hpp
index 44a8dd823a..eb6475f758 100644
--- a/cuda_core/cuda/core/experimental/_cpp/resource_handles.hpp
+++ b/cuda_core/cuda/core/experimental/_cpp/resource_handles.hpp
@@ -73,6 +73,12 @@ StreamHandle get_per_thread_stream() noexcept;
 // Returns empty handle on error (caller must check).
 EventHandle create_event_handle(ContextHandle h_ctx, unsigned int flags);
 
+// Create an owning event handle without context dependency.
+// Use for temporary events that are created and destroyed in the same scope.
+// When the last reference is released, cuEventDestroy is called automatically.
+// Returns empty handle on error (caller must check).
+EventHandle create_event_handle(unsigned int flags);
+
 // Create an owning event handle from an IPC handle.
 // The originating process owns the event and its context.
 // When the last reference is released, cuEventDestroy is called automatically.
diff --git a/cuda_core/cuda/core/experimental/_memoryview.pyx b/cuda_core/cuda/core/experimental/_memoryview.pyx
index 9e13ebea45..718736e5cf 100644
--- a/cuda_core/cuda/core/experimental/_memoryview.pyx
+++ b/cuda_core/cuda/core/experimental/_memoryview.pyx
@@ -13,7 +13,15 @@ from typing import Optional
 
 import numpy
 
+from cuda.bindings cimport cydriver
+from cuda.core.experimental._resource_handles cimport (
+    EventHandle,
+    create_event_handle,
+    native,
+)
 from cuda.core.experimental._utils.cuda_utils import handle_return, driver
+from cuda.core.experimental._utils cimport cuda_utils
+from cuda.core.experimental._utils.cuda_utils cimport HANDLE_RETURN
 
 
 from cuda.core.experimental._memory import Buffer
@@ -579,6 +587,7 @@ cpdef StridedMemoryView view_as_cai(obj, stream_ptr, view=None):
             buf.ptr))
 
     cdef intptr_t producer_s, consumer_s
+    cdef EventHandle h_event
     stream_ptr = int(stream_ptr)
     if stream_ptr != -1:
         stream = cai_data.get("stream")
@@ -588,11 +597,12 @@ cpdef StridedMemoryView view_as_cai(obj, stream_ptr, view=None):
             assert producer_s > 0
             # establish stream order
             if producer_s != consumer_s:
-                e = handle_return(driver.cuEventCreate(
-                    driver.CUevent_flags.CU_EVENT_DISABLE_TIMING))
-                handle_return(driver.cuEventRecord(e, producer_s))
-                handle_return(driver.cuStreamWaitEvent(consumer_s, e, 0))
-                handle_return(driver.cuEventDestroy(e))
+                h_event = create_event_handle(cydriver.CUevent_flags.CU_EVENT_DISABLE_TIMING)
+                with nogil:
+                    HANDLE_RETURN(cydriver.cuEventRecord(
+                        native(h_event), <cydriver.CUstream>producer_s))
+                    HANDLE_RETURN(cydriver.cuStreamWaitEvent(
+                        <cydriver.CUstream>consumer_s, native(h_event), 0))
 
     return buf
 
diff --git a/cuda_core/cuda/core/experimental/_resource_handles.pxd b/cuda_core/cuda/core/experimental/_resource_handles.pxd
index 4c33a9f358..7c10599f8d 100644
--- a/cuda_core/cuda/core/experimental/_resource_handles.pxd
+++ b/cuda_core/cuda/core/experimental/_resource_handles.pxd
@@ -55,6 +55,11 @@ cdef extern from "_cpp/resource_handles.hpp" namespace "cuda_core":
     # Returns empty handle on error (caller must check)
     EventHandle create_event_handle(ContextHandle h_ctx, unsigned int flags) nogil
 
+    # Create an owning event handle without context dependency
+    # Use for temporary events that are created and destroyed in the same scope
+    # Returns empty handle on error (caller must check)
+    EventHandle create_event_handle(unsigned int flags) nogil
+
     # Create an owning event handle from IPC handle
     # The originating process owns the event and its context
     # Returns empty handle on error (caller must check)
diff --git a/cuda_core/cuda/core/experimental/_stream.pyx b/cuda_core/cuda/core/experimental/_stream.pyx
index 078497c066..4a16399323 100644
--- a/cuda_core/cuda/core/experimental/_stream.pyx
+++ b/cuda_core/cuda/core/experimental/_stream.pyx
@@ -306,8 +306,7 @@ cdef class Stream:
                 ) from e
 
         # Wait on stream via temporary event
-        Stream_ensure_ctx(self)
-        h_event = create_event_handle(self._h_context, cydriver.CUevent_flags.CU_EVENT_DISABLE_TIMING)
+        h_event = create_event_handle(cydriver.CUevent_flags.CU_EVENT_DISABLE_TIMING)
         with nogil:
             HANDLE_RETURN(cydriver.cuEventRecord(native(h_event), native(stream._h_stream)))
             # TODO: support flags other than 0?

From 2b798f27a2918b5d1898ecf392ccd78d8bdf3c7c Mon Sep 17 00:00:00 2001
From: Andy Jost <ajost@nvidia.com>
Date: Thu, 11 Dec 2025 09:58:09 -0800
Subject: [PATCH 16/38] Convert DeviceMemoryResource to use MemoryPoolHandle

C++ layer (resource_handles.hpp/cpp):
- Add MemoryPoolHandle = std::shared_ptr<const CUmemoryPool>
- Add create_mempool_handle(props) - owning, calls cuMemPoolDestroy on release
- Add create_mempool_handle_ref(pool) - non-owning reference
- Add create_mempool_handle_ipc(fd, handle_type) - owning from IPC import
- Add get_device_mempool(device_id) - get current pool for device (non-owning)
- Add native(), intptr(), py() overloads for MemoryPoolHandle

Cython layer:
- Update _resource_handles.pxd with new types and functions
- Update _device_memory_resource.pxd: replace raw handle with MemoryPoolHandle
- Reorder members: _h_pool first (matches Stream/Event pattern)
- Update _device_memory_resource.pyx to use new handle functions
- Update _ipc.pyx to use create_mempool_handle_ipc for IPC imports
- DMR_close now uses RAII (_h_pool.reset()) instead of explicit cuMemPoolDestroy
- Consistent member initialization order across __cinit__, init functions, and close
---
 .../experimental/_cpp/resource_handles.cpp    | 90 +++++++++++++++++++
 .../experimental/_cpp/resource_handles.hpp    | 47 ++++++++++
 .../_memory/_device_memory_resource.pxd       | 15 ++--
 .../_memory/_device_memory_resource.pyx       | 77 ++++++++--------
 .../cuda/core/experimental/_memory/_ipc.pyx   | 24 ++---
 .../core/experimental/_resource_handles.pxd   | 26 ++++++
 6 files changed, 221 insertions(+), 58 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_cpp/resource_handles.cpp b/cuda_core/cuda/core/experimental/_cpp/resource_handles.cpp
index 860aae4857..8935a358e1 100644
--- a/cuda_core/cuda/core/experimental/_cpp/resource_handles.cpp
+++ b/cuda_core/cuda/core/experimental/_cpp/resource_handles.cpp
@@ -296,4 +296,94 @@ EventHandle create_event_handle_ipc(const CUipcEventHandle& ipc_handle) {
     return EventHandle(box, &box->resource);
 }
 
+// ============================================================================
+// Memory Pool Handles
+// ============================================================================
+
+// Internal box structure for MemoryPool
+struct MemoryPoolBox {
+    CUmemoryPool resource;
+};
+
+// Helper to clear peer access before destroying a memory pool.
+// Works around nvbug 5698116: recycled pool handles inherit peer access state.
+static void clear_mempool_peer_access(CUmemoryPool pool) {
+    int device_count = 0;
+    if (cuDeviceGetCount(&device_count) != CUDA_SUCCESS || device_count <= 0) {
+        return;
+    }
+
+    std::vector<CUmemAccessDesc> clear_access(device_count);
+    for (int i = 0; i < device_count; ++i) {
+        clear_access[i].location.type = CU_MEM_LOCATION_TYPE_DEVICE;
+        clear_access[i].location.id = i;
+        clear_access[i].flags = CU_MEM_ACCESS_FLAGS_PROT_NONE;
+    }
+
+    // Ignore errors - best effort cleanup
+    cuMemPoolSetAccess(pool, clear_access.data(), device_count);
+}
+
+// Helper to wrap a raw pool in an owning handle.
+// The deleter clears peer access (nvbug 5698116 workaround) and destroys the pool.
+static MemoryPoolHandle wrap_mempool_owned(CUmemoryPool pool) {
+    auto box = std::shared_ptr<const MemoryPoolBox>(new MemoryPoolBox{pool}, [](const MemoryPoolBox* b) {
+        GILReleaseGuard gil;
+        clear_mempool_peer_access(b->resource);
+        cuMemPoolDestroy(b->resource);
+        delete b;
+    });
+    return MemoryPoolHandle(box, &box->resource);
+}
+
+MemoryPoolHandle create_mempool_handle(const CUmemPoolProps& props) {
+    // Creates an owning memory pool handle - calls cuMemPoolCreate internally.
+    // Memory pools are device-scoped (not context-scoped).
+    // Returns empty handle on error (caller must check).
+    CUmemoryPool pool;
+    CUresult err;
+    {
+        GILReleaseGuard gil;
+        err = cuMemPoolCreate(&pool, &props);
+    }
+    return err == CUDA_SUCCESS ? wrap_mempool_owned(pool) : MemoryPoolHandle();
+}
+
+MemoryPoolHandle create_mempool_handle_ref(CUmemoryPool pool) {
+    // Creates a non-owning handle - pool will NOT be destroyed.
+    // Use for device default/current pools managed by the driver.
+    auto box = std::shared_ptr<const MemoryPoolBox>(new MemoryPoolBox{pool});
+
+    // Use aliasing constructor to expose only CUmemoryPool
+    return MemoryPoolHandle(box, &box->resource);
+}
+
+MemoryPoolHandle get_device_mempool(int device_id) noexcept {
+    // Get the current memory pool for a device.
+    // Returns a non-owning handle (pool managed by driver).
+    CUmemoryPool pool;
+    CUresult err;
+    {
+        GILReleaseGuard gil;
+        err = cuDeviceGetMemPool(&pool, device_id);
+    }
+    if (err != CUDA_SUCCESS) {
+        return MemoryPoolHandle();
+    }
+    return create_mempool_handle_ref(pool);
+}
+
+MemoryPoolHandle create_mempool_handle_ipc(int fd, CUmemAllocationHandleType handle_type) {
+    // Creates an owning memory pool handle from an IPC import.
+    // The file descriptor is NOT owned by this handle.
+    // Returns empty handle on error (caller must check).
+    CUmemoryPool pool;
+    CUresult err;
+    {
+        GILReleaseGuard gil;
+        err = cuMemPoolImportFromShareableHandle(&pool, reinterpret_cast<void*>(static_cast<uintptr_t>(fd)), handle_type, 0);
+    }
+    return err == CUDA_SUCCESS ? wrap_mempool_owned(pool) : MemoryPoolHandle();
+}
+
 }  // namespace cuda_core
diff --git a/cuda_core/cuda/core/experimental/_cpp/resource_handles.hpp b/cuda_core/cuda/core/experimental/_cpp/resource_handles.hpp
index eb6475f758..83a68c8b40 100644
--- a/cuda_core/cuda/core/experimental/_cpp/resource_handles.hpp
+++ b/cuda_core/cuda/core/experimental/_cpp/resource_handles.hpp
@@ -18,6 +18,7 @@ namespace cuda_core {
 using ContextHandle = std::shared_ptr<const CUcontext>;
 using StreamHandle = std::shared_ptr<const CUstream>;
 using EventHandle = std::shared_ptr<const CUevent>;
+using MemoryPoolHandle = std::shared_ptr<const CUmemoryPool>;
 
 // ============================================================================
 // Context handle functions
@@ -85,6 +86,31 @@ EventHandle create_event_handle(unsigned int flags);
 // Returns empty handle on error (caller must check).
 EventHandle create_event_handle_ipc(const CUipcEventHandle& ipc_handle);
 
+// ============================================================================
+// Memory pool handle functions
+// ============================================================================
+
+// Create an owning memory pool handle by calling cuMemPoolCreate.
+// Memory pools are device-scoped (not context-scoped).
+// When the last reference is released, cuMemPoolDestroy is called automatically.
+// Returns empty handle on error (caller must check).
+MemoryPoolHandle create_mempool_handle(const CUmemPoolProps& props);
+
+// Create a non-owning memory pool handle (references existing pool).
+// Use for device default/current pools that are managed by the driver.
+// The pool will NOT be destroyed when the handle is released.
+MemoryPoolHandle create_mempool_handle_ref(CUmemoryPool pool);
+
+// Get non-owning handle to the current memory pool for a device.
+// Returns empty handle on error (caller must check).
+MemoryPoolHandle get_device_mempool(int device_id) noexcept;
+
+// Create an owning memory pool handle from an IPC import.
+// The file descriptor is NOT owned by this handle (caller manages FD separately).
+// When the last reference is released, cuMemPoolDestroy is called automatically.
+// Returns empty handle on error (caller must check).
+MemoryPoolHandle create_mempool_handle_ipc(int fd, CUmemAllocationHandleType handle_type);
+
 // ============================================================================
 // Overloaded helper functions to extract raw resources from handles
 // ============================================================================
@@ -102,6 +128,10 @@ inline CUevent native(const EventHandle& h) noexcept {
     return h ? *h : nullptr;
 }
 
+inline CUmemoryPool native(const MemoryPoolHandle& h) noexcept {
+    return h ? *h : nullptr;
+}
+
 // intptr() - extract handle as uintptr_t for Python interop
 inline std::uintptr_t intptr(const ContextHandle& h) noexcept {
     return reinterpret_cast<std::uintptr_t>(h ? *h : nullptr);
@@ -115,6 +145,10 @@ inline std::uintptr_t intptr(const EventHandle& h) noexcept {
     return reinterpret_cast<std::uintptr_t>(h ? *h : nullptr);
 }
 
+inline std::uintptr_t intptr(const MemoryPoolHandle& h) noexcept {
+    return reinterpret_cast<std::uintptr_t>(h ? *h : nullptr);
+}
+
 // py() - convert handle to Python driver wrapper object
 // Returns new reference. Caller must hold GIL.
 inline PyObject* py(const ContextHandle& h) {
@@ -156,4 +190,17 @@ inline PyObject* py(const EventHandle& h) {
     return PyObject_CallFunction(cls, "K", val);
 }
 
+inline PyObject* py(const MemoryPoolHandle& h) {
+    static PyObject* cls = nullptr;
+    if (!cls) {
+        PyObject* mod = PyImport_ImportModule("cuda.bindings.driver");
+        if (!mod) return nullptr;
+        cls = PyObject_GetAttrString(mod, "CUmemoryPool");
+        Py_DECREF(mod);
+        if (!cls) return nullptr;
+    }
+    std::uintptr_t val = h ? reinterpret_cast<std::uintptr_t>(*h) : 0;
+    return PyObject_CallFunction(cls, "K", val);
+}
+
 }  // namespace cuda_core
diff --git a/cuda_core/cuda/core/experimental/_memory/_device_memory_resource.pxd b/cuda_core/cuda/core/experimental/_memory/_device_memory_resource.pxd
index d31ff7b2e1..9b5c384d39 100644
--- a/cuda_core/cuda/core/experimental/_memory/_device_memory_resource.pxd
+++ b/cuda_core/cuda/core/experimental/_memory/_device_memory_resource.pxd
@@ -5,17 +5,18 @@
 from cuda.bindings cimport cydriver
 from cuda.core.experimental._memory._buffer cimport MemoryResource
 from cuda.core.experimental._memory._ipc cimport IPCDataForMR
+from cuda.core.experimental._resource_handles cimport MemoryPoolHandle
 
 
 cdef class DeviceMemoryResource(MemoryResource):
     cdef:
-        int                   _device_id
-        cydriver.CUmemoryPool _handle
-        bint                  _mempool_owned
-        IPCDataForMR          _ipc_data
-        object                _attributes
-        object                _peer_accessible_by
-        object                __weakref__
+        MemoryPoolHandle _h_pool
+        int              _device_id
+        bint             _pool_owned
+        IPCDataForMR     _ipc_data
+        object           _attributes
+        object           _peer_accessible_by
+        object           __weakref__
 
 
 cpdef DMR_mempool_get_access(DeviceMemoryResource, int)
diff --git a/cuda_core/cuda/core/experimental/_memory/_device_memory_resource.pyx b/cuda_core/cuda/core/experimental/_memory/_device_memory_resource.pyx
index d06f0b8297..b009408a43 100644
--- a/cuda_core/cuda/core/experimental/_memory/_device_memory_resource.pyx
+++ b/cuda_core/cuda/core/experimental/_memory/_device_memory_resource.pyx
@@ -13,7 +13,13 @@ from cuda.bindings cimport cydriver
 from cuda.core.experimental._memory._buffer cimport Buffer, MemoryResource
 from cuda.core.experimental._memory cimport _ipc
 from cuda.core.experimental._memory._ipc cimport IPCAllocationHandle, IPCDataForMR
-from cuda.core.experimental._resource_handles cimport native
+from cuda.core.experimental._resource_handles cimport (
+    MemoryPoolHandle,
+    create_mempool_handle,
+    get_device_mempool,
+    native,
+    py,
+)
 from cuda.core.experimental._stream cimport default_stream, Stream_accept, Stream
 from cuda.core.experimental._utils.cuda_utils cimport (
     check_or_create_options,
@@ -77,7 +83,7 @@ cdef class DeviceMemoryResourceAttributes:
         cdef DeviceMemoryResource mr = <DeviceMemoryResource>(self._mr_weakref())
         if mr is None:
             raise RuntimeError("DeviceMemoryResource is expired")
-        cdef cydriver.CUmemoryPool pool_handle = mr._handle
+        cdef cydriver.CUmemoryPool pool_handle = native(mr._h_pool)
         with nogil:
             HANDLE_RETURN(cydriver.cuMemPoolGetAttribute(pool_handle, attr_enum, value))
         return 0
@@ -219,9 +225,9 @@ cdef class DeviceMemoryResource(MemoryResource):
     """
 
     def __cinit__(self):
+        # _h_pool is default-initialized (empty shared_ptr) by C++
         self._device_id = cydriver.CU_DEVICE_INVALID
-        self._handle = NULL
-        self._mempool_owned = False
+        self._pool_owned = False
         self._ipc_data = None
         self._attributes = None
         self._peer_accessible_by = ()
@@ -239,9 +245,6 @@ cdef class DeviceMemoryResource(MemoryResource):
         else:
             DMR_init_create(self, c_device_id, opts)
 
-    def __dealloc__(self):
-        DMR_close(self)
-
     def close(self):
         """
         Close the device memory resource and destroy the associated memory pool
@@ -371,7 +374,7 @@ cdef class DeviceMemoryResource(MemoryResource):
     @property
     def handle(self) -> driver.CUmemoryPool:
         """Handle to the underlying memory pool."""
-        return driver.CUmemoryPool(<uintptr_t>(self._handle))
+        return py(self._h_pool)
 
     @property
     def is_device_accessible(self) -> bool:
@@ -381,7 +384,7 @@ cdef class DeviceMemoryResource(MemoryResource):
     @property
     def is_handle_owned(self) -> bool:
         """Whether the memory resource handle is owned. If False, ``close`` has no effect."""
-        return self._mempool_owned
+        return self._pool_owned
 
     @property
     def is_host_accessible(self) -> bool:
@@ -469,7 +472,7 @@ cdef class DeviceMemoryResource(MemoryResource):
                     i += 1
 
                 with nogil:
-                    HANDLE_RETURN(cydriver.cuMemPoolSetAccess(self._handle, access_desc, count))
+                    HANDLE_RETURN(cydriver.cuMemPoolSetAccess(native(self._h_pool), access_desc, count))
             finally:
                 if access_desc != NULL:
                     free(access_desc)
@@ -485,19 +488,20 @@ cdef void DMR_init_current(DeviceMemoryResource self, int device_id):
     cdef cydriver.cuuint64_t current_threshold
     cdef cydriver.cuuint64_t max_threshold = ULLONG_MAX
 
+    self._h_pool = get_device_mempool(device_id)
+    if not self._h_pool:
+        raise RuntimeError("Failed to get device memory pool")
     self._device_id = device_id
-    self._mempool_owned = False
+    self._pool_owned = False
 
     with nogil:
-        HANDLE_RETURN(cydriver.cuDeviceGetMemPool(&(self._handle), device_id))
-
         # Set a higher release threshold to improve performance when there are
         # no active allocations.  By default, the release threshold is 0, which
         # means memory is immediately released back to the OS when there are no
         # active suballocations, causing performance issues.
         HANDLE_RETURN(
             cydriver.cuMemPoolGetAttribute(
-                self._handle,
+                native(self._h_pool),
                 cydriver.CUmemPool_attribute.CU_MEMPOOL_ATTR_RELEASE_THRESHOLD,
                 &current_threshold
             )
@@ -506,7 +510,7 @@ cdef void DMR_init_current(DeviceMemoryResource self, int device_id):
         # If threshold is 0 (default), set it to maximum to retain memory in the pool.
         if current_threshold == 0:
             HANDLE_RETURN(cydriver.cuMemPoolSetAttribute(
-                self._handle,
+                native(self._h_pool),
                 cydriver.CUmemPool_attribute.CU_MEMPOOL_ATTR_RELEASE_THRESHOLD,
                 &max_threshold
             ))
@@ -530,16 +534,15 @@ cdef void DMR_init_create(
     properties.win32SecurityAttributes = NULL
     properties.usage = 0
 
+    self._h_pool = create_mempool_handle(properties)
+    if not self._h_pool:
+        raise RuntimeError("Failed to create memory pool")
     self._device_id = device_id
-    self._mempool_owned = True
-
-    with nogil:
-        HANDLE_RETURN(cydriver.cuMemPoolCreate(&(self._handle), &properties))
-        # TODO: should we also set the threshold here?
-
+    self._pool_owned = True
     if opts.ipc_enabled:
         alloc_handle = _ipc.DMR_export_mempool(self)
         self._ipc_data = IPCDataForMR(alloc_handle, False)
+    # TODO: should we also set the threshold here?
 
 
 # Raise an exception if the given stream is capturing.
@@ -554,10 +557,11 @@ cdef inline int check_not_capturing(cydriver.CUstream s) except?-1 nogil:
 
 cdef inline Buffer DMR_allocate(DeviceMemoryResource self, size_t size, Stream stream):
     cdef cydriver.CUstream s = native(stream._h_stream)
+    cdef cydriver.CUmemoryPool pool = native(self._h_pool)
     cdef cydriver.CUdeviceptr devptr
     with nogil:
         check_not_capturing(s)
-        HANDLE_RETURN(cydriver.cuMemAllocFromPoolAsync(&devptr, size, self._handle, s))
+        HANDLE_RETURN(cydriver.cuMemAllocFromPoolAsync(&devptr, size, pool, s))
     cdef Buffer buf = Buffer.__new__(Buffer)
     buf._ptr = <uintptr_t>(devptr)
     buf._ptr_obj = None
@@ -580,25 +584,18 @@ cdef inline void DMR_deallocate(
 
 
 cdef inline DMR_close(DeviceMemoryResource self):
-    if self._handle == NULL:
+    if not self._h_pool:
         return
 
-    # This works around nvbug 5698116. When a memory pool handle is recycled
-    # the new handle inherits the peer access state of the previous handle.
-    if self._peer_accessible_by:
-        self.peer_accessible_by = []
-
-    try:
-        if self._mempool_owned:
-            with nogil:
-                HANDLE_RETURN(cydriver.cuMemPoolDestroy(self._handle))
-    finally:
-        self._device_id = cydriver.CU_DEVICE_INVALID
-        self._handle = NULL
-        self._attributes = None
-        self._mempool_owned = False
-        self._ipc_data = None
-        self._peer_accessible_by = ()
+    # Reset members in declaration order.
+    # The RAII deleter handles nvbug 5698116 workaround (clears peer access)
+    # and calls cuMemPoolDestroy if this is an owning handle.
+    self._h_pool.reset()
+    self._device_id = cydriver.CU_DEVICE_INVALID
+    self._pool_owned = False
+    self._ipc_data = None
+    self._attributes = None
+    self._peer_accessible_by = ()
 
 
 # Note: this is referenced in instructions to debug nvbug 5698116.
@@ -626,7 +623,7 @@ cpdef DMR_mempool_get_access(DeviceMemoryResource dmr, int device_id):
     location.id = c_device_id
 
     with nogil:
-        HANDLE_RETURN(cydriver.cuMemPoolGetAccess(&flags, dmr._handle, &location))
+        HANDLE_RETURN(cydriver.cuMemPoolGetAccess(&flags, native(dmr._h_pool), &location))
 
     if flags == cydriver.CUmemAccess_flags.CU_MEM_ACCESS_FLAGS_PROT_READWRITE:
         return "rw"
diff --git a/cuda_core/cuda/core/experimental/_memory/_ipc.pyx b/cuda_core/cuda/core/experimental/_memory/_ipc.pyx
index 7c5a9b0409..4eb062dda0 100644
--- a/cuda_core/cuda/core/experimental/_memory/_ipc.pyx
+++ b/cuda_core/cuda/core/experimental/_memory/_ipc.pyx
@@ -8,6 +8,11 @@ from libc.string cimport memcpy
 
 from cuda.bindings cimport cydriver
 from cuda.core.experimental._memory._buffer cimport Buffer
+from cuda.core.experimental._resource_handles cimport (
+    MemoryPoolHandle,
+    create_mempool_handle_ipc,
+    native,
+)
 from cuda.core.experimental._stream cimport default_stream
 from cuda.core.experimental._utils.cuda_utils cimport HANDLE_RETURN
 from cuda.core.experimental._utils.cuda_utils import check_multiprocessing_start_method
@@ -185,7 +190,7 @@ cdef Buffer Buffer_from_ipc_descriptor(
     )
     cdef cydriver.CUdeviceptr ptr
     with nogil:
-        HANDLE_RETURN(cydriver.cuMemPoolImportPointer(&ptr, mr._handle, &data))
+        HANDLE_RETURN(cydriver.cuMemPoolImportPointer(&ptr, native(mr._h_pool), &data))
     return Buffer._init(<uintptr_t>ptr, ipc_descriptor.size, mr, stream, ipc_descriptor)
 
 
@@ -209,20 +214,17 @@ cdef DeviceMemoryResource DMR_from_allocation_handle(cls, device_id, alloc_handl
             os.close(fd)
             raise
 
-    # Construct a new DMR.
+    # Construct a new DMR (set members in declaration order).
     cdef DeviceMemoryResource self = DeviceMemoryResource.__new__(cls)
+    cdef int ipc_fd = int(alloc_handle)
+    self._h_pool = create_mempool_handle_ipc(ipc_fd, IPC_HANDLE_TYPE)
+    if not self._h_pool:
+        raise RuntimeError("Failed to import memory pool from IPC handle")
     from .._device import Device
     self._device_id = Device(device_id).device_id
-    self._mempool_owned = True
+    self._pool_owned = True
     self._ipc_data = IPCDataForMR(alloc_handle, True)
 
-    # Map the mempool into this process.
-    cdef int handle = int(alloc_handle)
-    with nogil:
-        HANDLE_RETURN(cydriver.cuMemPoolImportFromShareableHandle(
-            &(self._handle), <void*><uintptr_t>(handle), IPC_HANDLE_TYPE, 0)
-        )
-
     # Register it.
     if uuid is not None:
         registered = self.register(uuid)
@@ -253,7 +255,7 @@ cdef IPCAllocationHandle DMR_export_mempool(DeviceMemoryResource self):
     cdef int fd
     with nogil:
         HANDLE_RETURN(cydriver.cuMemPoolExportToShareableHandle(
-            &fd, self._handle, IPC_HANDLE_TYPE, 0)
+            &fd, native(self._h_pool), IPC_HANDLE_TYPE, 0)
         )
     try:
         return IPCAllocationHandle._init(fd, uuid.uuid4())
diff --git a/cuda_core/cuda/core/experimental/_resource_handles.pxd b/cuda_core/cuda/core/experimental/_resource_handles.pxd
index 7c10599f8d..6bb172e64d 100644
--- a/cuda_core/cuda/core/experimental/_resource_handles.pxd
+++ b/cuda_core/cuda/core/experimental/_resource_handles.pxd
@@ -65,6 +65,29 @@ cdef extern from "_cpp/resource_handles.hpp" namespace "cuda_core":
     # Returns empty handle on error (caller must check)
     EventHandle create_event_handle_ipc(const cydriver.CUipcEventHandle& ipc_handle) nogil
 
+    # ========================================================================
+    # Memory Pool Handle
+    # ========================================================================
+    ctypedef shared_ptr[const cydriver.CUmemoryPool] MemoryPoolHandle
+
+    # Create an owning memory pool handle via cuMemPoolCreate
+    # Memory pools are device-scoped (not context-scoped)
+    # Returns empty handle on error (caller must check)
+    MemoryPoolHandle create_mempool_handle(const cydriver.CUmemPoolProps& props) nogil
+
+    # Create a non-owning memory pool handle (pool NOT destroyed when released)
+    # Use for device default/current pools managed by the driver
+    MemoryPoolHandle create_mempool_handle_ref(cydriver.CUmemoryPool pool) nogil
+
+    # Get non-owning handle to the current memory pool for a device
+    # Returns empty handle on error (caller must check)
+    MemoryPoolHandle get_device_mempool(int device_id) nogil
+
+    # Create an owning memory pool handle from IPC import
+    # File descriptor NOT owned by this handle (caller manages FD separately)
+    # Returns empty handle on error (caller must check)
+    MemoryPoolHandle create_mempool_handle_ipc(int fd, cydriver.CUmemAllocationHandleType handle_type) nogil
+
     # ========================================================================
     # Overloaded helper functions (C++ handles dispatch by type)
     # ========================================================================
@@ -73,13 +96,16 @@ cdef extern from "_cpp/resource_handles.hpp" namespace "cuda_core":
     cydriver.CUcontext native(ContextHandle h) nogil
     cydriver.CUstream native(StreamHandle h) nogil
     cydriver.CUevent native(EventHandle h) nogil
+    cydriver.CUmemoryPool native(MemoryPoolHandle h) nogil
 
     # intptr() - extract handle as uintptr_t for Python interop
     uintptr_t intptr(ContextHandle h) nogil
     uintptr_t intptr(StreamHandle h) nogil
     uintptr_t intptr(EventHandle h) nogil
+    uintptr_t intptr(MemoryPoolHandle h) nogil
 
     # py() - convert handle to Python driver wrapper object (requires GIL)
     object py(ContextHandle h)
     object py(StreamHandle h)
     object py(EventHandle h)
+    object py(MemoryPoolHandle h)

From 63d263dc17f41c0eb605a26bee140502a29d5190 Mon Sep 17 00:00:00 2001
From: Andy Jost <ajost@nvidia.com>
Date: Thu, 11 Dec 2025 12:52:20 -0800
Subject: [PATCH 17/38] Add DevicePtrHandle for RAII device pointer management

Introduce DevicePtrHandle (std::shared_ptr<const CUdeviceptr>) to manage
device pointer lifetimes with automatic deallocation. Key features:

- Allocation functions: deviceptr_alloc_from_pool, deviceptr_alloc_async,
  deviceptr_alloc, deviceptr_alloc_host, deviceptr_create_ref
- IPC import via deviceptr_import_ipc with error output parameter
- Deallocation stream stored in mutable DevicePtrBox, accessible via
  deallocation_stream() and set_deallocation_stream()
- cuMemFreeAsync used for deallocation (NULL stream = legacy default)
- Buffer class updated to use DevicePtrHandle instead of raw pointers
- Buffer.handle returns integer for backward compatibility with ctypes
- IPCBufferDescriptor.payload_ptr() helper to simplify casting

Note: IPC-imported pointers do not yet implement reference counting
workaround for nvbug 5570902.
---
 .../experimental/_cpp/resource_handles.cpp    | 170 +++++++++++
 .../experimental/_cpp/resource_handles.hpp    |  73 +++++
 .../core/experimental/_memory/_buffer.pxd     |  23 +-
 .../core/experimental/_memory/_buffer.pyx     | 266 ++++++------------
 .../_memory/_device_memory_resource.pyx       |  18 +-
 .../_memory/_graph_memory_resource.pyx        |  21 +-
 .../cuda/core/experimental/_memory/_ipc.pxd   |   2 +
 .../cuda/core/experimental/_memory/_ipc.pyx   |  29 +-
 .../core/experimental/_resource_handles.pxd   |  47 ++++
 9 files changed, 426 insertions(+), 223 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_cpp/resource_handles.cpp b/cuda_core/cuda/core/experimental/_cpp/resource_handles.cpp
index 8935a358e1..b5ccfff105 100644
--- a/cuda_core/cuda/core/experimental/_cpp/resource_handles.cpp
+++ b/cuda_core/cuda/core/experimental/_cpp/resource_handles.cpp
@@ -386,4 +386,174 @@ MemoryPoolHandle create_mempool_handle_ipc(int fd, CUmemAllocationHandleType han
     return err == CUDA_SUCCESS ? wrap_mempool_owned(pool) : MemoryPoolHandle();
 }
 
+// ============================================================================
+// Device Pointer Handles
+// ============================================================================
+
+// Internal box structure for DevicePtr.
+// The h_stream is mutable to allow updating the deallocation stream after creation.
+struct DevicePtrBox {
+    CUdeviceptr resource;
+    mutable StreamHandle h_stream;
+};
+
+// Internal helper to retrieve the box from a handle (for deallocation_stream access).
+static DevicePtrBox* get_box(const DevicePtrHandle& h) {
+    const CUdeviceptr* p = h.get();
+    return reinterpret_cast<DevicePtrBox*>(
+        reinterpret_cast<char*>(const_cast<CUdeviceptr*>(p))
+        - offsetof(DevicePtrBox, resource)
+    );
+}
+
+StreamHandle deallocation_stream(const DevicePtrHandle& h) {
+    return get_box(h)->h_stream;
+}
+
+void set_deallocation_stream(const DevicePtrHandle& h, StreamHandle h_stream) {
+    get_box(h)->h_stream = std::move(h_stream);
+}
+
+DevicePtrHandle deviceptr_alloc_from_pool(
+    size_t size,
+    MemoryPoolHandle h_pool,
+    StreamHandle h_stream)
+{
+    // Allocate from pool asynchronously.
+    // Pool handle is captured in deleter to keep pool alive.
+    CUdeviceptr ptr;
+    CUresult err;
+    {
+        GILReleaseGuard gil;
+        err = cuMemAllocFromPoolAsync(&ptr, size, *h_pool, native(h_stream));
+    }
+    if (err != CUDA_SUCCESS) {
+        return DevicePtrHandle();
+    }
+
+    auto box = std::shared_ptr<DevicePtrBox>(
+        new DevicePtrBox{ptr, h_stream},
+        [h_pool](DevicePtrBox* b) {
+            GILReleaseGuard gil;
+            // cuMemFreeAsync accepts NULL stream (uses legacy default stream)
+            cuMemFreeAsync(b->resource, native(b->h_stream));
+            delete b;
+            // h_pool destructor runs here, releasing pool reference
+        }
+    );
+    return DevicePtrHandle(box, &box->resource);
+}
+
+DevicePtrHandle deviceptr_alloc_async(size_t size, StreamHandle h_stream) {
+    // Allocate asynchronously (not from a specific pool).
+    CUdeviceptr ptr;
+    CUresult err;
+    {
+        GILReleaseGuard gil;
+        err = cuMemAllocAsync(&ptr, size, native(h_stream));
+    }
+    if (err != CUDA_SUCCESS) {
+        return DevicePtrHandle();
+    }
+
+    auto box = std::shared_ptr<DevicePtrBox>(
+        new DevicePtrBox{ptr, h_stream},
+        [](DevicePtrBox* b) {
+            GILReleaseGuard gil;
+            // cuMemFreeAsync accepts NULL stream (uses legacy default stream)
+            cuMemFreeAsync(b->resource, native(b->h_stream));
+            delete b;
+        }
+    );
+    return DevicePtrHandle(box, &box->resource);
+}
+
+DevicePtrHandle deviceptr_alloc(size_t size) {
+    // Allocate synchronously.
+    CUdeviceptr ptr;
+    CUresult err;
+    {
+        GILReleaseGuard gil;
+        err = cuMemAlloc(&ptr, size);
+    }
+    if (err != CUDA_SUCCESS) {
+        return DevicePtrHandle();
+    }
+
+    auto box = std::shared_ptr<DevicePtrBox>(
+        new DevicePtrBox{ptr, StreamHandle{}},
+        [](DevicePtrBox* b) {
+            GILReleaseGuard gil;
+            cuMemFree(b->resource);
+            delete b;
+        }
+    );
+    return DevicePtrHandle(box, &box->resource);
+}
+
+DevicePtrHandle deviceptr_alloc_host(size_t size) {
+    // Allocate pinned host memory.
+    void* ptr;
+    CUresult err;
+    {
+        GILReleaseGuard gil;
+        err = cuMemAllocHost(&ptr, size);
+    }
+    if (err != CUDA_SUCCESS) {
+        return DevicePtrHandle();
+    }
+
+    auto box = std::shared_ptr<DevicePtrBox>(
+        new DevicePtrBox{reinterpret_cast<CUdeviceptr>(ptr), StreamHandle{}},
+        [](DevicePtrBox* b) {
+            GILReleaseGuard gil;
+            cuMemFreeHost(reinterpret_cast<void*>(b->resource));
+            delete b;
+        }
+    );
+    return DevicePtrHandle(box, &box->resource);
+}
+
+DevicePtrHandle deviceptr_create_ref(CUdeviceptr ptr) {
+    // Non-owning reference - pointer will NOT be freed.
+    auto box = std::shared_ptr<DevicePtrBox>(new DevicePtrBox{ptr, StreamHandle{}});
+    return DevicePtrHandle(box, &box->resource);
+}
+
+DevicePtrHandle deviceptr_import_ipc(
+    MemoryPoolHandle h_pool,
+    const void* export_data,
+    StreamHandle h_stream,
+    CUresult* error_out)
+{
+    // Import pointer from IPC.
+    // Note: Does not implement reference counting workaround for nvbug 5570902 yet.
+    CUdeviceptr ptr;
+    CUresult err;
+    {
+        GILReleaseGuard gil;
+        err = cuMemPoolImportPointer(&ptr, *h_pool,
+            const_cast<CUmemPoolPtrExportData*>(
+                reinterpret_cast<const CUmemPoolPtrExportData*>(export_data)));
+    }
+    if (error_out) {
+        *error_out = err;
+    }
+    if (err != CUDA_SUCCESS) {
+        return DevicePtrHandle();
+    }
+
+    auto box = std::shared_ptr<DevicePtrBox>(
+        new DevicePtrBox{ptr, h_stream},
+        [h_pool](DevicePtrBox* b) {
+            GILReleaseGuard gil;
+            // cuMemFreeAsync accepts NULL stream (uses legacy default stream)
+            cuMemFreeAsync(b->resource, native(b->h_stream));
+            delete b;
+            // h_pool destructor runs here
+        }
+    );
+    return DevicePtrHandle(box, &box->resource);
+}
+
 }  // namespace cuda_core
diff --git a/cuda_core/cuda/core/experimental/_cpp/resource_handles.hpp b/cuda_core/cuda/core/experimental/_cpp/resource_handles.hpp
index 83a68c8b40..5ce6671fa9 100644
--- a/cuda_core/cuda/core/experimental/_cpp/resource_handles.hpp
+++ b/cuda_core/cuda/core/experimental/_cpp/resource_handles.hpp
@@ -111,6 +111,58 @@ MemoryPoolHandle get_device_mempool(int device_id) noexcept;
 // Returns empty handle on error (caller must check).
 MemoryPoolHandle create_mempool_handle_ipc(int fd, CUmemAllocationHandleType handle_type);
 
+// ============================================================================
+// Device pointer handle functions
+// ============================================================================
+
+using DevicePtrHandle = std::shared_ptr<const CUdeviceptr>;
+
+// Allocate device memory from a pool asynchronously via cuMemAllocFromPoolAsync.
+// The pointer structurally depends on the provided pool handle (captured in deleter).
+// When the last reference is released, cuMemFreeAsync is called on the stored stream.
+// Returns empty handle on error (caller must check).
+DevicePtrHandle deviceptr_alloc_from_pool(
+    size_t size,
+    MemoryPoolHandle h_pool,
+    StreamHandle h_stream);
+
+// Allocate device memory asynchronously via cuMemAllocAsync.
+// When the last reference is released, cuMemFreeAsync is called on the stored stream.
+// Returns empty handle on error (caller must check).
+DevicePtrHandle deviceptr_alloc_async(size_t size, StreamHandle h_stream);
+
+// Allocate device memory synchronously via cuMemAlloc.
+// When the last reference is released, cuMemFree is called.
+// Returns empty handle on error (caller must check).
+DevicePtrHandle deviceptr_alloc(size_t size);
+
+// Allocate pinned host memory via cuMemAllocHost.
+// When the last reference is released, cuMemFreeHost is called.
+// Returns empty handle on error (caller must check).
+DevicePtrHandle deviceptr_alloc_host(size_t size);
+
+// Create a non-owning device pointer handle (references existing pointer).
+// Use for foreign pointers (e.g., from external libraries).
+// The pointer will NOT be freed when the handle is released.
+DevicePtrHandle deviceptr_create_ref(CUdeviceptr ptr);
+
+// Import a device pointer from IPC via cuMemPoolImportPointer.
+// When the last reference is released, cuMemFreeAsync is called on the stored stream.
+// Note: Does not yet implement reference counting for nvbug 5570902.
+// Error code is written to error_out (caller must check).
+DevicePtrHandle deviceptr_import_ipc(
+    MemoryPoolHandle h_pool,
+    const void* export_data,
+    StreamHandle h_stream,
+    CUresult* error_out);
+
+// Access the deallocation stream for a device pointer handle (read-only).
+// For non-owning handles, the stream is not used but can still be accessed.
+StreamHandle deallocation_stream(const DevicePtrHandle& h);
+
+// Set the deallocation stream for a device pointer handle.
+void set_deallocation_stream(const DevicePtrHandle& h, StreamHandle h_stream);
+
 // ============================================================================
 // Overloaded helper functions to extract raw resources from handles
 // ============================================================================
@@ -132,6 +184,10 @@ inline CUmemoryPool native(const MemoryPoolHandle& h) noexcept {
     return h ? *h : nullptr;
 }
 
+inline CUdeviceptr native(const DevicePtrHandle& h) noexcept {
+    return h ? *h : 0;
+}
+
 // intptr() - extract handle as uintptr_t for Python interop
 inline std::uintptr_t intptr(const ContextHandle& h) noexcept {
     return reinterpret_cast<std::uintptr_t>(h ? *h : nullptr);
@@ -149,6 +205,10 @@ inline std::uintptr_t intptr(const MemoryPoolHandle& h) noexcept {
     return reinterpret_cast<std::uintptr_t>(h ? *h : nullptr);
 }
 
+inline std::uintptr_t intptr(const DevicePtrHandle& h) noexcept {
+    return h ? static_cast<std::uintptr_t>(*h) : 0;
+}
+
 // py() - convert handle to Python driver wrapper object
 // Returns new reference. Caller must hold GIL.
 inline PyObject* py(const ContextHandle& h) {
@@ -203,4 +263,17 @@ inline PyObject* py(const MemoryPoolHandle& h) {
     return PyObject_CallFunction(cls, "K", val);
 }
 
+inline PyObject* py(const DevicePtrHandle& h) {
+    static PyObject* cls = nullptr;
+    if (!cls) {
+        PyObject* mod = PyImport_ImportModule("cuda.bindings.driver");
+        if (!mod) return nullptr;
+        cls = PyObject_GetAttrString(mod, "CUdeviceptr");
+        Py_DECREF(mod);
+        if (!cls) return nullptr;
+    }
+    std::uintptr_t val = h ? static_cast<std::uintptr_t>(*h) : 0;
+    return PyObject_CallFunction(cls, "K", val);
+}
+
 }  // namespace cuda_core
diff --git a/cuda_core/cuda/core/experimental/_memory/_buffer.pxd b/cuda_core/cuda/core/experimental/_memory/_buffer.pxd
index b581dcd293..81653dafd5 100644
--- a/cuda_core/cuda/core/experimental/_memory/_buffer.pxd
+++ b/cuda_core/cuda/core/experimental/_memory/_buffer.pxd
@@ -4,6 +4,7 @@
 
 from libc.stdint cimport uintptr_t
 
+from cuda.core.experimental._resource_handles cimport DevicePtrHandle
 from cuda.core.experimental._stream cimport Stream
 
 
@@ -15,16 +16,20 @@ cdef struct _MemAttrs:
 
 cdef class Buffer:
     cdef:
-        uintptr_t      _ptr
-        size_t         _size
-        MemoryResource _memory_resource
-        object         _ipc_data
-        object         _owner
-        object         _ptr_obj
-        Stream         _alloc_stream
-        _MemAttrs      _mem_attrs
-        bint           _mem_attrs_inited
+        DevicePtrHandle _h_ptr
+        size_t          _size
+        MemoryResource  _memory_resource
+        object          _ipc_data
 
 
 cdef class MemoryResource:
     pass
+
+
+# Helper function to create a Buffer from a DevicePtrHandle
+cdef Buffer Buffer_from_deviceptr_handle(
+    DevicePtrHandle h_ptr,
+    size_t size,
+    MemoryResource mr,
+    object ipc_descriptor = *
+)
diff --git a/cuda_core/cuda/core/experimental/_memory/_buffer.pyx b/cuda_core/cuda/core/experimental/_memory/_buffer.pyx
index b26471ed0e..c7ab15ae95 100644
--- a/cuda_core/cuda/core/experimental/_memory/_buffer.pyx
+++ b/cuda_core/cuda/core/experimental/_memory/_buffer.pyx
@@ -11,6 +11,15 @@ from cuda.bindings cimport cydriver
 from cuda.core.experimental._memory._device_memory_resource cimport DeviceMemoryResource
 from cuda.core.experimental._memory._ipc cimport IPCBufferDescriptor, IPCDataForBuffer
 from cuda.core.experimental._memory cimport _ipc
+from cuda.core.experimental._resource_handles cimport (
+    DevicePtrHandle,
+    StreamHandle,
+    deviceptr_create_ref,
+    intptr,
+    native,
+    py,
+    set_deallocation_stream,
+)
 from cuda.core.experimental._stream cimport Stream_accept, Stream
 from cuda.core.experimental._utils.cuda_utils cimport HANDLE_RETURN
 
@@ -43,39 +52,39 @@ cdef class Buffer:
         self._clear()
 
     def _clear(self):
-        self._ptr = 0
+        # _h_ptr is default-initialized (empty shared_ptr) by C++
         self._size = 0
         self._memory_resource = None
         self._ipc_data = None
-        self._ptr_obj = None
-        self._alloc_stream = None
-        self._owner = None
-        self._mem_attrs_inited = False
 
     def __init__(self, *args, **kwargs):
         raise RuntimeError("Buffer objects cannot be instantiated directly. "
                            "Please use MemoryResource APIs.")
 
+    # Note: _init_from_handle is a cdef inline function, not a method
+    # See Buffer_init_from_handle below
+
     @classmethod
     def _init(
         cls, ptr: DevicePointerT, size_t size, mr: MemoryResource | None = None,
         stream: Stream | None = None, ipc_descriptor: IPCBufferDescriptor | None = None,
         owner : object | None = None
     ):
+        """Legacy init for compatibility - creates a non-owning ref handle.
+        
+        Note: The stream parameter is accepted for API compatibility but is
+        ignored since non-owning refs are never freed by the handle.
+        """
         cdef Buffer self = Buffer.__new__(cls)
-        self._ptr = <uintptr_t>(int(ptr))
-        self._ptr_obj = ptr
+        self._h_ptr = deviceptr_create_ref(<uintptr_t>(int(ptr)))
         self._size = size
         if mr is not None and owner is not None:
             raise ValueError("owner and memory resource cannot be both specified together")
         self._memory_resource = mr
         self._ipc_data = IPCDataForBuffer(ipc_descriptor, True) if ipc_descriptor is not None else None
-        self._alloc_stream = <Stream>(stream) if stream is not None else None
-        self._owner = owner
         return self
 
-    def __dealloc__(self):
-        self.close(self._alloc_stream)
+    # No __dealloc__ needed - RAII handles cleanup via _h_ptr destructor
 
     def __reduce__(self):
         # Must not serialize the parent's stream!
@@ -96,13 +105,14 @@ cdef class Buffer:
             Memory size of the buffer
         mr : :obj:`~_memory.MemoryResource`, optional
             Memory resource associated with the buffer
-        owner : object, optional
-            An object holding external allocation that the ``ptr`` points to.
-            The reference is kept as long as the buffer is alive.
-            The ``owner`` and ``mr`` cannot be specified together.
+
+        Note
+        ----
+        This creates a non-owning reference. The pointer will NOT be freed
+        when the Buffer is closed or garbage collected.
         """
-        # TODO: It is better to take a stream for latter deallocation
-        return Buffer._init(ptr, size, mr=mr, owner=owner)
+        cdef DevicePtrHandle h_ptr = deviceptr_create_ref(<uintptr_t>(int(ptr)))
+        return Buffer_from_deviceptr_handle(h_ptr, size, mr)
 
     @classmethod
     def from_ipc_descriptor(
@@ -128,7 +138,7 @@ cdef class Buffer:
         ----------
         stream : :obj:`~_stream.Stream` | :obj:`~_graph.GraphBuilder`, optional
             The stream object to use for asynchronous deallocation. If None,
-            the behavior depends on the underlying memory resource.
+            the deallocation stream stored in the handle is used.
         """
         Buffer_close(self, stream)
 
@@ -163,14 +173,8 @@ cdef class Buffer:
             raise ValueError( "buffer sizes mismatch between src and dst (sizes "
                              f"are: src={src_size}, dst={dst_size})"
             )
-        cdef cydriver.CUstream s = s_stream._handle
-        with nogil:
-            HANDLE_RETURN(cydriver.cuMemcpyAsync(
-                <cydriver.CUdeviceptr>dst._ptr,
-                <cydriver.CUdeviceptr>self._ptr,
-                src_size,
-                s
-            ))
+        err, = driver.cuMemcpyAsync(native(dst._h_ptr), native(self._h_ptr), src_size, stream.handle)
+        raise_if_driver_error(err)
         return dst
 
     def copy_from(self, src: Buffer, *, stream: Stream | GraphBuilder):
@@ -194,14 +198,8 @@ cdef class Buffer:
             raise ValueError( "buffer sizes mismatch between src and dst (sizes "
                              f"are: src={src_size}, dst={dst_size})"
             )
-        cdef cydriver.CUstream s = s_stream._handle
-        with nogil:
-            HANDLE_RETURN(cydriver.cuMemcpyAsync(
-                <cydriver.CUdeviceptr>self._ptr,
-                <cydriver.CUdeviceptr>src._ptr,
-                dst_size,
-                s
-            ))
+        err, = driver.cuMemcpyAsync(native(self._h_ptr), native(src._h_ptr), dst_size, stream.handle)
+        raise_if_driver_error(err)
 
     def fill(self, value: int, width: int, *, stream: Stream | GraphBuilder):
         """Fill this buffer with a value pattern asynchronously on the given stream.
@@ -222,42 +220,33 @@ cdef class Buffer:
             or if buffer size is not divisible by width
 
         """
-        cdef Stream s_stream = Stream_accept(stream)
-        cdef unsigned char c_value8
-        cdef unsigned short c_value16
-        cdef unsigned int c_value32
-        cdef size_t N
+        stream = Stream_accept(stream)
 
         # Validate width
         if width not in (1, 2, 4):
             raise ValueError(f"width must be 1, 2, or 4, got {width}")
 
         # Validate buffer size modulus.
-        cdef size_t buffer_size = self._size
+        buffer_size = self._size
         if buffer_size % width != 0:
             raise ValueError(f"buffer size ({buffer_size}) must be divisible by width ({width})")
 
         # Map width (bytes) to bitwidth and validate value
-        cdef int bitwidth = width * 8
+        bitwidth = width * 8
         _validate_value_against_bitwidth(bitwidth, value, is_signed=False)
 
         # Validate value fits in width and perform fill
-        cdef cydriver.CUstream s = s_stream._handle
+        ptr = native(self._h_ptr)
         if width == 1:
-            c_value8 = <unsigned char>value
             N = buffer_size
-            with nogil:
-                HANDLE_RETURN(cydriver.cuMemsetD8Async(<cydriver.CUdeviceptr>self._ptr, c_value8, N, s))
+            err, = driver.cuMemsetD8Async(ptr, value, N, stream.handle)
         elif width == 2:
-            c_value16 = <unsigned short>value
             N = buffer_size // 2
-            with nogil:
-                HANDLE_RETURN(cydriver.cuMemsetD16Async(<cydriver.CUdeviceptr>self._ptr, c_value16, N, s))
+            err, = driver.cuMemsetD16Async(ptr, value, N, stream.handle)
         else:  # width == 4
-            c_value32 = <unsigned int>value
             N = buffer_size // 4
-            with nogil:
-                HANDLE_RETURN(cydriver.cuMemsetD32Async(<cydriver.CUdeviceptr>self._ptr, c_value32, N, s))
+            err, = driver.cuMemsetD32Async(ptr, value, N, stream.handle)
+        raise_if_driver_error(err)
 
     def __dlpack__(
         self,
@@ -310,9 +299,7 @@ cdef class Buffer:
         """Return the device ordinal of this buffer."""
         if self._memory_resource is not None:
             return self._memory_resource.device_id
-        else:
-            Buffer_init_mem_attrs(self)
-            return self._mem_attrs.device_id
+        raise NotImplementedError("device_id requires a memory resource")
 
     @property
     def handle(self) -> DevicePointerT:
@@ -323,31 +310,23 @@ cdef class Buffer:
             This handle is a Python object. To get the memory address of the underlying C
             handle, call ``int(Buffer.handle)``.
         """
-        if self._ptr_obj is not None:
-            return self._ptr_obj
-        elif self._ptr:
-            return self._ptr
-        else:
-            # contract: Buffer is closed
-            return 0
+        # Return raw integer for compatibility with ctypes and other tools
+        # that expect a raw pointer value
+        return intptr(self._h_ptr)
 
     @property
     def is_device_accessible(self) -> bool:
         """Return True if this buffer can be accessed by the GPU, otherwise False."""
         if self._memory_resource is not None:
             return self._memory_resource.is_device_accessible
-        else:
-            Buffer_init_mem_attrs(self)
-            return self._mem_attrs.is_device_accessible
+        raise NotImplementedError("is_device_accessible requires a memory resource")
 
     @property
     def is_host_accessible(self) -> bool:
         """Return True if this buffer can be accessed by the CPU, otherwise False."""
         if self._memory_resource is not None:
             return self._memory_resource.is_host_accessible
-        else:
-            Buffer_init_mem_attrs(self)
-            return self._mem_attrs.is_host_accessible
+        raise NotImplementedError("is_host_accessible requires a memory resource")
 
     @property
     def is_mapped(self) -> bool:
@@ -365,92 +344,6 @@ cdef class Buffer:
         """Return the memory size of this buffer."""
         return self._size
 
-    @property
-    def owner(self) -> object:
-        """Return the object holding external allocation."""
-        return self._owner
-
-
-# Buffer Implementation
-# ---------------------
-cdef inline void Buffer_close(Buffer self, stream):
-    cdef Stream s
-    if self._ptr:
-        if self._memory_resource is not None:
-            s = Stream_accept(stream) if stream is not None else self._alloc_stream
-            self._memory_resource.deallocate(self._ptr, self._size, s)
-        self._ptr = 0
-        self._memory_resource = None
-        self._owner = None
-        self._ptr_obj = None
-        self._alloc_stream = None
-
-
-cdef Buffer_init_mem_attrs(Buffer self):
-    if not self._mem_attrs_inited:
-        query_memory_attrs(self._mem_attrs, self._ptr)
-        self._mem_attrs_inited = True
-
-
-cdef int query_memory_attrs(_MemAttrs &out, uintptr_t ptr) except -1 nogil:
-    cdef unsigned int memory_type = 0
-    cdef int is_managed = 0
-    cdef int device_id = 0
-    _query_memory_attrs(memory_type, is_managed, device_id, <cydriver.CUdeviceptr>ptr)
-
-    if memory_type == 0:
-        # unregistered host pointer
-        out.is_host_accessible = True
-        out.is_device_accessible = False
-        out.device_id = -1
-    # for managed memory, the memory type can be CU_MEMORYTYPE_DEVICE,
-    # so we need to check it first not to falsely claim it is not
-    # host accessible.
-    elif (
-        is_managed
-        or memory_type == cydriver.CUmemorytype.CU_MEMORYTYPE_HOST
-    ):
-        # For pinned memory allocated with cudaMallocHost or paged-locked
-        # with cudaHostRegister, the memory_type is
-        # cydriver.CUmemorytype.CU_MEMORYTYPE_HOST.
-        # TODO(ktokarski): In some cases, the registered memory requires
-        # using different ptr for device and host, we could check
-        # cuMemHostGetDevicePointer and
-        # CU_DEVICE_ATTRIBUTE_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM
-        # to double check the device accessibility.
-        out.is_host_accessible = True
-        out.is_device_accessible = True
-        out.device_id = device_id
-    elif memory_type == cydriver.CUmemorytype.CU_MEMORYTYPE_DEVICE:
-        out.is_host_accessible = False
-        out.is_device_accessible = True
-        out.device_id = device_id
-    else:
-        raise ValueError(f"Unsupported memory type: {memory_type}")
-    return 0
-
-
-cdef inline int _query_memory_attrs(unsigned int& memory_type, int & is_managed, int& device_id, cydriver.CUdeviceptr ptr) except -1 nogil:
-    cdef cydriver.CUpointer_attribute attrs[3]
-    cdef uintptr_t vals[3]
-    attrs[0] = cydriver.CUpointer_attribute.CU_POINTER_ATTRIBUTE_MEMORY_TYPE
-    attrs[1] = cydriver.CUpointer_attribute.CU_POINTER_ATTRIBUTE_IS_MANAGED
-    attrs[2] = cydriver.CUpointer_attribute.CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL
-    vals[0] = <uintptr_t><void*>&memory_type
-    vals[1] = <uintptr_t><void*>&is_managed
-    vals[2] = <uintptr_t><void*>&device_id
-
-    cdef cydriver.CUresult ret
-    ret = cydriver.cuPointerGetAttributes(3, attrs, <void**>vals, ptr)
-    if ret == cydriver.CUresult.CUDA_ERROR_NOT_INITIALIZED:
-        with cython.gil:
-            # Device class handles the cuInit call internally
-            Device()
-        ret = cydriver.cuPointerGetAttributes(3, attrs, <void**>vals, ptr)
-    HANDLE_RETURN(ret)
-    return 0
-
-
 cdef class MemoryResource:
     """Abstract base class for memory resources that manage allocation and
     deallocation of buffers.
@@ -502,39 +395,50 @@ cdef class MemoryResource:
         ...
 
 
-# Helper Functions
-# ----------------
-cdef void _validate_value_against_bitwidth(int bitwidth, int64_t value, bint is_signed=False) except *:
-    """Validate that a value fits within the representable range for a given bitwidth.
-
-    Parameters
-    ----------
-    bitwidth : int
-        Number of bits (e.g., 8, 16, 32)
-    value : int64_t
-        Value to validate
-    is_signed : bool, optional
-        Whether the value is signed (default: False)
-
-    Raises
-    ------
-    ValueError
-        If value is outside the representable range for the bitwidth
-    """
-    cdef int max_bits = bitwidth
+# Buffer Implementation Helpers
+# -----------------------------
+cdef inline Buffer Buffer_from_deviceptr_handle(
+    DevicePtrHandle h_ptr,
+    size_t size,
+    MemoryResource mr,
+    object ipc_descriptor = None
+):
+    """Create a Buffer from an existing DevicePtrHandle."""
+    cdef Buffer buf = Buffer.__new__(Buffer)
+    buf._h_ptr = h_ptr
+    buf._size = size
+    buf._memory_resource = mr
+    buf._ipc_data = IPCDataForBuffer(ipc_descriptor, True) if ipc_descriptor is not None else None
+    return buf
+
+
+cdef inline void Buffer_close(Buffer self, object stream):
+    """Close a buffer, freeing its memory."""
+    cdef Stream s
+    if not self._h_ptr:
+        return
+    # Update deallocation stream if provided
+    if stream is not None:
+        s = Stream_accept(stream)
+        set_deallocation_stream(self._h_ptr, s._h_stream)
+    # Reset handle - RAII deleter will free the memory
+    self._h_ptr.reset()
+    self._size = 0
+    self._memory_resource = None
+    self._ipc_data = None
+
+
+def _validate_value_against_bitwidth(bitwidth, value, is_signed=False):
+    """Validate that a value fits within the representable range for a given bitwidth."""
+    max_bits = bitwidth
     assert max_bits < 64, f"bitwidth ({max_bits}) must be less than 64"
 
-    cdef int64_t min_value
-    cdef uint64_t max_value_unsigned
-    cdef int64_t max_value
-
     if is_signed:
-        min_value = -(<int64_t>1 << (max_bits - 1))
-        max_value = (<int64_t>1 << (max_bits - 1)) - 1
+        min_value = -(1 << (max_bits - 1))
+        max_value = (1 << (max_bits - 1)) - 1
     else:
         min_value = 0
-        max_value_unsigned = (<uint64_t>1 << max_bits) - 1
-        max_value = <int64_t>max_value_unsigned
+        max_value = (1 << max_bits) - 1
 
     if not min_value <= value <= max_value:
         raise ValueError(
diff --git a/cuda_core/cuda/core/experimental/_memory/_device_memory_resource.pyx b/cuda_core/cuda/core/experimental/_memory/_device_memory_resource.pyx
index b009408a43..2a3e5c2dfe 100644
--- a/cuda_core/cuda/core/experimental/_memory/_device_memory_resource.pyx
+++ b/cuda_core/cuda/core/experimental/_memory/_device_memory_resource.pyx
@@ -10,12 +10,14 @@ from libc.stdlib cimport malloc, free
 from libc.string cimport memset
 
 from cuda.bindings cimport cydriver
-from cuda.core.experimental._memory._buffer cimport Buffer, MemoryResource
+from cuda.core.experimental._memory._buffer cimport Buffer, Buffer_from_deviceptr_handle, MemoryResource
 from cuda.core.experimental._memory cimport _ipc
 from cuda.core.experimental._memory._ipc cimport IPCAllocationHandle, IPCDataForMR
 from cuda.core.experimental._resource_handles cimport (
+    DevicePtrHandle,
     MemoryPoolHandle,
     create_mempool_handle,
+    deviceptr_alloc_from_pool,
     get_device_mempool,
     native,
     py,
@@ -557,18 +559,12 @@ cdef inline int check_not_capturing(cydriver.CUstream s) except?-1 nogil:
 
 cdef inline Buffer DMR_allocate(DeviceMemoryResource self, size_t size, Stream stream):
     cdef cydriver.CUstream s = native(stream._h_stream)
-    cdef cydriver.CUmemoryPool pool = native(self._h_pool)
-    cdef cydriver.CUdeviceptr devptr
     with nogil:
         check_not_capturing(s)
-        HANDLE_RETURN(cydriver.cuMemAllocFromPoolAsync(&devptr, size, pool, s))
-    cdef Buffer buf = Buffer.__new__(Buffer)
-    buf._ptr = <uintptr_t>(devptr)
-    buf._ptr_obj = None
-    buf._size = size
-    buf._memory_resource = self
-    buf._alloc_stream = stream
-    return buf
+    cdef DevicePtrHandle h_ptr = deviceptr_alloc_from_pool(size, self._h_pool, stream._h_stream)
+    if not h_ptr:
+        raise RuntimeError("Failed to allocate memory from pool")
+    return Buffer_from_deviceptr_handle(h_ptr, size, self, None)
 
 
 cdef inline void DMR_deallocate(
diff --git a/cuda_core/cuda/core/experimental/_memory/_graph_memory_resource.pyx b/cuda_core/cuda/core/experimental/_memory/_graph_memory_resource.pyx
index 5ad9d86c53..981c2830dd 100644
--- a/cuda_core/cuda/core/experimental/_memory/_graph_memory_resource.pyx
+++ b/cuda_core/cuda/core/experimental/_memory/_graph_memory_resource.pyx
@@ -7,8 +7,12 @@ from __future__ import annotations
 from libc.stdint cimport intptr_t
 
 from cuda.bindings cimport cydriver
-from cuda.core.experimental._memory._buffer cimport Buffer, MemoryResource
-from cuda.core.experimental._resource_handles cimport native
+from cuda.core.experimental._memory._buffer cimport Buffer, Buffer_from_deviceptr_handle, MemoryResource
+from cuda.core.experimental._resource_handles cimport (
+    DevicePtrHandle,
+    deviceptr_alloc_async,
+    native,
+)
 from cuda.core.experimental._stream cimport default_stream, Stream_accept, Stream
 from cuda.core.experimental._utils.cuda_utils cimport HANDLE_RETURN
 
@@ -188,17 +192,12 @@ cdef inline int check_capturing(cydriver.CUstream s) except?-1 nogil:
 
 cdef inline Buffer GMR_allocate(cyGraphMemoryResource self, size_t size, Stream stream):
     cdef cydriver.CUstream s = native(stream._h_stream)
-    cdef cydriver.CUdeviceptr devptr
     with nogil:
         check_capturing(s)
-        HANDLE_RETURN(cydriver.cuMemAllocAsync(&devptr, size, s))
-    cdef Buffer buf = Buffer.__new__(Buffer)
-    buf._ptr = <intptr_t>(devptr)
-    buf._ptr_obj = None
-    buf._size = size
-    buf._memory_resource = self
-    buf._alloc_stream = stream
-    return buf
+    cdef DevicePtrHandle h_ptr = deviceptr_alloc_async(size, stream._h_stream)
+    if not h_ptr:
+        raise RuntimeError("Failed to allocate memory asynchronously")
+    return Buffer_from_deviceptr_handle(h_ptr, size, self, None)
 
 
 cdef inline void GMR_deallocate(intptr_t ptr, size_t size, Stream stream) noexcept:
diff --git a/cuda_core/cuda/core/experimental/_memory/_ipc.pxd b/cuda_core/cuda/core/experimental/_memory/_ipc.pxd
index 60d96a3b33..5505e92381 100644
--- a/cuda_core/cuda/core/experimental/_memory/_ipc.pxd
+++ b/cuda_core/cuda/core/experimental/_memory/_ipc.pxd
@@ -41,6 +41,8 @@ cdef class IPCBufferDescriptor:
         bytes  _payload
         size_t _size
 
+    cdef const void* payload_ptr(self) noexcept
+
 
 cdef class IPCAllocationHandle:
     cdef:
diff --git a/cuda_core/cuda/core/experimental/_memory/_ipc.pyx b/cuda_core/cuda/core/experimental/_memory/_ipc.pyx
index 4eb062dda0..f0bdc22216 100644
--- a/cuda_core/cuda/core/experimental/_memory/_ipc.pyx
+++ b/cuda_core/cuda/core/experimental/_memory/_ipc.pyx
@@ -7,10 +7,13 @@ from libc.stdint cimport uintptr_t
 from libc.string cimport memcpy
 
 from cuda.bindings cimport cydriver
-from cuda.core.experimental._memory._buffer cimport Buffer
+from cuda.core.experimental._memory._buffer cimport Buffer, Buffer_from_deviceptr_handle
+from cuda.core.experimental._stream cimport Stream
 from cuda.core.experimental._resource_handles cimport (
+    DevicePtrHandle,
     MemoryPoolHandle,
     create_mempool_handle_ipc,
+    deviceptr_import_ipc,
     native,
 )
 from cuda.core.experimental._stream cimport default_stream
@@ -92,6 +95,10 @@ cdef class IPCBufferDescriptor:
     def size(self):
         return self._size
 
+    cdef const void* payload_ptr(self) noexcept:
+        """Return the payload as a const void* for C API calls."""
+        return <const void*><const char*>(self._payload)
+
 
 cdef class IPCAllocationHandle:
     """Shareable handle to an IPC-enabled device memory pool."""
@@ -166,7 +173,7 @@ cdef IPCBufferDescriptor Buffer_get_ipc_descriptor(Buffer self):
     cdef cydriver.CUmemPoolPtrExportData data
     with nogil:
         HANDLE_RETURN(
-            cydriver.cuMemPoolExportPointer(&data, <cydriver.CUdeviceptr>(self._ptr))
+            cydriver.cuMemPoolExportPointer(&data, native(self._h_ptr))
         )
     cdef bytes data_b = cpython.PyBytes_FromStringAndSize(
         <char*>(data.reserved), sizeof(data.reserved)
@@ -182,16 +189,16 @@ cdef Buffer Buffer_from_ipc_descriptor(
     if stream is None:
         # Note: match this behavior to DeviceMemoryResource.allocate()
         stream = default_stream()
-    cdef cydriver.CUmemPoolPtrExportData data
-    memcpy(
-        data.reserved,
-        <const void*><const char*>(ipc_descriptor._payload),
-        sizeof(data.reserved)
+    cdef Stream s = <Stream>stream
+    cdef cydriver.CUresult err
+    cdef DevicePtrHandle h_ptr = deviceptr_import_ipc(
+        mr._h_pool,
+        ipc_descriptor.payload_ptr(),
+        s._h_stream,
+        &err
     )
-    cdef cydriver.CUdeviceptr ptr
-    with nogil:
-        HANDLE_RETURN(cydriver.cuMemPoolImportPointer(&ptr, native(mr._h_pool), &data))
-    return Buffer._init(<uintptr_t>ptr, ipc_descriptor.size, mr, stream, ipc_descriptor)
+    HANDLE_RETURN(err)
+    return Buffer_from_deviceptr_handle(h_ptr, ipc_descriptor.size, mr, ipc_descriptor)
 
 
 # DeviceMemoryResource IPC Implementation
diff --git a/cuda_core/cuda/core/experimental/_resource_handles.pxd b/cuda_core/cuda/core/experimental/_resource_handles.pxd
index 6bb172e64d..416dd8bd5c 100644
--- a/cuda_core/cuda/core/experimental/_resource_handles.pxd
+++ b/cuda_core/cuda/core/experimental/_resource_handles.pxd
@@ -88,6 +88,50 @@ cdef extern from "_cpp/resource_handles.hpp" namespace "cuda_core":
     # Returns empty handle on error (caller must check)
     MemoryPoolHandle create_mempool_handle_ipc(int fd, cydriver.CUmemAllocationHandleType handle_type) nogil
 
+    # ========================================================================
+    # Device Pointer Handle
+    # ========================================================================
+    ctypedef shared_ptr[const cydriver.CUdeviceptr] DevicePtrHandle
+
+    # Allocate device memory from a pool asynchronously via cuMemAllocFromPoolAsync
+    # Pool handle is captured in deleter to keep pool alive
+    # Returns empty handle on error (caller must check)
+    DevicePtrHandle deviceptr_alloc_from_pool(
+        size_t size,
+        MemoryPoolHandle h_pool,
+        StreamHandle h_stream) nogil
+
+    # Allocate device memory asynchronously via cuMemAllocAsync
+    # Returns empty handle on error (caller must check)
+    DevicePtrHandle deviceptr_alloc_async(size_t size, StreamHandle h_stream) nogil
+
+    # Allocate device memory synchronously via cuMemAlloc
+    # Returns empty handle on error (caller must check)
+    DevicePtrHandle deviceptr_alloc(size_t size) nogil
+
+    # Allocate pinned host memory via cuMemAllocHost
+    # Returns empty handle on error (caller must check)
+    DevicePtrHandle deviceptr_alloc_host(size_t size) nogil
+
+    # Create a non-owning device pointer handle (pointer NOT freed when released)
+    # Use for foreign pointers from external libraries
+    DevicePtrHandle deviceptr_create_ref(cydriver.CUdeviceptr ptr) nogil
+
+    # Import a device pointer from IPC via cuMemPoolImportPointer
+    # Note: Does not yet implement reference counting for nvbug 5570902
+    # Error code is written to error_out (caller must check)
+    DevicePtrHandle deviceptr_import_ipc(
+        MemoryPoolHandle h_pool,
+        const void* export_data,
+        StreamHandle h_stream,
+        cydriver.CUresult* error_out) nogil
+
+    # Access the deallocation stream for a device pointer handle (read-only)
+    StreamHandle deallocation_stream(const DevicePtrHandle& h) nogil
+
+    # Set the deallocation stream for a device pointer handle
+    void set_deallocation_stream(const DevicePtrHandle& h, StreamHandle h_stream) nogil
+
     # ========================================================================
     # Overloaded helper functions (C++ handles dispatch by type)
     # ========================================================================
@@ -97,15 +141,18 @@ cdef extern from "_cpp/resource_handles.hpp" namespace "cuda_core":
     cydriver.CUstream native(StreamHandle h) nogil
     cydriver.CUevent native(EventHandle h) nogil
     cydriver.CUmemoryPool native(MemoryPoolHandle h) nogil
+    cydriver.CUdeviceptr native(DevicePtrHandle h) nogil
 
     # intptr() - extract handle as uintptr_t for Python interop
     uintptr_t intptr(ContextHandle h) nogil
     uintptr_t intptr(StreamHandle h) nogil
     uintptr_t intptr(EventHandle h) nogil
     uintptr_t intptr(MemoryPoolHandle h) nogil
+    uintptr_t intptr(DevicePtrHandle h) nogil
 
     # py() - convert handle to Python driver wrapper object (requires GIL)
     object py(ContextHandle h)
     object py(StreamHandle h)
     object py(EventHandle h)
     object py(MemoryPoolHandle h)
+    object py(DevicePtrHandle h)

From ea9a293a18459d0311eee549a11c6362aa19fb78 Mon Sep 17 00:00:00 2001
From: Andy Jost <ajost@nvidia.com>
Date: Thu, 11 Dec 2025 13:56:25 -0800
Subject: [PATCH 18/38] Use intptr_t for all handle integer conversions

Change all intptr() overloads to return std::intptr_t (signed) instead
of std::uintptr_t per C standard convention for pointer-to-integer
conversion. This addresses issue #1342 which requires Buffer.handle
to return a signed integer.

Fixes #1342
---
 .../experimental/_cpp/resource_handles.hpp    | 23 ++++++++++---------
 .../core/experimental/_resource_handles.pxd   | 15 ++++++------
 2 files changed, 20 insertions(+), 18 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_cpp/resource_handles.hpp b/cuda_core/cuda/core/experimental/_cpp/resource_handles.hpp
index 5ce6671fa9..7ef7ca153c 100644
--- a/cuda_core/cuda/core/experimental/_cpp/resource_handles.hpp
+++ b/cuda_core/cuda/core/experimental/_cpp/resource_handles.hpp
@@ -188,25 +188,26 @@ inline CUdeviceptr native(const DevicePtrHandle& h) noexcept {
     return h ? *h : 0;
 }
 
-// intptr() - extract handle as uintptr_t for Python interop
-inline std::uintptr_t intptr(const ContextHandle& h) noexcept {
-    return reinterpret_cast<std::uintptr_t>(h ? *h : nullptr);
+// intptr() - extract handle as intptr_t for Python interop
+// Using signed intptr_t per C standard convention and issue #1342
+inline std::intptr_t intptr(const ContextHandle& h) noexcept {
+    return reinterpret_cast<std::intptr_t>(h ? *h : nullptr);
 }
 
-inline std::uintptr_t intptr(const StreamHandle& h) noexcept {
-    return reinterpret_cast<std::uintptr_t>(h ? *h : nullptr);
+inline std::intptr_t intptr(const StreamHandle& h) noexcept {
+    return reinterpret_cast<std::intptr_t>(h ? *h : nullptr);
 }
 
-inline std::uintptr_t intptr(const EventHandle& h) noexcept {
-    return reinterpret_cast<std::uintptr_t>(h ? *h : nullptr);
+inline std::intptr_t intptr(const EventHandle& h) noexcept {
+    return reinterpret_cast<std::intptr_t>(h ? *h : nullptr);
 }
 
-inline std::uintptr_t intptr(const MemoryPoolHandle& h) noexcept {
-    return reinterpret_cast<std::uintptr_t>(h ? *h : nullptr);
+inline std::intptr_t intptr(const MemoryPoolHandle& h) noexcept {
+    return reinterpret_cast<std::intptr_t>(h ? *h : nullptr);
 }
 
-inline std::uintptr_t intptr(const DevicePtrHandle& h) noexcept {
-    return h ? static_cast<std::uintptr_t>(*h) : 0;
+inline std::intptr_t intptr(const DevicePtrHandle& h) noexcept {
+    return h ? static_cast<std::intptr_t>(*h) : 0;
 }
 
 // py() - convert handle to Python driver wrapper object
diff --git a/cuda_core/cuda/core/experimental/_resource_handles.pxd b/cuda_core/cuda/core/experimental/_resource_handles.pxd
index 416dd8bd5c..ea0841ed27 100644
--- a/cuda_core/cuda/core/experimental/_resource_handles.pxd
+++ b/cuda_core/cuda/core/experimental/_resource_handles.pxd
@@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-from libc.stdint cimport uintptr_t
+from libc.stdint cimport intptr_t
 from libcpp.memory cimport shared_ptr
 
 from cuda.bindings cimport cydriver
@@ -143,12 +143,13 @@ cdef extern from "_cpp/resource_handles.hpp" namespace "cuda_core":
     cydriver.CUmemoryPool native(MemoryPoolHandle h) nogil
     cydriver.CUdeviceptr native(DevicePtrHandle h) nogil
 
-    # intptr() - extract handle as uintptr_t for Python interop
-    uintptr_t intptr(ContextHandle h) nogil
-    uintptr_t intptr(StreamHandle h) nogil
-    uintptr_t intptr(EventHandle h) nogil
-    uintptr_t intptr(MemoryPoolHandle h) nogil
-    uintptr_t intptr(DevicePtrHandle h) nogil
+    # intptr() - extract handle as intptr_t for Python interop
+    # Using signed intptr_t per C standard convention and issue #1342
+    intptr_t intptr(ContextHandle h) nogil
+    intptr_t intptr(StreamHandle h) nogil
+    intptr_t intptr(EventHandle h) nogil
+    intptr_t intptr(MemoryPoolHandle h) nogil
+    intptr_t intptr(DevicePtrHandle h) nogil
 
     # py() - convert handle to Python driver wrapper object (requires GIL)
     object py(ContextHandle h)

From 92fa76bc32c3328389bf5fcb7169e833cdd70c98 Mon Sep 17 00:00:00 2001
From: Andy Jost <ajost@nvidia.com>
Date: Thu, 11 Dec 2025 14:15:25 -0800
Subject: [PATCH 19/38] Add thread-local error handling for resource handle
 functions

Implement a systematic error handling approach for C++ resource handle
functions using thread-local storage, similar to cudaGetLastError().

API:
- get_last_error(): Returns and clears the last CUDA error
- peek_last_error(): Returns without clearing
- clear_last_error(): Explicitly clears the error

All functions that can fail now set the thread-local error before
returning an empty handle. This allows callers to retrieve specific
CUDA error codes for proper exception propagation.

Updated deviceptr_import_ipc to use this pattern instead of an
output parameter.
---
 .../experimental/_cpp/resource_handles.cpp    | 372 +++++++-----------
 .../experimental/_cpp/resource_handles.hpp    |  18 +-
 .../cuda/core/experimental/_memory/_ipc.pyx   |   8 +-
 .../core/experimental/_resource_handles.pxd   |  12 +-
 4 files changed, 164 insertions(+), 246 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_cpp/resource_handles.cpp b/cuda_core/cuda/core/experimental/_cpp/resource_handles.cpp
index b5ccfff105..bc663f8228 100644
--- a/cuda_core/cuda/core/experimental/_cpp/resource_handles.cpp
+++ b/cuda_core/cuda/core/experimental/_cpp/resource_handles.cpp
@@ -10,6 +10,31 @@
 
 namespace cuda_core {
 
+// ============================================================================
+// Thread-local error handling
+// ============================================================================
+
+// Thread-local status of the most recent CUDA API call in this module.
+thread_local CUresult err = CUDA_SUCCESS;
+
+CUresult get_last_error() noexcept {
+    CUresult e = err;
+    err = CUDA_SUCCESS;
+    return e;
+}
+
+CUresult peek_last_error() noexcept {
+    return err;
+}
+
+void clear_last_error() noexcept {
+    err = CUDA_SUCCESS;
+}
+
+// ============================================================================
+// GIL management helpers
+// ============================================================================
+
 // Helper to release the GIL while calling into the CUDA driver.
 // This guard is *conditional*: if the caller already dropped the GIL,
 // we avoid calling PyEval_SaveThread (which requires holding the GIL).
@@ -46,7 +71,6 @@ class GILReleaseGuard {
 
 // Helper to acquire the GIL when we might not hold it.
 // Use in C++ destructors that need to manipulate Python objects.
-// Symmetric counterpart to GILReleaseGuard.
 class GILAcquireGuard {
 public:
     GILAcquireGuard() : acquired_(false) {
@@ -64,7 +88,6 @@ class GILAcquireGuard {
         }
     }
 
-    // Check if GIL was successfully acquired (for conditional operations)
     bool acquired() const { return acquired_; }
 
     // Non-copyable, non-movable
@@ -76,78 +99,63 @@ class GILAcquireGuard {
     bool acquired_;
 };
 
-// Internal box structure for Context (kept private to this TU)
+// ============================================================================
+// Context Handles
+// ============================================================================
+
 struct ContextBox {
     CUcontext resource;
 };
 
 ContextHandle create_context_handle_ref(CUcontext ctx) {
-    // Creates a non-owning handle that references an existing context
-    // (e.g., primary context managed by CUDA driver)
-
-    // Use default deleter - it will delete the box, but not touch the CUcontext
-    // CUcontext lifetime is managed externally (e.g., by CUDA driver)
-    auto box = std::shared_ptr<const ContextBox>(new ContextBox{ctx});
-
-    // Use aliasing constructor to create handle that exposes only CUcontext
-    // The handle's reference count is tied to box, but it points to &box->resource
+    auto box = std::make_shared<const ContextBox>(ContextBox{ctx});
     return ContextHandle(box, &box->resource);
 }
 
-// Thread-local storage for primary context cache
-// Each thread maintains its own cache of primary contexts indexed by device ID
+// Thread-local cache of primary contexts indexed by device ID
 thread_local std::vector<ContextHandle> primary_context_cache;
 
 ContextHandle get_primary_context(int device_id) noexcept {
     // Check thread-local cache
     if (static_cast<size_t>(device_id) < primary_context_cache.size()) {
-        auto cached = primary_context_cache[device_id];
-        if (cached.get() != nullptr) {
-            return cached;  // Cache hit
+        if (auto cached = primary_context_cache[device_id]) {
+            return cached;
         }
     }
 
     // Cache miss - acquire primary context from driver
+    GILReleaseGuard gil;
     CUcontext ctx;
-    CUresult err;
-    {
-        GILReleaseGuard gil;
-        err = cuDevicePrimaryCtxRetain(&ctx, device_id);
+    if (CUDA_SUCCESS != (err = cuDevicePrimaryCtxRetain(&ctx, device_id))) {
+        return {};
     }
-    if (err != CUDA_SUCCESS) {
-        // Return empty handle on error (caller must check)
-        return ContextHandle();
-    }
-
-    // Create owning handle with custom deleter that releases the primary context
-    auto box = std::shared_ptr<const ContextBox>(new ContextBox{ctx}, [device_id](const ContextBox* b) {
-        GILReleaseGuard gil;
-        cuDevicePrimaryCtxRelease(device_id);
-        delete b;
-    });
 
-    // Use aliasing constructor to expose only CUcontext
-    auto h_context = ContextHandle(box, &box->resource);
+    auto box = std::shared_ptr<const ContextBox>(
+        new ContextBox{ctx},
+        [device_id](const ContextBox* b) {
+            GILReleaseGuard gil;
+            cuDevicePrimaryCtxRelease(device_id);
+            delete b;
+        }
+    );
+    auto h = ContextHandle(box, &box->resource);
 
-    // Resize cache if needed
+    // Update cache
     if (static_cast<size_t>(device_id) >= primary_context_cache.size()) {
         primary_context_cache.resize(device_id + 1);
     }
-    primary_context_cache[device_id] = h_context;
-
-    return h_context;
+    primary_context_cache[device_id] = h;
+    return h;
 }
 
 ContextHandle get_current_context() noexcept {
+    GILReleaseGuard gil;
     CUcontext ctx = nullptr;
-    CUresult err;
-    {
-        GILReleaseGuard gil;
-        err = cuCtxGetCurrent(&ctx);
+    if (CUDA_SUCCESS != (err = cuCtxGetCurrent(&ctx))) {
+        return {};
     }
-    if (err != CUDA_SUCCESS || ctx == nullptr) {
-        // Return empty handle if no current context or error
-        return ContextHandle();
+    if (!ctx) {
+        return {};  // No current context (not an error)
     }
     return create_context_handle_ref(ctx);
 }
@@ -156,76 +164,54 @@ ContextHandle get_current_context() noexcept {
 // Stream Handles
 // ============================================================================
 
-// Internal box structure for Stream
 struct StreamBox {
     CUstream resource;
 };
 
 StreamHandle create_stream_handle(ContextHandle h_ctx, unsigned int flags, int priority) {
-    // Creates an owning stream handle - calls cuStreamCreateWithPriority internally.
-    // The context handle is captured in the deleter to ensure context outlives the stream.
-    // Returns empty handle on error (caller must check).
+    GILReleaseGuard gil;
     CUstream stream;
-    CUresult err;
-    {
-        GILReleaseGuard gil;
-        err = cuStreamCreateWithPriority(&stream, flags, priority);
-    }
-    if (err != CUDA_SUCCESS) {
-        return StreamHandle();
+    if (CUDA_SUCCESS != (err = cuStreamCreateWithPriority(&stream, flags, priority))) {
+        return {};
     }
 
-    // Capture h_ctx in lambda - shared_ptr control block keeps it alive
-    auto box = std::shared_ptr<const StreamBox>(new StreamBox{stream}, [h_ctx](const StreamBox* b) {
-        GILReleaseGuard gil;
-        cuStreamDestroy(b->resource);
-        delete b;
-        // h_ctx destructor runs here when last stream reference is released
-    });
-
-    // Use aliasing constructor to expose only CUstream
+    auto box = std::shared_ptr<const StreamBox>(
+        new StreamBox{stream},
+        [h_ctx](const StreamBox* b) {
+            GILReleaseGuard gil;
+            cuStreamDestroy(b->resource);
+            delete b;
+        }
+    );
     return StreamHandle(box, &box->resource);
 }
 
 StreamHandle create_stream_handle_ref(CUstream stream) {
-    // Creates a non-owning handle - stream will NOT be destroyed.
-    // Caller is responsible for keeping the stream's context alive.
-    auto box = std::shared_ptr<const StreamBox>(new StreamBox{stream});
-
-    // Use aliasing constructor to expose only CUstream
+    auto box = std::make_shared<const StreamBox>(StreamBox{stream});
     return StreamHandle(box, &box->resource);
 }
 
 StreamHandle create_stream_handle_with_owner(CUstream stream, PyObject* owner) {
-    // Creates a non-owning handle that prevents a Python owner from being GC'd.
-    // The owner's refcount is incremented here and decremented when handle is released.
-    // The owner is responsible for keeping the stream's context alive.
     Py_XINCREF(owner);
-
-    auto box = std::shared_ptr<const StreamBox>(new StreamBox{stream}, [owner](const StreamBox* b) {
-        // Safely decrement owner refcount (GILAcquireGuard handles finalization check)
-        {
+    auto box = std::shared_ptr<const StreamBox>(
+        new StreamBox{stream},
+        [owner](const StreamBox* b) {
             GILAcquireGuard gil;
             if (gil.acquired()) {
                 Py_XDECREF(owner);
             }
+            delete b;
         }
-        delete b;
-    });
-
+    );
     return StreamHandle(box, &box->resource);
 }
 
 StreamHandle get_legacy_stream() noexcept {
-    // Return non-owning handle to the legacy default stream.
-    // Use function-local static for efficient repeated access.
     static StreamHandle handle = create_stream_handle_ref(CU_STREAM_LEGACY);
     return handle;
 }
 
 StreamHandle get_per_thread_stream() noexcept {
-    // Return non-owning handle to the per-thread default stream.
-    // Use function-local static for efficient repeated access.
     static StreamHandle handle = create_stream_handle_ref(CU_STREAM_PER_THREAD);
     return handle;
 }
@@ -234,65 +220,47 @@ StreamHandle get_per_thread_stream() noexcept {
 // Event Handles
 // ============================================================================
 
-// Internal box structure for Event
 struct EventBox {
     CUevent resource;
 };
 
 EventHandle create_event_handle(ContextHandle h_ctx, unsigned int flags) {
-    // Creates an owning event handle - calls cuEventCreate internally.
-    // The context handle is captured in the deleter to ensure context outlives the event.
-    // Returns empty handle on error (caller must check).
+    GILReleaseGuard gil;
     CUevent event;
-    CUresult err;
-    {
-        GILReleaseGuard gil;
-        err = cuEventCreate(&event, flags);
-    }
-    if (err != CUDA_SUCCESS) {
-        return EventHandle();
+    if (CUDA_SUCCESS != (err = cuEventCreate(&event, flags))) {
+        return {};
     }
 
-    // Capture h_ctx in lambda - shared_ptr control block keeps it alive
-    auto box = std::shared_ptr<const EventBox>(new EventBox{event}, [h_ctx](const EventBox* b) {
-        GILReleaseGuard gil;
-        cuEventDestroy(b->resource);
-        delete b;
-        // h_ctx destructor runs here when last event reference is released
-    });
-
-    // Use aliasing constructor to expose only CUevent
+    auto box = std::shared_ptr<const EventBox>(
+        new EventBox{event},
+        [h_ctx](const EventBox* b) {
+            GILReleaseGuard gil;
+            cuEventDestroy(b->resource);
+            delete b;
+        }
+    );
     return EventHandle(box, &box->resource);
 }
 
 EventHandle create_event_handle(unsigned int flags) {
-    // Creates an owning event handle without context dependency.
-    // Use for temporary events that are created and destroyed in the same scope.
-    // Returns empty handle on error (caller must check).
     return create_event_handle(ContextHandle{}, flags);
 }
 
 EventHandle create_event_handle_ipc(const CUipcEventHandle& ipc_handle) {
-    // Creates an owning event handle from an IPC handle.
-    // The originating process owns the event and its context.
-    // Returns empty handle on error (caller must check).
+    GILReleaseGuard gil;
     CUevent event;
-    CUresult err;
-    {
-        GILReleaseGuard gil;
-        err = cuIpcOpenEventHandle(&event, ipc_handle);
-    }
-    if (err != CUDA_SUCCESS) {
-        return EventHandle();
+    if (CUDA_SUCCESS != (err = cuIpcOpenEventHandle(&event, ipc_handle))) {
+        return {};
     }
 
-    auto box = std::shared_ptr<const EventBox>(new EventBox{event}, [](const EventBox* b) {
-        GILReleaseGuard gil;
-        cuEventDestroy(b->resource);
-        delete b;
-    });
-
-    // Use aliasing constructor to expose only CUevent
+    auto box = std::shared_ptr<const EventBox>(
+        new EventBox{event},
+        [](const EventBox* b) {
+            GILReleaseGuard gil;
+            cuEventDestroy(b->resource);
+            delete b;
+        }
+    );
     return EventHandle(box, &box->resource);
 }
 
@@ -300,7 +268,6 @@ EventHandle create_event_handle_ipc(const CUipcEventHandle& ipc_handle) {
 // Memory Pool Handles
 // ============================================================================
 
-// Internal box structure for MemoryPool
 struct MemoryPoolBox {
     CUmemoryPool resource;
 };
@@ -319,90 +286,68 @@ static void clear_mempool_peer_access(CUmemoryPool pool) {
         clear_access[i].location.id = i;
         clear_access[i].flags = CU_MEM_ACCESS_FLAGS_PROT_NONE;
     }
-
-    // Ignore errors - best effort cleanup
-    cuMemPoolSetAccess(pool, clear_access.data(), device_count);
+    cuMemPoolSetAccess(pool, clear_access.data(), device_count);  // Best effort
 }
 
-// Helper to wrap a raw pool in an owning handle.
-// The deleter clears peer access (nvbug 5698116 workaround) and destroys the pool.
 static MemoryPoolHandle wrap_mempool_owned(CUmemoryPool pool) {
-    auto box = std::shared_ptr<const MemoryPoolBox>(new MemoryPoolBox{pool}, [](const MemoryPoolBox* b) {
-        GILReleaseGuard gil;
-        clear_mempool_peer_access(b->resource);
-        cuMemPoolDestroy(b->resource);
-        delete b;
-    });
+    auto box = std::shared_ptr<const MemoryPoolBox>(
+        new MemoryPoolBox{pool},
+        [](const MemoryPoolBox* b) {
+            GILReleaseGuard gil;
+            clear_mempool_peer_access(b->resource);
+            cuMemPoolDestroy(b->resource);
+            delete b;
+        }
+    );
     return MemoryPoolHandle(box, &box->resource);
 }
 
 MemoryPoolHandle create_mempool_handle(const CUmemPoolProps& props) {
-    // Creates an owning memory pool handle - calls cuMemPoolCreate internally.
-    // Memory pools are device-scoped (not context-scoped).
-    // Returns empty handle on error (caller must check).
+    GILReleaseGuard gil;
     CUmemoryPool pool;
-    CUresult err;
-    {
-        GILReleaseGuard gil;
-        err = cuMemPoolCreate(&pool, &props);
+    if (CUDA_SUCCESS != (err = cuMemPoolCreate(&pool, &props))) {
+        return {};
     }
-    return err == CUDA_SUCCESS ? wrap_mempool_owned(pool) : MemoryPoolHandle();
+    return wrap_mempool_owned(pool);
 }
 
 MemoryPoolHandle create_mempool_handle_ref(CUmemoryPool pool) {
-    // Creates a non-owning handle - pool will NOT be destroyed.
-    // Use for device default/current pools managed by the driver.
-    auto box = std::shared_ptr<const MemoryPoolBox>(new MemoryPoolBox{pool});
-
-    // Use aliasing constructor to expose only CUmemoryPool
+    auto box = std::make_shared<const MemoryPoolBox>(MemoryPoolBox{pool});
     return MemoryPoolHandle(box, &box->resource);
 }
 
 MemoryPoolHandle get_device_mempool(int device_id) noexcept {
-    // Get the current memory pool for a device.
-    // Returns a non-owning handle (pool managed by driver).
+    GILReleaseGuard gil;
     CUmemoryPool pool;
-    CUresult err;
-    {
-        GILReleaseGuard gil;
-        err = cuDeviceGetMemPool(&pool, device_id);
-    }
-    if (err != CUDA_SUCCESS) {
-        return MemoryPoolHandle();
+    if (CUDA_SUCCESS != (err = cuDeviceGetMemPool(&pool, device_id))) {
+        return {};
     }
     return create_mempool_handle_ref(pool);
 }
 
 MemoryPoolHandle create_mempool_handle_ipc(int fd, CUmemAllocationHandleType handle_type) {
-    // Creates an owning memory pool handle from an IPC import.
-    // The file descriptor is NOT owned by this handle.
-    // Returns empty handle on error (caller must check).
+    GILReleaseGuard gil;
     CUmemoryPool pool;
-    CUresult err;
-    {
-        GILReleaseGuard gil;
-        err = cuMemPoolImportFromShareableHandle(&pool, reinterpret_cast<void*>(static_cast<uintptr_t>(fd)), handle_type, 0);
+    auto handle_ptr = reinterpret_cast<void*>(static_cast<uintptr_t>(fd));
+    if (CUDA_SUCCESS != (err = cuMemPoolImportFromShareableHandle(&pool, handle_ptr, handle_type, 0))) {
+        return {};
     }
-    return err == CUDA_SUCCESS ? wrap_mempool_owned(pool) : MemoryPoolHandle();
+    return wrap_mempool_owned(pool);
 }
 
 // ============================================================================
 // Device Pointer Handles
 // ============================================================================
 
-// Internal box structure for DevicePtr.
-// The h_stream is mutable to allow updating the deallocation stream after creation.
 struct DevicePtrBox {
     CUdeviceptr resource;
     mutable StreamHandle h_stream;
 };
 
-// Internal helper to retrieve the box from a handle (for deallocation_stream access).
 static DevicePtrBox* get_box(const DevicePtrHandle& h) {
     const CUdeviceptr* p = h.get();
     return reinterpret_cast<DevicePtrBox*>(
-        reinterpret_cast<char*>(const_cast<CUdeviceptr*>(p))
-        - offsetof(DevicePtrBox, resource)
+        reinterpret_cast<char*>(const_cast<CUdeviceptr*>(p)) - offsetof(DevicePtrBox, resource)
     );
 }
 
@@ -414,53 +359,35 @@ void set_deallocation_stream(const DevicePtrHandle& h, StreamHandle h_stream) {
     get_box(h)->h_stream = std::move(h_stream);
 }
 
-DevicePtrHandle deviceptr_alloc_from_pool(
-    size_t size,
-    MemoryPoolHandle h_pool,
-    StreamHandle h_stream)
-{
-    // Allocate from pool asynchronously.
-    // Pool handle is captured in deleter to keep pool alive.
+DevicePtrHandle deviceptr_alloc_from_pool(size_t size, MemoryPoolHandle h_pool, StreamHandle h_stream) {
+    GILReleaseGuard gil;
     CUdeviceptr ptr;
-    CUresult err;
-    {
-        GILReleaseGuard gil;
-        err = cuMemAllocFromPoolAsync(&ptr, size, *h_pool, native(h_stream));
-    }
-    if (err != CUDA_SUCCESS) {
-        return DevicePtrHandle();
+    if (CUDA_SUCCESS != (err = cuMemAllocFromPoolAsync(&ptr, size, *h_pool, native(h_stream)))) {
+        return {};
     }
 
     auto box = std::shared_ptr<DevicePtrBox>(
         new DevicePtrBox{ptr, h_stream},
         [h_pool](DevicePtrBox* b) {
             GILReleaseGuard gil;
-            // cuMemFreeAsync accepts NULL stream (uses legacy default stream)
             cuMemFreeAsync(b->resource, native(b->h_stream));
             delete b;
-            // h_pool destructor runs here, releasing pool reference
         }
     );
     return DevicePtrHandle(box, &box->resource);
 }
 
 DevicePtrHandle deviceptr_alloc_async(size_t size, StreamHandle h_stream) {
-    // Allocate asynchronously (not from a specific pool).
+    GILReleaseGuard gil;
     CUdeviceptr ptr;
-    CUresult err;
-    {
-        GILReleaseGuard gil;
-        err = cuMemAllocAsync(&ptr, size, native(h_stream));
-    }
-    if (err != CUDA_SUCCESS) {
-        return DevicePtrHandle();
+    if (CUDA_SUCCESS != (err = cuMemAllocAsync(&ptr, size, native(h_stream)))) {
+        return {};
     }
 
     auto box = std::shared_ptr<DevicePtrBox>(
         new DevicePtrBox{ptr, h_stream},
         [](DevicePtrBox* b) {
             GILReleaseGuard gil;
-            // cuMemFreeAsync accepts NULL stream (uses legacy default stream)
             cuMemFreeAsync(b->resource, native(b->h_stream));
             delete b;
         }
@@ -469,15 +396,10 @@ DevicePtrHandle deviceptr_alloc_async(size_t size, StreamHandle h_stream) {
 }
 
 DevicePtrHandle deviceptr_alloc(size_t size) {
-    // Allocate synchronously.
+    GILReleaseGuard gil;
     CUdeviceptr ptr;
-    CUresult err;
-    {
-        GILReleaseGuard gil;
-        err = cuMemAlloc(&ptr, size);
-    }
-    if (err != CUDA_SUCCESS) {
-        return DevicePtrHandle();
+    if (CUDA_SUCCESS != (err = cuMemAlloc(&ptr, size))) {
+        return {};
     }
 
     auto box = std::shared_ptr<DevicePtrBox>(
@@ -492,15 +414,10 @@ DevicePtrHandle deviceptr_alloc(size_t size) {
 }
 
 DevicePtrHandle deviceptr_alloc_host(size_t size) {
-    // Allocate pinned host memory.
+    GILReleaseGuard gil;
     void* ptr;
-    CUresult err;
-    {
-        GILReleaseGuard gil;
-        err = cuMemAllocHost(&ptr, size);
-    }
-    if (err != CUDA_SUCCESS) {
-        return DevicePtrHandle();
+    if (CUDA_SUCCESS != (err = cuMemAllocHost(&ptr, size))) {
+        return {};
     }
 
     auto box = std::shared_ptr<DevicePtrBox>(
@@ -515,42 +432,25 @@ DevicePtrHandle deviceptr_alloc_host(size_t size) {
 }
 
 DevicePtrHandle deviceptr_create_ref(CUdeviceptr ptr) {
-    // Non-owning reference - pointer will NOT be freed.
-    auto box = std::shared_ptr<DevicePtrBox>(new DevicePtrBox{ptr, StreamHandle{}});
+    auto box = std::make_shared<DevicePtrBox>(DevicePtrBox{ptr, StreamHandle{}});
     return DevicePtrHandle(box, &box->resource);
 }
 
-DevicePtrHandle deviceptr_import_ipc(
-    MemoryPoolHandle h_pool,
-    const void* export_data,
-    StreamHandle h_stream,
-    CUresult* error_out)
-{
-    // Import pointer from IPC.
-    // Note: Does not implement reference counting workaround for nvbug 5570902 yet.
+DevicePtrHandle deviceptr_import_ipc(MemoryPoolHandle h_pool, const void* export_data, StreamHandle h_stream) {
+    GILReleaseGuard gil;
     CUdeviceptr ptr;
-    CUresult err;
-    {
-        GILReleaseGuard gil;
-        err = cuMemPoolImportPointer(&ptr, *h_pool,
-            const_cast<CUmemPoolPtrExportData*>(
-                reinterpret_cast<const CUmemPoolPtrExportData*>(export_data)));
-    }
-    if (error_out) {
-        *error_out = err;
-    }
-    if (err != CUDA_SUCCESS) {
-        return DevicePtrHandle();
+    auto data = const_cast<CUmemPoolPtrExportData*>(
+        reinterpret_cast<const CUmemPoolPtrExportData*>(export_data));
+    if (CUDA_SUCCESS != (err = cuMemPoolImportPointer(&ptr, *h_pool, data))) {
+        return {};
     }
 
     auto box = std::shared_ptr<DevicePtrBox>(
         new DevicePtrBox{ptr, h_stream},
         [h_pool](DevicePtrBox* b) {
             GILReleaseGuard gil;
-            // cuMemFreeAsync accepts NULL stream (uses legacy default stream)
             cuMemFreeAsync(b->resource, native(b->h_stream));
             delete b;
-            // h_pool destructor runs here
         }
     );
     return DevicePtrHandle(box, &box->resource);
diff --git a/cuda_core/cuda/core/experimental/_cpp/resource_handles.hpp b/cuda_core/cuda/core/experimental/_cpp/resource_handles.hpp
index 7ef7ca153c..fc62c9aa2c 100644
--- a/cuda_core/cuda/core/experimental/_cpp/resource_handles.hpp
+++ b/cuda_core/cuda/core/experimental/_cpp/resource_handles.hpp
@@ -11,6 +11,19 @@
 
 namespace cuda_core {
 
+// ============================================================================
+// Thread-local error handling
+// ============================================================================
+
+// Get and clear the last CUDA error (like cudaGetLastError)
+CUresult get_last_error() noexcept;
+
+// Get the last CUDA error without clearing it (like cudaPeekAtLastError)
+CUresult peek_last_error() noexcept;
+
+// Explicitly clear the last error
+void clear_last_error() noexcept;
+
 // ============================================================================
 // Handle type aliases - expose only the raw CUDA resource
 // ============================================================================
@@ -149,12 +162,11 @@ DevicePtrHandle deviceptr_create_ref(CUdeviceptr ptr);
 // Import a device pointer from IPC via cuMemPoolImportPointer.
 // When the last reference is released, cuMemFreeAsync is called on the stored stream.
 // Note: Does not yet implement reference counting for nvbug 5570902.
-// Error code is written to error_out (caller must check).
+// On error, returns empty handle and sets thread-local error (use get_last_error()).
 DevicePtrHandle deviceptr_import_ipc(
     MemoryPoolHandle h_pool,
     const void* export_data,
-    StreamHandle h_stream,
-    CUresult* error_out);
+    StreamHandle h_stream);
 
 // Access the deallocation stream for a device pointer handle (read-only).
 // For non-owning handles, the stream is not used but can still be accessed.
diff --git a/cuda_core/cuda/core/experimental/_memory/_ipc.pyx b/cuda_core/cuda/core/experimental/_memory/_ipc.pyx
index f0bdc22216..414c134601 100644
--- a/cuda_core/cuda/core/experimental/_memory/_ipc.pyx
+++ b/cuda_core/cuda/core/experimental/_memory/_ipc.pyx
@@ -14,6 +14,7 @@ from cuda.core.experimental._resource_handles cimport (
     MemoryPoolHandle,
     create_mempool_handle_ipc,
     deviceptr_import_ipc,
+    get_last_error,
     native,
 )
 from cuda.core.experimental._stream cimport default_stream
@@ -190,14 +191,13 @@ cdef Buffer Buffer_from_ipc_descriptor(
         # Note: match this behavior to DeviceMemoryResource.allocate()
         stream = default_stream()
     cdef Stream s = <Stream>stream
-    cdef cydriver.CUresult err
     cdef DevicePtrHandle h_ptr = deviceptr_import_ipc(
         mr._h_pool,
         ipc_descriptor.payload_ptr(),
-        s._h_stream,
-        &err
+        s._h_stream
     )
-    HANDLE_RETURN(err)
+    if not h_ptr:
+        HANDLE_RETURN(get_last_error())
     return Buffer_from_deviceptr_handle(h_ptr, ipc_descriptor.size, mr, ipc_descriptor)
 
 
diff --git a/cuda_core/cuda/core/experimental/_resource_handles.pxd b/cuda_core/cuda/core/experimental/_resource_handles.pxd
index ea0841ed27..2aeff68cd8 100644
--- a/cuda_core/cuda/core/experimental/_resource_handles.pxd
+++ b/cuda_core/cuda/core/experimental/_resource_handles.pxd
@@ -9,6 +9,13 @@ from cuda.bindings cimport cydriver
 
 # Declare the C++ namespace and types
 cdef extern from "_cpp/resource_handles.hpp" namespace "cuda_core":
+    # ========================================================================
+    # Thread-local error handling
+    # ========================================================================
+    cydriver.CUresult get_last_error() nogil
+    cydriver.CUresult peek_last_error() nogil
+    void clear_last_error() nogil
+
     # ========================================================================
     # Context Handle
     # ========================================================================
@@ -119,12 +126,11 @@ cdef extern from "_cpp/resource_handles.hpp" namespace "cuda_core":
 
     # Import a device pointer from IPC via cuMemPoolImportPointer
     # Note: Does not yet implement reference counting for nvbug 5570902
-    # Error code is written to error_out (caller must check)
+    # On error, returns empty handle and sets thread-local error (use get_last_error())
     DevicePtrHandle deviceptr_import_ipc(
         MemoryPoolHandle h_pool,
         const void* export_data,
-        StreamHandle h_stream,
-        cydriver.CUresult* error_out) nogil
+        StreamHandle h_stream) nogil
 
     # Access the deallocation stream for a device pointer handle (read-only)
     StreamHandle deallocation_stream(const DevicePtrHandle& h) nogil

From f05d45a7a81c1c55c2d7377fac2fa63d58d8ff65 Mon Sep 17 00:00:00 2001
From: Andy Jost <ajost@nvidia.com>
Date: Thu, 11 Dec 2025 15:40:34 -0800
Subject: [PATCH 20/38] Add IPC pointer cache to fix duplicate import issue
 (nvbug 5570902)

IPC-imported device pointers are not correctly reference counted by the
driver - the first cuMemFreeAsync incorrectly unmaps the memory even when
the pointer was imported multiple times.

Work around this by caching imported pointers and returning the same
handle for duplicate imports. The cache uses weak_ptr so entries are
automatically cleaned up when all references are released.

The workaround can be easily bypassed via use_ipc_ptr_cache() when a
driver fix becomes available.
---
 .../experimental/_cpp/resource_handles.cpp    | 65 ++++++++++++--
 .../memory_ipc/test_ipc_duplicate_import.py   | 89 +++++++++++++++++++
 2 files changed, 146 insertions(+), 8 deletions(-)
 create mode 100644 cuda_core/tests/memory_ipc/test_ipc_duplicate_import.py

diff --git a/cuda_core/cuda/core/experimental/_cpp/resource_handles.cpp b/cuda_core/cuda/core/experimental/_cpp/resource_handles.cpp
index bc663f8228..557b0af74c 100644
--- a/cuda_core/cuda/core/experimental/_cpp/resource_handles.cpp
+++ b/cuda_core/cuda/core/experimental/_cpp/resource_handles.cpp
@@ -6,6 +6,8 @@
 
 #include "resource_handles.hpp"
 #include <cuda.h>
+#include <mutex>
+#include <unordered_map>
 #include <vector>
 
 namespace cuda_core {
@@ -436,6 +438,22 @@ DevicePtrHandle deviceptr_create_ref(CUdeviceptr ptr) {
     return DevicePtrHandle(box, &box->resource);
 }
 
+// ============================================================================
+// IPC Pointer Cache (workaround for nvbug 5570902)
+// ============================================================================
+// IPC-imported pointers are not correctly reference counted by the driver.
+// The first cuMemFreeAsync incorrectly unmaps the memory even when the pointer
+// was imported multiple times. We work around this by caching imported pointers
+// and returning the same handle for duplicate imports.
+
+// TODO: When driver fix is available, add version check here to bypass cache.
+static bool use_ipc_ptr_cache() {
+    return true;
+}
+
+static std::mutex ipc_ptr_cache_mutex;
+static std::unordered_map<CUdeviceptr, std::weak_ptr<DevicePtrBox>> ipc_ptr_cache;
+
 DevicePtrHandle deviceptr_import_ipc(MemoryPoolHandle h_pool, const void* export_data, StreamHandle h_stream) {
     GILReleaseGuard gil;
     CUdeviceptr ptr;
@@ -445,15 +463,46 @@ DevicePtrHandle deviceptr_import_ipc(MemoryPoolHandle h_pool, const void* export
         return {};
     }
 
-    auto box = std::shared_ptr<DevicePtrBox>(
-        new DevicePtrBox{ptr, h_stream},
-        [h_pool](DevicePtrBox* b) {
-            GILReleaseGuard gil;
-            cuMemFreeAsync(b->resource, native(b->h_stream));
-            delete b;
+    if (use_ipc_ptr_cache()) {
+        std::lock_guard<std::mutex> lock(ipc_ptr_cache_mutex);
+
+        // Check for existing handle
+        auto it = ipc_ptr_cache.find(ptr);
+        if (it != ipc_ptr_cache.end()) {
+            if (auto box = it->second.lock()) {
+                return DevicePtrHandle(box, &box->resource);
+            }
+            ipc_ptr_cache.erase(it);  // Expired entry
         }
-    );
-    return DevicePtrHandle(box, &box->resource);
+
+        // Create new handle with cache-clearing deleter
+        auto box = std::shared_ptr<DevicePtrBox>(
+            new DevicePtrBox{ptr, h_stream},
+            [h_pool, ptr](DevicePtrBox* b) {
+                {
+                    std::lock_guard<std::mutex> lock(ipc_ptr_cache_mutex);
+                    ipc_ptr_cache.erase(ptr);
+                }
+                GILReleaseGuard gil;
+                cuMemFreeAsync(b->resource, native(b->h_stream));
+                delete b;
+            }
+        );
+        ipc_ptr_cache[ptr] = box;
+        return DevicePtrHandle(box, &box->resource);
+
+    } else {
+        // No caching - simple handle creation
+        auto box = std::shared_ptr<DevicePtrBox>(
+            new DevicePtrBox{ptr, h_stream},
+            [h_pool](DevicePtrBox* b) {
+                GILReleaseGuard gil;
+                cuMemFreeAsync(b->resource, native(b->h_stream));
+                delete b;
+            }
+        );
+        return DevicePtrHandle(box, &box->resource);
+    }
 }
 
 }  // namespace cuda_core
diff --git a/cuda_core/tests/memory_ipc/test_ipc_duplicate_import.py b/cuda_core/tests/memory_ipc/test_ipc_duplicate_import.py
new file mode 100644
index 0000000000..a08c48d567
--- /dev/null
+++ b/cuda_core/tests/memory_ipc/test_ipc_duplicate_import.py
@@ -0,0 +1,89 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+"""Test for duplicate IPC buffer imports.
+
+Verifies that importing the same buffer descriptor multiple times returns the
+same underlying handle, and that closing all imports works correctly without
+crashing. This tests the workaround for nvbug 5570902 where IPC-imported
+pointers are not correctly reference counted by the driver.
+"""
+
+import multiprocessing as mp
+
+import pytest
+
+from cuda.core.experimental import Buffer, Device, DeviceMemoryResource, DeviceMemoryResourceOptions
+from helpers.logging import TimestampedLogger
+
+CHILD_TIMEOUT_SEC = 20
+NBYTES = 64
+POOL_SIZE = 2097152
+
+ENABLE_LOGGING = False  # Set True for test debugging and development
+
+
+def child_main(log, queue):
+    log.prefix = " child: "
+    log("ready")
+    device = Device()
+    device.set_current()
+    mr = queue.get()
+    buffer_desc1 = queue.get()
+    buffer_desc2 = queue.get()
+
+    # Import the same buffer twice - should return same handle due to cache
+    buffer1 = Buffer.from_ipc_descriptor(mr, buffer_desc1)
+    buffer2 = Buffer.from_ipc_descriptor(mr, buffer_desc2)
+
+    log(f"buffer1.handle = {buffer1.handle}")
+    log(f"buffer2.handle = {buffer2.handle}")
+    log(f"same handle: {buffer1.handle == buffer2.handle}")
+
+    # Close both - should not crash
+    buffer1.close()
+    log("buffer1 closed")
+
+    buffer2.close()
+    log("buffer2 closed")
+
+    device.sync()
+    log("done")
+
+
+class TestIpcDuplicateImport:
+    """Test that duplicate IPC imports return the same handle and close safely."""
+
+    @pytest.fixture(autouse=True)
+    def _set_start_method(self):
+        # Ensure spawn is used for multiprocessing
+        try:
+            mp.set_start_method("spawn", force=True)
+        except RuntimeError:
+            pass  # Already set
+
+    def test_main(self, ipc_device, ipc_memory_resource):
+        log = TimestampedLogger(prefix="parent: ", enabled=ENABLE_LOGGING)
+        device = ipc_device
+        mr = ipc_memory_resource
+
+        log("allocating buffer")
+        buffer = mr.allocate(NBYTES)
+
+        # Start the child process.
+        log("starting child")
+        queue = mp.Queue()
+        process = mp.Process(target=child_main, args=(log, queue))
+        process.start()
+
+        # Send the memory resource and buffer descriptor twice.
+        log("sending mr and buffer descriptors")
+        queue.put(mr)
+        queue.put(buffer.get_ipc_descriptor())
+        queue.put(buffer.get_ipc_descriptor())
+
+        log("waiting for child")
+        process.join(timeout=CHILD_TIMEOUT_SEC)
+        log(f"child exit code: {process.exitcode}")
+        assert process.exitcode == 0, f"Child process failed with exit code {process.exitcode}"
+        log("done")

From 937428b7d6fc97079c79929856f9008626a7f407 Mon Sep 17 00:00:00 2001
From: Andy Jost <ajost@nvidia.com>
Date: Thu, 11 Dec 2025 16:07:55 -0800
Subject: [PATCH 21/38] Fix lint issues: remove unused imports and variables

---
 cuda_core/cuda/core/experimental/_context.pyx |  4 -
 cuda_core/cuda/core/experimental/_event.pyx   |  2 -
 .../core/experimental/_memory/_buffer.pxd     |  3 +
 .../core/experimental/_memory/_buffer.pyx     | 99 ++++++++++++++++---
 .../_memory/_device_memory_resource.pyx       |  1 -
 .../cuda/core/experimental/_memory/_ipc.pyx   |  3 -
 .../cuda/core/experimental/_memoryview.pyx    |  1 -
 cuda_core/cuda/core/experimental/_stream.pyx  |  6 +-
 .../memory_ipc/test_ipc_duplicate_import.py   | 10 +-
 cuda_core/tests/test_comparable.py            |  1 -
 cuda_core/tests/test_hashable.py              |  1 -
 11 files changed, 95 insertions(+), 36 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_context.pyx b/cuda_core/cuda/core/experimental/_context.pyx
index 0504778207..2a7434c62b 100644
--- a/cuda_core/cuda/core/experimental/_context.pyx
+++ b/cuda_core/cuda/core/experimental/_context.pyx
@@ -4,15 +4,11 @@
 
 from dataclasses import dataclass
 
-from cuda.bindings cimport cydriver
 from cuda.core.experimental._resource_handles cimport (
     ContextHandle,
-    create_context_handle_ref,
     intptr,
-    native,
     py,
 )
-from cuda.core.experimental._utils.cuda_utils cimport HANDLE_RETURN
 
 
 __all__ = ['Context', 'ContextOptions']
diff --git a/cuda_core/cuda/core/experimental/_event.pyx b/cuda_core/cuda/core/experimental/_event.pyx
index 763df94fe3..72618e40d6 100644
--- a/cuda_core/cuda/core/experimental/_event.pyx
+++ b/cuda_core/cuda/core/experimental/_event.pyx
@@ -5,7 +5,6 @@
 from __future__ import annotations
 
 cimport cpython
-from libc.stdint cimport uintptr_t
 from libc.string cimport memcpy
 from cuda.bindings cimport cydriver
 from cuda.core.experimental._context cimport Context
@@ -30,7 +29,6 @@ from typing import TYPE_CHECKING, Optional
 from cuda.core.experimental._utils.cuda_utils import (
     CUDAError,
     check_multiprocessing_start_method,
-    driver,
 )
 if TYPE_CHECKING:
     import cuda.bindings
diff --git a/cuda_core/cuda/core/experimental/_memory/_buffer.pxd b/cuda_core/cuda/core/experimental/_memory/_buffer.pxd
index 81653dafd5..dda12622f4 100644
--- a/cuda_core/cuda/core/experimental/_memory/_buffer.pxd
+++ b/cuda_core/cuda/core/experimental/_memory/_buffer.pxd
@@ -20,6 +20,9 @@ cdef class Buffer:
         size_t          _size
         MemoryResource  _memory_resource
         object          _ipc_data
+        object          _owner
+        _MemAttrs       _mem_attrs
+        bint            _mem_attrs_inited
 
 
 cdef class MemoryResource:
diff --git a/cuda_core/cuda/core/experimental/_memory/_buffer.pyx b/cuda_core/cuda/core/experimental/_memory/_buffer.pyx
index c7ab15ae95..6568d6271f 100644
--- a/cuda_core/cuda/core/experimental/_memory/_buffer.pyx
+++ b/cuda_core/cuda/core/experimental/_memory/_buffer.pyx
@@ -5,7 +5,7 @@
 from __future__ import annotations
 
 cimport cython
-from libc.stdint cimport uintptr_t, int64_t, uint64_t
+from libc.stdint cimport uintptr_t
 
 from cuda.bindings cimport cydriver
 from cuda.core.experimental._memory._device_memory_resource cimport DeviceMemoryResource
@@ -13,15 +13,16 @@ from cuda.core.experimental._memory._ipc cimport IPCBufferDescriptor, IPCDataFor
 from cuda.core.experimental._memory cimport _ipc
 from cuda.core.experimental._resource_handles cimport (
     DevicePtrHandle,
-    StreamHandle,
     deviceptr_create_ref,
     intptr,
     native,
-    py,
     set_deallocation_stream,
 )
 from cuda.core.experimental._stream cimport Stream_accept, Stream
-from cuda.core.experimental._utils.cuda_utils cimport HANDLE_RETURN
+from cuda.core.experimental._utils.cuda_utils cimport (
+    _check_driver_error as raise_if_driver_error,
+    HANDLE_RETURN,
+)
 
 import abc
 from typing import TypeVar, Union
@@ -56,6 +57,8 @@ cdef class Buffer:
         self._size = 0
         self._memory_resource = None
         self._ipc_data = None
+        self._owner = None
+        self._mem_attrs_inited = False
 
     def __init__(self, *args, **kwargs):
         raise RuntimeError("Buffer objects cannot be instantiated directly. "
@@ -71,7 +74,7 @@ cdef class Buffer:
         owner : object | None = None
     ):
         """Legacy init for compatibility - creates a non-owning ref handle.
-        
+
         Note: The stream parameter is accepted for API compatibility but is
         ignored since non-owning refs are never freed by the handle.
         """
@@ -82,6 +85,8 @@ cdef class Buffer:
             raise ValueError("owner and memory resource cannot be both specified together")
         self._memory_resource = mr
         self._ipc_data = IPCDataForBuffer(ipc_descriptor, True) if ipc_descriptor is not None else None
+        self._owner = owner
+        self._mem_attrs_inited = False
         return self
 
     # No __dealloc__ needed - RAII handles cleanup via _h_ptr destructor
@@ -105,14 +110,17 @@ cdef class Buffer:
             Memory size of the buffer
         mr : :obj:`~_memory.MemoryResource`, optional
             Memory resource associated with the buffer
+        owner : object, optional
+            An object holding external allocation that the ``ptr`` points to.
+            The reference is kept as long as the buffer is alive.
+            The ``owner`` and ``mr`` cannot be specified together.
 
         Note
         ----
         This creates a non-owning reference. The pointer will NOT be freed
         when the Buffer is closed or garbage collected.
         """
-        cdef DevicePtrHandle h_ptr = deviceptr_create_ref(<uintptr_t>(int(ptr)))
-        return Buffer_from_deviceptr_handle(h_ptr, size, mr)
+        return Buffer._init(ptr, size, mr=mr, owner=owner)
 
     @classmethod
     def from_ipc_descriptor(
@@ -159,7 +167,6 @@ cdef class Buffer:
 
         """
         stream = Stream_accept(stream)
-        cdef Stream s_stream = <Stream>stream
         cdef size_t src_size = self._size
 
         if dst is None:
@@ -190,7 +197,6 @@ cdef class Buffer:
 
         """
         stream = Stream_accept(stream)
-        cdef Stream s_stream = <Stream>stream
         cdef size_t dst_size = self._size
         cdef size_t src_size = src._size
 
@@ -299,7 +305,8 @@ cdef class Buffer:
         """Return the device ordinal of this buffer."""
         if self._memory_resource is not None:
             return self._memory_resource.device_id
-        raise NotImplementedError("device_id requires a memory resource")
+        _init_mem_attrs(self)
+        return self._mem_attrs.device_id
 
     @property
     def handle(self) -> DevicePointerT:
@@ -319,14 +326,16 @@ cdef class Buffer:
         """Return True if this buffer can be accessed by the GPU, otherwise False."""
         if self._memory_resource is not None:
             return self._memory_resource.is_device_accessible
-        raise NotImplementedError("is_device_accessible requires a memory resource")
+        _init_mem_attrs(self)
+        return self._mem_attrs.is_device_accessible
 
     @property
     def is_host_accessible(self) -> bool:
         """Return True if this buffer can be accessed by the CPU, otherwise False."""
         if self._memory_resource is not None:
             return self._memory_resource.is_host_accessible
-        raise NotImplementedError("is_host_accessible requires a memory resource")
+        _init_mem_attrs(self)
+        return self._mem_attrs.is_host_accessible
 
     @property
     def is_mapped(self) -> bool:
@@ -344,6 +353,71 @@ cdef class Buffer:
         """Return the memory size of this buffer."""
         return self._size
 
+    @property
+    def owner(self) -> object:
+        """Return the object holding external allocation."""
+        return self._owner
+
+
+# Memory Attribute Query Helpers
+# ------------------------------
+cdef inline _init_mem_attrs(Buffer self):
+    """Initialize memory attributes by querying the pointer."""
+    if not self._mem_attrs_inited:
+        _query_memory_attrs(self._mem_attrs, native(self._h_ptr))
+        self._mem_attrs_inited = True
+
+
+cdef inline int _query_memory_attrs(
+    _MemAttrs& out,
+    cydriver.CUdeviceptr ptr
+) except -1 nogil:
+    """Query memory attributes for a device pointer."""
+    cdef unsigned int memory_type = 0
+    cdef int is_managed = 0
+    cdef int device_id = 0
+    cdef cydriver.CUpointer_attribute attrs[3]
+    cdef uintptr_t vals[3]
+
+    attrs[0] = cydriver.CUpointer_attribute.CU_POINTER_ATTRIBUTE_MEMORY_TYPE
+    attrs[1] = cydriver.CUpointer_attribute.CU_POINTER_ATTRIBUTE_IS_MANAGED
+    attrs[2] = cydriver.CUpointer_attribute.CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL
+    vals[0] = <uintptr_t><void*>&memory_type
+    vals[1] = <uintptr_t><void*>&is_managed
+    vals[2] = <uintptr_t><void*>&device_id
+
+    cdef cydriver.CUresult ret
+    ret = cydriver.cuPointerGetAttributes(3, attrs, <void**>vals, ptr)
+    if ret == cydriver.CUresult.CUDA_ERROR_NOT_INITIALIZED:
+        with cython.gil:
+            # Device class handles the cuInit call internally
+            Device()
+        ret = cydriver.cuPointerGetAttributes(3, attrs, <void**>vals, ptr)
+    HANDLE_RETURN(ret)
+
+    if memory_type == 0:
+        # unregistered host pointer
+        out.is_host_accessible = True
+        out.is_device_accessible = False
+        out.device_id = -1
+    elif (
+        is_managed
+        or memory_type == cydriver.CUmemorytype.CU_MEMORYTYPE_HOST
+    ):
+        # Managed memory or pinned host memory
+        out.is_host_accessible = True
+        out.is_device_accessible = True
+        out.device_id = device_id
+    elif memory_type == cydriver.CUmemorytype.CU_MEMORYTYPE_DEVICE:
+        out.is_host_accessible = False
+        out.is_device_accessible = True
+        out.device_id = device_id
+    else:
+        with cython.gil:
+            raise ValueError(f"Unsupported memory type: {memory_type}")
+    return 0
+
+
 cdef class MemoryResource:
     """Abstract base class for memory resources that manage allocation and
     deallocation of buffers.
@@ -426,6 +500,7 @@ cdef inline void Buffer_close(Buffer self, object stream):
     self._size = 0
     self._memory_resource = None
     self._ipc_data = None
+    self._owner = None
 
 
 def _validate_value_against_bitwidth(bitwidth, value, is_signed=False):
diff --git a/cuda_core/cuda/core/experimental/_memory/_device_memory_resource.pyx b/cuda_core/cuda/core/experimental/_memory/_device_memory_resource.pyx
index 2a3e5c2dfe..e80afe45ac 100644
--- a/cuda_core/cuda/core/experimental/_memory/_device_memory_resource.pyx
+++ b/cuda_core/cuda/core/experimental/_memory/_device_memory_resource.pyx
@@ -15,7 +15,6 @@ from cuda.core.experimental._memory cimport _ipc
 from cuda.core.experimental._memory._ipc cimport IPCAllocationHandle, IPCDataForMR
 from cuda.core.experimental._resource_handles cimport (
     DevicePtrHandle,
-    MemoryPoolHandle,
     create_mempool_handle,
     deviceptr_alloc_from_pool,
     get_device_mempool,
diff --git a/cuda_core/cuda/core/experimental/_memory/_ipc.pyx b/cuda_core/cuda/core/experimental/_memory/_ipc.pyx
index 414c134601..5b301750c4 100644
--- a/cuda_core/cuda/core/experimental/_memory/_ipc.pyx
+++ b/cuda_core/cuda/core/experimental/_memory/_ipc.pyx
@@ -3,15 +3,12 @@
 # SPDX-License-Identifier: Apache-2.0
 
 cimport cpython
-from libc.stdint cimport uintptr_t
-from libc.string cimport memcpy
 
 from cuda.bindings cimport cydriver
 from cuda.core.experimental._memory._buffer cimport Buffer, Buffer_from_deviceptr_handle
 from cuda.core.experimental._stream cimport Stream
 from cuda.core.experimental._resource_handles cimport (
     DevicePtrHandle,
-    MemoryPoolHandle,
     create_mempool_handle_ipc,
     deviceptr_import_ipc,
     get_last_error,
diff --git a/cuda_core/cuda/core/experimental/_memoryview.pyx b/cuda_core/cuda/core/experimental/_memoryview.pyx
index 718736e5cf..9f1119894a 100644
--- a/cuda_core/cuda/core/experimental/_memoryview.pyx
+++ b/cuda_core/cuda/core/experimental/_memoryview.pyx
@@ -20,7 +20,6 @@ from cuda.core.experimental._resource_handles cimport (
     native,
 )
 from cuda.core.experimental._utils.cuda_utils import handle_return, driver
-from cuda.core.experimental._utils cimport cuda_utils
 from cuda.core.experimental._utils.cuda_utils cimport HANDLE_RETURN
 
 
diff --git a/cuda_core/cuda/core/experimental/_stream.pyx b/cuda_core/cuda/core/experimental/_stream.pyx
index 4a16399323..87d78eba17 100644
--- a/cuda_core/cuda/core/experimental/_stream.pyx
+++ b/cuda_core/cuda/core/experimental/_stream.pyx
@@ -41,9 +41,6 @@ from cuda.core.experimental._resource_handles cimport (
     py,
 )
 from cuda.core.experimental._graph import GraphBuilder
-from cuda.core.experimental._utils.cuda_utils import (
-    driver,
-)
 
 
 @dataclass
@@ -423,8 +420,7 @@ cdef inline int Stream_ensure_ctx(Stream self) except?-1 nogil:
 
 cdef inline int Stream_ensure_ctx_device(Stream self) except?-1:
     """Ensure the stream's context and device_id are populated."""
-    cdef ContextHandle h_curr_context
-    cdef cydriver.CUcontext target_ctx, curr_ctx, ctx
+    cdef cydriver.CUcontext ctx
     cdef cydriver.CUdevice target_dev
     cdef bint switch_context
 
diff --git a/cuda_core/tests/memory_ipc/test_ipc_duplicate_import.py b/cuda_core/tests/memory_ipc/test_ipc_duplicate_import.py
index a08c48d567..096b3a2abd 100644
--- a/cuda_core/tests/memory_ipc/test_ipc_duplicate_import.py
+++ b/cuda_core/tests/memory_ipc/test_ipc_duplicate_import.py
@@ -9,11 +9,11 @@
 pointers are not correctly reference counted by the driver.
 """
 
+import contextlib
 import multiprocessing as mp
 
 import pytest
-
-from cuda.core.experimental import Buffer, Device, DeviceMemoryResource, DeviceMemoryResourceOptions
+from cuda.core.experimental import Buffer, Device
 from helpers.logging import TimestampedLogger
 
 CHILD_TIMEOUT_SEC = 20
@@ -57,14 +57,12 @@ class TestIpcDuplicateImport:
     @pytest.fixture(autouse=True)
     def _set_start_method(self):
         # Ensure spawn is used for multiprocessing
-        try:
+        with contextlib.suppress(RuntimeError):
             mp.set_start_method("spawn", force=True)
-        except RuntimeError:
-            pass  # Already set
 
     def test_main(self, ipc_device, ipc_memory_resource):
         log = TimestampedLogger(prefix="parent: ", enabled=ENABLE_LOGGING)
-        device = ipc_device
+        ipc_device.set_current()
         mr = ipc_memory_resource
 
         log("allocating buffer")
diff --git a/cuda_core/tests/test_comparable.py b/cuda_core/tests/test_comparable.py
index 2c05932dcc..8f62db8b49 100644
--- a/cuda_core/tests/test_comparable.py
+++ b/cuda_core/tests/test_comparable.py
@@ -9,7 +9,6 @@
 """
 
 from cuda.core.experimental import Device, Stream
-from cuda.core.experimental._event import Event
 from cuda.core.experimental._stream import StreamOptions
 
 # ============================================================================
diff --git a/cuda_core/tests/test_hashable.py b/cuda_core/tests/test_hashable.py
index 1ecf8cdedd..bdad435c6f 100644
--- a/cuda_core/tests/test_hashable.py
+++ b/cuda_core/tests/test_hashable.py
@@ -13,7 +13,6 @@
 """
 
 from cuda.core.experimental import Device
-from cuda.core.experimental._event import Event
 from cuda.core.experimental._stream import Stream, StreamOptions
 
 # ============================================================================

From b629ec68c99e2b8265284dee55916768c65bb655 Mon Sep 17 00:00:00 2001
From: Andy Jost <ajost@nvidia.com>
Date: Fri, 12 Dec 2025 09:13:19 -0800
Subject: [PATCH 22/38] Add deviceptr_create_with_owner for handle-based owner
 tracking

Implements handle-based owner tracking for device pointers, consistent
with the pattern used for streams (create_stream_handle_with_owner).

Changes:
- Add deviceptr_create_with_owner() - creates non-owning handle that
  keeps a Python owner alive via Py_INCREF/DECREF (lambda capture)
- If owner is nullptr, delegates to deviceptr_create_ref
- Buffer._owner field tracks owner in Python for property access
- Buffer._init() simplified to always call deviceptr_create_with_owner
---
 .../experimental/_cpp/resource_handles.cpp     | 18 ++++++++++++++++++
 .../experimental/_cpp/resource_handles.hpp     |  6 ++++++
 .../cuda/core/experimental/_memory/_buffer.pyx | 13 +++++--------
 .../core/experimental/_resource_handles.pxd    |  5 +++++
 4 files changed, 34 insertions(+), 8 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_cpp/resource_handles.cpp b/cuda_core/cuda/core/experimental/_cpp/resource_handles.cpp
index 557b0af74c..c4b574f6a7 100644
--- a/cuda_core/cuda/core/experimental/_cpp/resource_handles.cpp
+++ b/cuda_core/cuda/core/experimental/_cpp/resource_handles.cpp
@@ -438,6 +438,24 @@ DevicePtrHandle deviceptr_create_ref(CUdeviceptr ptr) {
     return DevicePtrHandle(box, &box->resource);
 }
 
+DevicePtrHandle deviceptr_create_with_owner(CUdeviceptr ptr, PyObject* owner) {
+    if (!owner) {
+        return deviceptr_create_ref(ptr);
+    }
+    Py_INCREF(owner);
+    auto box = std::shared_ptr<DevicePtrBox>(
+        new DevicePtrBox{ptr, StreamHandle{}},
+        [owner](DevicePtrBox* b) {
+            GILAcquireGuard gil;
+            if (gil.acquired()) {
+                Py_DECREF(owner);
+            }
+            delete b;
+        }
+    );
+    return DevicePtrHandle(box, &box->resource);
+}
+
 // ============================================================================
 // IPC Pointer Cache (workaround for nvbug 5570902)
 // ============================================================================
diff --git a/cuda_core/cuda/core/experimental/_cpp/resource_handles.hpp b/cuda_core/cuda/core/experimental/_cpp/resource_handles.hpp
index fc62c9aa2c..7649788fdd 100644
--- a/cuda_core/cuda/core/experimental/_cpp/resource_handles.hpp
+++ b/cuda_core/cuda/core/experimental/_cpp/resource_handles.hpp
@@ -159,6 +159,12 @@ DevicePtrHandle deviceptr_alloc_host(size_t size);
 // The pointer will NOT be freed when the handle is released.
 DevicePtrHandle deviceptr_create_ref(CUdeviceptr ptr);
 
+// Create a non-owning device pointer handle that prevents a Python owner from being GC'd.
+// The owner's refcount is incremented; decremented when handle is released.
+// The pointer will NOT be freed when the handle is released.
+// If owner is nullptr, equivalent to deviceptr_create_ref.
+DevicePtrHandle deviceptr_create_with_owner(CUdeviceptr ptr, PyObject* owner);
+
 // Import a device pointer from IPC via cuMemPoolImportPointer.
 // When the last reference is released, cuMemFreeAsync is called on the stored stream.
 // Note: Does not yet implement reference counting for nvbug 5570902.
diff --git a/cuda_core/cuda/core/experimental/_memory/_buffer.pyx b/cuda_core/cuda/core/experimental/_memory/_buffer.pyx
index 6568d6271f..bca2a21ff0 100644
--- a/cuda_core/cuda/core/experimental/_memory/_buffer.pyx
+++ b/cuda_core/cuda/core/experimental/_memory/_buffer.pyx
@@ -13,7 +13,7 @@ from cuda.core.experimental._memory._ipc cimport IPCBufferDescriptor, IPCDataFor
 from cuda.core.experimental._memory cimport _ipc
 from cuda.core.experimental._resource_handles cimport (
     DevicePtrHandle,
-    deviceptr_create_ref,
+    deviceptr_create_with_owner,
     intptr,
     native,
     set_deallocation_stream,
@@ -64,9 +64,6 @@ cdef class Buffer:
         raise RuntimeError("Buffer objects cannot be instantiated directly. "
                            "Please use MemoryResource APIs.")
 
-    # Note: _init_from_handle is a cdef inline function, not a method
-    # See Buffer_init_from_handle below
-
     @classmethod
     def _init(
         cls, ptr: DevicePointerT, size_t size, mr: MemoryResource | None = None,
@@ -78,11 +75,11 @@ cdef class Buffer:
         Note: The stream parameter is accepted for API compatibility but is
         ignored since non-owning refs are never freed by the handle.
         """
-        cdef Buffer self = Buffer.__new__(cls)
-        self._h_ptr = deviceptr_create_ref(<uintptr_t>(int(ptr)))
-        self._size = size
         if mr is not None and owner is not None:
             raise ValueError("owner and memory resource cannot be both specified together")
+        cdef Buffer self = Buffer.__new__(cls)
+        self._h_ptr = deviceptr_create_with_owner(<uintptr_t>(int(ptr)), owner)
+        self._size = size
         self._memory_resource = mr
         self._ipc_data = IPCDataForBuffer(ipc_descriptor, True) if ipc_descriptor is not None else None
         self._owner = owner
@@ -495,7 +492,7 @@ cdef inline void Buffer_close(Buffer self, object stream):
     if stream is not None:
         s = Stream_accept(stream)
         set_deallocation_stream(self._h_ptr, s._h_stream)
-    # Reset handle - RAII deleter will free the memory
+    # Reset handle - RAII deleter will free the memory (and release owner ref in C++)
     self._h_ptr.reset()
     self._size = 0
     self._memory_resource = None
diff --git a/cuda_core/cuda/core/experimental/_resource_handles.pxd b/cuda_core/cuda/core/experimental/_resource_handles.pxd
index 2aeff68cd8..6aa204efc6 100644
--- a/cuda_core/cuda/core/experimental/_resource_handles.pxd
+++ b/cuda_core/cuda/core/experimental/_resource_handles.pxd
@@ -124,6 +124,11 @@ cdef extern from "_cpp/resource_handles.hpp" namespace "cuda_core":
     # Use for foreign pointers from external libraries
     DevicePtrHandle deviceptr_create_ref(cydriver.CUdeviceptr ptr) nogil
 
+    # Create non-owning handle that prevents Python owner from being GC'd
+    # Pointer NOT freed when released; owner's refcount decremented on release
+    # If owner is None, equivalent to deviceptr_create_ref
+    DevicePtrHandle deviceptr_create_with_owner(cydriver.CUdeviceptr ptr, object owner)
+
     # Import a device pointer from IPC via cuMemPoolImportPointer
     # Note: Does not yet implement reference counting for nvbug 5570902
     # On error, returns empty handle and sets thread-local error (use get_last_error())

From cce5e9fa27551a0ef34e57891ca767eb6795a945 Mon Sep 17 00:00:00 2001
From: Andy Jost <ajost@nvidia.com>
Date: Tue, 16 Dec 2025 12:55:42 -0800
Subject: [PATCH 23/38] Add resource handles _CXX_API capsule and lazy driver
 loading

Expose a full C++ handles function table via PyCapsule so extensions can dispatch without RTLD_GLOBAL, and switch resource_handles.cpp to load libcuda symbols at runtime to support CPU-only imports.
---
 cuda_core/build_hooks.py                      |  28 +-
 .../experimental/_cpp/resource_handles.cpp    | 320 ++++++++++++++++--
 .../_cpp/resource_handles_cxx_api.hpp         |  80 +++++
 cuda_core/cuda/core/experimental/_event.pyx   |  27 +-
 .../core/experimental/_resource_handles.pyx   |  20 ++
 .../_resource_handles_cxx_api.pxd             |  69 ++++
 6 files changed, 494 insertions(+), 50 deletions(-)
 create mode 100644 cuda_core/cuda/core/experimental/_cpp/resource_handles_cxx_api.hpp
 create mode 100644 cuda_core/cuda/core/experimental/_resource_handles_cxx_api.pxd

diff --git a/cuda_core/build_hooks.py b/cuda_core/build_hooks.py
index a20407488e..80a96e0bc2 100644
--- a/cuda_core/build_hooks.py
+++ b/cuda_core/build_hooks.py
@@ -12,6 +12,7 @@
 import os
 import re
 import subprocess
+import sys
 
 from Cython.Build import cythonize
 from setuptools import Extension
@@ -84,18 +85,6 @@ def get_cuda_paths():
         print("CUDA paths:", CUDA_PATH)
         return CUDA_PATH
 
-    @functools.cache
-    def get_cuda_library_dirs():
-        """Return library search paths for CUDA driver runtime."""
-
-        libdirs = []
-        for root in get_cuda_paths():
-            for subdir in ("lib64", "lib"):
-                candidate = os.path.join(root, subdir)
-                if os.path.isdir(candidate):
-                    libdirs.append(candidate)
-        return libdirs
-
     def get_sources(mod_name):
         """Get source files for a module, including any .cpp files."""
         sources = [f"cuda/core/experimental/{mod_name}.pyx"]
@@ -108,16 +97,15 @@ def get_sources(mod_name):
         return sources
 
     def get_extension_kwargs(mod_name):
-        """Return Extension kwargs (libraries, library_dirs) per module."""
+        """Return Extension kwargs (libraries, etc.) per module."""
 
-        # Modules that use CUDA driver APIs need to link against libcuda
-        # _resource_handles: contains the C++ implementation that calls CUDA driver
-        # _context, _stream, _event, _device: use resource handles and may call CUDA driver directly
-        cuda_users = {"_resource_handles", "_context", "_stream", "_event", "_device"}
         kwargs = {}
-        if mod_name in cuda_users:
-            kwargs["libraries"] = ["cuda"]
-            kwargs["library_dirs"] = get_cuda_library_dirs()
+
+        # _resource_handles.cpp uses dlopen/dlsym on Linux, which requires -ldl on glibc < 2.34.
+        # (On Windows it uses LoadLibrary/GetProcAddress; on macOS dlopen is in libSystem.)
+        if sys.platform.startswith("linux") and mod_name == "_resource_handles":
+            kwargs["libraries"] = ["dl"]
+
         return kwargs
 
     ext_modules = tuple(
diff --git a/cuda_core/cuda/core/experimental/_cpp/resource_handles.cpp b/cuda_core/cuda/core/experimental/_cpp/resource_handles.cpp
index c4b574f6a7..3ad8ea5dc4 100644
--- a/cuda_core/cuda/core/experimental/_cpp/resource_handles.cpp
+++ b/cuda_core/cuda/core/experimental/_cpp/resource_handles.cpp
@@ -5,13 +5,148 @@
 #include <Python.h>
 
 #include "resource_handles.hpp"
+#include "resource_handles_cxx_api.hpp"
 #include <cuda.h>
 #include <mutex>
+
+#if defined(_WIN32)
+#include <windows.h>
+#else
+#include <dlfcn.h>
+#endif
+
+#include <atomic>
+#include <mutex>
 #include <unordered_map>
 #include <vector>
 
 namespace cuda_core {
 
+// ============================================================================
+// CUDA driver dynamic loading (CPU-only import + MVC compatibility)
+// ============================================================================
+
+namespace {
+
+#if defined(_WIN32)
+using LibHandle = HMODULE;
+
+static LibHandle open_libcuda() noexcept {
+    // CUDA driver DLL
+    return LoadLibraryA("nvcuda.dll");
+}
+
+static void* get_symbol(LibHandle lib, const char* name) noexcept {
+    return reinterpret_cast<void*>(GetProcAddress(lib, name));
+}
+#else
+using LibHandle = void*;
+
+static LibHandle open_libcuda() noexcept {
+    // Prefer the soname; fall back to the linker name.
+    LibHandle lib = dlopen("libcuda.so.1", RTLD_NOW | RTLD_LOCAL);
+    if (!lib) {
+        lib = dlopen("libcuda.so", RTLD_NOW | RTLD_LOCAL);
+    }
+    return lib;
+}
+
+static void* get_symbol(LibHandle lib, const char* name) noexcept {
+    return dlsym(lib, name);
+}
+#endif
+
+static std::once_flag driver_load_once;
+static std::atomic<bool> driver_loaded{false};
+static LibHandle libcuda = nullptr;
+
+#define DECLARE_DRIVER_FN(name) using name##_t = decltype(&name); static name##_t p_##name = nullptr
+
+DECLARE_DRIVER_FN(cuDevicePrimaryCtxRetain);
+DECLARE_DRIVER_FN(cuDevicePrimaryCtxRelease);
+DECLARE_DRIVER_FN(cuCtxGetCurrent);
+
+DECLARE_DRIVER_FN(cuStreamCreateWithPriority);
+DECLARE_DRIVER_FN(cuStreamDestroy);
+
+DECLARE_DRIVER_FN(cuEventCreate);
+DECLARE_DRIVER_FN(cuEventDestroy);
+DECLARE_DRIVER_FN(cuIpcOpenEventHandle);
+
+DECLARE_DRIVER_FN(cuDeviceGetCount);
+
+DECLARE_DRIVER_FN(cuMemPoolSetAccess);
+DECLARE_DRIVER_FN(cuMemPoolDestroy);
+DECLARE_DRIVER_FN(cuMemPoolCreate);
+DECLARE_DRIVER_FN(cuDeviceGetMemPool);
+DECLARE_DRIVER_FN(cuMemPoolImportFromShareableHandle);
+
+DECLARE_DRIVER_FN(cuMemAllocFromPoolAsync);
+DECLARE_DRIVER_FN(cuMemAllocAsync);
+DECLARE_DRIVER_FN(cuMemAlloc);
+DECLARE_DRIVER_FN(cuMemAllocHost);
+
+DECLARE_DRIVER_FN(cuMemFreeAsync);
+DECLARE_DRIVER_FN(cuMemFree);
+DECLARE_DRIVER_FN(cuMemFreeHost);
+
+DECLARE_DRIVER_FN(cuMemPoolImportPointer);
+
+#undef DECLARE_DRIVER_FN
+
+template <typename T>
+static bool load_symbol(const char* sym, T& fn) noexcept {
+    fn = reinterpret_cast<T>(get_symbol(libcuda, sym));
+    return fn != nullptr;
+}
+
+static bool load_driver_api() noexcept {
+    libcuda = open_libcuda();
+    if (!libcuda) {
+        return false;
+    }
+
+    bool ok = true;
+    ok &= load_symbol("cuDevicePrimaryCtxRetain", p_cuDevicePrimaryCtxRetain);
+    ok &= load_symbol("cuDevicePrimaryCtxRelease", p_cuDevicePrimaryCtxRelease);
+    ok &= load_symbol("cuCtxGetCurrent", p_cuCtxGetCurrent);
+
+    ok &= load_symbol("cuStreamCreateWithPriority", p_cuStreamCreateWithPriority);
+    ok &= load_symbol("cuStreamDestroy", p_cuStreamDestroy);
+
+    ok &= load_symbol("cuEventCreate", p_cuEventCreate);
+    ok &= load_symbol("cuEventDestroy", p_cuEventDestroy);
+    ok &= load_symbol("cuIpcOpenEventHandle", p_cuIpcOpenEventHandle);
+
+    ok &= load_symbol("cuDeviceGetCount", p_cuDeviceGetCount);
+
+    ok &= load_symbol("cuMemPoolSetAccess", p_cuMemPoolSetAccess);
+    ok &= load_symbol("cuMemPoolDestroy", p_cuMemPoolDestroy);
+    ok &= load_symbol("cuMemPoolCreate", p_cuMemPoolCreate);
+    ok &= load_symbol("cuDeviceGetMemPool", p_cuDeviceGetMemPool);
+    ok &= load_symbol("cuMemPoolImportFromShareableHandle", p_cuMemPoolImportFromShareableHandle);
+
+    ok &= load_symbol("cuMemAllocFromPoolAsync", p_cuMemAllocFromPoolAsync);
+    ok &= load_symbol("cuMemAllocAsync", p_cuMemAllocAsync);
+    ok &= load_symbol("cuMemAlloc", p_cuMemAlloc);
+    ok &= load_symbol("cuMemAllocHost", p_cuMemAllocHost);
+
+    ok &= load_symbol("cuMemFreeAsync", p_cuMemFreeAsync);
+    ok &= load_symbol("cuMemFree", p_cuMemFree);
+    ok &= load_symbol("cuMemFreeHost", p_cuMemFreeHost);
+
+    ok &= load_symbol("cuMemPoolImportPointer", p_cuMemPoolImportPointer);
+
+    return ok;
+}
+
+static bool ensure_driver_loaded() noexcept {
+    std::call_once(driver_load_once, []() { driver_loaded.store(load_driver_api()); });
+    return driver_loaded.load();
+}
+
+}  // namespace
+
 // ============================================================================
 // Thread-local error handling
 // ============================================================================
@@ -118,6 +253,10 @@ ContextHandle create_context_handle_ref(CUcontext ctx) {
 thread_local std::vector<ContextHandle> primary_context_cache;
 
 ContextHandle get_primary_context(int device_id) noexcept {
+    if (!ensure_driver_loaded()) {
+        err = CUDA_ERROR_NOT_INITIALIZED;
+        return {};
+    }
     // Check thread-local cache
     if (static_cast<size_t>(device_id) < primary_context_cache.size()) {
         if (auto cached = primary_context_cache[device_id]) {
@@ -128,7 +267,7 @@ ContextHandle get_primary_context(int device_id) noexcept {
     // Cache miss - acquire primary context from driver
     GILReleaseGuard gil;
     CUcontext ctx;
-    if (CUDA_SUCCESS != (err = cuDevicePrimaryCtxRetain(&ctx, device_id))) {
+    if (CUDA_SUCCESS != (err = p_cuDevicePrimaryCtxRetain(&ctx, device_id))) {
         return {};
     }
 
@@ -136,7 +275,9 @@ ContextHandle get_primary_context(int device_id) noexcept {
         new ContextBox{ctx},
         [device_id](const ContextBox* b) {
             GILReleaseGuard gil;
-            cuDevicePrimaryCtxRelease(device_id);
+            if (ensure_driver_loaded()) {
+                p_cuDevicePrimaryCtxRelease(device_id);
+            }
             delete b;
         }
     );
@@ -151,9 +292,13 @@ ContextHandle get_primary_context(int device_id) noexcept {
 }
 
 ContextHandle get_current_context() noexcept {
+    if (!ensure_driver_loaded()) {
+        err = CUDA_ERROR_NOT_INITIALIZED;
+        return {};
+    }
     GILReleaseGuard gil;
     CUcontext ctx = nullptr;
-    if (CUDA_SUCCESS != (err = cuCtxGetCurrent(&ctx))) {
+    if (CUDA_SUCCESS != (err = p_cuCtxGetCurrent(&ctx))) {
         return {};
     }
     if (!ctx) {
@@ -171,9 +316,13 @@ struct StreamBox {
 };
 
 StreamHandle create_stream_handle(ContextHandle h_ctx, unsigned int flags, int priority) {
+    if (!ensure_driver_loaded()) {
+        err = CUDA_ERROR_NOT_INITIALIZED;
+        return {};
+    }
     GILReleaseGuard gil;
     CUstream stream;
-    if (CUDA_SUCCESS != (err = cuStreamCreateWithPriority(&stream, flags, priority))) {
+    if (CUDA_SUCCESS != (err = p_cuStreamCreateWithPriority(&stream, flags, priority))) {
         return {};
     }
 
@@ -181,7 +330,9 @@ StreamHandle create_stream_handle(ContextHandle h_ctx, unsigned int flags, int p
         new StreamBox{stream},
         [h_ctx](const StreamBox* b) {
             GILReleaseGuard gil;
-            cuStreamDestroy(b->resource);
+            if (ensure_driver_loaded()) {
+                p_cuStreamDestroy(b->resource);
+            }
             delete b;
         }
     );
@@ -227,9 +378,13 @@ struct EventBox {
 };
 
 EventHandle create_event_handle(ContextHandle h_ctx, unsigned int flags) {
+    if (!ensure_driver_loaded()) {
+        err = CUDA_ERROR_NOT_INITIALIZED;
+        return {};
+    }
     GILReleaseGuard gil;
     CUevent event;
-    if (CUDA_SUCCESS != (err = cuEventCreate(&event, flags))) {
+    if (CUDA_SUCCESS != (err = p_cuEventCreate(&event, flags))) {
         return {};
     }
 
@@ -237,7 +392,9 @@ EventHandle create_event_handle(ContextHandle h_ctx, unsigned int flags) {
         new EventBox{event},
         [h_ctx](const EventBox* b) {
             GILReleaseGuard gil;
-            cuEventDestroy(b->resource);
+            if (ensure_driver_loaded()) {
+                p_cuEventDestroy(b->resource);
+            }
             delete b;
         }
     );
@@ -249,9 +406,13 @@ EventHandle create_event_handle(unsigned int flags) {
 }
 
 EventHandle create_event_handle_ipc(const CUipcEventHandle& ipc_handle) {
+    if (!ensure_driver_loaded()) {
+        err = CUDA_ERROR_NOT_INITIALIZED;
+        return {};
+    }
     GILReleaseGuard gil;
     CUevent event;
-    if (CUDA_SUCCESS != (err = cuIpcOpenEventHandle(&event, ipc_handle))) {
+    if (CUDA_SUCCESS != (err = p_cuIpcOpenEventHandle(&event, ipc_handle))) {
         return {};
     }
 
@@ -259,7 +420,9 @@ EventHandle create_event_handle_ipc(const CUipcEventHandle& ipc_handle) {
         new EventBox{event},
         [](const EventBox* b) {
             GILReleaseGuard gil;
-            cuEventDestroy(b->resource);
+            if (ensure_driver_loaded()) {
+                p_cuEventDestroy(b->resource);
+            }
             delete b;
         }
     );
@@ -277,8 +440,11 @@ struct MemoryPoolBox {
 // Helper to clear peer access before destroying a memory pool.
 // Works around nvbug 5698116: recycled pool handles inherit peer access state.
 static void clear_mempool_peer_access(CUmemoryPool pool) {
+    if (!ensure_driver_loaded()) {
+        return;
+    }
     int device_count = 0;
-    if (cuDeviceGetCount(&device_count) != CUDA_SUCCESS || device_count <= 0) {
+    if (p_cuDeviceGetCount(&device_count) != CUDA_SUCCESS || device_count <= 0) {
         return;
     }
 
@@ -288,7 +454,7 @@ static void clear_mempool_peer_access(CUmemoryPool pool) {
         clear_access[i].location.id = i;
         clear_access[i].flags = CU_MEM_ACCESS_FLAGS_PROT_NONE;
     }
-    cuMemPoolSetAccess(pool, clear_access.data(), device_count);  // Best effort
+    p_cuMemPoolSetAccess(pool, clear_access.data(), device_count);  // Best effort
 }
 
 static MemoryPoolHandle wrap_mempool_owned(CUmemoryPool pool) {
@@ -297,7 +463,9 @@ static MemoryPoolHandle wrap_mempool_owned(CUmemoryPool pool) {
         [](const MemoryPoolBox* b) {
             GILReleaseGuard gil;
             clear_mempool_peer_access(b->resource);
-            cuMemPoolDestroy(b->resource);
+            if (ensure_driver_loaded()) {
+                p_cuMemPoolDestroy(b->resource);
+            }
             delete b;
         }
     );
@@ -305,9 +473,13 @@ static MemoryPoolHandle wrap_mempool_owned(CUmemoryPool pool) {
 }
 
 MemoryPoolHandle create_mempool_handle(const CUmemPoolProps& props) {
+    if (!ensure_driver_loaded()) {
+        err = CUDA_ERROR_NOT_INITIALIZED;
+        return {};
+    }
     GILReleaseGuard gil;
     CUmemoryPool pool;
-    if (CUDA_SUCCESS != (err = cuMemPoolCreate(&pool, &props))) {
+    if (CUDA_SUCCESS != (err = p_cuMemPoolCreate(&pool, &props))) {
         return {};
     }
     return wrap_mempool_owned(pool);
@@ -319,19 +491,27 @@ MemoryPoolHandle create_mempool_handle_ref(CUmemoryPool pool) {
 }
 
 MemoryPoolHandle get_device_mempool(int device_id) noexcept {
+    if (!ensure_driver_loaded()) {
+        err = CUDA_ERROR_NOT_INITIALIZED;
+        return {};
+    }
     GILReleaseGuard gil;
     CUmemoryPool pool;
-    if (CUDA_SUCCESS != (err = cuDeviceGetMemPool(&pool, device_id))) {
+    if (CUDA_SUCCESS != (err = p_cuDeviceGetMemPool(&pool, device_id))) {
         return {};
     }
     return create_mempool_handle_ref(pool);
 }
 
 MemoryPoolHandle create_mempool_handle_ipc(int fd, CUmemAllocationHandleType handle_type) {
+    if (!ensure_driver_loaded()) {
+        err = CUDA_ERROR_NOT_INITIALIZED;
+        return {};
+    }
     GILReleaseGuard gil;
     CUmemoryPool pool;
     auto handle_ptr = reinterpret_cast<void*>(static_cast<uintptr_t>(fd));
-    if (CUDA_SUCCESS != (err = cuMemPoolImportFromShareableHandle(&pool, handle_ptr, handle_type, 0))) {
+    if (CUDA_SUCCESS != (err = p_cuMemPoolImportFromShareableHandle(&pool, handle_ptr, handle_type, 0))) {
         return {};
     }
     return wrap_mempool_owned(pool);
@@ -362,9 +542,13 @@ void set_deallocation_stream(const DevicePtrHandle& h, StreamHandle h_stream) {
 }
 
 DevicePtrHandle deviceptr_alloc_from_pool(size_t size, MemoryPoolHandle h_pool, StreamHandle h_stream) {
+    if (!ensure_driver_loaded()) {
+        err = CUDA_ERROR_NOT_INITIALIZED;
+        return {};
+    }
     GILReleaseGuard gil;
     CUdeviceptr ptr;
-    if (CUDA_SUCCESS != (err = cuMemAllocFromPoolAsync(&ptr, size, *h_pool, native(h_stream)))) {
+    if (CUDA_SUCCESS != (err = p_cuMemAllocFromPoolAsync(&ptr, size, *h_pool, native(h_stream)))) {
         return {};
     }
 
@@ -372,7 +556,9 @@ DevicePtrHandle deviceptr_alloc_from_pool(size_t size, MemoryPoolHandle h_pool,
         new DevicePtrBox{ptr, h_stream},
         [h_pool](DevicePtrBox* b) {
             GILReleaseGuard gil;
-            cuMemFreeAsync(b->resource, native(b->h_stream));
+            if (ensure_driver_loaded()) {
+                p_cuMemFreeAsync(b->resource, native(b->h_stream));
+            }
             delete b;
         }
     );
@@ -380,9 +566,13 @@ DevicePtrHandle deviceptr_alloc_from_pool(size_t size, MemoryPoolHandle h_pool,
 }
 
 DevicePtrHandle deviceptr_alloc_async(size_t size, StreamHandle h_stream) {
+    if (!ensure_driver_loaded()) {
+        err = CUDA_ERROR_NOT_INITIALIZED;
+        return {};
+    }
     GILReleaseGuard gil;
     CUdeviceptr ptr;
-    if (CUDA_SUCCESS != (err = cuMemAllocAsync(&ptr, size, native(h_stream)))) {
+    if (CUDA_SUCCESS != (err = p_cuMemAllocAsync(&ptr, size, native(h_stream)))) {
         return {};
     }
 
@@ -390,7 +580,9 @@ DevicePtrHandle deviceptr_alloc_async(size_t size, StreamHandle h_stream) {
         new DevicePtrBox{ptr, h_stream},
         [](DevicePtrBox* b) {
             GILReleaseGuard gil;
-            cuMemFreeAsync(b->resource, native(b->h_stream));
+            if (ensure_driver_loaded()) {
+                p_cuMemFreeAsync(b->resource, native(b->h_stream));
+            }
             delete b;
         }
     );
@@ -398,9 +590,13 @@ DevicePtrHandle deviceptr_alloc_async(size_t size, StreamHandle h_stream) {
 }
 
 DevicePtrHandle deviceptr_alloc(size_t size) {
+    if (!ensure_driver_loaded()) {
+        err = CUDA_ERROR_NOT_INITIALIZED;
+        return {};
+    }
     GILReleaseGuard gil;
     CUdeviceptr ptr;
-    if (CUDA_SUCCESS != (err = cuMemAlloc(&ptr, size))) {
+    if (CUDA_SUCCESS != (err = p_cuMemAlloc(&ptr, size))) {
         return {};
     }
 
@@ -408,7 +604,9 @@ DevicePtrHandle deviceptr_alloc(size_t size) {
         new DevicePtrBox{ptr, StreamHandle{}},
         [](DevicePtrBox* b) {
             GILReleaseGuard gil;
-            cuMemFree(b->resource);
+            if (ensure_driver_loaded()) {
+                p_cuMemFree(b->resource);
+            }
             delete b;
         }
     );
@@ -416,9 +614,13 @@ DevicePtrHandle deviceptr_alloc(size_t size) {
 }
 
 DevicePtrHandle deviceptr_alloc_host(size_t size) {
+    if (!ensure_driver_loaded()) {
+        err = CUDA_ERROR_NOT_INITIALIZED;
+        return {};
+    }
     GILReleaseGuard gil;
     void* ptr;
-    if (CUDA_SUCCESS != (err = cuMemAllocHost(&ptr, size))) {
+    if (CUDA_SUCCESS != (err = p_cuMemAllocHost(&ptr, size))) {
         return {};
     }
 
@@ -426,7 +628,9 @@ DevicePtrHandle deviceptr_alloc_host(size_t size) {
         new DevicePtrBox{reinterpret_cast<CUdeviceptr>(ptr), StreamHandle{}},
         [](DevicePtrBox* b) {
             GILReleaseGuard gil;
-            cuMemFreeHost(reinterpret_cast<void*>(b->resource));
+            if (ensure_driver_loaded()) {
+                p_cuMemFreeHost(reinterpret_cast<void*>(b->resource));
+            }
             delete b;
         }
     );
@@ -473,11 +677,15 @@ static std::mutex ipc_ptr_cache_mutex;
 static std::unordered_map<CUdeviceptr, std::weak_ptr<DevicePtrBox>> ipc_ptr_cache;
 
 DevicePtrHandle deviceptr_import_ipc(MemoryPoolHandle h_pool, const void* export_data, StreamHandle h_stream) {
+    if (!ensure_driver_loaded()) {
+        err = CUDA_ERROR_NOT_INITIALIZED;
+        return {};
+    }
     GILReleaseGuard gil;
     CUdeviceptr ptr;
     auto data = const_cast<CUmemPoolPtrExportData*>(
         reinterpret_cast<const CUmemPoolPtrExportData*>(export_data));
-    if (CUDA_SUCCESS != (err = cuMemPoolImportPointer(&ptr, *h_pool, data))) {
+    if (CUDA_SUCCESS != (err = p_cuMemPoolImportPointer(&ptr, *h_pool, data))) {
         return {};
     }
 
@@ -502,7 +710,9 @@ DevicePtrHandle deviceptr_import_ipc(MemoryPoolHandle h_pool, const void* export
                     ipc_ptr_cache.erase(ptr);
                 }
                 GILReleaseGuard gil;
-                cuMemFreeAsync(b->resource, native(b->h_stream));
+                if (ensure_driver_loaded()) {
+                    p_cuMemFreeAsync(b->resource, native(b->h_stream));
+                }
                 delete b;
             }
         );
@@ -515,7 +725,9 @@ DevicePtrHandle deviceptr_import_ipc(MemoryPoolHandle h_pool, const void* export
             new DevicePtrBox{ptr, h_stream},
             [h_pool](DevicePtrBox* b) {
                 GILReleaseGuard gil;
-                cuMemFreeAsync(b->resource, native(b->h_stream));
+                if (ensure_driver_loaded()) {
+                    p_cuMemFreeAsync(b->resource, native(b->h_stream));
+                }
                 delete b;
             }
         );
@@ -523,4 +735,60 @@ DevicePtrHandle deviceptr_import_ipc(MemoryPoolHandle h_pool, const void* export
     }
 }
 
+// ============================================================================
+// Capsule C++ API table
+// ============================================================================
+
+const ResourceHandlesCxxApiV1* get_resource_handles_cxx_api_v1() noexcept {
+    static const ResourceHandlesCxxApiV1 table = []() {
+        ResourceHandlesCxxApiV1 t{};
+        t.abi_version = RESOURCE_HANDLES_CXX_API_VERSION;
+        t.struct_size = static_cast<std::uint32_t>(sizeof(ResourceHandlesCxxApiV1));
+
+        // Error handling
+        t.get_last_error = &get_last_error;
+        t.peek_last_error = &peek_last_error;
+        t.clear_last_error = &clear_last_error;
+
+        // Context
+        t.create_context_handle_ref = &create_context_handle_ref;
+        t.get_primary_context = &get_primary_context;
+        t.get_current_context = &get_current_context;
+
+        // Stream
+        t.create_stream_handle = &create_stream_handle;
+        t.create_stream_handle_ref = &create_stream_handle_ref;
+        t.create_stream_handle_with_owner = &create_stream_handle_with_owner;
+        t.get_legacy_stream = &get_legacy_stream;
+        t.get_per_thread_stream = &get_per_thread_stream;
+
+        // Event (resolve overloads explicitly)
+        t.create_event_handle =
+            static_cast<EventHandle (*)(ContextHandle, unsigned int)>(&create_event_handle);
+        t.create_event_handle_noctx =
+            static_cast<EventHandle (*)(unsigned int)>(&create_event_handle);
+        t.create_event_handle_ipc = &create_event_handle_ipc;
+
+        // Memory pool
+        t.create_mempool_handle = &create_mempool_handle;
+        t.create_mempool_handle_ref = &create_mempool_handle_ref;
+        t.get_device_mempool = &get_device_mempool;
+        t.create_mempool_handle_ipc = &create_mempool_handle_ipc;
+
+        // Device pointer
+        t.deviceptr_alloc_from_pool = &deviceptr_alloc_from_pool;
+        t.deviceptr_alloc_async = &deviceptr_alloc_async;
+        t.deviceptr_alloc = &deviceptr_alloc;
+        t.deviceptr_alloc_host = &deviceptr_alloc_host;
+        t.deviceptr_create_ref = &deviceptr_create_ref;
+        t.deviceptr_create_with_owner = &deviceptr_create_with_owner;
+        t.deviceptr_import_ipc = &deviceptr_import_ipc;
+        t.deallocation_stream = &deallocation_stream;
+        t.set_deallocation_stream = &set_deallocation_stream;
+
+        return t;
+    }();
+    return &table;
+}
+
 }  // namespace cuda_core
diff --git a/cuda_core/cuda/core/experimental/_cpp/resource_handles_cxx_api.hpp b/cuda_core/cuda/core/experimental/_cpp/resource_handles_cxx_api.hpp
new file mode 100644
index 0000000000..5436b761f5
--- /dev/null
+++ b/cuda_core/cuda/core/experimental/_cpp/resource_handles_cxx_api.hpp
@@ -0,0 +1,80 @@
+// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include <cstdint>
+
+#include "resource_handles.hpp"
+
+namespace cuda_core {
+
+// C++ capsule API for cross-extension-module calls.
+//
+// The function-pointer table is exported from the Python extension module
+// `cuda.core.experimental._resource_handles` as a PyCapsule named:
+//
+//   "cuda.core.experimental._resource_handles._CXX_API"
+//
+// Other extension modules import the capsule and dispatch through the table to
+// ensure there is a single owner of all correctness-critical static/thread_local
+// state in resource_handles.cpp (caches, last-error state, etc.).
+
+static constexpr std::uint32_t RESOURCE_HANDLES_CXX_API_VERSION = 1;
+
+struct ResourceHandlesCxxApiV1 {
+    std::uint32_t abi_version;
+    std::uint32_t struct_size;
+
+    // Thread-local error handling
+    CUresult (*get_last_error)() noexcept;
+    CUresult (*peek_last_error)() noexcept;
+    void (*clear_last_error)() noexcept;
+
+    // Context handles
+    ContextHandle (*create_context_handle_ref)(CUcontext ctx);
+    ContextHandle (*get_primary_context)(int device_id) noexcept;
+    ContextHandle (*get_current_context)() noexcept;
+
+    // Stream handles
+    StreamHandle (*create_stream_handle)(ContextHandle h_ctx, unsigned int flags, int priority);
+    StreamHandle (*create_stream_handle_ref)(CUstream stream);
+    StreamHandle (*create_stream_handle_with_owner)(CUstream stream, PyObject* owner);
+    StreamHandle (*get_legacy_stream)() noexcept;
+    StreamHandle (*get_per_thread_stream)() noexcept;
+
+    // Event handles
+    EventHandle (*create_event_handle)(ContextHandle h_ctx, unsigned int flags);
+    EventHandle (*create_event_handle_noctx)(unsigned int flags);
+    EventHandle (*create_event_handle_ipc)(const CUipcEventHandle& ipc_handle);
+
+    // Memory pool handles
+    MemoryPoolHandle (*create_mempool_handle)(const CUmemPoolProps& props);
+    MemoryPoolHandle (*create_mempool_handle_ref)(CUmemoryPool pool);
+    MemoryPoolHandle (*get_device_mempool)(int device_id) noexcept;
+    MemoryPoolHandle (*create_mempool_handle_ipc)(int fd, CUmemAllocationHandleType handle_type);
+
+    // Device pointer handles
+    DevicePtrHandle (*deviceptr_alloc_from_pool)(
+        size_t size,
+        MemoryPoolHandle h_pool,
+        StreamHandle h_stream);
+    DevicePtrHandle (*deviceptr_alloc_async)(size_t size, StreamHandle h_stream);
+    DevicePtrHandle (*deviceptr_alloc)(size_t size);
+    DevicePtrHandle (*deviceptr_alloc_host)(size_t size);
+    DevicePtrHandle (*deviceptr_create_ref)(CUdeviceptr ptr);
+    DevicePtrHandle (*deviceptr_create_with_owner)(CUdeviceptr ptr, PyObject* owner);
+    DevicePtrHandle (*deviceptr_import_ipc)(
+        MemoryPoolHandle h_pool,
+        const void* export_data,
+        StreamHandle h_stream);
+    StreamHandle (*deallocation_stream)(const DevicePtrHandle& h);
+    void (*set_deallocation_stream)(const DevicePtrHandle& h, StreamHandle h_stream);
+};
+
+// Return pointer to a process-wide singleton table.
+const ResourceHandlesCxxApiV1* get_resource_handles_cxx_api_v1() noexcept;
+
+}  // namespace cuda_core
+
diff --git a/cuda_core/cuda/core/experimental/_event.pyx b/cuda_core/cuda/core/experimental/_event.pyx
index 72618e40d6..2305199ffc 100644
--- a/cuda_core/cuda/core/experimental/_event.pyx
+++ b/cuda_core/cuda/core/experimental/_event.pyx
@@ -5,14 +5,14 @@
 from __future__ import annotations
 
 cimport cpython
+from cpython.pycapsule cimport PyCapsule_Import
 from libc.string cimport memcpy
 from cuda.bindings cimport cydriver
 from cuda.core.experimental._context cimport Context
+from cuda.core.experimental._resource_handles_cxx_api cimport ResourceHandlesCxxApiV1
 from cuda.core.experimental._resource_handles cimport (
     ContextHandle,
     EventHandle,
-    create_event_handle,
-    create_event_handle_ipc,
     intptr,
     native,
     py,
@@ -34,6 +34,23 @@ if TYPE_CHECKING:
     import cuda.bindings
 
 
+cdef const char* _CXX_API_NAME = b"cuda.core.experimental._resource_handles._CXX_API"
+cdef const ResourceHandlesCxxApiV1* _handles = NULL
+
+
+cdef inline const ResourceHandlesCxxApiV1* _get_handles() except NULL:
+    global _handles
+    if _handles == NULL:
+        _handles = <const ResourceHandlesCxxApiV1*>PyCapsule_Import(_CXX_API_NAME, 0)
+        if _handles == NULL:
+            raise ImportError("Failed to import cuda.core.experimental._resource_handles._CXX_API capsule")
+        if _handles.abi_version != 1:
+            raise ImportError("Unsupported resource handles C++ API version")
+        if _handles.struct_size < cython.sizeof(ResourceHandlesCxxApiV1):
+            raise ImportError("Resource handles C++ API table is too small")
+    return _handles
+
+
 @dataclass
 cdef class EventOptions:
     """Customizable :obj:`~_event.Event` options.
@@ -116,7 +133,8 @@ cdef class Event:
             if not self._timing_disabled:
                 raise TypeError("IPC-enabled events cannot use timing.")
         # C++ creates the event and returns owning handle with context dependency
-        cdef EventHandle h_event = create_event_handle(h_context, flags)
+        cdef const ResourceHandlesCxxApiV1* handles = _get_handles()
+        cdef EventHandle h_event = handles.create_event_handle(h_context, flags)
         if not h_event:
             raise RuntimeError("Failed to create CUDA event")
         self._h_event = h_event
@@ -199,7 +217,8 @@ cdef class Event:
         memcpy(data.reserved, <const void*><const char*>(ipc_descriptor._reserved), sizeof(data.reserved))
         cdef Event self = Event.__new__(cls)
         # IPC events: the originating process owns the event and its context
-        cdef EventHandle h_event = create_event_handle_ipc(data)
+        cdef const ResourceHandlesCxxApiV1* handles = _get_handles()
+        cdef EventHandle h_event = handles.create_event_handle_ipc(data)
         if not h_event:
             raise RuntimeError("Failed to open IPC event handle")
         self._h_event = h_event
diff --git a/cuda_core/cuda/core/experimental/_resource_handles.pyx b/cuda_core/cuda/core/experimental/_resource_handles.pyx
index 6395f21e2a..47fc1dc1c4 100644
--- a/cuda_core/cuda/core/experimental/_resource_handles.pyx
+++ b/cuda_core/cuda/core/experimental/_resource_handles.pyx
@@ -5,3 +5,23 @@
 # This module exists to compile _cpp/resource_handles.cpp into a shared library.
 # The helper functions (native, intptr, py) are implemented as inline C++ functions
 # in _cpp/resource_handles.hpp and declared as extern in _resource_handles.pxd.
+
+from cpython.pycapsule cimport PyCapsule_New
+
+from cuda.core.experimental._resource_handles_cxx_api cimport (
+    ResourceHandlesCxxApiV1,
+    get_resource_handles_cxx_api_v1,
+)
+
+
+cdef const char* _CXX_API_NAME = b"cuda.core.experimental._resource_handles._CXX_API"
+
+# Export the C++ handles dispatch table as a PyCapsule.
+# Consumers use PyCapsule_Import(_CXX_API_NAME, 0) to retrieve it.
+cdef const ResourceHandlesCxxApiV1* _handles_table = get_resource_handles_cxx_api_v1()
+if _handles_table == NULL:
+    raise RuntimeError("Failed to initialize resource handles C++ API table")
+
+_CXX_API = <object>PyCapsule_New(<void*>_handles_table, _CXX_API_NAME, NULL)
+if _CXX_API is None:
+    raise RuntimeError("Failed to create _CXX_API capsule")
diff --git a/cuda_core/cuda/core/experimental/_resource_handles_cxx_api.pxd b/cuda_core/cuda/core/experimental/_resource_handles_cxx_api.pxd
new file mode 100644
index 0000000000..f8a4874908
--- /dev/null
+++ b/cuda_core/cuda/core/experimental/_resource_handles_cxx_api.pxd
@@ -0,0 +1,69 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from libc.stdint cimport uint32_t
+from libc.stddef cimport size_t
+
+from cuda.bindings cimport cydriver
+from cuda.core.experimental._resource_handles cimport (
+    ContextHandle,
+    DevicePtrHandle,
+    EventHandle,
+    MemoryPoolHandle,
+    StreamHandle,
+)
+
+
+cdef extern from "_cpp/resource_handles_cxx_api.hpp" namespace "cuda_core":
+    cdef struct ResourceHandlesCxxApiV1:
+        uint32_t abi_version
+        uint32_t struct_size
+
+        # Thread-local error handling
+        cydriver.CUresult (*get_last_error)() nogil
+        cydriver.CUresult (*peek_last_error)() nogil
+        void (*clear_last_error)() nogil
+
+        # Context handles
+        ContextHandle (*create_context_handle_ref)(cydriver.CUcontext ctx) nogil
+        ContextHandle (*get_primary_context)(int device_id) nogil
+        ContextHandle (*get_current_context)() nogil
+
+        # Stream handles
+        StreamHandle (*create_stream_handle)(ContextHandle h_ctx, unsigned int flags, int priority) nogil
+        StreamHandle (*create_stream_handle_ref)(cydriver.CUstream stream) nogil
+        StreamHandle (*create_stream_handle_with_owner)(cydriver.CUstream stream, object owner)
+        StreamHandle (*get_legacy_stream)() nogil
+        StreamHandle (*get_per_thread_stream)() nogil
+
+        # Event handles
+        EventHandle (*create_event_handle)(ContextHandle h_ctx, unsigned int flags) nogil
+        EventHandle (*create_event_handle_noctx)(unsigned int flags) nogil
+        EventHandle (*create_event_handle_ipc)(const cydriver.CUipcEventHandle& ipc_handle) nogil
+
+        # Memory pool handles
+        MemoryPoolHandle (*create_mempool_handle)(const cydriver.CUmemPoolProps& props) nogil
+        MemoryPoolHandle (*create_mempool_handle_ref)(cydriver.CUmemoryPool pool) nogil
+        MemoryPoolHandle (*get_device_mempool)(int device_id) nogil
+        MemoryPoolHandle (*create_mempool_handle_ipc)(int fd, cydriver.CUmemAllocationHandleType handle_type) nogil
+
+        # Device pointer handles
+        DevicePtrHandle (*deviceptr_alloc_from_pool)(
+            size_t size,
+            MemoryPoolHandle h_pool,
+            StreamHandle h_stream) nogil
+        DevicePtrHandle (*deviceptr_alloc_async)(size_t size, StreamHandle h_stream) nogil
+        DevicePtrHandle (*deviceptr_alloc)(size_t size) nogil
+        DevicePtrHandle (*deviceptr_alloc_host)(size_t size) nogil
+        DevicePtrHandle (*deviceptr_create_ref)(cydriver.CUdeviceptr ptr) nogil
+        DevicePtrHandle (*deviceptr_create_with_owner)(cydriver.CUdeviceptr ptr, object owner)
+        DevicePtrHandle (*deviceptr_import_ipc)(
+            MemoryPoolHandle h_pool,
+            const void* export_data,
+            StreamHandle h_stream) nogil
+        StreamHandle (*deallocation_stream)(const DevicePtrHandle& h) nogil
+        void (*set_deallocation_stream)(const DevicePtrHandle& h, StreamHandle h_stream) nogil
+
+    const ResourceHandlesCxxApiV1* get_resource_handles_cxx_api_v1() nogil
+

From ab164569718066f8e34d1474c55a9418556663de Mon Sep 17 00:00:00 2001
From: Andy Jost <ajost@nvidia.com>
Date: Tue, 16 Dec 2025 14:31:54 -0800
Subject: [PATCH 24/38] Resolve CUDA driver entrypoints via cuda-bindings
 cuGetProcAddress

Use a lazy PyCapsule in _resource_handles to resolve and cache required CUDA driver entrypoints via cuda.bindings.driver.cuGetProcAddress, and have resource_handles.cpp consume that table on first use. This avoids duplicating driver pathfinding logic and removes dlopen/dlsym linkage requirements.
---
 cuda_core/build_hooks.py                      |  10 +-
 .../experimental/_cpp/resource_handles.cpp    | 192 +++++++++++-------
 .../core/experimental/_resource_handles.pyx   | 110 ++++++++++
 3 files changed, 229 insertions(+), 83 deletions(-)

diff --git a/cuda_core/build_hooks.py b/cuda_core/build_hooks.py
index 80a96e0bc2..1f1197e9bc 100644
--- a/cuda_core/build_hooks.py
+++ b/cuda_core/build_hooks.py
@@ -12,7 +12,6 @@
 import os
 import re
 import subprocess
-import sys
 
 from Cython.Build import cythonize
 from setuptools import Extension
@@ -99,14 +98,7 @@ def get_sources(mod_name):
     def get_extension_kwargs(mod_name):
         """Return Extension kwargs (libraries, etc.) per module."""
 
-        kwargs = {}
-
-        # _resource_handles.cpp uses dlopen/dlsym on Linux, which requires -ldl on glibc < 2.34.
-        # (On Windows it uses LoadLibrary/GetProcAddress; on macOS dlopen is in libSystem.)
-        if sys.platform.startswith("linux") and mod_name == "_resource_handles":
-            kwargs["libraries"] = ["dl"]
-
-        return kwargs
+        return {}
 
     ext_modules = tuple(
         Extension(
diff --git a/cuda_core/cuda/core/experimental/_cpp/resource_handles.cpp b/cuda_core/cuda/core/experimental/_cpp/resource_handles.cpp
index 3ad8ea5dc4..a4f872566b 100644
--- a/cuda_core/cuda/core/experimental/_cpp/resource_handles.cpp
+++ b/cuda_core/cuda/core/experimental/_cpp/resource_handles.cpp
@@ -7,15 +7,7 @@
 #include "resource_handles.hpp"
 #include "resource_handles_cxx_api.hpp"
 #include <cuda.h>
-#include <mutex>
-
-#if defined(_WIN32)
-#include <windows.h>
-#else
-#include <dlfcn.h>
-#endif
-
-#include <atomic>
+#include <cstdint>
 #include <mutex>
 #include <unordered_map>
 #include <vector>
@@ -23,42 +15,13 @@
 namespace cuda_core {
 
 // ============================================================================
-// CUDA driver dynamic loading (CPU-only import + MVC compatibility)
+// CUDA driver lazy resolution via cuda-bindings (CPU-only import + MVC)
 // ============================================================================
 
 namespace {
 
-#if defined(_WIN32)
-using LibHandle = HMODULE;
-
-static LibHandle open_libcuda() noexcept {
-    // CUDA driver DLL
-    return LoadLibraryA("nvcuda.dll");
-}
-
-static void* get_symbol(LibHandle lib, const char* name) noexcept {
-    return reinterpret_cast<void*>(GetProcAddress(lib, name));
-}
-#else
-using LibHandle = void*;
-
-static LibHandle open_libcuda() noexcept {
-    // Prefer the soname; fall back to the linker name.
-    LibHandle lib = dlopen("libcuda.so.1", RTLD_NOW | RTLD_LOCAL);
-    if (!lib) {
-        lib = dlopen("libcuda.so", RTLD_NOW | RTLD_LOCAL);
-    }
-    return lib;
-}
-
-static void* get_symbol(LibHandle lib, const char* name) noexcept {
-    return dlsym(lib, name);
-}
-#endif
-
 static std::once_flag driver_load_once;
-static std::atomic<bool> driver_loaded{false};
-static LibHandle libcuda = nullptr;
+static bool driver_loaded = false;
 
 #define DECLARE_DRIVER_FN(name) using name##_t = decltype(&name); static name##_t p_##name = nullptr
 
@@ -94,55 +57,136 @@ DECLARE_DRIVER_FN(cuMemPoolImportPointer);
 
 #undef DECLARE_DRIVER_FN
 
-template <typename T>
-static bool load_symbol(const char* sym, T& fn) noexcept {
-    fn = reinterpret_cast<T>(get_symbol(libcuda, sym));
-    return fn != nullptr;
-}
-
 static bool load_driver_api() noexcept {
-    libcuda = open_libcuda();
-    if (!libcuda) {
+    if (!Py_IsInitialized() || _Py_IsFinalizing()) {
+        return false;
+    }
+
+    struct CudaDriverApiV1 {
+        std::uint32_t abi_version;
+        std::uint32_t struct_size;
+
+        std::uintptr_t cuDevicePrimaryCtxRetain;
+        std::uintptr_t cuDevicePrimaryCtxRelease;
+        std::uintptr_t cuCtxGetCurrent;
+
+        std::uintptr_t cuStreamCreateWithPriority;
+        std::uintptr_t cuStreamDestroy;
+
+        std::uintptr_t cuEventCreate;
+        std::uintptr_t cuEventDestroy;
+        std::uintptr_t cuIpcOpenEventHandle;
+
+        std::uintptr_t cuDeviceGetCount;
+
+        std::uintptr_t cuMemPoolSetAccess;
+        std::uintptr_t cuMemPoolDestroy;
+        std::uintptr_t cuMemPoolCreate;
+        std::uintptr_t cuDeviceGetMemPool;
+        std::uintptr_t cuMemPoolImportFromShareableHandle;
+
+        std::uintptr_t cuMemAllocFromPoolAsync;
+        std::uintptr_t cuMemAllocAsync;
+        std::uintptr_t cuMemAlloc;
+        std::uintptr_t cuMemAllocHost;
+
+        std::uintptr_t cuMemFreeAsync;
+        std::uintptr_t cuMemFree;
+        std::uintptr_t cuMemFreeHost;
+
+        std::uintptr_t cuMemPoolImportPointer;
+    };
+
+    static constexpr const char* capsule_name =
+        "cuda.core.experimental._resource_handles._CUDA_DRIVER_API_V1";
+
+    PyGILState_STATE gstate = PyGILState_Ensure();
+
+    // `_resource_handles` is already loaded (it exports the handle API capsule),
+    // so avoid import machinery and just grab the module object.
+    PyObject* mod = PyImport_AddModule("cuda.core.experimental._resource_handles");  // borrowed
+    if (!mod) {
+        PyErr_Clear();
+        PyGILState_Release(gstate);
+        return false;
+    }
+
+    PyObject* fn = PyObject_GetAttrString(mod, "_get_cuda_driver_api_v1_capsule");  // new ref
+    if (!fn) {
+        PyErr_Clear();
+        PyGILState_Release(gstate);
+        return false;
+    }
+
+    PyObject* cap = PyObject_CallFunctionObjArgs(fn, nullptr);
+    Py_DECREF(fn);
+    if (!cap) {
+        PyErr_Clear();
+        PyGILState_Release(gstate);
         return false;
     }
 
-    bool ok = true;
-    ok &= load_symbol("cuDevicePrimaryCtxRetain", p_cuDevicePrimaryCtxRetain);
-    ok &= load_symbol("cuDevicePrimaryCtxRelease", p_cuDevicePrimaryCtxRelease);
-    ok &= load_symbol("cuCtxGetCurrent", p_cuCtxGetCurrent);
+    const auto* api = static_cast<const CudaDriverApiV1*>(PyCapsule_GetPointer(cap, capsule_name));
+    Py_DECREF(cap);
 
-    ok &= load_symbol("cuStreamCreateWithPriority", p_cuStreamCreateWithPriority);
-    ok &= load_symbol("cuStreamDestroy", p_cuStreamDestroy);
+    if (!api) {
+        PyErr_Clear();
+        PyGILState_Release(gstate);
+        return false;
+    }
+    if (api->abi_version != 1 || api->struct_size < sizeof(CudaDriverApiV1)) {
+        PyGILState_Release(gstate);
+        return false;
+    }
 
-    ok &= load_symbol("cuEventCreate", p_cuEventCreate);
-    ok &= load_symbol("cuEventDestroy", p_cuEventDestroy);
-    ok &= load_symbol("cuIpcOpenEventHandle", p_cuIpcOpenEventHandle);
+#define LOAD_ADDR(name)                                             \
+    do {                                                            \
+        if (api->name == 0) {                                       \
+            PyGILState_Release(gstate);                             \
+            return false;                                           \
+        }                                                           \
+        p_##name = reinterpret_cast<decltype(p_##name)>(api->name); \
+    } while (0)
 
-    ok &= load_symbol("cuDeviceGetCount", p_cuDeviceGetCount);
+    LOAD_ADDR(cuDevicePrimaryCtxRetain);
+    LOAD_ADDR(cuDevicePrimaryCtxRelease);
+    LOAD_ADDR(cuCtxGetCurrent);
 
-    ok &= load_symbol("cuMemPoolSetAccess", p_cuMemPoolSetAccess);
-    ok &= load_symbol("cuMemPoolDestroy", p_cuMemPoolDestroy);
-    ok &= load_symbol("cuMemPoolCreate", p_cuMemPoolCreate);
-    ok &= load_symbol("cuDeviceGetMemPool", p_cuDeviceGetMemPool);
-    ok &= load_symbol("cuMemPoolImportFromShareableHandle", p_cuMemPoolImportFromShareableHandle);
+    LOAD_ADDR(cuStreamCreateWithPriority);
+    LOAD_ADDR(cuStreamDestroy);
 
-    ok &= load_symbol("cuMemAllocFromPoolAsync", p_cuMemAllocFromPoolAsync);
-    ok &= load_symbol("cuMemAllocAsync", p_cuMemAllocAsync);
-    ok &= load_symbol("cuMemAlloc", p_cuMemAlloc);
-    ok &= load_symbol("cuMemAllocHost", p_cuMemAllocHost);
+    LOAD_ADDR(cuEventCreate);
+    LOAD_ADDR(cuEventDestroy);
+    LOAD_ADDR(cuIpcOpenEventHandle);
 
-    ok &= load_symbol("cuMemFreeAsync", p_cuMemFreeAsync);
-    ok &= load_symbol("cuMemFree", p_cuMemFree);
-    ok &= load_symbol("cuMemFreeHost", p_cuMemFreeHost);
+    LOAD_ADDR(cuDeviceGetCount);
 
-    ok &= load_symbol("cuMemPoolImportPointer", p_cuMemPoolImportPointer);
+    LOAD_ADDR(cuMemPoolSetAccess);
+    LOAD_ADDR(cuMemPoolDestroy);
+    LOAD_ADDR(cuMemPoolCreate);
+    LOAD_ADDR(cuDeviceGetMemPool);
+    LOAD_ADDR(cuMemPoolImportFromShareableHandle);
 
-    return ok;
+    LOAD_ADDR(cuMemAllocFromPoolAsync);
+    LOAD_ADDR(cuMemAllocAsync);
+    LOAD_ADDR(cuMemAlloc);
+    LOAD_ADDR(cuMemAllocHost);
+
+    LOAD_ADDR(cuMemFreeAsync);
+    LOAD_ADDR(cuMemFree);
+    LOAD_ADDR(cuMemFreeHost);
+
+    LOAD_ADDR(cuMemPoolImportPointer);
+
+#undef LOAD_ADDR
+
+    PyGILState_Release(gstate);
+    return true;
 }
 
 static bool ensure_driver_loaded() noexcept {
-    std::call_once(driver_load_once, []() { driver_loaded.store(load_driver_api()); });
-    return driver_loaded.load();
+    std::call_once(driver_load_once, []() { driver_loaded = load_driver_api(); });
+    return driver_loaded;
 }
 
 }  // namespace
diff --git a/cuda_core/cuda/core/experimental/_resource_handles.pyx b/cuda_core/cuda/core/experimental/_resource_handles.pyx
index 47fc1dc1c4..8ddb44c175 100644
--- a/cuda_core/cuda/core/experimental/_resource_handles.pyx
+++ b/cuda_core/cuda/core/experimental/_resource_handles.pyx
@@ -7,14 +7,18 @@
 # in _cpp/resource_handles.hpp and declared as extern in _resource_handles.pxd.
 
 from cpython.pycapsule cimport PyCapsule_New
+from libc.stdint cimport uint32_t, uint64_t, uintptr_t
 
 from cuda.core.experimental._resource_handles_cxx_api cimport (
     ResourceHandlesCxxApiV1,
     get_resource_handles_cxx_api_v1,
 )
 
+import cython
+
 
 cdef const char* _CXX_API_NAME = b"cuda.core.experimental._resource_handles._CXX_API"
+cdef const char* _CUDA_DRIVER_API_V1_NAME = b"cuda.core.experimental._resource_handles._CUDA_DRIVER_API_V1"
 
 # Export the C++ handles dispatch table as a PyCapsule.
 # Consumers use PyCapsule_Import(_CXX_API_NAME, 0) to retrieve it.
@@ -25,3 +29,109 @@ if _handles_table == NULL:
 _CXX_API = <object>PyCapsule_New(<void*>_handles_table, _CXX_API_NAME, NULL)
 if _CXX_API is None:
     raise RuntimeError("Failed to create _CXX_API capsule")
+
+
+cdef struct CudaDriverApiV1:
+    uint32_t abi_version
+    uint32_t struct_size
+
+    uintptr_t cuDevicePrimaryCtxRetain
+    uintptr_t cuDevicePrimaryCtxRelease
+    uintptr_t cuCtxGetCurrent
+
+    uintptr_t cuStreamCreateWithPriority
+    uintptr_t cuStreamDestroy
+
+    uintptr_t cuEventCreate
+    uintptr_t cuEventDestroy
+    uintptr_t cuIpcOpenEventHandle
+
+    uintptr_t cuDeviceGetCount
+
+    uintptr_t cuMemPoolSetAccess
+    uintptr_t cuMemPoolDestroy
+    uintptr_t cuMemPoolCreate
+    uintptr_t cuDeviceGetMemPool
+    uintptr_t cuMemPoolImportFromShareableHandle
+
+    uintptr_t cuMemAllocFromPoolAsync
+    uintptr_t cuMemAllocAsync
+    uintptr_t cuMemAlloc
+    uintptr_t cuMemAllocHost
+
+    uintptr_t cuMemFreeAsync
+    uintptr_t cuMemFree
+    uintptr_t cuMemFreeHost
+
+    uintptr_t cuMemPoolImportPointer
+
+
+cdef CudaDriverApiV1 _cuda_driver_api_v1
+cdef bint _cuda_driver_api_v1_inited = False
+
+
+cdef inline uintptr_t _as_addr(object pfn) except 0:
+    return <uintptr_t>int(pfn)
+
+
+cdef inline uintptr_t _resolve(object d, int driver_ver, uint64_t flags, bytes sym) except 0:
+    err, pfn, status = d.cuGetProcAddress(sym, driver_ver, flags)
+    if int(err) != 0 or pfn is None:
+        raise RuntimeError(f"cuGetProcAddress failed for {sym!r}, err={err}, status={status}")
+    return _as_addr(pfn)
+
+
+def _get_cuda_driver_api_v1_capsule():
+    """Return a PyCapsule containing cached CUDA driver entrypoints.
+
+    This is evaluated lazily on first use so cuda-core remains importable on
+    CPU-only machines.
+    """
+    global _cuda_driver_api_v1_inited, _cuda_driver_api_v1
+    if not _cuda_driver_api_v1_inited:
+        import cuda.bindings.driver as d
+
+        err, ver = d.cuDriverGetVersion()
+        if int(err) != 0:
+            raise RuntimeError(f"cuDriverGetVersion failed: {err}")
+        driver_ver = int(ver)
+
+        flags = 0  # CU_GET_PROC_ADDRESS_DEFAULT
+
+        _cuda_driver_api_v1.cuDevicePrimaryCtxRetain = _resolve(d, driver_ver, flags, b"cuDevicePrimaryCtxRetain")
+        _cuda_driver_api_v1.cuDevicePrimaryCtxRelease = _resolve(d, driver_ver, flags, b"cuDevicePrimaryCtxRelease")
+        _cuda_driver_api_v1.cuCtxGetCurrent = _resolve(d, driver_ver, flags, b"cuCtxGetCurrent")
+
+        _cuda_driver_api_v1.cuStreamCreateWithPriority = _resolve(d, driver_ver, flags, b"cuStreamCreateWithPriority")
+        _cuda_driver_api_v1.cuStreamDestroy = _resolve(d, driver_ver, flags, b"cuStreamDestroy")
+
+        _cuda_driver_api_v1.cuEventCreate = _resolve(d, driver_ver, flags, b"cuEventCreate")
+        _cuda_driver_api_v1.cuEventDestroy = _resolve(d, driver_ver, flags, b"cuEventDestroy")
+        _cuda_driver_api_v1.cuIpcOpenEventHandle = _resolve(d, driver_ver, flags, b"cuIpcOpenEventHandle")
+
+        _cuda_driver_api_v1.cuDeviceGetCount = _resolve(d, driver_ver, flags, b"cuDeviceGetCount")
+
+        _cuda_driver_api_v1.cuMemPoolSetAccess = _resolve(d, driver_ver, flags, b"cuMemPoolSetAccess")
+        _cuda_driver_api_v1.cuMemPoolDestroy = _resolve(d, driver_ver, flags, b"cuMemPoolDestroy")
+        _cuda_driver_api_v1.cuMemPoolCreate = _resolve(d, driver_ver, flags, b"cuMemPoolCreate")
+        _cuda_driver_api_v1.cuDeviceGetMemPool = _resolve(d, driver_ver, flags, b"cuDeviceGetMemPool")
+        _cuda_driver_api_v1.cuMemPoolImportFromShareableHandle = _resolve(
+            d, driver_ver, flags, b"cuMemPoolImportFromShareableHandle"
+        )
+
+        _cuda_driver_api_v1.cuMemAllocFromPoolAsync = _resolve(d, driver_ver, flags, b"cuMemAllocFromPoolAsync")
+        _cuda_driver_api_v1.cuMemAllocAsync = _resolve(d, driver_ver, flags, b"cuMemAllocAsync")
+        _cuda_driver_api_v1.cuMemAlloc = _resolve(d, driver_ver, flags, b"cuMemAlloc")
+        _cuda_driver_api_v1.cuMemAllocHost = _resolve(d, driver_ver, flags, b"cuMemAllocHost")
+
+        _cuda_driver_api_v1.cuMemFreeAsync = _resolve(d, driver_ver, flags, b"cuMemFreeAsync")
+        _cuda_driver_api_v1.cuMemFree = _resolve(d, driver_ver, flags, b"cuMemFree")
+        _cuda_driver_api_v1.cuMemFreeHost = _resolve(d, driver_ver, flags, b"cuMemFreeHost")
+
+        _cuda_driver_api_v1.cuMemPoolImportPointer = _resolve(d, driver_ver, flags, b"cuMemPoolImportPointer")
+
+        _cuda_driver_api_v1.abi_version = 1
+        _cuda_driver_api_v1.struct_size = cython.sizeof(CudaDriverApiV1)
+        _cuda_driver_api_v1_inited = True
+
+    return <object>PyCapsule_New(<void*>&_cuda_driver_api_v1, _CUDA_DRIVER_API_V1_NAME, NULL)

From 3fafe926952699b825b4db316c5696550640a453 Mon Sep 17 00:00:00 2001
From: Andy Jost <ajost@nvidia.com>
Date: Tue, 16 Dec 2025 15:21:09 -0800
Subject: [PATCH 25/38] Centralize resource handles capsule dispatch in
 _resource_handles.pxd

Hide _CXX_API capsule import/version checks behind inline pxd wrappers so call sites stay clean, and remove redundant ensure_driver_loaded() checks in C++ deleters.
---
 .../experimental/_cpp/resource_handles.cpp    |  47 +--
 cuda_core/cuda/core/experimental/_event.pyx   |  27 +-
 .../cuda/core/experimental/_memoryview.pyx    |   4 +-
 .../core/experimental/_resource_handles.pxd   | 344 +++++++++++-------
 cuda_core/cuda/core/experimental/_stream.pyx  |   7 +-
 5 files changed, 227 insertions(+), 202 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_cpp/resource_handles.cpp b/cuda_core/cuda/core/experimental/_cpp/resource_handles.cpp
index a4f872566b..9f4e21ca24 100644
--- a/cuda_core/cuda/core/experimental/_cpp/resource_handles.cpp
+++ b/cuda_core/cuda/core/experimental/_cpp/resource_handles.cpp
@@ -319,9 +319,7 @@ ContextHandle get_primary_context(int device_id) noexcept {
         new ContextBox{ctx},
         [device_id](const ContextBox* b) {
             GILReleaseGuard gil;
-            if (ensure_driver_loaded()) {
-                p_cuDevicePrimaryCtxRelease(device_id);
-            }
+            p_cuDevicePrimaryCtxRelease(device_id);
             delete b;
         }
     );
@@ -374,9 +372,7 @@ StreamHandle create_stream_handle(ContextHandle h_ctx, unsigned int flags, int p
         new StreamBox{stream},
         [h_ctx](const StreamBox* b) {
             GILReleaseGuard gil;
-            if (ensure_driver_loaded()) {
-                p_cuStreamDestroy(b->resource);
-            }
+            p_cuStreamDestroy(b->resource);
             delete b;
         }
     );
@@ -436,9 +432,7 @@ EventHandle create_event_handle(ContextHandle h_ctx, unsigned int flags) {
         new EventBox{event},
         [h_ctx](const EventBox* b) {
             GILReleaseGuard gil;
-            if (ensure_driver_loaded()) {
-                p_cuEventDestroy(b->resource);
-            }
+            p_cuEventDestroy(b->resource);
             delete b;
         }
     );
@@ -464,9 +458,7 @@ EventHandle create_event_handle_ipc(const CUipcEventHandle& ipc_handle) {
         new EventBox{event},
         [](const EventBox* b) {
             GILReleaseGuard gil;
-            if (ensure_driver_loaded()) {
-                p_cuEventDestroy(b->resource);
-            }
+            p_cuEventDestroy(b->resource);
             delete b;
         }
     );
@@ -484,9 +476,6 @@ struct MemoryPoolBox {
 // Helper to clear peer access before destroying a memory pool.
 // Works around nvbug 5698116: recycled pool handles inherit peer access state.
 static void clear_mempool_peer_access(CUmemoryPool pool) {
-    if (!ensure_driver_loaded()) {
-        return;
-    }
     int device_count = 0;
     if (p_cuDeviceGetCount(&device_count) != CUDA_SUCCESS || device_count <= 0) {
         return;
@@ -507,9 +496,7 @@ static MemoryPoolHandle wrap_mempool_owned(CUmemoryPool pool) {
         [](const MemoryPoolBox* b) {
             GILReleaseGuard gil;
             clear_mempool_peer_access(b->resource);
-            if (ensure_driver_loaded()) {
-                p_cuMemPoolDestroy(b->resource);
-            }
+            p_cuMemPoolDestroy(b->resource);
             delete b;
         }
     );
@@ -600,9 +587,7 @@ DevicePtrHandle deviceptr_alloc_from_pool(size_t size, MemoryPoolHandle h_pool,
         new DevicePtrBox{ptr, h_stream},
         [h_pool](DevicePtrBox* b) {
             GILReleaseGuard gil;
-            if (ensure_driver_loaded()) {
-                p_cuMemFreeAsync(b->resource, native(b->h_stream));
-            }
+            p_cuMemFreeAsync(b->resource, native(b->h_stream));
             delete b;
         }
     );
@@ -624,9 +609,7 @@ DevicePtrHandle deviceptr_alloc_async(size_t size, StreamHandle h_stream) {
         new DevicePtrBox{ptr, h_stream},
         [](DevicePtrBox* b) {
             GILReleaseGuard gil;
-            if (ensure_driver_loaded()) {
-                p_cuMemFreeAsync(b->resource, native(b->h_stream));
-            }
+            p_cuMemFreeAsync(b->resource, native(b->h_stream));
             delete b;
         }
     );
@@ -648,9 +631,7 @@ DevicePtrHandle deviceptr_alloc(size_t size) {
         new DevicePtrBox{ptr, StreamHandle{}},
         [](DevicePtrBox* b) {
             GILReleaseGuard gil;
-            if (ensure_driver_loaded()) {
-                p_cuMemFree(b->resource);
-            }
+            p_cuMemFree(b->resource);
             delete b;
         }
     );
@@ -672,9 +653,7 @@ DevicePtrHandle deviceptr_alloc_host(size_t size) {
         new DevicePtrBox{reinterpret_cast<CUdeviceptr>(ptr), StreamHandle{}},
         [](DevicePtrBox* b) {
             GILReleaseGuard gil;
-            if (ensure_driver_loaded()) {
-                p_cuMemFreeHost(reinterpret_cast<void*>(b->resource));
-            }
+            p_cuMemFreeHost(reinterpret_cast<void*>(b->resource));
             delete b;
         }
     );
@@ -754,9 +733,7 @@ DevicePtrHandle deviceptr_import_ipc(MemoryPoolHandle h_pool, const void* export
                     ipc_ptr_cache.erase(ptr);
                 }
                 GILReleaseGuard gil;
-                if (ensure_driver_loaded()) {
-                    p_cuMemFreeAsync(b->resource, native(b->h_stream));
-                }
+                p_cuMemFreeAsync(b->resource, native(b->h_stream));
                 delete b;
             }
         );
@@ -769,9 +746,7 @@ DevicePtrHandle deviceptr_import_ipc(MemoryPoolHandle h_pool, const void* export
             new DevicePtrBox{ptr, h_stream},
             [h_pool](DevicePtrBox* b) {
                 GILReleaseGuard gil;
-                if (ensure_driver_loaded()) {
-                    p_cuMemFreeAsync(b->resource, native(b->h_stream));
-                }
+                p_cuMemFreeAsync(b->resource, native(b->h_stream));
                 delete b;
             }
         );
diff --git a/cuda_core/cuda/core/experimental/_event.pyx b/cuda_core/cuda/core/experimental/_event.pyx
index 2305199ffc..72618e40d6 100644
--- a/cuda_core/cuda/core/experimental/_event.pyx
+++ b/cuda_core/cuda/core/experimental/_event.pyx
@@ -5,14 +5,14 @@
 from __future__ import annotations
 
 cimport cpython
-from cpython.pycapsule cimport PyCapsule_Import
 from libc.string cimport memcpy
 from cuda.bindings cimport cydriver
 from cuda.core.experimental._context cimport Context
-from cuda.core.experimental._resource_handles_cxx_api cimport ResourceHandlesCxxApiV1
 from cuda.core.experimental._resource_handles cimport (
     ContextHandle,
     EventHandle,
+    create_event_handle,
+    create_event_handle_ipc,
     intptr,
     native,
     py,
@@ -34,23 +34,6 @@ if TYPE_CHECKING:
     import cuda.bindings
 
 
-cdef const char* _CXX_API_NAME = b"cuda.core.experimental._resource_handles._CXX_API"
-cdef const ResourceHandlesCxxApiV1* _handles = NULL
-
-
-cdef inline const ResourceHandlesCxxApiV1* _get_handles() except NULL:
-    global _handles
-    if _handles == NULL:
-        _handles = <const ResourceHandlesCxxApiV1*>PyCapsule_Import(_CXX_API_NAME, 0)
-        if _handles == NULL:
-            raise ImportError("Failed to import cuda.core.experimental._resource_handles._CXX_API capsule")
-        if _handles.abi_version != 1:
-            raise ImportError("Unsupported resource handles C++ API version")
-        if _handles.struct_size < cython.sizeof(ResourceHandlesCxxApiV1):
-            raise ImportError("Resource handles C++ API table is too small")
-    return _handles
-
-
 @dataclass
 cdef class EventOptions:
     """Customizable :obj:`~_event.Event` options.
@@ -133,8 +116,7 @@ cdef class Event:
             if not self._timing_disabled:
                 raise TypeError("IPC-enabled events cannot use timing.")
         # C++ creates the event and returns owning handle with context dependency
-        cdef const ResourceHandlesCxxApiV1* handles = _get_handles()
-        cdef EventHandle h_event = handles.create_event_handle(h_context, flags)
+        cdef EventHandle h_event = create_event_handle(h_context, flags)
         if not h_event:
             raise RuntimeError("Failed to create CUDA event")
         self._h_event = h_event
@@ -217,8 +199,7 @@ cdef class Event:
         memcpy(data.reserved, <const void*><const char*>(ipc_descriptor._reserved), sizeof(data.reserved))
         cdef Event self = Event.__new__(cls)
         # IPC events: the originating process owns the event and its context
-        cdef const ResourceHandlesCxxApiV1* handles = _get_handles()
-        cdef EventHandle h_event = handles.create_event_handle_ipc(data)
+        cdef EventHandle h_event = create_event_handle_ipc(data)
         if not h_event:
             raise RuntimeError("Failed to open IPC event handle")
         self._h_event = h_event
diff --git a/cuda_core/cuda/core/experimental/_memoryview.pyx b/cuda_core/cuda/core/experimental/_memoryview.pyx
index 9f1119894a..443929b27e 100644
--- a/cuda_core/cuda/core/experimental/_memoryview.pyx
+++ b/cuda_core/cuda/core/experimental/_memoryview.pyx
@@ -16,7 +16,7 @@ import numpy
 from cuda.bindings cimport cydriver
 from cuda.core.experimental._resource_handles cimport (
     EventHandle,
-    create_event_handle,
+    create_event_handle_noctx,
     native,
 )
 from cuda.core.experimental._utils.cuda_utils import handle_return, driver
@@ -596,7 +596,7 @@ cpdef StridedMemoryView view_as_cai(obj, stream_ptr, view=None):
             assert producer_s > 0
             # establish stream order
             if producer_s != consumer_s:
-                h_event = create_event_handle(cydriver.CUevent_flags.CU_EVENT_DISABLE_TIMING)
+                h_event = create_event_handle_noctx(cydriver.CUevent_flags.CU_EVENT_DISABLE_TIMING)
                 with nogil:
                     HANDLE_RETURN(cydriver.cuEventRecord(
                         native(h_event), <cydriver.CUstream>producer_s))
diff --git a/cuda_core/cuda/core/experimental/_resource_handles.pxd b/cuda_core/cuda/core/experimental/_resource_handles.pxd
index 6aa204efc6..5bfc1821e4 100644
--- a/cuda_core/cuda/core/experimental/_resource_handles.pxd
+++ b/cuda_core/cuda/core/experimental/_resource_handles.pxd
@@ -2,169 +2,237 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-from libc.stdint cimport intptr_t
+from libc.stddef cimport size_t
+from libc.stdint cimport intptr_t, uint32_t
 from libcpp.memory cimport shared_ptr
 
+from cpython.pycapsule cimport PyCapsule_Import
+
 from cuda.bindings cimport cydriver
 
-# Declare the C++ namespace and types
+# Declare the C++ namespace and types (inline helpers live in the header).
 cdef extern from "_cpp/resource_handles.hpp" namespace "cuda_core":
-    # ========================================================================
-    # Thread-local error handling
-    # ========================================================================
-    cydriver.CUresult get_last_error() nogil
-    cydriver.CUresult peek_last_error() nogil
-    void clear_last_error() nogil
-
-    # ========================================================================
-    # Context Handle
-    # ========================================================================
     ctypedef shared_ptr[const cydriver.CUcontext] ContextHandle
-
-    # Function to create a non-owning context handle (references existing context)
-    ContextHandle create_context_handle_ref(cydriver.CUcontext ctx) nogil
-
-    # Context acquisition functions (pure C++, nogil-safe with thread-local caching)
-    ContextHandle get_primary_context(int device_id) nogil
-    ContextHandle get_current_context() nogil
-
-    # ========================================================================
-    # Stream Handle
-    # ========================================================================
     ctypedef shared_ptr[const cydriver.CUstream] StreamHandle
-
-    # Create an owning stream handle via cuStreamCreateWithPriority
-    # Context handle establishes structural dependency (context outlives stream)
-    # Returns empty handle on error (caller must check)
-    StreamHandle create_stream_handle(ContextHandle h_ctx, unsigned int flags, int priority) nogil
-
-    # Create a non-owning stream handle (stream NOT destroyed when handle released)
-    # Caller is responsible for keeping the stream's context alive
-    StreamHandle create_stream_handle_ref(cydriver.CUstream stream) nogil
-
-    # Create non-owning handle that prevents Python owner from being GC'd
-    # Owner is responsible for keeping the stream's context alive
-    StreamHandle create_stream_handle_with_owner(cydriver.CUstream stream, object owner)
-
-    # Get non-owning handle to the legacy default stream (no context dependency)
-    StreamHandle get_legacy_stream() nogil
-
-    # Get non-owning handle to the per-thread default stream (no context dependency)
-    StreamHandle get_per_thread_stream() nogil
-
-    # ========================================================================
-    # Event Handle
-    # ========================================================================
     ctypedef shared_ptr[const cydriver.CUevent] EventHandle
-
-    # Create an owning event handle via cuEventCreate
-    # Context handle establishes structural dependency (context outlives event)
-    # Returns empty handle on error (caller must check)
-    EventHandle create_event_handle(ContextHandle h_ctx, unsigned int flags) nogil
-
-    # Create an owning event handle without context dependency
-    # Use for temporary events that are created and destroyed in the same scope
-    # Returns empty handle on error (caller must check)
-    EventHandle create_event_handle(unsigned int flags) nogil
-
-    # Create an owning event handle from IPC handle
-    # The originating process owns the event and its context
-    # Returns empty handle on error (caller must check)
-    EventHandle create_event_handle_ipc(const cydriver.CUipcEventHandle& ipc_handle) nogil
-
-    # ========================================================================
-    # Memory Pool Handle
-    # ========================================================================
     ctypedef shared_ptr[const cydriver.CUmemoryPool] MemoryPoolHandle
-
-    # Create an owning memory pool handle via cuMemPoolCreate
-    # Memory pools are device-scoped (not context-scoped)
-    # Returns empty handle on error (caller must check)
-    MemoryPoolHandle create_mempool_handle(const cydriver.CUmemPoolProps& props) nogil
-
-    # Create a non-owning memory pool handle (pool NOT destroyed when released)
-    # Use for device default/current pools managed by the driver
-    MemoryPoolHandle create_mempool_handle_ref(cydriver.CUmemoryPool pool) nogil
-
-    # Get non-owning handle to the current memory pool for a device
-    # Returns empty handle on error (caller must check)
-    MemoryPoolHandle get_device_mempool(int device_id) nogil
-
-    # Create an owning memory pool handle from IPC import
-    # File descriptor NOT owned by this handle (caller manages FD separately)
-    # Returns empty handle on error (caller must check)
-    MemoryPoolHandle create_mempool_handle_ipc(int fd, cydriver.CUmemAllocationHandleType handle_type) nogil
-
-    # ========================================================================
-    # Device Pointer Handle
-    # ========================================================================
     ctypedef shared_ptr[const cydriver.CUdeviceptr] DevicePtrHandle
 
-    # Allocate device memory from a pool asynchronously via cuMemAllocFromPoolAsync
-    # Pool handle is captured in deleter to keep pool alive
-    # Returns empty handle on error (caller must check)
-    DevicePtrHandle deviceptr_alloc_from_pool(
-        size_t size,
-        MemoryPoolHandle h_pool,
-        StreamHandle h_stream) nogil
-
-    # Allocate device memory asynchronously via cuMemAllocAsync
-    # Returns empty handle on error (caller must check)
-    DevicePtrHandle deviceptr_alloc_async(size_t size, StreamHandle h_stream) nogil
-
-    # Allocate device memory synchronously via cuMemAlloc
-    # Returns empty handle on error (caller must check)
-    DevicePtrHandle deviceptr_alloc(size_t size) nogil
-
-    # Allocate pinned host memory via cuMemAllocHost
-    # Returns empty handle on error (caller must check)
-    DevicePtrHandle deviceptr_alloc_host(size_t size) nogil
-
-    # Create a non-owning device pointer handle (pointer NOT freed when released)
-    # Use for foreign pointers from external libraries
-    DevicePtrHandle deviceptr_create_ref(cydriver.CUdeviceptr ptr) nogil
-
-    # Create non-owning handle that prevents Python owner from being GC'd
-    # Pointer NOT freed when released; owner's refcount decremented on release
-    # If owner is None, equivalent to deviceptr_create_ref
-    DevicePtrHandle deviceptr_create_with_owner(cydriver.CUdeviceptr ptr, object owner)
-
-    # Import a device pointer from IPC via cuMemPoolImportPointer
-    # Note: Does not yet implement reference counting for nvbug 5570902
-    # On error, returns empty handle and sets thread-local error (use get_last_error())
-    DevicePtrHandle deviceptr_import_ipc(
-        MemoryPoolHandle h_pool,
-        const void* export_data,
-        StreamHandle h_stream) nogil
-
-    # Access the deallocation stream for a device pointer handle (read-only)
-    StreamHandle deallocation_stream(const DevicePtrHandle& h) nogil
-
-    # Set the deallocation stream for a device pointer handle
-    void set_deallocation_stream(const DevicePtrHandle& h, StreamHandle h_stream) nogil
-
-    # ========================================================================
-    # Overloaded helper functions (C++ handles dispatch by type)
-    # ========================================================================
-
-    # native() - extract the raw CUDA handle
+    # native() - extract the raw CUDA handle (inline C++)
     cydriver.CUcontext native(ContextHandle h) nogil
     cydriver.CUstream native(StreamHandle h) nogil
     cydriver.CUevent native(EventHandle h) nogil
     cydriver.CUmemoryPool native(MemoryPoolHandle h) nogil
     cydriver.CUdeviceptr native(DevicePtrHandle h) nogil
 
-    # intptr() - extract handle as intptr_t for Python interop
-    # Using signed intptr_t per C standard convention and issue #1342
+    # intptr() - extract handle as intptr_t for Python interop (inline C++)
     intptr_t intptr(ContextHandle h) nogil
     intptr_t intptr(StreamHandle h) nogil
     intptr_t intptr(EventHandle h) nogil
     intptr_t intptr(MemoryPoolHandle h) nogil
     intptr_t intptr(DevicePtrHandle h) nogil
 
-    # py() - convert handle to Python driver wrapper object (requires GIL)
+    # py() - convert handle to Python driver wrapper object (inline C++; requires GIL)
     object py(ContextHandle h)
     object py(StreamHandle h)
     object py(EventHandle h)
     object py(MemoryPoolHandle h)
     object py(DevicePtrHandle h)
+
+
+# The resource handles API table is exported from `cuda.core.experimental._resource_handles`
+# as a PyCapsule named:
+#
+#   "cuda.core.experimental._resource_handles._CXX_API"
+#
+# Consumers dispatch through this table to avoid relying on RTLD_GLOBAL and to
+# ensure a single owner of correctness-critical static/thread_local state.
+cdef extern from "_cpp/resource_handles_cxx_api.hpp" namespace "cuda_core":
+    cdef struct ResourceHandlesCxxApiV1:
+        uint32_t abi_version
+        uint32_t struct_size
+
+        # Thread-local error handling
+        cydriver.CUresult (*get_last_error)() nogil
+        cydriver.CUresult (*peek_last_error)() nogil
+        void (*clear_last_error)() nogil
+
+        # Context handles
+        ContextHandle (*create_context_handle_ref)(cydriver.CUcontext ctx) nogil
+        ContextHandle (*get_primary_context)(int device_id) nogil
+        ContextHandle (*get_current_context)() nogil
+
+        # Stream handles
+        StreamHandle (*create_stream_handle)(ContextHandle h_ctx, unsigned int flags, int priority) nogil
+        StreamHandle (*create_stream_handle_ref)(cydriver.CUstream stream) nogil
+        StreamHandle (*create_stream_handle_with_owner)(cydriver.CUstream stream, object owner)
+        StreamHandle (*get_legacy_stream)() nogil
+        StreamHandle (*get_per_thread_stream)() nogil
+
+        # Event handles
+        EventHandle (*create_event_handle)(ContextHandle h_ctx, unsigned int flags) nogil
+        EventHandle (*create_event_handle_noctx)(unsigned int flags) nogil
+        EventHandle (*create_event_handle_ipc)(const cydriver.CUipcEventHandle& ipc_handle) nogil
+
+        # Memory pool handles
+        MemoryPoolHandle (*create_mempool_handle)(const cydriver.CUmemPoolProps& props) nogil
+        MemoryPoolHandle (*create_mempool_handle_ref)(cydriver.CUmemoryPool pool) nogil
+        MemoryPoolHandle (*get_device_mempool)(int device_id) nogil
+        MemoryPoolHandle (*create_mempool_handle_ipc)(int fd, cydriver.CUmemAllocationHandleType handle_type) nogil
+
+        # Device pointer handles
+        DevicePtrHandle (*deviceptr_alloc_from_pool)(
+            size_t size,
+            MemoryPoolHandle h_pool,
+            StreamHandle h_stream) nogil
+        DevicePtrHandle (*deviceptr_alloc_async)(size_t size, StreamHandle h_stream) nogil
+        DevicePtrHandle (*deviceptr_alloc)(size_t size) nogil
+        DevicePtrHandle (*deviceptr_alloc_host)(size_t size) nogil
+        DevicePtrHandle (*deviceptr_create_ref)(cydriver.CUdeviceptr ptr) nogil
+        DevicePtrHandle (*deviceptr_create_with_owner)(cydriver.CUdeviceptr ptr, object owner)
+        DevicePtrHandle (*deviceptr_import_ipc)(
+            MemoryPoolHandle h_pool,
+            const void* export_data,
+            StreamHandle h_stream) nogil
+        StreamHandle (*deallocation_stream)(const DevicePtrHandle& h) nogil
+        void (*set_deallocation_stream)(const DevicePtrHandle& h, StreamHandle h_stream) nogil
+
+    const ResourceHandlesCxxApiV1* get_resource_handles_cxx_api_v1() nogil
+
+
+cdef const ResourceHandlesCxxApiV1* _handles_table = NULL
+
+
+cdef inline const ResourceHandlesCxxApiV1* _get_handles_table() except NULL nogil:
+    global _handles_table
+    if _handles_table == NULL:
+        with gil:
+            if _handles_table == NULL:
+                _handles_table = <const ResourceHandlesCxxApiV1*>PyCapsule_Import(
+                    b"cuda.core.experimental._resource_handles._CXX_API", 0
+                )
+                if _handles_table == NULL:
+                    raise ImportError("Failed to import cuda.core.experimental._resource_handles._CXX_API capsule")
+                if _handles_table.abi_version != 1:
+                    raise ImportError("Unsupported resource handles C++ API version")
+                if _handles_table.struct_size < sizeof(ResourceHandlesCxxApiV1):
+                    raise ImportError("Resource handles C++ API table is too small")
+    return _handles_table
+
+
+# -----------------------------------------------------------------------------
+# Dispatch wrappers (hide capsule init from consumers)
+# -----------------------------------------------------------------------------
+
+cdef inline cydriver.CUresult get_last_error() except * nogil:
+    return _get_handles_table().get_last_error()
+
+
+cdef inline cydriver.CUresult peek_last_error() except * nogil:
+    return _get_handles_table().peek_last_error()
+
+
+cdef inline void clear_last_error() except * nogil:
+    _get_handles_table().clear_last_error()
+
+
+cdef inline ContextHandle create_context_handle_ref(cydriver.CUcontext ctx) except * nogil:
+    return _get_handles_table().create_context_handle_ref(ctx)
+
+
+cdef inline ContextHandle get_primary_context(int device_id) except * nogil:
+    return _get_handles_table().get_primary_context(device_id)
+
+
+cdef inline ContextHandle get_current_context() except * nogil:
+    return _get_handles_table().get_current_context()
+
+
+cdef inline StreamHandle create_stream_handle(ContextHandle h_ctx, unsigned int flags, int priority) except * nogil:
+    return _get_handles_table().create_stream_handle(h_ctx, flags, priority)
+
+
+cdef inline StreamHandle create_stream_handle_ref(cydriver.CUstream stream) except * nogil:
+    return _get_handles_table().create_stream_handle_ref(stream)
+
+
+cdef inline StreamHandle create_stream_handle_with_owner(cydriver.CUstream stream, object owner) except *:
+    return _get_handles_table().create_stream_handle_with_owner(stream, owner)
+
+
+cdef inline StreamHandle get_legacy_stream() except * nogil:
+    return _get_handles_table().get_legacy_stream()
+
+
+cdef inline StreamHandle get_per_thread_stream() except * nogil:
+    return _get_handles_table().get_per_thread_stream()
+
+
+cdef inline EventHandle create_event_handle(ContextHandle h_ctx, unsigned int flags) except * nogil:
+    return _get_handles_table().create_event_handle(h_ctx, flags)
+
+
+cdef inline EventHandle create_event_handle_noctx(unsigned int flags) except * nogil:
+    return _get_handles_table().create_event_handle_noctx(flags)
+
+
+cdef inline EventHandle create_event_handle_ipc(const cydriver.CUipcEventHandle& ipc_handle) except * nogil:
+    return _get_handles_table().create_event_handle_ipc(ipc_handle)
+
+
+cdef inline MemoryPoolHandle create_mempool_handle(const cydriver.CUmemPoolProps& props) except * nogil:
+    return _get_handles_table().create_mempool_handle(props)
+
+
+cdef inline MemoryPoolHandle create_mempool_handle_ref(cydriver.CUmemoryPool pool) except * nogil:
+    return _get_handles_table().create_mempool_handle_ref(pool)
+
+
+cdef inline MemoryPoolHandle get_device_mempool(int device_id) except * nogil:
+    return _get_handles_table().get_device_mempool(device_id)
+
+
+cdef inline MemoryPoolHandle create_mempool_handle_ipc(int fd, cydriver.CUmemAllocationHandleType handle_type) except * nogil:
+    return _get_handles_table().create_mempool_handle_ipc(fd, handle_type)
+
+
+cdef inline DevicePtrHandle deviceptr_alloc_from_pool(
+    size_t size,
+    MemoryPoolHandle h_pool,
+    StreamHandle h_stream) except * nogil:
+    return _get_handles_table().deviceptr_alloc_from_pool(size, h_pool, h_stream)
+
+
+cdef inline DevicePtrHandle deviceptr_alloc_async(size_t size, StreamHandle h_stream) except * nogil:
+    return _get_handles_table().deviceptr_alloc_async(size, h_stream)
+
+
+cdef inline DevicePtrHandle deviceptr_alloc(size_t size) except * nogil:
+    return _get_handles_table().deviceptr_alloc(size)
+
+
+cdef inline DevicePtrHandle deviceptr_alloc_host(size_t size) except * nogil:
+    return _get_handles_table().deviceptr_alloc_host(size)
+
+
+cdef inline DevicePtrHandle deviceptr_create_ref(cydriver.CUdeviceptr ptr) except * nogil:
+    return _get_handles_table().deviceptr_create_ref(ptr)
+
+
+cdef inline DevicePtrHandle deviceptr_create_with_owner(cydriver.CUdeviceptr ptr, object owner) except *:
+    return _get_handles_table().deviceptr_create_with_owner(ptr, owner)
+
+
+cdef inline DevicePtrHandle deviceptr_import_ipc(
+    MemoryPoolHandle h_pool,
+    const void* export_data,
+    StreamHandle h_stream) except * nogil:
+    return _get_handles_table().deviceptr_import_ipc(h_pool, export_data, h_stream)
+
+
+cdef inline StreamHandle deallocation_stream(const DevicePtrHandle& h) except * nogil:
+    return _get_handles_table().deallocation_stream(h)
+
+
+cdef inline void set_deallocation_stream(const DevicePtrHandle& h, StreamHandle h_stream) except * nogil:
+    _get_handles_table().set_deallocation_stream(h, h_stream)
diff --git a/cuda_core/cuda/core/experimental/_stream.pyx b/cuda_core/cuda/core/experimental/_stream.pyx
index 87d78eba17..6fb4c79bd5 100644
--- a/cuda_core/cuda/core/experimental/_stream.pyx
+++ b/cuda_core/cuda/core/experimental/_stream.pyx
@@ -30,7 +30,7 @@ from cuda.core.experimental._resource_handles cimport (
     EventHandle,
     StreamHandle,
     create_context_handle_ref,
-    create_event_handle,
+    create_event_handle_noctx,
     create_stream_handle,
     create_stream_handle_with_owner,
     get_current_context,
@@ -303,7 +303,7 @@ cdef class Stream:
                 ) from e
 
         # Wait on stream via temporary event
-        h_event = create_event_handle(cydriver.CUevent_flags.CU_EVENT_DISABLE_TIMING)
+        h_event = create_event_handle_noctx(cydriver.CUevent_flags.CU_EVENT_DISABLE_TIMING)
         with nogil:
             HANDLE_RETURN(cydriver.cuEventRecord(native(h_event), native(stream._h_stream)))
             # TODO: support flags other than 0?
@@ -414,7 +414,8 @@ cdef inline int Stream_ensure_ctx(Stream self) except?-1 nogil:
     cdef cydriver.CUcontext ctx
     if not self._h_context:
         HANDLE_RETURN(cydriver.cuStreamGetCtx(native(self._h_stream), &ctx))
-        self._h_context = create_context_handle_ref(ctx)
+        with gil:
+            self._h_context = create_context_handle_ref(ctx)
     return 0
 
 

From ba139f34976e806be48f68251565e9d3bd2151a9 Mon Sep 17 00:00:00 2001
From: Andy Jost <ajost@nvidia.com>
Date: Tue, 16 Dec 2025 15:25:15 -0800
Subject: [PATCH 26/38] Drop RTLD_GLOBAL import for _resource_handles

Resource handle consumers now dispatch through the exported PyCapsule table, so _resource_handles no longer needs to be loaded with RTLD_GLOBAL.
---
 cuda_core/cuda/core/experimental/__init__.py  | 20 +++++--------------
 .../_cpp/resource_handles_cxx_api.hpp         |  1 -
 .../_resource_handles_cxx_api.pxd             |  1 -
 3 files changed, 5 insertions(+), 17 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/__init__.py b/cuda_core/cuda/core/experimental/__init__.py
index ac0627222b..ead15c92e2 100644
--- a/cuda_core/cuda/core/experimental/__init__.py
+++ b/cuda_core/cuda/core/experimental/__init__.py
@@ -14,21 +14,11 @@
 import importlib
 import sys
 
-# Load _resource_handles with RTLD_GLOBAL so its C++ symbols are available
-# to other extension modules that depend on them (_context, _device, etc.)
-# This must happen before importing any dependent modules.
-if sys.platform != "win32":
-    import os
-
-    _old_dlopen_flags = sys.getdlopenflags()
-    sys.setdlopenflags(os.RTLD_GLOBAL | os.RTLD_NOW)
-    try:
-        from cuda.core.experimental import _resource_handles  # noqa: F401
-    finally:
-        sys.setdlopenflags(_old_dlopen_flags)
-    del _old_dlopen_flags, os
-else:
-    from cuda.core.experimental import _resource_handles  # noqa: F401
+# Import the resource handles module early.
+#
+# Other extension modules access its functionality via the exported PyCapsule
+# dispatch table, so we don't rely on RTLD_GLOBAL (POSIX-only behavior).
+from cuda.core.experimental import _resource_handles  # noqa: F401
 
 subdir = f"cu{cuda_major}"
 try:
diff --git a/cuda_core/cuda/core/experimental/_cpp/resource_handles_cxx_api.hpp b/cuda_core/cuda/core/experimental/_cpp/resource_handles_cxx_api.hpp
index 5436b761f5..11e458603b 100644
--- a/cuda_core/cuda/core/experimental/_cpp/resource_handles_cxx_api.hpp
+++ b/cuda_core/cuda/core/experimental/_cpp/resource_handles_cxx_api.hpp
@@ -77,4 +77,3 @@ struct ResourceHandlesCxxApiV1 {
 const ResourceHandlesCxxApiV1* get_resource_handles_cxx_api_v1() noexcept;
 
 }  // namespace cuda_core
-
diff --git a/cuda_core/cuda/core/experimental/_resource_handles_cxx_api.pxd b/cuda_core/cuda/core/experimental/_resource_handles_cxx_api.pxd
index f8a4874908..f14fa7e730 100644
--- a/cuda_core/cuda/core/experimental/_resource_handles_cxx_api.pxd
+++ b/cuda_core/cuda/core/experimental/_resource_handles_cxx_api.pxd
@@ -66,4 +66,3 @@ cdef extern from "_cpp/resource_handles_cxx_api.hpp" namespace "cuda_core":
         void (*set_deallocation_stream)(const DevicePtrHandle& h, StreamHandle h_stream) nogil
 
     const ResourceHandlesCxxApiV1* get_resource_handles_cxx_api_v1() nogil
-

From bac302b4a07e07ad6a9f2808e5a18614c30e199f Mon Sep 17 00:00:00 2001
From: Andy Jost <ajost@nvidia.com>
Date: Tue, 16 Dec 2025 16:20:15 -0800
Subject: [PATCH 27/38] Fix Python 3.13 finalization check

Use public Py_IsFinalizing() API instead of removed _Py_IsFinalizing().
---
 cuda_core/cuda/core/experimental/_cpp/resource_handles.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_cpp/resource_handles.cpp b/cuda_core/cuda/core/experimental/_cpp/resource_handles.cpp
index 9f4e21ca24..5c2a70eced 100644
--- a/cuda_core/cuda/core/experimental/_cpp/resource_handles.cpp
+++ b/cuda_core/cuda/core/experimental/_cpp/resource_handles.cpp
@@ -58,7 +58,7 @@ DECLARE_DRIVER_FN(cuMemPoolImportPointer);
 #undef DECLARE_DRIVER_FN
 
 static bool load_driver_api() noexcept {
-    if (!Py_IsInitialized() || _Py_IsFinalizing()) {
+    if (!Py_IsInitialized() || Py_IsFinalizing()) {
         return false;
     }
 
@@ -225,7 +225,7 @@ class GILReleaseGuard {
 public:
     GILReleaseGuard() : tstate_(nullptr), released_(false) {
         // Don't try to manipulate GIL if Python is finalizing
-        if (!Py_IsInitialized() || _Py_IsFinalizing()) {
+        if (!Py_IsInitialized() || Py_IsFinalizing()) {
             return;
         }
         // PyGILState_Check() returns 1 if the GIL is held by this thread.
@@ -256,7 +256,7 @@ class GILAcquireGuard {
 public:
     GILAcquireGuard() : acquired_(false) {
         // Don't try to acquire GIL if Python is finalizing
-        if (!Py_IsInitialized() || _Py_IsFinalizing()) {
+        if (!Py_IsInitialized() || Py_IsFinalizing()) {
             return;
         }
         gstate_ = PyGILState_Ensure();

From 9d5a010ba37ea394b0f9c331f1568aafbefe004b Mon Sep 17 00:00:00 2001
From: Andy Jost <ajost@nvidia.com>
Date: Wed, 17 Dec 2025 06:57:03 -0800
Subject: [PATCH 28/38] Fix finalization check across Python versions

Use Py_IsFinalizing() on Python 3.13+ and fall back to _Py_IsFinalizing() on older versions.
---
 .../experimental/_cpp/resource_handles.cpp    | 20 ++++++++++++++++---
 1 file changed, 17 insertions(+), 3 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_cpp/resource_handles.cpp b/cuda_core/cuda/core/experimental/_cpp/resource_handles.cpp
index 5c2a70eced..62c5b0eff7 100644
--- a/cuda_core/cuda/core/experimental/_cpp/resource_handles.cpp
+++ b/cuda_core/cuda/core/experimental/_cpp/resource_handles.cpp
@@ -23,6 +23,20 @@ namespace {
 static std::once_flag driver_load_once;
 static bool driver_loaded = false;
 
+#if PY_VERSION_HEX < 0x030D0000
+extern "C" int _Py_IsFinalizing(void);
+#endif
+
+static inline bool py_is_finalizing() noexcept {
+#if PY_VERSION_HEX >= 0x030D0000
+    return Py_IsFinalizing();
+#else
+    // Python < 3.13 does not expose Py_IsFinalizing() publicly. Use the private
+    // API that exists in those versions.
+    return _Py_IsFinalizing() != 0;
+#endif
+}
+
 #define DECLARE_DRIVER_FN(name) using name##_t = decltype(&name); static name##_t p_##name = nullptr
 
 DECLARE_DRIVER_FN(cuDevicePrimaryCtxRetain);
@@ -58,7 +72,7 @@ DECLARE_DRIVER_FN(cuMemPoolImportPointer);
 #undef DECLARE_DRIVER_FN
 
 static bool load_driver_api() noexcept {
-    if (!Py_IsInitialized() || Py_IsFinalizing()) {
+    if (!Py_IsInitialized() || py_is_finalizing()) {
         return false;
     }
 
@@ -225,7 +239,7 @@ class GILReleaseGuard {
 public:
     GILReleaseGuard() : tstate_(nullptr), released_(false) {
         // Don't try to manipulate GIL if Python is finalizing
-        if (!Py_IsInitialized() || Py_IsFinalizing()) {
+        if (!Py_IsInitialized() || py_is_finalizing()) {
             return;
         }
         // PyGILState_Check() returns 1 if the GIL is held by this thread.
@@ -256,7 +270,7 @@ class GILAcquireGuard {
 public:
     GILAcquireGuard() : acquired_(false) {
         // Don't try to acquire GIL if Python is finalizing
-        if (!Py_IsInitialized() || Py_IsFinalizing()) {
+        if (!Py_IsInitialized() || py_is_finalizing()) {
             return;
         }
         gstate_ = PyGILState_Ensure();

From 3b45f7c4c48a27d06a0cd26082d1730568779b9f Mon Sep 17 00:00:00 2001
From: Andy Jost <ajost@nvidia.com>
Date: Wed, 17 Dec 2025 07:53:36 -0800
Subject: [PATCH 29/38] Fix circular import for _resource_handles

Use a relative import in cuda.core.experimental.__init__ to avoid failing imports
from partially-initialized packages during test collection.
---
 cuda_core/cuda/core/experimental/__init__.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/cuda_core/cuda/core/experimental/__init__.py b/cuda_core/cuda/core/experimental/__init__.py
index 7eb3611a03..008426740e 100644
--- a/cuda_core/cuda/core/experimental/__init__.py
+++ b/cuda_core/cuda/core/experimental/__init__.py
@@ -18,7 +18,11 @@
 #
 # Other extension modules access its functionality via the exported PyCapsule
 # dispatch table, so we don't rely on RTLD_GLOBAL (POSIX-only behavior).
-from cuda.core.experimental import _resource_handles  # noqa: F401
+#
+# Use a relative import to avoid circular-import issues when `cuda.core.experimental`
+# is still being initialized (e.g. when importing submodules like
+# `cuda.core.experimental._utils.cuda_utils`).
+from . import _resource_handles  # noqa: F401
 
 subdir = f"cu{cuda_major}"
 try:

From dd07ea88ff4cf1853aaa0b39327f23c6e6e96f95 Mon Sep 17 00:00:00 2001
From: Andy Jost <ajost@nvidia.com>
Date: Wed, 17 Dec 2025 11:21:30 -0800
Subject: [PATCH 30/38] Fix circular import in _resource_handles module

Use relative cimports instead of fully-qualified cimports to prevent
Cython from generating code that imports the parent package during
module initialization, which caused circular import errors.
---
 cuda_core/cuda/core/experimental/_resource_handles.pyx         | 2 +-
 cuda_core/cuda/core/experimental/_resource_handles_cxx_api.pxd | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_resource_handles.pyx b/cuda_core/cuda/core/experimental/_resource_handles.pyx
index 8ddb44c175..ed6d286abe 100644
--- a/cuda_core/cuda/core/experimental/_resource_handles.pyx
+++ b/cuda_core/cuda/core/experimental/_resource_handles.pyx
@@ -9,7 +9,7 @@
 from cpython.pycapsule cimport PyCapsule_New
 from libc.stdint cimport uint32_t, uint64_t, uintptr_t
 
-from cuda.core.experimental._resource_handles_cxx_api cimport (
+from ._resource_handles_cxx_api cimport (
     ResourceHandlesCxxApiV1,
     get_resource_handles_cxx_api_v1,
 )
diff --git a/cuda_core/cuda/core/experimental/_resource_handles_cxx_api.pxd b/cuda_core/cuda/core/experimental/_resource_handles_cxx_api.pxd
index f14fa7e730..da3d8d4fd3 100644
--- a/cuda_core/cuda/core/experimental/_resource_handles_cxx_api.pxd
+++ b/cuda_core/cuda/core/experimental/_resource_handles_cxx_api.pxd
@@ -6,7 +6,7 @@ from libc.stdint cimport uint32_t
 from libc.stddef cimport size_t
 
 from cuda.bindings cimport cydriver
-from cuda.core.experimental._resource_handles cimport (
+from ._resource_handles cimport (
     ContextHandle,
     DevicePtrHandle,
     EventHandle,

From 280665f2147cc42a9c4c739a3aa8cc4dbb09aebd Mon Sep 17 00:00:00 2001
From: Andy Jost <ajost@nvidia.com>
Date: Wed, 17 Dec 2025 14:00:33 -0800
Subject: [PATCH 31/38] Fix circular import by using importlib.import_module

Replace relative import `from . import _resource_handles` with
`importlib.import_module("cuda.core.experimental._resource_handles")`
to avoid circular import issues during package initialization.

The relative import can fail with "partially initialized module" errors
on some Python versions (e.g., Python 3.10) when the package is still
being initialized. Using importlib.import_module with an absolute path
bypasses the relative import machinery and avoids this issue.
---
 cuda_core/cuda/core/experimental/__init__.py | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/__init__.py b/cuda_core/cuda/core/experimental/__init__.py
index 008426740e..b6ed4df302 100644
--- a/cuda_core/cuda/core/experimental/__init__.py
+++ b/cuda_core/cuda/core/experimental/__init__.py
@@ -14,15 +14,14 @@
 import importlib
 import sys
 
-# Import the resource handles module early.
+# The _resource_handles module exports a PyCapsule dispatch table that other
+# extension modules access via PyCapsule_Import. We import it here to ensure
+# it's loaded before other modules try to use it.
 #
-# Other extension modules access its functionality via the exported PyCapsule
-# dispatch table, so we don't rely on RTLD_GLOBAL (POSIX-only behavior).
-#
-# Use a relative import to avoid circular-import issues when `cuda.core.experimental`
-# is still being initialized (e.g. when importing submodules like
-# `cuda.core.experimental._utils.cuda_utils`).
-from . import _resource_handles  # noqa: F401
+# We use importlib.import_module with the full path to avoid triggering
+# circular import issues that can occur with relative imports during
+# package initialization.
+_resource_handles = importlib.import_module("cuda.core.experimental._resource_handles")
 
 subdir = f"cu{cuda_major}"
 try:

From 0f89baa92770402a20802b6560017d01cee1d915 Mon Sep 17 00:00:00 2001
From: Andy Jost <ajost@nvidia.com>
Date: Thu, 18 Dec 2025 10:17:49 -0800
Subject: [PATCH 32/38] Fix wheel merge script to keep _resource_handles module

The wheel merge script was removing _resource_handles.cpython-*.so
during the merge process because it only kept a small set of files
at the cuda/core/ top level. However, _resource_handles is shared
code (not CUDA-version-specific) and must remain at the top level
because it's imported early in __init__.py before versioned code.

Also keep _cpp/ directory for Cython development headers.
---
 ci/tools/merge_cuda_core_wheels.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/ci/tools/merge_cuda_core_wheels.py b/ci/tools/merge_cuda_core_wheels.py
index 87e2df13a7..e5320e9142 100644
--- a/ci/tools/merge_cuda_core_wheels.py
+++ b/ci/tools/merge_cuda_core_wheels.py
@@ -150,15 +150,21 @@ def merge_wheels(wheels: List[Path], output_dir: Path, show_wheel_contents: bool
             "__init__.py",
             "_version.py",
             "_include",
+            "_cpp",  # Headers for Cython development
             "cu12",
             "cu13",
         )
+        # _resource_handles is shared (not CUDA-version-specific) and must stay
+        # at top level. It's imported early in __init__.py before versioned code.
+        items_to_keep_prefix = ("_resource_handles",)
         all_items = os.scandir(base_wheel / base_dir)
         removed_count = 0
         for f in all_items:
             f_abspath = f.path
             if f.name in items_to_keep:
                 continue
+            if any(f.name.startswith(prefix) for prefix in items_to_keep_prefix):
+                continue
             if f.is_dir():
                 print(f"  Removing directory: {f.name}", file=sys.stderr)
                 shutil.rmtree(f_abspath)

From 5e437b294fa2e527dadc63d8cd9b146cbd232f72 Mon Sep 17 00:00:00 2001
From: Andy Jost <ajost@nvidia.com>
Date: Thu, 18 Dec 2025 10:38:29 -0800
Subject: [PATCH 33/38] Fix IPC pointer cache to use export data as key

The cache was using the returned pointer as the key, but checking the
cache after calling cuMemPoolImportPointer. This caused duplicate
imports to fail with CUDA_ERROR_ALREADY_MAPPED before the cache check.

Fix by using the export_data bytes (CUmemPoolPtrExportData) as the
cache key and checking the cache BEFORE calling cuMemPoolImportPointer.
---
 cuda_core/cuda/core/_cpp/resource_handles.cpp | 62 +++++++++++++++----
 1 file changed, 51 insertions(+), 11 deletions(-)

diff --git a/cuda_core/cuda/core/_cpp/resource_handles.cpp b/cuda_core/cuda/core/_cpp/resource_handles.cpp
index 4f660fe8ef..538c220f98 100644
--- a/cuda_core/cuda/core/_cpp/resource_handles.cpp
+++ b/cuda_core/cuda/core/_cpp/resource_handles.cpp
@@ -8,6 +8,7 @@
 #include "resource_handles_cxx_api.hpp"
 #include <cuda.h>
 #include <cstdint>
+#include <cstring>
 #include <mutex>
 #include <unordered_map>
 #include <vector>
@@ -704,58 +705,97 @@ DevicePtrHandle deviceptr_create_with_owner(CUdeviceptr ptr, PyObject* owner) {
 // The first cuMemFreeAsync incorrectly unmaps the memory even when the pointer
 // was imported multiple times. We work around this by caching imported pointers
 // and returning the same handle for duplicate imports.
+//
+// The cache key is the export_data bytes (CUmemPoolPtrExportData), not the
+// returned pointer, because we must check the cache BEFORE calling
+// cuMemPoolImportPointer (which fails with CUDA_ERROR_ALREADY_MAPPED if
+// the pointer is already imported).
 
 // TODO: When driver fix is available, add version check here to bypass cache.
 static bool use_ipc_ptr_cache() {
     return true;
 }
 
+// Wrapper for CUmemPoolPtrExportData to use as map key
+struct ExportDataKey {
+    CUmemPoolPtrExportData data;
+
+    bool operator==(const ExportDataKey& other) const {
+        return std::memcmp(&data, &other.data, sizeof(data)) == 0;
+    }
+};
+
+struct ExportDataKeyHash {
+    std::size_t operator()(const ExportDataKey& key) const {
+        // Simple hash of the bytes
+        std::size_t h = 0;
+        const auto* bytes = reinterpret_cast<const unsigned char*>(&key.data);
+        for (std::size_t i = 0; i < sizeof(key.data); ++i) {
+            h = h * 31 + bytes[i];
+        }
+        return h;
+    }
+};
+
 static std::mutex ipc_ptr_cache_mutex;
-static std::unordered_map<CUdeviceptr, std::weak_ptr<DevicePtrBox>> ipc_ptr_cache;
+static std::unordered_map<ExportDataKey, std::weak_ptr<DevicePtrBox>, ExportDataKeyHash> ipc_ptr_cache;
 
 DevicePtrHandle deviceptr_import_ipc(MemoryPoolHandle h_pool, const void* export_data, StreamHandle h_stream) {
     if (!ensure_driver_loaded()) {
         err = CUDA_ERROR_NOT_INITIALIZED;
         return {};
     }
-    GILReleaseGuard gil;
-    CUdeviceptr ptr;
+
     auto data = const_cast<CUmemPoolPtrExportData*>(
         reinterpret_cast<const CUmemPoolPtrExportData*>(export_data));
-    if (CUDA_SUCCESS != (err = p_cuMemPoolImportPointer(&ptr, *h_pool, data))) {
-        return {};
-    }
 
     if (use_ipc_ptr_cache()) {
+        // Check cache BEFORE calling cuMemPoolImportPointer
+        ExportDataKey key;
+        std::memcpy(&key.data, data, sizeof(key.data));
+
         std::lock_guard<std::mutex> lock(ipc_ptr_cache_mutex);
 
-        // Check for existing handle
-        auto it = ipc_ptr_cache.find(ptr);
+        auto it = ipc_ptr_cache.find(key);
         if (it != ipc_ptr_cache.end()) {
             if (auto box = it->second.lock()) {
+                // Cache hit - return existing handle
                 return DevicePtrHandle(box, &box->resource);
             }
             ipc_ptr_cache.erase(it);  // Expired entry
         }
 
+        // Cache miss - import the pointer
+        GILReleaseGuard gil;
+        CUdeviceptr ptr;
+        if (CUDA_SUCCESS != (err = p_cuMemPoolImportPointer(&ptr, *h_pool, data))) {
+            return {};
+        }
+
         // Create new handle with cache-clearing deleter
         auto box = std::shared_ptr<DevicePtrBox>(
             new DevicePtrBox{ptr, h_stream},
-            [h_pool, ptr](DevicePtrBox* b) {
+            [h_pool, key](DevicePtrBox* b) {
                 {
                     std::lock_guard<std::mutex> lock(ipc_ptr_cache_mutex);
-                    ipc_ptr_cache.erase(ptr);
+                    ipc_ptr_cache.erase(key);
                 }
                 GILReleaseGuard gil;
                 p_cuMemFreeAsync(b->resource, native(b->h_stream));
                 delete b;
             }
         );
-        ipc_ptr_cache[ptr] = box;
+        ipc_ptr_cache[key] = box;
         return DevicePtrHandle(box, &box->resource);
 
     } else {
         // No caching - simple handle creation
+        GILReleaseGuard gil;
+        CUdeviceptr ptr;
+        if (CUDA_SUCCESS != (err = p_cuMemPoolImportPointer(&ptr, *h_pool, data))) {
+            return {};
+        }
+
         auto box = std::shared_ptr<DevicePtrBox>(
             new DevicePtrBox{ptr, h_stream},
             [h_pool](DevicePtrBox* b) {

From 9bde6a27347227c0c39c753e2a410eda6dbda86b Mon Sep 17 00:00:00 2001
From: Andy Jost <ajost@nvidia.com>
Date: Thu, 18 Dec 2025 11:14:00 -0800
Subject: [PATCH 34/38] Improve IPC pointer cache comments and fix race
 condition

- Clarify that the cache handles two different memory type behaviors:
  memory pool allocations (nvbug 5570902) and pinned memory (ALREADY_MAPPED)
- Fix race condition in deleter: only erase cache entry if expired,
  avoiding erasure of a new entry created by another thread
- Move GILReleaseGuard before mutex acquisition in deleter
---
 cuda_core/cuda/core/_cpp/resource_handles.cpp | 41 +++++++++++++------
 1 file changed, 29 insertions(+), 12 deletions(-)

diff --git a/cuda_core/cuda/core/_cpp/resource_handles.cpp b/cuda_core/cuda/core/_cpp/resource_handles.cpp
index 538c220f98..88a3f1ff05 100644
--- a/cuda_core/cuda/core/_cpp/resource_handles.cpp
+++ b/cuda_core/cuda/core/_cpp/resource_handles.cpp
@@ -699,19 +699,31 @@ DevicePtrHandle deviceptr_create_with_owner(CUdeviceptr ptr, PyObject* owner) {
 }
 
 // ============================================================================
-// IPC Pointer Cache (workaround for nvbug 5570902)
+// IPC Pointer Cache
 // ============================================================================
-// IPC-imported pointers are not correctly reference counted by the driver.
-// The first cuMemFreeAsync incorrectly unmaps the memory even when the pointer
-// was imported multiple times. We work around this by caching imported pointers
-// and returning the same handle for duplicate imports.
+// This cache handles duplicate IPC imports, which behave differently depending
+// on the memory type:
+//
+// 1. Memory pool allocations (DeviceMemoryResource):
+//    Multiple imports of the same allocation succeed and return duplicate
+//    pointers. However, the driver has a reference counting bug (nvbug 5570902)
+//    where the first cuMemFreeAsync incorrectly unmaps the memory even when
+//    imported multiple times. A driver fix is expected.
+//
+// 2. Pinned memory allocations (PinnedMemoryResource):
+//    Duplicate imports result in CUDA_ERROR_ALREADY_MAPPED.
+//
+// The cache solves both issues by checking the cache before calling
+// cuMemPoolImportPointer and returning the existing handle for duplicate
+// imports. This provides a consistent user experience where the same IPC
+// descriptor can be imported multiple times regardless of memory type.
 //
 // The cache key is the export_data bytes (CUmemPoolPtrExportData), not the
-// returned pointer, because we must check the cache BEFORE calling
-// cuMemPoolImportPointer (which fails with CUDA_ERROR_ALREADY_MAPPED if
-// the pointer is already imported).
+// returned pointer, because we must check before calling the driver API.
 
-// TODO: When driver fix is available, add version check here to bypass cache.
+// TODO: When driver fix for nvbug 5570902 is available, consider whether
+// the cache is still needed for memory pool allocations (it will still be
+// needed for pinned memory).
 static bool use_ipc_ptr_cache() {
     return true;
 }
@@ -750,7 +762,7 @@ DevicePtrHandle deviceptr_import_ipc(MemoryPoolHandle h_pool, const void* export
         reinterpret_cast<const CUmemPoolPtrExportData*>(export_data));
 
     if (use_ipc_ptr_cache()) {
-        // Check cache BEFORE calling cuMemPoolImportPointer
+        // Check cache before calling cuMemPoolImportPointer
         ExportDataKey key;
         std::memcpy(&key.data, data, sizeof(key.data));
 
@@ -776,11 +788,16 @@ DevicePtrHandle deviceptr_import_ipc(MemoryPoolHandle h_pool, const void* export
         auto box = std::shared_ptr<DevicePtrBox>(
             new DevicePtrBox{ptr, h_stream},
             [h_pool, key](DevicePtrBox* b) {
+                GILReleaseGuard gil;
                 {
                     std::lock_guard<std::mutex> lock(ipc_ptr_cache_mutex);
-                    ipc_ptr_cache.erase(key);
+                    // Only erase if expired - avoids race where another thread
+                    // replaced the entry with a new import before we acquired the lock.
+                    auto it = ipc_ptr_cache.find(key);
+                    if (it != ipc_ptr_cache.end() && it->second.expired()) {
+                        ipc_ptr_cache.erase(it);
+                    }
                 }
-                GILReleaseGuard gil;
                 p_cuMemFreeAsync(b->resource, native(b->h_stream));
                 delete b;
             }

From 90ab0a59d8ba1de814492f2f05e187b142a18bf1 Mon Sep 17 00:00:00 2001
From: Andy Jost <ajost@nvidia.com>
Date: Thu, 18 Dec 2025 11:29:29 -0800
Subject: [PATCH 35/38] Refactor load_driver_api to use RAII GIL guard

Replace raw PyGILState_Ensure/Release calls with a simple GILGuard
class, eliminating manual release on each early return path.
---
 cuda_core/cuda/core/_cpp/resource_handles.cpp | 40 +++++++++++++------
 1 file changed, 28 insertions(+), 12 deletions(-)

diff --git a/cuda_core/cuda/core/_cpp/resource_handles.cpp b/cuda_core/cuda/core/_cpp/resource_handles.cpp
index 88a3f1ff05..3d35b0b498 100644
--- a/cuda_core/cuda/core/_cpp/resource_handles.cpp
+++ b/cuda_core/cuda/core/_cpp/resource_handles.cpp
@@ -38,6 +38,30 @@ static inline bool py_is_finalizing() noexcept {
 #endif
 }
 
+// Simple RAII guard to acquire the GIL. Used in load_driver_api.
+class GILGuard {
+public:
+    GILGuard() : acquired_(false) {
+        if (!Py_IsInitialized() || py_is_finalizing()) {
+            return;
+        }
+        gstate_ = PyGILState_Ensure();
+        acquired_ = true;
+    }
+    ~GILGuard() {
+        if (acquired_) {
+            PyGILState_Release(gstate_);
+        }
+    }
+    bool acquired() const { return acquired_; }
+    GILGuard(const GILGuard&) = delete;
+    GILGuard& operator=(const GILGuard&) = delete;
+
+private:
+    PyGILState_STATE gstate_;
+    bool acquired_;
+};
+
 #define DECLARE_DRIVER_FN(name) using name##_t = decltype(&name); static name##_t p_##name = nullptr
 
 DECLARE_DRIVER_FN(cuDevicePrimaryCtxRetain);
@@ -73,10 +97,6 @@ DECLARE_DRIVER_FN(cuMemPoolImportPointer);
 #undef DECLARE_DRIVER_FN
 
 static bool load_driver_api() noexcept {
-    if (!Py_IsInitialized() || py_is_finalizing()) {
-        return false;
-    }
-
     struct CudaDriverApiV1 {
         std::uint32_t abi_version;
         std::uint32_t struct_size;
@@ -115,21 +135,22 @@ static bool load_driver_api() noexcept {
     static constexpr const char* capsule_name =
         "cuda.core._resource_handles._CUDA_DRIVER_API_V1";
 
-    PyGILState_STATE gstate = PyGILState_Ensure();
+    GILGuard gil;
+    if (!gil.acquired()) {
+        return false;
+    }
 
     // `_resource_handles` is already loaded (it exports the handle API capsule),
     // so avoid import machinery and just grab the module object.
     PyObject* mod = PyImport_AddModule("cuda.core._resource_handles");  // borrowed
     if (!mod) {
         PyErr_Clear();
-        PyGILState_Release(gstate);
         return false;
     }
 
     PyObject* fn = PyObject_GetAttrString(mod, "_get_cuda_driver_api_v1_capsule");  // new ref
     if (!fn) {
         PyErr_Clear();
-        PyGILState_Release(gstate);
         return false;
     }
 
@@ -137,7 +158,6 @@ static bool load_driver_api() noexcept {
     Py_DECREF(fn);
     if (!cap) {
         PyErr_Clear();
-        PyGILState_Release(gstate);
         return false;
     }
 
@@ -146,18 +166,15 @@ static bool load_driver_api() noexcept {
 
     if (!api) {
         PyErr_Clear();
-        PyGILState_Release(gstate);
         return false;
     }
     if (api->abi_version != 1 || api->struct_size < sizeof(CudaDriverApiV1)) {
-        PyGILState_Release(gstate);
         return false;
     }
 
 #define LOAD_ADDR(name)                                             \
     do {                                                            \
         if (api->name == 0) {                                       \
-            PyGILState_Release(gstate);                             \
             return false;                                           \
         }                                                           \
         p_##name = reinterpret_cast<decltype(p_##name)>(api->name); \
@@ -195,7 +212,6 @@ static bool load_driver_api() noexcept {
 
 #undef LOAD_ADDR
 
-    PyGILState_Release(gstate);
     return true;
 }
 

From 675ca24071ef9ef9b62eb034d12223db0ef18df6 Mon Sep 17 00:00:00 2001
From: Andy Jost <ajost@nvidia.com>
Date: Thu, 18 Dec 2025 14:38:07 -0800
Subject: [PATCH 36/38] Add DESIGN.md and optimize GIL usage in resource handle
 wrappers and CUDA operations

- Change resource handle wrapper functions from `except * nogil` to
  `noexcept nogil` to avoid GIL acquisition on every call
- Add `_init_handles_table()` for consumers to initialize at module level
- Move CUDA operations into nogil blocks: cuMemcpyAsync, deviceptr_alloc_*,
  create_event_handle_noctx
- Add Buffer._clear() to properly reset the handle shared_ptr
- Add DESIGN.md documenting the resource handles architecture
---
 cuda_core/cuda/core/_cpp/DESIGN.md            | 286 ++++++++++++++++++
 cuda_core/cuda/core/_device.pyx               |   3 +
 cuda_core/cuda/core/_event.pyx                |   3 +
 cuda_core/cuda/core/_memory/_buffer.pyx       |  21 +-
 .../core/_memory/_graph_memory_resource.pyx   |   6 +-
 cuda_core/cuda/core/_memory/_ipc.pyx          |   3 +
 cuda_core/cuda/core/_memory/_memory_pool.pyx  |   6 +-
 cuda_core/cuda/core/_memoryview.pyx           |   5 +-
 cuda_core/cuda/core/_resource_handles.pxd     | 118 ++++----
 cuda_core/cuda/core/_stream.pyx               |   5 +-
 .../memory_ipc/test_ipc_duplicate_import.py   |   2 +-
 11 files changed, 390 insertions(+), 68 deletions(-)
 create mode 100644 cuda_core/cuda/core/_cpp/DESIGN.md

diff --git a/cuda_core/cuda/core/_cpp/DESIGN.md b/cuda_core/cuda/core/_cpp/DESIGN.md
new file mode 100644
index 0000000000..003dcfd945
--- /dev/null
+++ b/cuda_core/cuda/core/_cpp/DESIGN.md
@@ -0,0 +1,286 @@
+# Resource Handles Design
+
+This document describes the resource handle abstraction in cuda.core, which provides
+robust lifetime management for CUDA resources.
+
+## Overview
+
+The cuda-core Python library provides a high-level interface to CUDA resources such as
+Context, Device, Stream, and Event. These objects correspond to resources managed by
+the CUDA Driver API, each having explicit creation and destruction routines. Several
+of these CUDA resources also participate in non-trivial ownership hierarchies (e.g.,
+a stream belongs to a context), and releasing them may require additional arguments
+or other resources (e.g., a device pointer freed through a specific stream).
+
+### Goals
+
+The goal of the handle abstraction is to provide a robust, explicit, and Python-agnostic
+layer for ownership and lifetime management of CUDA resources. The intent is to use
+handles as the backbone of the cuda-core resource hierarchy, enabling cuda-core Python
+objects to manipulate handles rather than work directly with raw CUDA resources.
+
+While Python-facing objects expose convenient APIs and additional behaviors, the handle
+layer isolates all concerns related to resource lifetime. By cleanly separating these
+responsibilities, we achieve:
+
+- **Clearer architecture** with minimal cross-layer coupling
+- **Safe transfer of resource ownership** between Python and other domains, including C++
+- **Ability to preserve resource validity** independent of Python
+- **Well-specified semantics** for immutability, ownership, and reachability
+- **Simplified reasoning about resource lifetimes**, especially with nested or dependent resources
+
+### Handle Semantics
+
+Resource handles provide **referentially transparent** wrappers around CUDA resources:
+
+- **No rebinding**: A handle always refers to the same resource.
+- **No invalidation**: If a handle exists, its resource is valid.
+- **Structural dependencies**: If resource A depends on resource B, A's handle
+  embeds B's handle, automatically extending B's lifetime.
+
+This eliminates global lifetime analysis. Correctness is enforced structurally—if you
+have a handle, you have a valid resource.
+
+## Handle Types
+
+All handles are `std::shared_ptr` aliases that expose only the raw CUDA resource:
+
+```cpp
+using ContextHandle = std::shared_ptr<const CUcontext>;
+using StreamHandle = std::shared_ptr<const CUstream>;
+using EventHandle = std::shared_ptr<const CUevent>;
+using MemoryPoolHandle = std::shared_ptr<const CUmemoryPool>;
+using DevicePtrHandle = std::shared_ptr<const CUdeviceptr>;
+```
+
+Internally, handles use **shared pointer aliasing**: the actual managed object is a
+"box" containing the resource, its dependencies, and any state needed for destruction.
+The public handle points only to the raw resource field, keeping the API minimal.
+
+### Why shared_ptr?
+
+- **Automatic reference counting**: Resources are released when the last reference
+  disappears.
+- **Cross-language stability**: Works across Python/C++ boundaries without relying
+  on Python's garbage collector.
+- **Interpreter independence**: Resources remain valid even during Python shutdown.
+- **Type-erased deleters**: Destruction logic is captured at creation time, supporting
+  diverse lifetime strategies.
+
+## Accessing Handle Values
+
+Handles can be accessed in three ways via overloaded helper functions:
+
+| Function | Returns | Use Case | Notes
+|----------|---------|----------|-------|
+| `native(h)` | Raw CUDA type (e.g., `CUstream`) | Passing to CUDA APIs | An attribute of `cuda.bindings.cydriver` |
+| `intptr(h)` | `intptr_t` | Python interop, foreign code | |
+| `py(h)` | Python wrapper object | Returning to Python callers | An attribute of `cure.bindings.driver`
+
+These overloads exist because `std::shared_ptr` cannot have additional attributes.
+Wrapping handles in Python objects would be superfluous overhead for internal use,
+so we provide these helpers instead.
+
+Example usage from Cython:
+
+```cython
+# Get raw handle for CUDA API calls
+cdef CUstream raw_stream = native(h_stream)  # cuda.bindings.cydriver.CUstream
+
+# Get as integer for other use cases
+return hash(intptr(h_stream))
+
+# Get Python wrapper for returning to user
+return py(h_stream)  # cuda.bindings.driver.CUstream
+```
+
+## Code Structure
+
+### Directory Layout
+
+```
+cuda/core/
+├── _resource_handles.pyx    # Cython module (compiles resource_handles.cpp)
+├── _resource_handles.pxd    # Cython declarations and dispatch wrappers
+└── _cpp/
+    ├── resource_handles.hpp       # C++ API declarations
+    ├── resource_handles.cpp       # C++ implementation
+    └── resource_handles_cxx_api.hpp  # Capsule struct definition
+```
+
+### Build Implications
+
+The `_cpp/` subdirectory contains C++ source files that are compiled into the
+`_resource_handles` extension module. Other Cython modules in cuda.core do **not**
+link against this code directly—they access it through a capsule mechanism
+(explained below).
+
+## Capsule Architecture
+
+The implementation uses **two separate capsule mechanisms** for different purposes:
+
+### Capsule 1: C++ API Table (`_CXX_API`)
+
+**Problem**: Cython extension modules compile independently. If multiple modules
+(`_memory.pyx`, `_ipc.pyx`, etc.) each linked `resource_handles.cpp`, they would
+each have their own copies of:
+
+- Static driver function pointers
+- Thread-local error state
+- Other static data, including global caches
+
+**Solution**: Only `_resource_handles.so` links the C++ code. It exports a capsule
+containing function pointers:
+
+```cpp
+struct ResourceHandlesCxxApiV1 {
+    uint32_t abi_version;
+    uint32_t struct_size;
+
+    // Thread-local error handling
+    CUresult (*get_last_error)() noexcept;
+    CUresult (*peek_last_error)() noexcept;
+    void (*clear_last_error)() noexcept;
+
+    // Handle creation functions
+    ContextHandle (*get_primary_context)(int device_id) noexcept;
+    StreamHandle (*create_stream_handle)(...) noexcept;
+    // ... etc
+};
+```
+
+Other Cython modules import this capsule at runtime and call through the function
+pointers. The `.pxd` file provides inline wrappers that hide this indirection:
+
+```cython
+cdef inline StreamHandle create_stream_handle(...) except * nogil:
+    return _handles_table.create_stream_handle(...)
+```
+
+Importing modules are expected to call `_init_handles_table()` prior to calling
+any wrapper functions.
+
+### Capsule 2: CUDA Driver API (`_CUDA_DRIVER_API_V1`)
+
+**Problem**: cuda.core cannot directly call CUDA driver functions because:
+
+1. We don't want to link against `libcuda.so` at build time.
+2. The driver symbols must be resolved dynamically through cuda-bindings.
+
+**Solution**: `_resource_handles.pyx` creates a capsule containing CUDA driver
+function pointers obtained from cuda-bindings:
+
+```cpp
+struct CudaDriverApiV1 {
+    uint32_t abi_version;
+    uint32_t struct_size;
+
+    uintptr_t cuDevicePrimaryCtxRetain;
+    uintptr_t cuDevicePrimaryCtxRelease;
+    uintptr_t cuStreamCreateWithPriority;
+    uintptr_t cuStreamDestroy;
+    // ... etc
+};
+```
+
+The C++ code retrieves this capsule once (via `load_driver_api()`) and caches the
+function pointers for subsequent use.
+
+### Why Two Capsules?
+
+| Capsule | Direction | Purpose |
+|---------|-----------|---------|
+| `_CXX_API` | C++ → Cython | Share handle functions across modules |
+| `_CUDA_DRIVER_API_V1` | Cython → C++ | Provide resolved driver symbols |
+
+## Key Implementation Details
+
+### Structural Dependencies
+
+When a resource depends on another, its handle embeds the dependency:
+
+```cpp
+struct StreamBox {
+    CUstream resource;
+    ContextHandle h_context;  // Keeps context alive
+};
+```
+
+The shared pointer's custom deleter captures any additional state needed for
+destruction. This ensures resources are always destroyed in the correct order.
+
+### GIL Management
+
+Handle destructors may run from any thread. The implementation includes RAII guards
+(`GILReleaseGuard`, `GILAcquireGuard`) that:
+
+- Release the GIL before calling CUDA APIs (for parallelism)
+- Handle Python finalization gracefully (avoid GIL operations during shutdown)
+- Ensure Python object manipulation happens with GIL held
+
+The handle API functions are safe to call with or without the GIL held. They
+will release the GIL (if necessary) before calling CUDA driver API functions.
+
+### Error Handling
+
+Handle API functions do not raise Python exceptions. Instead, they return an empty
+handle (null `shared_ptr`) on failure and store the error code in thread-local state.
+Callers should check for failure and retrieve the error using `get_last_error()`:
+
+```cython
+cdef StreamHandle h = create_stream_handle(h_ctx, flags, priority)
+if not h:
+    # Handle creation failed - get the CUDA error code
+    cdef CUresult err = get_last_error()
+    # ... handle error (e.g., raise Python exception)
+```
+
+This design allows handle functions to be called from `nogil` blocks without requiring
+GIL acquisition for exception handling on the success path. The error state is
+thread-local, so concurrent calls from different threads do not interfere.
+
+Related functions:
+- `get_last_error()`: Returns and clears the most recent error
+- `peek_last_error()`: Returns the error without clearing it
+- `clear_last_error()`: Clears the error state
+
+## Usage from Cython
+
+```cython
+from cuda.core._resource_handles cimport (
+    StreamHandle,
+    create_stream_handle,
+    native,
+    intptr,
+    get_last_error,
+    _init_handles_table,
+)
+
+_init_handles_table()  # prerequisite before calling handle API functions
+
+# Create a stream
+cdef StreamHandle h_stream = create_stream_handle(h_ctx, flags, priority)
+if not h_stream:
+    HANDLE_RETURN(get_last_error())
+
+# Use in CUDA API
+cuStreamSynchronize(native(h_stream))
+
+# Return to Python
+return py(h_stream)
+```
+
+## Summary
+
+The resource handle design:
+
+1. **Separates resource management** into its own layer, independent of Python objects.
+2. **Encodes lifetimes structurally** via embedded handle dependencies.
+3. **Uses capsules** to solve two distinct problems:
+   - Sharing C++ code across Cython modules without duplicate statics.
+   - Resolving CUDA driver symbols dynamically through cuda-bindings.
+4. **Provides overloaded accessors** (`native`, `intptr`, `py`) since handles cannot
+   have attributes without unnecessary Python object wrappers.
+
+This architecture ensures CUDA resources are managed correctly regardless of Python
+garbage collection timing, interpreter shutdown, or cross-language usage patterns.
diff --git a/cuda_core/cuda/core/_device.pyx b/cuda_core/cuda/core/_device.pyx
index a8387273c7..014b7dae78 100644
--- a/cuda_core/cuda/core/_device.pyx
+++ b/cuda_core/cuda/core/_device.pyx
@@ -17,10 +17,13 @@ from cuda.core._event cimport Event as cyEvent
 from cuda.core._event import Event, EventOptions
 from cuda.core._resource_handles cimport (
     ContextHandle,
+    _init_handles_table,
     create_context_handle_ref,
     get_primary_context,
     native,
 )
+
+_init_handles_table()
 from cuda.core._graph import GraphBuilder
 from cuda.core._stream import IsStreamT, Stream, StreamOptions
 from cuda.core._utils.clear_error_support import assert_type
diff --git a/cuda_core/cuda/core/_event.pyx b/cuda_core/cuda/core/_event.pyx
index faff47bec9..1dec487665 100644
--- a/cuda_core/cuda/core/_event.pyx
+++ b/cuda_core/cuda/core/_event.pyx
@@ -11,12 +11,15 @@ from cuda.core._context cimport Context
 from cuda.core._resource_handles cimport (
     ContextHandle,
     EventHandle,
+    _init_handles_table,
     create_event_handle,
     create_event_handle_ipc,
     intptr,
     native,
     py,
 )
+
+_init_handles_table()
 from cuda.core._utils.cuda_utils cimport (
     check_or_create_options,
     HANDLE_RETURN
diff --git a/cuda_core/cuda/core/_memory/_buffer.pyx b/cuda_core/cuda/core/_memory/_buffer.pyx
index d7e6252ab4..3e2c4c4d05 100644
--- a/cuda_core/cuda/core/_memory/_buffer.pyx
+++ b/cuda_core/cuda/core/_memory/_buffer.pyx
@@ -16,11 +16,14 @@ from cuda.core._memory cimport _ipc
 from cuda.core._resource_handles cimport (
     DevicePtrHandle,
     StreamHandle,
+    _init_handles_table,
     deviceptr_create_with_owner,
     intptr,
     native,
     set_deallocation_stream,
 )
+
+_init_handles_table()
 from cuda.core._stream cimport Stream_accept, Stream
 from cuda.core._utils.cuda_utils cimport (
     _check_driver_error as raise_if_driver_error,
@@ -61,7 +64,7 @@ cdef class Buffer:
         self._clear()
 
     def _clear(self):
-        # _h_ptr is default-initialized (empty shared_ptr) by C++
+        self._h_ptr.reset()  # Release the handle
         self._size = 0
         self._memory_resource = None
         self._ipc_data = None
@@ -171,22 +174,23 @@ cdef class Buffer:
             asynchronous copy
 
         """
-        stream = Stream_accept(stream)
+        cdef Stream s = Stream_accept(stream)
         cdef size_t src_size = self._size
 
         if dst is None:
             if self._memory_resource is None:
                 raise ValueError("a destination buffer must be provided (this "
                                  "buffer does not have a memory_resource)")
-            dst = self._memory_resource.allocate(src_size, stream)
+            dst = self._memory_resource.allocate(src_size, s)
 
         cdef size_t dst_size = dst._size
         if dst_size != src_size:
             raise ValueError( "buffer sizes mismatch between src and dst (sizes "
                              f"are: src={src_size}, dst={dst_size})"
             )
-        err, = driver.cuMemcpyAsync(native(dst._h_ptr), native(self._h_ptr), src_size, stream.handle)
-        raise_if_driver_error(err)
+        with nogil:
+            HANDLE_RETURN(cydriver.cuMemcpyAsync(
+                native(dst._h_ptr), native(self._h_ptr), src_size, native(s._h_stream)))
         return dst
 
     def copy_from(self, src: Buffer, *, stream: Stream | GraphBuilder):
@@ -201,7 +205,7 @@ cdef class Buffer:
             asynchronous copy
 
         """
-        stream = Stream_accept(stream)
+        cdef Stream s = Stream_accept(stream)
         cdef size_t dst_size = self._size
         cdef size_t src_size = src._size
 
@@ -209,8 +213,9 @@ cdef class Buffer:
             raise ValueError( "buffer sizes mismatch between src and dst (sizes "
                              f"are: src={src_size}, dst={dst_size})"
             )
-        err, = driver.cuMemcpyAsync(native(self._h_ptr), native(src._h_ptr), dst_size, stream.handle)
-        raise_if_driver_error(err)
+        with nogil:
+            HANDLE_RETURN(cydriver.cuMemcpyAsync(
+                native(self._h_ptr), native(src._h_ptr), dst_size, native(s._h_stream)))
 
     def fill(self, value: int | BufferProtocol, *, stream: Stream | GraphBuilder):
         """Fill this buffer with a repeating byte pattern.
diff --git a/cuda_core/cuda/core/_memory/_graph_memory_resource.pyx b/cuda_core/cuda/core/_memory/_graph_memory_resource.pyx
index 3ad20fdabb..daa38a1216 100644
--- a/cuda_core/cuda/core/_memory/_graph_memory_resource.pyx
+++ b/cuda_core/cuda/core/_memory/_graph_memory_resource.pyx
@@ -10,9 +10,12 @@ from cuda.bindings cimport cydriver
 from cuda.core._memory._buffer cimport Buffer, Buffer_from_deviceptr_handle, MemoryResource
 from cuda.core._resource_handles cimport (
     DevicePtrHandle,
+    _init_handles_table,
     deviceptr_alloc_async,
     native,
 )
+
+_init_handles_table()
 from cuda.core._stream cimport default_stream, Stream_accept, Stream
 from cuda.core._utils.cuda_utils cimport HANDLE_RETURN
 
@@ -192,9 +195,10 @@ cdef inline int check_capturing(cydriver.CUstream s) except?-1 nogil:
 
 cdef inline Buffer GMR_allocate(cyGraphMemoryResource self, size_t size, Stream stream):
     cdef cydriver.CUstream s = native(stream._h_stream)
+    cdef DevicePtrHandle h_ptr
     with nogil:
         check_capturing(s)
-    cdef DevicePtrHandle h_ptr = deviceptr_alloc_async(size, stream._h_stream)
+        h_ptr = deviceptr_alloc_async(size, stream._h_stream)
     if not h_ptr:
         raise RuntimeError("Failed to allocate memory asynchronously")
     return Buffer_from_deviceptr_handle(h_ptr, size, self, None)
diff --git a/cuda_core/cuda/core/_memory/_ipc.pyx b/cuda_core/cuda/core/_memory/_ipc.pyx
index 3c0eee3300..99608f55db 100644
--- a/cuda_core/cuda/core/_memory/_ipc.pyx
+++ b/cuda_core/cuda/core/_memory/_ipc.pyx
@@ -10,11 +10,14 @@ from cuda.core._memory._memory_pool cimport _MemPool
 from cuda.core._stream cimport Stream
 from cuda.core._resource_handles cimport (
     DevicePtrHandle,
+    _init_handles_table,
     create_mempool_handle_ipc,
     deviceptr_import_ipc,
     get_last_error,
     native,
 )
+
+_init_handles_table()
 from cuda.core._stream cimport default_stream
 from cuda.core._utils.cuda_utils cimport HANDLE_RETURN
 from cuda.core._utils.cuda_utils import check_multiprocessing_start_method
diff --git a/cuda_core/cuda/core/_memory/_memory_pool.pyx b/cuda_core/cuda/core/_memory/_memory_pool.pyx
index db994c09b0..7a255ebb3d 100644
--- a/cuda_core/cuda/core/_memory/_memory_pool.pyx
+++ b/cuda_core/cuda/core/_memory/_memory_pool.pyx
@@ -16,6 +16,7 @@ from cuda.core._stream cimport default_stream, Stream_accept, Stream
 from cuda.core._resource_handles cimport (
     MemoryPoolHandle,
     DevicePtrHandle,
+    _init_handles_table,
     create_mempool_handle,
     create_mempool_handle_ref,
     get_device_mempool,
@@ -23,6 +24,8 @@ from cuda.core._resource_handles cimport (
     native,
     py,
 )
+
+_init_handles_table()
 from cuda.core._utils.cuda_utils cimport (
     HANDLE_RETURN,
 )
@@ -424,9 +427,10 @@ cdef inline int check_not_capturing(cydriver.CUstream s) except?-1 nogil:
 
 cdef inline Buffer _MP_allocate(_MemPool self, size_t size, Stream stream):
     cdef cydriver.CUstream s = native(stream._h_stream)
+    cdef DevicePtrHandle h_ptr
     with nogil:
         check_not_capturing(s)
-    cdef DevicePtrHandle h_ptr = deviceptr_alloc_from_pool(size, self._h_pool, stream._h_stream)
+        h_ptr = deviceptr_alloc_from_pool(size, self._h_pool, stream._h_stream)
     if not h_ptr:
         raise RuntimeError("Failed to allocate memory from pool")
     return Buffer_from_deviceptr_handle(h_ptr, size, self, None)
diff --git a/cuda_core/cuda/core/_memoryview.pyx b/cuda_core/cuda/core/_memoryview.pyx
index 082fc7b130..41321c8722 100644
--- a/cuda_core/cuda/core/_memoryview.pyx
+++ b/cuda_core/cuda/core/_memoryview.pyx
@@ -16,9 +16,12 @@ import numpy
 from cuda.bindings cimport cydriver
 from cuda.core._resource_handles cimport (
     EventHandle,
+    _init_handles_table,
     create_event_handle_noctx,
     native,
 )
+
+_init_handles_table()
 from cuda.core._utils.cuda_utils import handle_return, driver
 from cuda.core._utils.cuda_utils cimport HANDLE_RETURN
 
@@ -608,8 +611,8 @@ cpdef StridedMemoryView view_as_cai(obj, stream_ptr, view=None):
             assert producer_s > 0
             # establish stream order
             if producer_s != consumer_s:
-                h_event = create_event_handle_noctx(cydriver.CUevent_flags.CU_EVENT_DISABLE_TIMING)
                 with nogil:
+                    h_event = create_event_handle_noctx(cydriver.CUevent_flags.CU_EVENT_DISABLE_TIMING)
                     HANDLE_RETURN(cydriver.cuEventRecord(
                         native(h_event), <cydriver.CUstream>producer_s))
                     HANDLE_RETURN(cydriver.cuStreamWaitEvent(
diff --git a/cuda_core/cuda/core/_resource_handles.pxd b/cuda_core/cuda/core/_resource_handles.pxd
index 06d41bf170..801d354958 100644
--- a/cuda_core/cuda/core/_resource_handles.pxd
+++ b/cuda_core/cuda/core/_resource_handles.pxd
@@ -121,118 +121,126 @@ cdef inline const ResourceHandlesCxxApiV1* _get_handles_table() except NULL nogi
 
 
 # -----------------------------------------------------------------------------
-# Dispatch wrappers (hide capsule init from consumers)
+# Dispatch wrappers
+#
+# These wrappers assume _handles_table has been initialized. Consumers must call
+# _init_handles_table() at module level before using these functions in nogil blocks.
 # -----------------------------------------------------------------------------
 
-cdef inline cydriver.CUresult get_last_error() except * nogil:
-    return _get_handles_table().get_last_error()
+cdef inline void _init_handles_table() except *:
+    """Initialize the handles table. Call at module level before using wrappers."""
+    _get_handles_table()
+
+
+cdef inline cydriver.CUresult get_last_error() noexcept nogil:
+    return _handles_table.get_last_error()
 
 
-cdef inline cydriver.CUresult peek_last_error() except * nogil:
-    return _get_handles_table().peek_last_error()
+cdef inline cydriver.CUresult peek_last_error() noexcept nogil:
+    return _handles_table.peek_last_error()
 
 
-cdef inline void clear_last_error() except * nogil:
-    _get_handles_table().clear_last_error()
+cdef inline void clear_last_error() noexcept nogil:
+    _handles_table.clear_last_error()
 
 
-cdef inline ContextHandle create_context_handle_ref(cydriver.CUcontext ctx) except * nogil:
-    return _get_handles_table().create_context_handle_ref(ctx)
+cdef inline ContextHandle create_context_handle_ref(cydriver.CUcontext ctx) noexcept nogil:
+    return _handles_table.create_context_handle_ref(ctx)
 
 
-cdef inline ContextHandle get_primary_context(int device_id) except * nogil:
-    return _get_handles_table().get_primary_context(device_id)
+cdef inline ContextHandle get_primary_context(int device_id) noexcept nogil:
+    return _handles_table.get_primary_context(device_id)
 
 
-cdef inline ContextHandle get_current_context() except * nogil:
-    return _get_handles_table().get_current_context()
+cdef inline ContextHandle get_current_context() noexcept nogil:
+    return _handles_table.get_current_context()
 
 
-cdef inline StreamHandle create_stream_handle(ContextHandle h_ctx, unsigned int flags, int priority) except * nogil:
-    return _get_handles_table().create_stream_handle(h_ctx, flags, priority)
+cdef inline StreamHandle create_stream_handle(ContextHandle h_ctx, unsigned int flags, int priority) noexcept nogil:
+    return _handles_table.create_stream_handle(h_ctx, flags, priority)
 
 
-cdef inline StreamHandle create_stream_handle_ref(cydriver.CUstream stream) except * nogil:
-    return _get_handles_table().create_stream_handle_ref(stream)
+cdef inline StreamHandle create_stream_handle_ref(cydriver.CUstream stream) noexcept nogil:
+    return _handles_table.create_stream_handle_ref(stream)
 
 
-cdef inline StreamHandle create_stream_handle_with_owner(cydriver.CUstream stream, object owner) except *:
-    return _get_handles_table().create_stream_handle_with_owner(stream, owner)
+cdef inline StreamHandle create_stream_handle_with_owner(cydriver.CUstream stream, object owner):
+    return _handles_table.create_stream_handle_with_owner(stream, owner)
 
 
-cdef inline StreamHandle get_legacy_stream() except * nogil:
-    return _get_handles_table().get_legacy_stream()
+cdef inline StreamHandle get_legacy_stream() noexcept nogil:
+    return _handles_table.get_legacy_stream()
 
 
-cdef inline StreamHandle get_per_thread_stream() except * nogil:
-    return _get_handles_table().get_per_thread_stream()
+cdef inline StreamHandle get_per_thread_stream() noexcept nogil:
+    return _handles_table.get_per_thread_stream()
 
 
-cdef inline EventHandle create_event_handle(ContextHandle h_ctx, unsigned int flags) except * nogil:
-    return _get_handles_table().create_event_handle(h_ctx, flags)
+cdef inline EventHandle create_event_handle(ContextHandle h_ctx, unsigned int flags) noexcept nogil:
+    return _handles_table.create_event_handle(h_ctx, flags)
 
 
-cdef inline EventHandle create_event_handle_noctx(unsigned int flags) except * nogil:
-    return _get_handles_table().create_event_handle_noctx(flags)
+cdef inline EventHandle create_event_handle_noctx(unsigned int flags) noexcept nogil:
+    return _handles_table.create_event_handle_noctx(flags)
 
 
-cdef inline EventHandle create_event_handle_ipc(const cydriver.CUipcEventHandle& ipc_handle) except * nogil:
-    return _get_handles_table().create_event_handle_ipc(ipc_handle)
+cdef inline EventHandle create_event_handle_ipc(const cydriver.CUipcEventHandle& ipc_handle) noexcept nogil:
+    return _handles_table.create_event_handle_ipc(ipc_handle)
 
 
-cdef inline MemoryPoolHandle create_mempool_handle(const cydriver.CUmemPoolProps& props) except * nogil:
-    return _get_handles_table().create_mempool_handle(props)
+cdef inline MemoryPoolHandle create_mempool_handle(const cydriver.CUmemPoolProps& props) noexcept nogil:
+    return _handles_table.create_mempool_handle(props)
 
 
-cdef inline MemoryPoolHandle create_mempool_handle_ref(cydriver.CUmemoryPool pool) except * nogil:
-    return _get_handles_table().create_mempool_handle_ref(pool)
+cdef inline MemoryPoolHandle create_mempool_handle_ref(cydriver.CUmemoryPool pool) noexcept nogil:
+    return _handles_table.create_mempool_handle_ref(pool)
 
 
-cdef inline MemoryPoolHandle get_device_mempool(int device_id) except * nogil:
-    return _get_handles_table().get_device_mempool(device_id)
+cdef inline MemoryPoolHandle get_device_mempool(int device_id) noexcept nogil:
+    return _handles_table.get_device_mempool(device_id)
 
 
-cdef inline MemoryPoolHandle create_mempool_handle_ipc(int fd, cydriver.CUmemAllocationHandleType handle_type) except * nogil:
-    return _get_handles_table().create_mempool_handle_ipc(fd, handle_type)
+cdef inline MemoryPoolHandle create_mempool_handle_ipc(int fd, cydriver.CUmemAllocationHandleType handle_type) noexcept nogil:
+    return _handles_table.create_mempool_handle_ipc(fd, handle_type)
 
 
 cdef inline DevicePtrHandle deviceptr_alloc_from_pool(
     size_t size,
     MemoryPoolHandle h_pool,
-    StreamHandle h_stream) except * nogil:
-    return _get_handles_table().deviceptr_alloc_from_pool(size, h_pool, h_stream)
+    StreamHandle h_stream) noexcept nogil:
+    return _handles_table.deviceptr_alloc_from_pool(size, h_pool, h_stream)
 
 
-cdef inline DevicePtrHandle deviceptr_alloc_async(size_t size, StreamHandle h_stream) except * nogil:
-    return _get_handles_table().deviceptr_alloc_async(size, h_stream)
+cdef inline DevicePtrHandle deviceptr_alloc_async(size_t size, StreamHandle h_stream) noexcept nogil:
+    return _handles_table.deviceptr_alloc_async(size, h_stream)
 
 
-cdef inline DevicePtrHandle deviceptr_alloc(size_t size) except * nogil:
-    return _get_handles_table().deviceptr_alloc(size)
+cdef inline DevicePtrHandle deviceptr_alloc(size_t size) noexcept nogil:
+    return _handles_table.deviceptr_alloc(size)
 
 
-cdef inline DevicePtrHandle deviceptr_alloc_host(size_t size) except * nogil:
-    return _get_handles_table().deviceptr_alloc_host(size)
+cdef inline DevicePtrHandle deviceptr_alloc_host(size_t size) noexcept nogil:
+    return _handles_table.deviceptr_alloc_host(size)
 
 
-cdef inline DevicePtrHandle deviceptr_create_ref(cydriver.CUdeviceptr ptr) except * nogil:
-    return _get_handles_table().deviceptr_create_ref(ptr)
+cdef inline DevicePtrHandle deviceptr_create_ref(cydriver.CUdeviceptr ptr) noexcept nogil:
+    return _handles_table.deviceptr_create_ref(ptr)
 
 
-cdef inline DevicePtrHandle deviceptr_create_with_owner(cydriver.CUdeviceptr ptr, object owner) except *:
-    return _get_handles_table().deviceptr_create_with_owner(ptr, owner)
+cdef inline DevicePtrHandle deviceptr_create_with_owner(cydriver.CUdeviceptr ptr, object owner):
+    return _handles_table.deviceptr_create_with_owner(ptr, owner)
 
 
 cdef inline DevicePtrHandle deviceptr_import_ipc(
     MemoryPoolHandle h_pool,
     const void* export_data,
-    StreamHandle h_stream) except * nogil:
-    return _get_handles_table().deviceptr_import_ipc(h_pool, export_data, h_stream)
+    StreamHandle h_stream) noexcept nogil:
+    return _handles_table.deviceptr_import_ipc(h_pool, export_data, h_stream)
 
 
-cdef inline StreamHandle deallocation_stream(const DevicePtrHandle& h) except * nogil:
-    return _get_handles_table().deallocation_stream(h)
+cdef inline StreamHandle deallocation_stream(const DevicePtrHandle& h) noexcept nogil:
+    return _handles_table.deallocation_stream(h)
 
 
-cdef inline void set_deallocation_stream(const DevicePtrHandle& h, StreamHandle h_stream) except * nogil:
-    _get_handles_table().set_deallocation_stream(h, h_stream)
+cdef inline void set_deallocation_stream(const DevicePtrHandle& h, StreamHandle h_stream) noexcept nogil:
+    _handles_table.set_deallocation_stream(h, h_stream)
diff --git a/cuda_core/cuda/core/_stream.pyx b/cuda_core/cuda/core/_stream.pyx
index 4626fbf109..aecf24b06e 100644
--- a/cuda_core/cuda/core/_stream.pyx
+++ b/cuda_core/cuda/core/_stream.pyx
@@ -29,6 +29,7 @@ from cuda.core._resource_handles cimport (
     ContextHandle,
     EventHandle,
     StreamHandle,
+    _init_handles_table,
     create_context_handle_ref,
     create_event_handle_noctx,
     create_stream_handle,
@@ -40,6 +41,8 @@ from cuda.core._resource_handles cimport (
     native,
     py,
 )
+
+_init_handles_table()
 from cuda.core._graph import GraphBuilder
 
 
@@ -303,8 +306,8 @@ cdef class Stream:
                 ) from e
 
         # Wait on stream via temporary event
-        h_event = create_event_handle_noctx(cydriver.CUevent_flags.CU_EVENT_DISABLE_TIMING)
         with nogil:
+            h_event = create_event_handle_noctx(cydriver.CUevent_flags.CU_EVENT_DISABLE_TIMING)
             HANDLE_RETURN(cydriver.cuEventRecord(native(h_event), native(stream._h_stream)))
             # TODO: support flags other than 0?
             HANDLE_RETURN(cydriver.cuStreamWaitEvent(native(self._h_stream), native(h_event), 0))
diff --git a/cuda_core/tests/memory_ipc/test_ipc_duplicate_import.py b/cuda_core/tests/memory_ipc/test_ipc_duplicate_import.py
index 096b3a2abd..ca4ecc0749 100644
--- a/cuda_core/tests/memory_ipc/test_ipc_duplicate_import.py
+++ b/cuda_core/tests/memory_ipc/test_ipc_duplicate_import.py
@@ -13,7 +13,7 @@
 import multiprocessing as mp
 
 import pytest
-from cuda.core.experimental import Buffer, Device
+from cuda.core import Buffer, Device
 from helpers.logging import TimestampedLogger
 
 CHILD_TIMEOUT_SEC = 20

From ccf9a3b217242773b9e9996a6ca1825b7e1140dc Mon Sep 17 00:00:00 2001
From: Andy Jost <ajost@nvidia.com>
Date: Thu, 18 Dec 2025 15:23:48 -0800
Subject: [PATCH 37/38] linter fix

---
 cuda_core/cuda/core/_memory/_buffer.pyx | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/cuda_core/cuda/core/_memory/_buffer.pyx b/cuda_core/cuda/core/_memory/_buffer.pyx
index 3e2c4c4d05..32fe28bab4 100644
--- a/cuda_core/cuda/core/_memory/_buffer.pyx
+++ b/cuda_core/cuda/core/_memory/_buffer.pyx
@@ -25,10 +25,7 @@ from cuda.core._resource_handles cimport (
 
 _init_handles_table()
 from cuda.core._stream cimport Stream_accept, Stream
-from cuda.core._utils.cuda_utils cimport (
-    _check_driver_error as raise_if_driver_error,
-    HANDLE_RETURN,
-)
+from cuda.core._utils.cuda_utils cimport HANDLE_RETURN
 
 import sys
 from typing import TypeVar, Union

From 6c82cb65feece3b09d3c09e17faa585f98e8c734 Mon Sep 17 00:00:00 2001
From: Andy Jost <ajost@nvidia.com>
Date: Thu, 18 Dec 2025 16:30:05 -0800
Subject: [PATCH 38/38] Consolidate GIL helper classes at top of
 resource_handles.cpp

Move GILReleaseGuard and GILAcquireGuard to the top of the file before
first use, and remove redundant GILGuard class that duplicated
GILAcquireGuard functionality.
---
 cuda_core/cuda/core/_cpp/resource_handles.cpp | 127 +++++++-----------
 1 file changed, 52 insertions(+), 75 deletions(-)

diff --git a/cuda_core/cuda/core/_cpp/resource_handles.cpp b/cuda_core/cuda/core/_cpp/resource_handles.cpp
index 3d35b0b498..5ffc84145c 100644
--- a/cuda_core/cuda/core/_cpp/resource_handles.cpp
+++ b/cuda_core/cuda/core/_cpp/resource_handles.cpp
@@ -38,30 +38,75 @@ static inline bool py_is_finalizing() noexcept {
 #endif
 }
 
-// Simple RAII guard to acquire the GIL. Used in load_driver_api.
-class GILGuard {
+// ============================================================================
+// GIL management helpers
+// ============================================================================
+
+// Helper to release the GIL while calling into the CUDA driver.
+// This guard is *conditional*: if the caller already dropped the GIL,
+// we avoid calling PyEval_SaveThread (which requires holding the GIL).
+// It also handles the case where Python is finalizing and GIL operations
+// are no longer safe.
+class GILReleaseGuard {
+public:
+    GILReleaseGuard() : tstate_(nullptr), released_(false) {
+        // Don't try to manipulate GIL if Python is finalizing
+        if (!Py_IsInitialized() || py_is_finalizing()) {
+            return;
+        }
+        // PyGILState_Check() returns 1 if the GIL is held by this thread.
+        if (PyGILState_Check()) {
+            tstate_ = PyEval_SaveThread();
+            released_ = true;
+        }
+    }
+
+    ~GILReleaseGuard() {
+        if (released_) {
+            PyEval_RestoreThread(tstate_);
+        }
+    }
+
+    // Non-copyable, non-movable
+    GILReleaseGuard(const GILReleaseGuard&) = delete;
+    GILReleaseGuard& operator=(const GILReleaseGuard&) = delete;
+
+private:
+    PyThreadState* tstate_;
+    bool released_;
+};
+
+// Helper to acquire the GIL when we might not hold it.
+// Use in C++ destructors that need to manipulate Python objects.
+class GILAcquireGuard {
 public:
-    GILGuard() : acquired_(false) {
+    GILAcquireGuard() : acquired_(false) {
+        // Don't try to acquire GIL if Python is finalizing
         if (!Py_IsInitialized() || py_is_finalizing()) {
             return;
         }
         gstate_ = PyGILState_Ensure();
         acquired_ = true;
     }
-    ~GILGuard() {
+
+    ~GILAcquireGuard() {
         if (acquired_) {
             PyGILState_Release(gstate_);
         }
     }
+
     bool acquired() const { return acquired_; }
-    GILGuard(const GILGuard&) = delete;
-    GILGuard& operator=(const GILGuard&) = delete;
+
+    // Non-copyable, non-movable
+    GILAcquireGuard(const GILAcquireGuard&) = delete;
+    GILAcquireGuard& operator=(const GILAcquireGuard&) = delete;
 
 private:
     PyGILState_STATE gstate_;
     bool acquired_;
 };
 
+
 #define DECLARE_DRIVER_FN(name) using name##_t = decltype(&name); static name##_t p_##name = nullptr
 
 DECLARE_DRIVER_FN(cuDevicePrimaryCtxRetain);
@@ -135,7 +180,7 @@ static bool load_driver_api() noexcept {
     static constexpr const char* capsule_name =
         "cuda.core._resource_handles._CUDA_DRIVER_API_V1";
 
-    GILGuard gil;
+    GILAcquireGuard gil;
     if (!gil.acquired()) {
         return false;
     }
@@ -243,74 +288,6 @@ void clear_last_error() noexcept {
     err = CUDA_SUCCESS;
 }
 
-// ============================================================================
-// GIL management helpers
-// ============================================================================
-
-// Helper to release the GIL while calling into the CUDA driver.
-// This guard is *conditional*: if the caller already dropped the GIL,
-// we avoid calling PyEval_SaveThread (which requires holding the GIL).
-// It also handles the case where Python is finalizing and GIL operations
-// are no longer safe.
-class GILReleaseGuard {
-public:
-    GILReleaseGuard() : tstate_(nullptr), released_(false) {
-        // Don't try to manipulate GIL if Python is finalizing
-        if (!Py_IsInitialized() || py_is_finalizing()) {
-            return;
-        }
-        // PyGILState_Check() returns 1 if the GIL is held by this thread.
-        if (PyGILState_Check()) {
-            tstate_ = PyEval_SaveThread();
-            released_ = true;
-        }
-    }
-
-    ~GILReleaseGuard() {
-        if (released_) {
-            PyEval_RestoreThread(tstate_);
-        }
-    }
-
-    // Non-copyable, non-movable
-    GILReleaseGuard(const GILReleaseGuard&) = delete;
-    GILReleaseGuard& operator=(const GILReleaseGuard&) = delete;
-
-private:
-    PyThreadState* tstate_;
-    bool released_;
-};
-
-// Helper to acquire the GIL when we might not hold it.
-// Use in C++ destructors that need to manipulate Python objects.
-class GILAcquireGuard {
-public:
-    GILAcquireGuard() : acquired_(false) {
-        // Don't try to acquire GIL if Python is finalizing
-        if (!Py_IsInitialized() || py_is_finalizing()) {
-            return;
-        }
-        gstate_ = PyGILState_Ensure();
-        acquired_ = true;
-    }
-
-    ~GILAcquireGuard() {
-        if (acquired_) {
-            PyGILState_Release(gstate_);
-        }
-    }
-
-    bool acquired() const { return acquired_; }
-
-    // Non-copyable, non-movable
-    GILAcquireGuard(const GILAcquireGuard&) = delete;
-    GILAcquireGuard& operator=(const GILAcquireGuard&) = delete;
-
-private:
-    PyGILState_STATE gstate_;
-    bool acquired_;
-};
-
 // ============================================================================
 // Context Handles
 // ============================================================================