From 20d33da87641d6463711f86ef39e98f0f4623f51 Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Fri, 5 Dec 2025 15:22:00 -0800 Subject: [PATCH 01/38] Centralize context management code - Create _context.pxd with Context class declaration - Move get_primary_context from _device.pyx to _context.pyx - Add Cython-level context functions: get_current_context, set_current_context, get_stream_context - Update _device.pyx to use context module functions - Update _stream.pyx to use context module functions - Remove push_context and pop_context (unused, replaced with direct CUDA API calls) - Reorganize _context.pyx according to style guide (principal class first) --- cuda_core/cuda/core/experimental/_context.pxd | 22 ++++ cuda_core/cuda/core/experimental/_context.pyx | 109 ++++++++++++++++-- cuda_core/cuda/core/experimental/_device.pyx | 64 +++++----- cuda_core/cuda/core/experimental/_stream.pyx | 21 ++-- 4 files changed, 163 insertions(+), 53 deletions(-) create mode 100644 cuda_core/cuda/core/experimental/_context.pxd diff --git a/cuda_core/cuda/core/experimental/_context.pxd b/cuda_core/cuda/core/experimental/_context.pxd new file mode 100644 index 0000000000..0e0df83831 --- /dev/null +++ b/cuda_core/cuda/core/experimental/_context.pxd @@ -0,0 +1,22 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +from cuda.bindings cimport cydriver + +cdef class Context: + """Cython declaration for Context class. + + This class provides access to CUDA contexts. Context objects cannot be + instantiated directly - use factory methods or Device/Stream APIs. + """ + + cdef: + readonly object _handle + int _device_id + +# Cython-level context operations +cdef cydriver.CUcontext get_primary_context(int dev_id) except?NULL +cdef cydriver.CUcontext get_current_context() except?NULL nogil +cdef void set_current_context(cydriver.CUcontext ctx) except * +cdef cydriver.CUcontext get_stream_context(cydriver.CUstream stream) except?NULL nogil diff --git a/cuda_core/cuda/core/experimental/_context.pyx b/cuda_core/cuda/core/experimental/_context.pyx index f9858c1710..244109584d 100644 --- a/cuda_core/cuda/core/experimental/_context.pyx +++ b/cuda_core/cuda/core/experimental/_context.pyx @@ -4,19 +4,23 @@ from dataclasses import dataclass -from cuda.core.experimental._utils.cuda_utils import driver +import threading +from libc.stdint cimport uintptr_t +from cuda.bindings cimport cydriver +from cuda.core.experimental._utils.cuda_utils import driver, CUDAError +from cuda.core.experimental._utils.cuda_utils cimport HANDLE_RETURN -@dataclass -class ContextOptions: - pass # TODO + +__all__ = ['Context', 'ContextOptions'] cdef class Context: + """CUDA context wrapper. - cdef: - readonly object _handle - int _device_id + Context objects represent CUDA contexts and cannot be instantiated directly. + Use Device or Stream APIs to obtain context objects. + """ def __init__(self, *args, **kwargs): raise RuntimeError("Context objects cannot be instantiated directly. Please use Device or Stream APIs.") @@ -36,3 +40,94 @@ cdef class Context: def __hash__(self) -> int: return hash(int(self._handle)) + + +@dataclass +class ContextOptions: + """Options for context creation. + + Currently unused, reserved for future use. + """ + pass # TODO + + +cdef cydriver.CUcontext get_current_context() except?NULL nogil: + """Get the current CUDA context. + + Returns + ------- + CUcontext + Current context handle, or NULL if no context is bound + """ + cdef cydriver.CUcontext ctx = NULL + HANDLE_RETURN(cydriver.cuCtxGetCurrent(&ctx)) + return ctx + + +cdef cydriver.CUcontext get_primary_context(int dev_id) except?NULL: + """Get the primary context for a device. + + Uses thread-local storage to cache primary contexts per device. + The primary context is lazily initialized on first access. + + Parameters + ---------- + dev_id : int + Device ID + + Returns + ------- + CUcontext + Primary context handle for the device, or NULL on error + """ + cdef int total = 0 + cdef cydriver.CUcontext ctx + + try: + primary_ctxs = _tls.primary_ctxs + except AttributeError: + # Initialize primary context cache + with nogil: + HANDLE_RETURN(cydriver.cuDeviceGetCount(&total)) + primary_ctxs = _tls.primary_ctxs = [0] * total + + ctx = (primary_ctxs[dev_id]) + if ctx == NULL: + with nogil: + HANDLE_RETURN(cydriver.cuDevicePrimaryCtxRetain(&ctx, dev_id)) + primary_ctxs[dev_id] = (ctx) + return ctx + + +cdef cydriver.CUcontext get_stream_context(cydriver.CUstream stream) except?NULL nogil: + """Get the context associated with a stream. + + Parameters + ---------- + stream : CUstream + Stream handle + + Returns + ------- + CUcontext + Context handle associated with the stream, or NULL on error + """ + cdef cydriver.CUcontext ctx = NULL + HANDLE_RETURN(cydriver.cuStreamGetCtx(stream, &ctx)) + return ctx + + +cdef void set_current_context(cydriver.CUcontext ctx) except *: + """Set the current CUDA context. + + Parameters + ---------- + ctx : CUcontext + Context handle to set as current + """ + with nogil: + HANDLE_RETURN(cydriver.cuCtxSetCurrent(ctx)) + + +# Thread-local storage for primary context cache +_tls = threading.local() diff --git a/cuda_core/cuda/core/experimental/_device.pyx b/cuda_core/cuda/core/experimental/_device.pyx index cd802943a5..c6efa21ac7 100644 --- a/cuda_core/cuda/core/experimental/_device.pyx +++ b/cuda_core/cuda/core/experimental/_device.pyx @@ -11,7 +11,13 @@ from cuda.core.experimental._utils.cuda_utils cimport HANDLE_RETURN import threading from typing import Optional, TYPE_CHECKING, Union -from cuda.core.experimental._context import Context, ContextOptions +from cuda.core.experimental._context cimport ( + Context, + get_primary_context, + get_current_context, + set_current_context, +) +from cuda.core.experimental._context import ContextOptions from cuda.core.experimental._event import Event, EventOptions from cuda.core.experimental._graph import GraphBuilder from cuda.core.experimental._stream import IsStreamT, Stream, StreamOptions @@ -908,20 +914,6 @@ cdef class DeviceProperties: ) -cdef cydriver.CUcontext _get_primary_context(int dev_id) except?NULL: - try: - primary_ctxs = _tls.primary_ctxs - except AttributeError: - total = len(_tls.devices) - primary_ctxs = _tls.primary_ctxs = [0] * total - cdef cydriver.CUcontext ctx = (primary_ctxs[dev_id]) - if ctx == NULL: - with nogil: - HANDLE_RETURN(cydriver.cuDevicePrimaryCtxRetain(&ctx, dev_id)) - primary_ctxs[dev_id] = (ctx) - return ctx - - class Device: """Represent a GPU and act as an entry point for cuda.core features. @@ -973,8 +965,7 @@ class Device: if err == cydriver.CUresult.CUDA_SUCCESS: device_id = int(dev) elif err == cydriver.CUresult.CUDA_ERROR_INVALID_CONTEXT: - with nogil: - HANDLE_RETURN(cydriver.cuCtxGetCurrent(&ctx)) + ctx = get_current_context() assert (ctx) == NULL device_id = 0 # cudart behavior else: @@ -1010,19 +1001,6 @@ class Device: f"Device {self._id} is not yet initialized, perhaps you forgot to call .set_current() first?" ) - def _get_current_context(self, bint check_consistency=False) -> driver.CUcontext: - cdef cydriver.CUcontext ctx - cdef cydriver.CUdevice dev - cdef cydriver.CUdevice this_dev = self._id - with nogil: - HANDLE_RETURN(cydriver.cuCtxGetCurrent(&ctx)) - if ctx == NULL: - raise CUDAError("No context is bound to the calling CPU thread.") - if check_consistency: - HANDLE_RETURN(cydriver.cuCtxGetDevice(&dev)) - if dev != this_dev: - raise CUDAError("Internal error (current device is not equal to Device.device_id)") - return driver.CUcontext(ctx) @property def device_id(self) -> int: @@ -1136,8 +1114,16 @@ class Device: """ self._check_context_initialized() - ctx = self._get_current_context(check_consistency=True) - return Context._from_ctx(ctx, self._id) + cdef cydriver.CUcontext ctx + cdef cydriver.CUdevice dev + with nogil: + ctx = get_current_context() + HANDLE_RETURN(cydriver.cuCtxGetDevice(&dev)) + if ctx == NULL: + raise CUDAError("No context is bound to the calling CPU thread.") + if dev != self._id: + raise CUDAError("Internal error (current device is not equal to Device.device_id)") + return Context._from_ctx((ctx), self._id) @property def memory_resource(self) -> MemoryResource: @@ -1241,6 +1227,7 @@ class Device: ) # prev_ctx is the previous context curr_ctx = (ctx._handle) + prev_ctx = NULL with nogil: HANDLE_RETURN(cydriver.cuCtxPopCurrent(&prev_ctx)) HANDLE_RETURN(cydriver.cuCtxPushCurrent(curr_ctx)) @@ -1249,9 +1236,8 @@ class Device: return Context._from_ctx((prev_ctx), self._id) else: # use primary ctx - curr_ctx = _get_primary_context(self._id) - with nogil: - HANDLE_RETURN(cydriver.cuCtxSetCurrent(curr_ctx)) + curr_ctx = get_primary_context(self._id) + set_current_context(curr_ctx) self._has_inited = True def create_context(self, options: ContextOptions = None) -> Context: @@ -1324,8 +1310,12 @@ class Device: """ self._check_context_initialized() - ctx = self._get_current_context() - return Event._init(self._id, ctx, options, True) + cdef cydriver.CUcontext ctx + with nogil: + ctx = get_current_context() + if ctx == NULL: + raise CUDAError("No context is bound to the calling CPU thread.") + return Event._init(self._id, (ctx), options, True) def allocate(self, size, stream: Stream | GraphBuilder | None = None) -> Buffer: """Allocate device memory from a specified stream. diff --git a/cuda_core/cuda/core/experimental/_stream.pyx b/cuda_core/cuda/core/experimental/_stream.pyx index 87ec4a691a..440130f679 100644 --- a/cuda_core/cuda/core/experimental/_stream.pyx +++ b/cuda_core/cuda/core/experimental/_stream.pyx @@ -25,7 +25,11 @@ from typing import TYPE_CHECKING, Optional, Protocol, Union if TYPE_CHECKING: import cuda.bindings from cuda.core.experimental._device import Device -from cuda.core.experimental._context import Context +from cuda.core.experimental._context cimport ( + Context, + get_stream_context, + get_current_context, +) from cuda.core.experimental._event import Event, EventOptions from cuda.core.experimental._graph import GraphBuilder from cuda.core.experimental._utils.cuda_utils import ( @@ -317,19 +321,18 @@ cdef class Stream: cdef int _get_context(self) except?-1 nogil: if self._ctx_handle == CU_CONTEXT_INVALID: - HANDLE_RETURN(cydriver.cuStreamGetCtx(self._handle, &(self._ctx_handle))) + self._ctx_handle = get_stream_context(self._handle) return 0 cdef int _get_device_and_context(self) except?-1: cdef cydriver.CUcontext curr_ctx if self._device_id == cydriver.CU_DEVICE_INVALID: - with nogil: - # Get the current context - HANDLE_RETURN(cydriver.cuCtxGetCurrent(&curr_ctx)) - # Get the stream's context (self.ctx_handle is populated) - self._get_context() - # Get the stream's device (may require a context-switching dance) - self._device_id = get_device_from_ctx(self._ctx_handle, curr_ctx) + # Get the current context + curr_ctx = get_current_context() + # Get the stream's context (self.ctx_handle is populated) + self._get_context() + # Get the stream's device (may require a context-switching dance) + self._device_id = get_device_from_ctx(self._ctx_handle, curr_ctx) return 0 @property From 758f9f9aede579d59d93f74b069083c21f58244a Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Sat, 6 Dec 2025 08:13:17 -0800 Subject: [PATCH 02/38] Integrate resource handles into Context class - Replace Context._handle (object) with ContextHandle (shared_ptr) resource handle - Add handle property to Context returning driver.CUcontext - Update Context._from_ctx to create ContextHandle using create_context_handle() - Update Context.__eq__ to compare actual CUcontext values (not shared_ptr addresses) - Update Context.__hash__ to include type(self) and handle value with NULL safety - Update _device.pyx to use ctx._handle.get()[0] for direct access - Update _graph.pyx to use context.handle property - Update C++ implementation to use default deleter (simplifies code) - Rename _resource_handles_impl.cpp to _context_impl.cpp - Remove test_dev.py development script - Update .gitignore to allow *_impl.cpp files - Fix all test files to use context.handle instead of context._handle --- .gitattributes | 3 + .gitignore | 1 + cuda_core/build_hooks.py | 27 ++++++++- cuda_core/cuda/core/experimental/_context.pxd | 3 +- cuda_core/cuda/core/experimental/_context.pyx | 28 +++++++-- .../experimental/_cpp/resource_handles.cpp | 58 +++++++++++++++++++ .../experimental/_cpp/resource_handles.hpp | 30 ++++++++++ cuda_core/cuda/core/experimental/_device.pyx | 2 +- cuda_core/cuda/core/experimental/_graph.py | 2 +- .../core/experimental/_resource_handles.pxd | 15 +++++ .../core/experimental/_resource_handles.pyx | 6 ++ cuda_core/tests/test_comparable.py | 2 +- cuda_core/tests/test_hashable.py | 4 +- cuda_core/tests/test_stream.py | 2 +- 14 files changed, 170 insertions(+), 13 deletions(-) create mode 100644 cuda_core/cuda/core/experimental/_cpp/resource_handles.cpp create mode 100644 cuda_core/cuda/core/experimental/_cpp/resource_handles.hpp create mode 100644 cuda_core/cuda/core/experimental/_resource_handles.pxd create mode 100644 cuda_core/cuda/core/experimental/_resource_handles.pyx diff --git a/.gitattributes b/.gitattributes index 6a3ee0fe72..68492b15c9 100644 --- a/.gitattributes +++ b/.gitattributes @@ -6,6 +6,9 @@ cuda/_version.py export-subst # we do not own any headers checked in, don't touch them *.h binary *.hpp binary +# Exception: headers we own (cuda_core C++ implementation) +cuda_core/cuda/core/experimental/_cpp/*.h -binary text diff +cuda_core/cuda/core/experimental/_cpp/*.hpp -binary text diff # git should not convert line endings in PNG files *.png binary *.svg binary diff --git a/.gitignore b/.gitignore index 1455b1dfc2..685fa231f8 100644 --- a/.gitignore +++ b/.gitignore @@ -11,6 +11,7 @@ __pycache__/ .pytest_cache/ .benchmarks/ *.cpp +!*_impl.cpp !cuda_bindings/cuda/bindings/_lib/param_packer.cpp !cuda_bindings/cuda/bindings/_bindings/loader.cpp cache_driver diff --git a/cuda_core/build_hooks.py b/cuda_core/build_hooks.py index e38f5676df..7ebb67cef0 100644 --- a/cuda_core/build_hooks.py +++ b/cuda_core/build_hooks.py @@ -84,11 +84,34 @@ def get_cuda_paths(): print("CUDA paths:", CUDA_PATH) return CUDA_PATH + def get_sources(mod_name): + """Get source files for a module, including any .cpp files.""" + sources = [f"cuda/core/experimental/{mod_name}.pyx"] + + # Add module-specific .cpp file from _cpp/ directory if it exists + cpp_file = f"cuda/core/experimental/_cpp/{mod_name.lstrip('_')}.cpp" + if os.path.exists(cpp_file): + sources.append(cpp_file) + + # Modules that use resource handles need to link against _resource_handles_impl.cpp + # This includes _context, _stream, _event, etc. as they adopt handle-based management + resource_handle_users = {"_context", "_stream", "_event"} + if mod_name in resource_handle_users: + resource_handles_impl = "cuda/core/experimental/_resource_handles_impl.cpp" + if os.path.exists(resource_handles_impl): + sources.append(resource_handles_impl) + + return sources + ext_modules = tuple( Extension( f"cuda.core.experimental.{mod.replace(os.path.sep, '.')}", - sources=[f"cuda/core/experimental/{mod}.pyx"], - include_dirs=list(os.path.join(root, "include") for root in get_cuda_paths()), + sources=get_sources(mod), + include_dirs=[ + "cuda/core/experimental/include", + "cuda/core/experimental/_cpp", + ] + + list(os.path.join(root, "include") for root in get_cuda_paths()), language="c++", ) for mod in module_names diff --git a/cuda_core/cuda/core/experimental/_context.pxd b/cuda_core/cuda/core/experimental/_context.pxd index 0e0df83831..875d95d283 100644 --- a/cuda_core/cuda/core/experimental/_context.pxd +++ b/cuda_core/cuda/core/experimental/_context.pxd @@ -3,6 +3,7 @@ # SPDX-License-Identifier: Apache-2.0 from cuda.bindings cimport cydriver +from cuda.core.experimental._resource_handles cimport ContextHandle cdef class Context: """Cython declaration for Context class. @@ -12,7 +13,7 @@ cdef class Context: """ cdef: - readonly object _handle + ContextHandle _resource_handle int _device_id # Cython-level context operations diff --git a/cuda_core/cuda/core/experimental/_context.pyx b/cuda_core/cuda/core/experimental/_context.pyx index 244109584d..81daa2aa09 100644 --- a/cuda_core/cuda/core/experimental/_context.pyx +++ b/cuda_core/cuda/core/experimental/_context.pyx @@ -8,7 +8,8 @@ import threading from libc.stdint cimport uintptr_t from cuda.bindings cimport cydriver -from cuda.core.experimental._utils.cuda_utils import driver, CUDAError +from cuda.core.experimental._resource_handles cimport create_context_handle_ref +from cuda.core.experimental._utils.cuda_utils import driver from cuda.core.experimental._utils.cuda_utils cimport HANDLE_RETURN @@ -28,18 +29,37 @@ cdef class Context: @classmethod def _from_ctx(cls, handle: driver.CUcontext, int device_id): cdef Context ctx = Context.__new__(Context) - ctx._handle = handle + # Convert Python CUcontext to C-level CUcontext and create non-owning ContextHandle + cdef cydriver.CUcontext c_ctx = int(handle) + ctx._resource_handle = create_context_handle_ref(c_ctx) ctx._device_id = device_id return ctx + @property + def handle(self): + """Return the underlying CUcontext handle.""" + cdef const cydriver.CUcontext* ptr = self._resource_handle.get() + if ptr != NULL: + return driver.CUcontext((ptr[0])) + return None + def __eq__(self, other): if not isinstance(other, Context): return NotImplemented cdef Context _other = other - return int(self._handle) == int(_other._handle) + # Compare the actual CUcontext values, not the shared_ptr objects + # (aliasing constructor creates different addresses even for same CUcontext) + cdef const cydriver.CUcontext* ptr1 = self._resource_handle.get() + cdef const cydriver.CUcontext* ptr2 = _other._resource_handle.get() + if ptr1 == NULL or ptr2 == NULL: + return ptr1 == ptr2 + return ptr1[0] == ptr2[0] def __hash__(self) -> int: - return hash(int(self._handle)) + cdef const cydriver.CUcontext* ptr = self._resource_handle.get() + if ptr == NULL: + return hash((type(self), 0)) + return hash((type(self), (ptr[0]))) @dataclass diff --git a/cuda_core/cuda/core/experimental/_cpp/resource_handles.cpp b/cuda_core/cuda/core/experimental/_cpp/resource_handles.cpp new file mode 100644 index 0000000000..93a31551e0 --- /dev/null +++ b/cuda_core/cuda/core/experimental/_cpp/resource_handles.cpp @@ -0,0 +1,58 @@ +// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// SPDX-License-Identifier: Apache-2.0 + +#include "resource_handles.hpp" +#include + +namespace cuda_core { + +// ============================================================================ +// Context Handles +// ============================================================================ + +ContextHandle create_context_handle_ref(CUcontext ctx) { + // Creates a non-owning handle that references an existing context + // (e.g., primary context managed by CUDA driver) + + // Allocate the box containing the context resource + ContextBox* box = new ContextBox(); + box->resource = ctx; + + // Use default deleter - it will delete the box, but not touch the CUcontext + // CUcontext lifetime is managed externally (e.g., by CUDA driver) + std::shared_ptr box_ptr(box); + + // Use aliasing constructor to create handle that exposes only CUcontext + // The handle's reference count is tied to box_ptr, but it points to &box_ptr->resource + return ContextHandle(box_ptr, &box_ptr->resource); +} + +// TODO: Future owning handle for cuCtxCreate/cuCtxDestroy +// ContextHandle create_context_handle(CUdevice dev, unsigned int flags) { ... } + +// ============================================================================ +// Stream Handles +// ============================================================================ + +// TODO: Implement StreamH create_stream_handle(...) when Stream gets handle support + +// ============================================================================ +// Event Handles +// ============================================================================ + +// TODO: Implement EventH create_event_handle(...) when Event gets handle support + +// ============================================================================ +// Device Pointer Handles +// ============================================================================ + +// TODO: Implement DevicePtrH create_deviceptr_handle(...) when DevicePtr gets handle support + +// ============================================================================ +// Memory Pool Handles +// ============================================================================ + +// TODO: Implement MemPoolH create_mempool_handle(...) when MemPool gets handle support + +} // namespace cuda_core diff --git a/cuda_core/cuda/core/experimental/_cpp/resource_handles.hpp b/cuda_core/cuda/core/experimental/_cpp/resource_handles.hpp new file mode 100644 index 0000000000..949d9f1289 --- /dev/null +++ b/cuda_core/cuda/core/experimental/_cpp/resource_handles.hpp @@ -0,0 +1,30 @@ +// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include +#include + +namespace cuda_core { + +// Forward declarations +struct ContextBox; + +// Handle type aliases - expose only the raw CUDA resource +using ContextHandle = std::shared_ptr; + +// Internal box structure for Context +// This holds the resource and any dependencies needed for lifetime management +struct ContextBox { + CUcontext resource; + // Context doesn't depend on other CUDA resources, but we keep the structure + // extensible for future needs +}; + +// Function to create a non-owning context handle (references existing context) +// This will be implemented in the .cpp file +ContextHandle create_context_handle_ref(CUcontext ctx); + +} // namespace cuda_core diff --git a/cuda_core/cuda/core/experimental/_device.pyx b/cuda_core/cuda/core/experimental/_device.pyx index c6efa21ac7..ceea0c6a5f 100644 --- a/cuda_core/cuda/core/experimental/_device.pyx +++ b/cuda_core/cuda/core/experimental/_device.pyx @@ -1226,7 +1226,7 @@ class Device: f" id={ctx._id}, which is different from the target id={self._id}" ) # prev_ctx is the previous context - curr_ctx = (ctx._handle) + curr_ctx = ctx._resource_handle.get()[0] prev_ctx = NULL with nogil: HANDLE_RETURN(cydriver.cuCtxPopCurrent(&prev_ctx)) diff --git a/cuda_core/cuda/core/experimental/_graph.py b/cuda_core/cuda/core/experimental/_graph.py index a82bd70f55..5dbcc80c04 100644 --- a/cuda_core/cuda/core/experimental/_graph.py +++ b/cuda_core/cuda/core/experimental/_graph.py @@ -453,7 +453,7 @@ def __cuda_stream__(self) -> tuple[int, int]: return self.stream.__cuda_stream__() def _get_conditional_context(self) -> driver.CUcontext: - return self._mnff.stream.context._handle + return self._mnff.stream.context.handle def create_conditional_handle(self, default_value=None) -> driver.CUgraphConditionalHandle: """Creates a conditional handle for the graph builder. diff --git a/cuda_core/cuda/core/experimental/_resource_handles.pxd b/cuda_core/cuda/core/experimental/_resource_handles.pxd new file mode 100644 index 0000000000..8ada7d8cd5 --- /dev/null +++ b/cuda_core/cuda/core/experimental/_resource_handles.pxd @@ -0,0 +1,15 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +from libcpp.memory cimport shared_ptr + +from cuda.bindings cimport cydriver + +# Declare the C++ namespace and types +cdef extern from "_cpp/resource_handles.hpp" namespace "cuda_core": + # Handle type - shared_ptr to const CUcontext + ctypedef shared_ptr[const cydriver.CUcontext] ContextHandle + + # Function to create a non-owning context handle (references existing context) + ContextHandle create_context_handle_ref(cydriver.CUcontext ctx) diff --git a/cuda_core/cuda/core/experimental/_resource_handles.pyx b/cuda_core/cuda/core/experimental/_resource_handles.pyx new file mode 100644 index 0000000000..564f2abac3 --- /dev/null +++ b/cuda_core/cuda/core/experimental/_resource_handles.pyx @@ -0,0 +1,6 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +# This module exists to compile _cpp/resource_handles.cpp into a shared library. +# The C++ code provides handle management for CUDA contexts and other resources. diff --git a/cuda_core/tests/test_comparable.py b/cuda_core/tests/test_comparable.py index c99963cd23..72b3caa2ba 100644 --- a/cuda_core/tests/test_comparable.py +++ b/cuda_core/tests/test_comparable.py @@ -136,7 +136,7 @@ class MyContext(Context): context = stream.context # MyContext._from_ctx() returns a Context instance, not MyContext - my_context = MyContext._from_ctx(context._handle, device.device_id) + my_context = MyContext._from_ctx(context.handle, device.device_id) assert type(my_context) is Context, "_from_ctx returns Context, not subclass" assert type(my_context) is not MyContext diff --git a/cuda_core/tests/test_hashable.py b/cuda_core/tests/test_hashable.py index 4aa801866f..751a88250c 100644 --- a/cuda_core/tests/test_hashable.py +++ b/cuda_core/tests/test_hashable.py @@ -174,7 +174,7 @@ class MyContext(Context): context = stream.context # MyContext._from_ctx() returns Context, not MyContext - my_context = MyContext._from_ctx(context._handle, device.device_id) + my_context = MyContext._from_ctx(context.handle, device.device_id) assert type(my_context) is Context, "_from_ctx returns Context type" # Same handle -> same hash @@ -221,7 +221,7 @@ class MyContext(Context): # Test Context: always returns base type from _from_ctx ctx = device.context - my_ctx = MyContext._from_ctx(ctx._handle, device.device_id) + my_ctx = MyContext._from_ctx(ctx.handle, device.device_id) assert ctx == my_ctx, "Equal contexts with same handle" assert hash(ctx) == hash(my_ctx), "Equal objects have equal hashes" diff --git a/cuda_core/tests/test_stream.py b/cuda_core/tests/test_stream.py index 695a70e931..ef83c09d05 100644 --- a/cuda_core/tests/test_stream.py +++ b/cuda_core/tests/test_stream.py @@ -74,7 +74,7 @@ def test_stream_context(init_cuda): stream = Device().create_stream(options=StreamOptions()) context = stream.context assert context is not None - assert context._handle is not None + assert context.handle is not None def test_stream_from_foreign_stream(init_cuda): From 31880d4e973b0a7a254b1289eda178f6e7b85f0e Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Mon, 8 Dec 2025 11:41:18 -0800 Subject: [PATCH 03/38] Refactor context helpers to use ContextHandle and TLS cache - switch context helper APIs to return ContextHandle instead of raw CUcontext - add TLS wrapper for primary context caching using handles - update device/stream code to consume ContextHandle-based helpers - expose create_context_handle_ref as nogil-safe in the pxd --- cuda_core/cuda/core/experimental/_context.pxd | 12 +-- cuda_core/cuda/core/experimental/_context.pyx | 93 ++++++++++++------- cuda_core/cuda/core/experimental/_device.pyx | 41 ++++---- .../core/experimental/_resource_handles.pxd | 3 +- cuda_core/cuda/core/experimental/_stream.pyx | 13 ++- 5 files changed, 100 insertions(+), 62 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_context.pxd b/cuda_core/cuda/core/experimental/_context.pxd index 875d95d283..d4dfa08085 100644 --- a/cuda_core/cuda/core/experimental/_context.pxd +++ b/cuda_core/cuda/core/experimental/_context.pxd @@ -13,11 +13,11 @@ cdef class Context: """ cdef: - ContextHandle _resource_handle + ContextHandle _h_context int _device_id -# Cython-level context operations -cdef cydriver.CUcontext get_primary_context(int dev_id) except?NULL -cdef cydriver.CUcontext get_current_context() except?NULL nogil -cdef void set_current_context(cydriver.CUcontext ctx) except * -cdef cydriver.CUcontext get_stream_context(cydriver.CUstream stream) except?NULL nogil +# Cython-level context operations (handle-centric API) +cdef ContextHandle get_primary_context(int dev_id) except * +cdef ContextHandle get_current_context() except * nogil +cdef void set_current_context(ContextHandle h_context) except * nogil +cdef ContextHandle get_stream_context(cydriver.CUstream stream) except * nogil diff --git a/cuda_core/cuda/core/experimental/_context.pyx b/cuda_core/cuda/core/experimental/_context.pyx index 81daa2aa09..70870d3b1c 100644 --- a/cuda_core/cuda/core/experimental/_context.pyx +++ b/cuda_core/cuda/core/experimental/_context.pyx @@ -16,6 +16,21 @@ from cuda.core.experimental._utils.cuda_utils cimport HANDLE_RETURN __all__ = ['Context', 'ContextOptions'] +# Lightweight Python wrapper for ContextHandle (for caching in TLS) +cdef class _ContextHandleWrapper: + """Internal wrapper to store ContextHandle in Python containers.""" + cdef ContextHandle h_context + + def __cinit__(self): + pass + + @staticmethod + cdef _ContextHandleWrapper create(ContextHandle h_context): + cdef _ContextHandleWrapper wrapper = _ContextHandleWrapper.__new__(_ContextHandleWrapper) + wrapper.h_context = h_context + return wrapper + + cdef class Context: """CUDA context wrapper. @@ -31,14 +46,14 @@ cdef class Context: cdef Context ctx = Context.__new__(Context) # Convert Python CUcontext to C-level CUcontext and create non-owning ContextHandle cdef cydriver.CUcontext c_ctx = int(handle) - ctx._resource_handle = create_context_handle_ref(c_ctx) + ctx._h_context = create_context_handle_ref(c_ctx) ctx._device_id = device_id return ctx @property def handle(self): """Return the underlying CUcontext handle.""" - cdef const cydriver.CUcontext* ptr = self._resource_handle.get() + cdef const cydriver.CUcontext* ptr = self._h_context.get() if ptr != NULL: return driver.CUcontext((ptr[0])) return None @@ -49,14 +64,14 @@ cdef class Context: cdef Context _other = other # Compare the actual CUcontext values, not the shared_ptr objects # (aliasing constructor creates different addresses even for same CUcontext) - cdef const cydriver.CUcontext* ptr1 = self._resource_handle.get() - cdef const cydriver.CUcontext* ptr2 = _other._resource_handle.get() + cdef const cydriver.CUcontext* ptr1 = self._h_context.get() + cdef const cydriver.CUcontext* ptr2 = _other._h_context.get() if ptr1 == NULL or ptr2 == NULL: return ptr1 == ptr2 return ptr1[0] == ptr2[0] def __hash__(self) -> int: - cdef const cydriver.CUcontext* ptr = self._resource_handle.get() + cdef const cydriver.CUcontext* ptr = self._h_context.get() if ptr == NULL: return hash((type(self), 0)) return hash((type(self), (ptr[0]))) @@ -71,23 +86,25 @@ class ContextOptions: pass # TODO -cdef cydriver.CUcontext get_current_context() except?NULL nogil: - """Get the current CUDA context. +cdef ContextHandle get_current_context() except * nogil: + """Get handle to the current CUDA context. Returns ------- - CUcontext - Current context handle, or NULL if no context is bound + ContextHandle + Handle to current context, or empty handle if no context is bound """ cdef cydriver.CUcontext ctx = NULL HANDLE_RETURN(cydriver.cuCtxGetCurrent(&ctx)) - return ctx + if ctx == NULL: + return ContextHandle() + return create_context_handle_ref(ctx) -cdef cydriver.CUcontext get_primary_context(int dev_id) except?NULL: - """Get the primary context for a device. +cdef ContextHandle get_primary_context(int dev_id) except *: + """Get handle to the primary context for a device. - Uses thread-local storage to cache primary contexts per device. + Uses thread-local storage to cache primary context handles per device. The primary context is lazily initialized on first access. Parameters @@ -97,30 +114,40 @@ cdef cydriver.CUcontext get_primary_context(int dev_id) except?NULL: Returns ------- - CUcontext - Primary context handle for the device, or NULL on error + ContextHandle + Handle to primary context for the device """ cdef int total = 0 cdef cydriver.CUcontext ctx + cdef ContextHandle h_context + cdef _ContextHandleWrapper wrapper + # Check TLS cache try: primary_ctxs = _tls.primary_ctxs except AttributeError: # Initialize primary context cache with nogil: HANDLE_RETURN(cydriver.cuDeviceGetCount(&total)) - primary_ctxs = _tls.primary_ctxs = [0] * total + primary_ctxs = _tls.primary_ctxs = [None] * total - ctx = (primary_ctxs[dev_id]) - if ctx == NULL: - with nogil: - HANDLE_RETURN(cydriver.cuDevicePrimaryCtxRetain(&ctx, dev_id)) - primary_ctxs[dev_id] = (ctx) - return ctx + wrapper = primary_ctxs[dev_id] + if wrapper is not None: + return wrapper.h_context + + # Acquire primary context (release GIL for driver call) + with nogil: + HANDLE_RETURN(cydriver.cuDevicePrimaryCtxRetain(&ctx, dev_id)) + h_context = create_context_handle_ref(ctx) + # Cache the handle (wrapped in Python object) + _tls.primary_ctxs[dev_id] = _ContextHandleWrapper.create(h_context) -cdef cydriver.CUcontext get_stream_context(cydriver.CUstream stream) except?NULL nogil: - """Get the context associated with a stream. + return h_context + + +cdef ContextHandle get_stream_context(cydriver.CUstream stream) except * nogil: + """Get handle to the context associated with a stream. Parameters ---------- @@ -129,24 +156,26 @@ cdef cydriver.CUcontext get_stream_context(cydriver.CUstream stream) except?NULL Returns ------- - CUcontext - Context handle associated with the stream, or NULL on error + ContextHandle + Handle to context associated with the stream """ cdef cydriver.CUcontext ctx = NULL HANDLE_RETURN(cydriver.cuStreamGetCtx(stream, &ctx)) - return ctx + return create_context_handle_ref(ctx) -cdef void set_current_context(cydriver.CUcontext ctx) except *: - """Set the current CUDA context. +cdef void set_current_context(ContextHandle h_context) except * nogil: + """Set the current CUDA context from a handle. Parameters ---------- - ctx : CUcontext + h_context : ContextHandle Context handle to set as current """ - with nogil: - HANDLE_RETURN(cydriver.cuCtxSetCurrent(ctx)) + if h_context.get() == NULL: + with gil: + raise ValueError("Cannot set NULL context as current") + HANDLE_RETURN(cydriver.cuCtxSetCurrent(h_context.get()[0])) # Thread-local storage for primary context cache diff --git a/cuda_core/cuda/core/experimental/_device.pyx b/cuda_core/cuda/core/experimental/_device.pyx index ceea0c6a5f..bf38a5515e 100644 --- a/cuda_core/cuda/core/experimental/_device.pyx +++ b/cuda_core/cuda/core/experimental/_device.pyx @@ -19,6 +19,7 @@ from cuda.core.experimental._context cimport ( ) from cuda.core.experimental._context import ContextOptions from cuda.core.experimental._event import Event, EventOptions +from cuda.core.experimental._resource_handles cimport ContextHandle from cuda.core.experimental._graph import GraphBuilder from cuda.core.experimental._stream import IsStreamT, Stream, StreamOptions from cuda.core.experimental._utils.clear_error_support import assert_type @@ -959,14 +960,15 @@ class Device: # important: creating a Device instance does not initialize the GPU! cdef cydriver.CUdevice dev cdef cydriver.CUcontext ctx + cdef ContextHandle h_context if device_id is None: with nogil: err = cydriver.cuCtxGetDevice(&dev) if err == cydriver.CUresult.CUDA_SUCCESS: device_id = int(dev) elif err == cydriver.CUresult.CUDA_ERROR_INVALID_CONTEXT: - ctx = get_current_context() - assert (ctx) == NULL + h_context = get_current_context() + assert h_context.get() == NULL device_id = 0 # cudart behavior else: HANDLE_RETURN(err) @@ -1114,16 +1116,16 @@ class Device: """ self._check_context_initialized() - cdef cydriver.CUcontext ctx + cdef ContextHandle h_context cdef cydriver.CUdevice dev + h_context = get_current_context() + if h_context.get() == NULL: + raise CUDAError("No context is bound to the calling CPU thread.") with nogil: - ctx = get_current_context() HANDLE_RETURN(cydriver.cuCtxGetDevice(&dev)) - if ctx == NULL: - raise CUDAError("No context is bound to the calling CPU thread.") if dev != self._id: raise CUDAError("Internal error (current device is not equal to Device.device_id)") - return Context._from_ctx((ctx), self._id) + return Context._from_ctx((h_context.get()[0]), self._id) @property def memory_resource(self) -> MemoryResource: @@ -1215,18 +1217,19 @@ class Device: >>> # ... do work on device 0 ... """ - cdef cydriver.CUcontext prev_ctx - cdef cydriver.CUcontext curr_ctx + cdef ContextHandle h_context + cdef cydriver.CUcontext prev_ctx, curr_ctx + if ctx is not None: # TODO: revisit once Context is cythonized assert_type(ctx, Context) - if ctx._id != self._id: + if ctx._device_id != self._id: raise RuntimeError( "the provided context was created on the device with" - f" id={ctx._id}, which is different from the target id={self._id}" + f" id={ctx._device_id}, which is different from the target id={self._id}" ) # prev_ctx is the previous context - curr_ctx = ctx._resource_handle.get()[0] + curr_ctx = ctx._h_context.get()[0] prev_ctx = NULL with nogil: HANDLE_RETURN(cydriver.cuCtxPopCurrent(&prev_ctx)) @@ -1236,8 +1239,9 @@ class Device: return Context._from_ctx((prev_ctx), self._id) else: # use primary ctx - curr_ctx = get_primary_context(self._id) - set_current_context(curr_ctx) + h_context = get_primary_context(self._id) + with nogil: + set_current_context(h_context) self._has_inited = True def create_context(self, options: ContextOptions = None) -> Context: @@ -1310,12 +1314,11 @@ class Device: """ self._check_context_initialized() - cdef cydriver.CUcontext ctx - with nogil: - ctx = get_current_context() - if ctx == NULL: + cdef ContextHandle h_context + h_context = get_current_context() + if h_context.get() == NULL: raise CUDAError("No context is bound to the calling CPU thread.") - return Event._init(self._id, (ctx), options, True) + return Event._init(self._id, (h_context.get()[0]), options, True) def allocate(self, size, stream: Stream | GraphBuilder | None = None) -> Buffer: """Allocate device memory from a specified stream. diff --git a/cuda_core/cuda/core/experimental/_resource_handles.pxd b/cuda_core/cuda/core/experimental/_resource_handles.pxd index 8ada7d8cd5..08a4cc01c6 100644 --- a/cuda_core/cuda/core/experimental/_resource_handles.pxd +++ b/cuda_core/cuda/core/experimental/_resource_handles.pxd @@ -12,4 +12,5 @@ cdef extern from "_cpp/resource_handles.hpp" namespace "cuda_core": ctypedef shared_ptr[const cydriver.CUcontext] ContextHandle # Function to create a non-owning context handle (references existing context) - ContextHandle create_context_handle_ref(cydriver.CUcontext ctx) + # This is nogil-safe (pure C++, no Python dependencies) + ContextHandle create_context_handle_ref(cydriver.CUcontext ctx) nogil diff --git a/cuda_core/cuda/core/experimental/_stream.pyx b/cuda_core/cuda/core/experimental/_stream.pyx index 440130f679..de87f378df 100644 --- a/cuda_core/cuda/core/experimental/_stream.pyx +++ b/cuda_core/cuda/core/experimental/_stream.pyx @@ -31,6 +31,7 @@ from cuda.core.experimental._context cimport ( get_current_context, ) from cuda.core.experimental._event import Event, EventOptions +from cuda.core.experimental._resource_handles cimport ContextHandle from cuda.core.experimental._graph import GraphBuilder from cuda.core.experimental._utils.cuda_utils import ( driver, @@ -321,16 +322,20 @@ cdef class Stream: cdef int _get_context(self) except?-1 nogil: if self._ctx_handle == CU_CONTEXT_INVALID: - self._ctx_handle = get_stream_context(self._handle) + h_context = get_stream_context(self._handle) + self._ctx_handle = h_context.get()[0] return 0 cdef int _get_device_and_context(self) except?-1: + cdef ContextHandle h_curr_context cdef cydriver.CUcontext curr_ctx if self._device_id == cydriver.CU_DEVICE_INVALID: # Get the current context - curr_ctx = get_current_context() - # Get the stream's context (self.ctx_handle is populated) - self._get_context() + with nogil: + h_curr_context = get_current_context() + curr_ctx = h_curr_context.get()[0] if h_curr_context.get() != NULL else 0 + # Get the stream's context (self._ctx_handle is populated) + self._get_context() # Get the stream's device (may require a context-switching dance) self._device_id = get_device_from_ctx(self._ctx_handle, curr_ctx) return 0 From c173e3d1f4ed764a4495042fecbdc1649b0d914a Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Mon, 8 Dec 2025 12:18:06 -0800 Subject: [PATCH 04/38] Add helper functions to extract raw resources from ContextHandle Introduce three helper functions for ContextHandle resource extraction: - native(h): Returns cydriver.CUcontext for use with cydriver API calls - py(h): Returns driver.CUcontext for use with Python driver API - intptr(h): Returns uintptr_t for internal APIs expecting integer addresses These helpers replace direct h_context.get()[0] calls, providing: - Cleaner, more semantic code - Consistent extraction pattern across all handle types - Type-safe conversions with clear intent Implementation details: - native() and intptr() are inline nogil functions in .pxd - py() requires Python module access, implemented in new _resource_handles.pyx - Updated all call sites in _context, _device, and _stream modules --- cuda_core/cuda/core/experimental/_context.pyx | 4 +-- cuda_core/cuda/core/experimental/_device.pyx | 8 +++--- .../core/experimental/_resource_handles.pxd | 25 +++++++++++++++++++ .../core/experimental/_resource_handles.pyx | 14 +++++++++-- cuda_core/cuda/core/experimental/_stream.pyx | 6 ++--- 5 files changed, 46 insertions(+), 11 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_context.pyx b/cuda_core/cuda/core/experimental/_context.pyx index 70870d3b1c..2b6ecad8e0 100644 --- a/cuda_core/cuda/core/experimental/_context.pyx +++ b/cuda_core/cuda/core/experimental/_context.pyx @@ -8,7 +8,7 @@ import threading from libc.stdint cimport uintptr_t from cuda.bindings cimport cydriver -from cuda.core.experimental._resource_handles cimport create_context_handle_ref +from cuda.core.experimental._resource_handles cimport create_context_handle_ref, native from cuda.core.experimental._utils.cuda_utils import driver from cuda.core.experimental._utils.cuda_utils cimport HANDLE_RETURN @@ -175,7 +175,7 @@ cdef void set_current_context(ContextHandle h_context) except * nogil: if h_context.get() == NULL: with gil: raise ValueError("Cannot set NULL context as current") - HANDLE_RETURN(cydriver.cuCtxSetCurrent(h_context.get()[0])) + HANDLE_RETURN(cydriver.cuCtxSetCurrent(native(h_context))) # Thread-local storage for primary context cache diff --git a/cuda_core/cuda/core/experimental/_device.pyx b/cuda_core/cuda/core/experimental/_device.pyx index bf38a5515e..2e3e5a1e43 100644 --- a/cuda_core/cuda/core/experimental/_device.pyx +++ b/cuda_core/cuda/core/experimental/_device.pyx @@ -19,7 +19,7 @@ from cuda.core.experimental._context cimport ( ) from cuda.core.experimental._context import ContextOptions from cuda.core.experimental._event import Event, EventOptions -from cuda.core.experimental._resource_handles cimport ContextHandle +from cuda.core.experimental._resource_handles cimport ContextHandle, intptr, native from cuda.core.experimental._graph import GraphBuilder from cuda.core.experimental._stream import IsStreamT, Stream, StreamOptions from cuda.core.experimental._utils.clear_error_support import assert_type @@ -1125,7 +1125,7 @@ class Device: HANDLE_RETURN(cydriver.cuCtxGetDevice(&dev)) if dev != self._id: raise CUDAError("Internal error (current device is not equal to Device.device_id)") - return Context._from_ctx((h_context.get()[0]), self._id) + return Context._from_ctx(intptr(h_context), self._id) @property def memory_resource(self) -> MemoryResource: @@ -1229,7 +1229,7 @@ class Device: f" id={ctx._device_id}, which is different from the target id={self._id}" ) # prev_ctx is the previous context - curr_ctx = ctx._h_context.get()[0] + curr_ctx = native(ctx._h_context) prev_ctx = NULL with nogil: HANDLE_RETURN(cydriver.cuCtxPopCurrent(&prev_ctx)) @@ -1318,7 +1318,7 @@ class Device: h_context = get_current_context() if h_context.get() == NULL: raise CUDAError("No context is bound to the calling CPU thread.") - return Event._init(self._id, (h_context.get()[0]), options, True) + return Event._init(self._id, intptr(h_context), options, True) def allocate(self, size, stream: Stream | GraphBuilder | None = None) -> Buffer: """Allocate device memory from a specified stream. diff --git a/cuda_core/cuda/core/experimental/_resource_handles.pxd b/cuda_core/cuda/core/experimental/_resource_handles.pxd index 08a4cc01c6..4ec0e6b62c 100644 --- a/cuda_core/cuda/core/experimental/_resource_handles.pxd +++ b/cuda_core/cuda/core/experimental/_resource_handles.pxd @@ -2,6 +2,7 @@ # # SPDX-License-Identifier: Apache-2.0 +from libc.stdint cimport uintptr_t from libcpp.memory cimport shared_ptr from cuda.bindings cimport cydriver @@ -14,3 +15,27 @@ cdef extern from "_cpp/resource_handles.hpp" namespace "cuda_core": # Function to create a non-owning context handle (references existing context) # This is nogil-safe (pure C++, no Python dependencies) ContextHandle create_context_handle_ref(cydriver.CUcontext ctx) nogil + + +# ============================================================================ +# Helper functions to extract raw resources from handles +# ============================================================================ + +cdef inline cydriver.CUcontext native(ContextHandle h) nogil: + """Extract the native C type (cydriver.CUcontext) from the handle. + + This is for use with cydriver API calls that expect the raw C type. + """ + return h.get()[0] + + +# Python conversion function (implemented in .pyx due to Python module dependency) +cdef object py(ContextHandle h) + + +cdef inline uintptr_t intptr(ContextHandle h) nogil: + """Extract the handle as a uintptr_t integer address. + + This is for use with internal APIs that expect integer addresses. + """ + return (h.get()[0]) diff --git a/cuda_core/cuda/core/experimental/_resource_handles.pyx b/cuda_core/cuda/core/experimental/_resource_handles.pyx index 564f2abac3..b150228762 100644 --- a/cuda_core/cuda/core/experimental/_resource_handles.pyx +++ b/cuda_core/cuda/core/experimental/_resource_handles.pyx @@ -2,5 +2,15 @@ # # SPDX-License-Identifier: Apache-2.0 -# This module exists to compile _cpp/resource_handles.cpp into a shared library. -# The C++ code provides handle management for CUDA contexts and other resources. +from libc.stdint cimport uintptr_t + +from cuda.bindings import driver +from cuda.core.experimental._resource_handles cimport ContextHandle + + +cdef object py(ContextHandle h): + """Convert the handle to a Python driver.CUcontext object. + + This is for use with driver (Python) API calls or returning to Python code. + """ + return driver.CUcontext((h.get()[0])) diff --git a/cuda_core/cuda/core/experimental/_stream.pyx b/cuda_core/cuda/core/experimental/_stream.pyx index de87f378df..ec6436af53 100644 --- a/cuda_core/cuda/core/experimental/_stream.pyx +++ b/cuda_core/cuda/core/experimental/_stream.pyx @@ -31,7 +31,7 @@ from cuda.core.experimental._context cimport ( get_current_context, ) from cuda.core.experimental._event import Event, EventOptions -from cuda.core.experimental._resource_handles cimport ContextHandle +from cuda.core.experimental._resource_handles cimport ContextHandle, native from cuda.core.experimental._graph import GraphBuilder from cuda.core.experimental._utils.cuda_utils import ( driver, @@ -323,7 +323,7 @@ cdef class Stream: cdef int _get_context(self) except?-1 nogil: if self._ctx_handle == CU_CONTEXT_INVALID: h_context = get_stream_context(self._handle) - self._ctx_handle = h_context.get()[0] + self._ctx_handle = native(h_context) return 0 cdef int _get_device_and_context(self) except?-1: @@ -333,7 +333,7 @@ cdef class Stream: # Get the current context with nogil: h_curr_context = get_current_context() - curr_ctx = h_curr_context.get()[0] if h_curr_context.get() != NULL else 0 + curr_ctx = native(h_curr_context) if h_curr_context.get() != NULL else 0 # Get the stream's context (self._ctx_handle is populated) self._get_context() # Get the stream's device (may require a context-switching dance) From 4357f580a553053f0c77c4d40c35bd06b5a07407 Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Mon, 8 Dec 2025 16:31:56 -0800 Subject: [PATCH 05/38] Refactor context acquisition to C++ handle helpers Move get_primary_context/get_current_context into C++ with thread-local caching and conditional GIL release; inline create_context_handle_ref in the header; update Cython modules and build hooks to link handle users (including _device) against resource_handles and libcuda. --- cuda_core/build_hooks.py | 29 ++++- cuda_core/cuda/core/experimental/_context.pxd | 3 +- cuda_core/cuda/core/experimental/_context.pyx | 87 ++------------ .../experimental/_cpp/resource_handles.cpp | 107 ++++++++++++++++-- .../experimental/_cpp/resource_handles.hpp | 26 ++--- cuda_core/cuda/core/experimental/_device.pyx | 10 +- .../core/experimental/_resource_handles.pxd | 4 + cuda_core/cuda/core/experimental/_stream.pyx | 7 +- 8 files changed, 163 insertions(+), 110 deletions(-) diff --git a/cuda_core/build_hooks.py b/cuda_core/build_hooks.py index 7ebb67cef0..9b85973e17 100644 --- a/cuda_core/build_hooks.py +++ b/cuda_core/build_hooks.py @@ -84,6 +84,18 @@ def get_cuda_paths(): print("CUDA paths:", CUDA_PATH) return CUDA_PATH + @functools.cache + def get_cuda_library_dirs(): + """Return library search paths for CUDA driver runtime.""" + + libdirs = [] + for root in get_cuda_paths(): + for subdir in ("lib64", "lib"): + candidate = os.path.join(root, subdir) + if os.path.isdir(candidate): + libdirs.append(candidate) + return libdirs + def get_sources(mod_name): """Get source files for a module, including any .cpp files.""" sources = [f"cuda/core/experimental/{mod_name}.pyx"] @@ -95,7 +107,11 @@ def get_sources(mod_name): # Modules that use resource handles need to link against _resource_handles_impl.cpp # This includes _context, _stream, _event, etc. as they adopt handle-based management - resource_handle_users = {"_context", "_stream", "_event"} + # Modules that call into the handle helpers implemented in + # `_resource_handles_impl.cpp` must link against that translation unit. + # Keep this in sync with any module that cimports `get_primary_context` + # or other helpers defined there. + resource_handle_users = {"_context", "_stream", "_event", "_device"} if mod_name in resource_handle_users: resource_handles_impl = "cuda/core/experimental/_resource_handles_impl.cpp" if os.path.exists(resource_handles_impl): @@ -103,6 +119,16 @@ def get_sources(mod_name): return sources + def get_extension_kwargs(mod_name): + """Return Extension kwargs (libraries, library_dirs) per module.""" + + resource_handle_users = {"_context", "_stream", "_event", "_device"} + kwargs = {} + if mod_name in resource_handle_users: + kwargs["libraries"] = ["cuda"] + kwargs["library_dirs"] = get_cuda_library_dirs() + return kwargs + ext_modules = tuple( Extension( f"cuda.core.experimental.{mod.replace(os.path.sep, '.')}", @@ -113,6 +139,7 @@ def get_sources(mod_name): ] + list(os.path.join(root, "include") for root in get_cuda_paths()), language="c++", + **get_extension_kwargs(mod), ) for mod in module_names ) diff --git a/cuda_core/cuda/core/experimental/_context.pxd b/cuda_core/cuda/core/experimental/_context.pxd index d4dfa08085..01552c055e 100644 --- a/cuda_core/cuda/core/experimental/_context.pxd +++ b/cuda_core/cuda/core/experimental/_context.pxd @@ -17,7 +17,6 @@ cdef class Context: int _device_id # Cython-level context operations (handle-centric API) -cdef ContextHandle get_primary_context(int dev_id) except * -cdef ContextHandle get_current_context() except * nogil +# Note: get_primary_context and get_current_context are now pure C++ (imported from _resource_handles) cdef void set_current_context(ContextHandle h_context) except * nogil cdef ContextHandle get_stream_context(cydriver.CUstream stream) except * nogil diff --git a/cuda_core/cuda/core/experimental/_context.pyx b/cuda_core/cuda/core/experimental/_context.pyx index 2b6ecad8e0..6532eecadf 100644 --- a/cuda_core/cuda/core/experimental/_context.pyx +++ b/cuda_core/cuda/core/experimental/_context.pyx @@ -4,11 +4,15 @@ from dataclasses import dataclass -import threading from libc.stdint cimport uintptr_t from cuda.bindings cimport cydriver -from cuda.core.experimental._resource_handles cimport create_context_handle_ref, native +from cuda.core.experimental._resource_handles cimport ( + create_context_handle_ref, + get_primary_context, + get_current_context, + native, +) from cuda.core.experimental._utils.cuda_utils import driver from cuda.core.experimental._utils.cuda_utils cimport HANDLE_RETURN @@ -16,21 +20,6 @@ from cuda.core.experimental._utils.cuda_utils cimport HANDLE_RETURN __all__ = ['Context', 'ContextOptions'] -# Lightweight Python wrapper for ContextHandle (for caching in TLS) -cdef class _ContextHandleWrapper: - """Internal wrapper to store ContextHandle in Python containers.""" - cdef ContextHandle h_context - - def __cinit__(self): - pass - - @staticmethod - cdef _ContextHandleWrapper create(ContextHandle h_context): - cdef _ContextHandleWrapper wrapper = _ContextHandleWrapper.__new__(_ContextHandleWrapper) - wrapper.h_context = h_context - return wrapper - - cdef class Context: """CUDA context wrapper. @@ -86,64 +75,8 @@ class ContextOptions: pass # TODO -cdef ContextHandle get_current_context() except * nogil: - """Get handle to the current CUDA context. - - Returns - ------- - ContextHandle - Handle to current context, or empty handle if no context is bound - """ - cdef cydriver.CUcontext ctx = NULL - HANDLE_RETURN(cydriver.cuCtxGetCurrent(&ctx)) - if ctx == NULL: - return ContextHandle() - return create_context_handle_ref(ctx) - - -cdef ContextHandle get_primary_context(int dev_id) except *: - """Get handle to the primary context for a device. - - Uses thread-local storage to cache primary context handles per device. - The primary context is lazily initialized on first access. - - Parameters - ---------- - dev_id : int - Device ID - - Returns - ------- - ContextHandle - Handle to primary context for the device - """ - cdef int total = 0 - cdef cydriver.CUcontext ctx - cdef ContextHandle h_context - cdef _ContextHandleWrapper wrapper - - # Check TLS cache - try: - primary_ctxs = _tls.primary_ctxs - except AttributeError: - # Initialize primary context cache - with nogil: - HANDLE_RETURN(cydriver.cuDeviceGetCount(&total)) - primary_ctxs = _tls.primary_ctxs = [None] * total - - wrapper = primary_ctxs[dev_id] - if wrapper is not None: - return wrapper.h_context - - # Acquire primary context (release GIL for driver call) - with nogil: - HANDLE_RETURN(cydriver.cuDevicePrimaryCtxRetain(&ctx, dev_id)) - h_context = create_context_handle_ref(ctx) - - # Cache the handle (wrapped in Python object) - _tls.primary_ctxs[dev_id] = _ContextHandleWrapper.create(h_context) - - return h_context +# get_current_context() and get_primary_context() are now pure C++ functions +# imported from _resource_handles (with thread-local caching in C++) cdef ContextHandle get_stream_context(cydriver.CUstream stream) except * nogil: @@ -176,7 +109,3 @@ cdef void set_current_context(ContextHandle h_context) except * nogil: with gil: raise ValueError("Cannot set NULL context as current") HANDLE_RETURN(cydriver.cuCtxSetCurrent(native(h_context))) - - -# Thread-local storage for primary context cache -_tls = threading.local() diff --git a/cuda_core/cuda/core/experimental/_cpp/resource_handles.cpp b/cuda_core/cuda/core/experimental/_cpp/resource_handles.cpp index 93a31551e0..f584304496 100644 --- a/cuda_core/cuda/core/experimental/_cpp/resource_handles.cpp +++ b/cuda_core/cuda/core/experimental/_cpp/resource_handles.cpp @@ -2,34 +2,121 @@ // // SPDX-License-Identifier: Apache-2.0 +#include + #include "resource_handles.hpp" #include +#include namespace cuda_core { -// ============================================================================ -// Context Handles -// ============================================================================ +// Helper to release the GIL while calling into the CUDA driver. +// This guard is *conditional*: if the caller already dropped the GIL, +// we avoid calling PyEval_SaveThread (which requires holding the GIL). +// It also handles the case where Python is finalizing and GIL operations +// are no longer safe. +class GILReleaseGuard { +public: + GILReleaseGuard() : tstate_(nullptr), released_(false) { + // Don't try to manipulate GIL if Python is finalizing + if (!Py_IsInitialized() || _Py_IsFinalizing()) { + return; + } + // PyGILState_Check() returns 1 if the GIL is held by this thread. + if (PyGILState_Check()) { + tstate_ = PyEval_SaveThread(); + released_ = true; + } + } + + ~GILReleaseGuard() { + if (released_) { + PyEval_RestoreThread(tstate_); + } + } + +private: + PyThreadState* tstate_; + bool released_; +}; + +// Internal box structure for Context (kept private to this TU) +struct ContextBox { + CUcontext resource; +}; ContextHandle create_context_handle_ref(CUcontext ctx) { // Creates a non-owning handle that references an existing context // (e.g., primary context managed by CUDA driver) - // Allocate the box containing the context resource - ContextBox* box = new ContextBox(); - box->resource = ctx; - // Use default deleter - it will delete the box, but not touch the CUcontext // CUcontext lifetime is managed externally (e.g., by CUDA driver) - std::shared_ptr box_ptr(box); + auto box = new ContextBox{ctx}; + auto box_ptr = std::shared_ptr(box); // Use aliasing constructor to create handle that exposes only CUcontext // The handle's reference count is tied to box_ptr, but it points to &box_ptr->resource return ContextHandle(box_ptr, &box_ptr->resource); } -// TODO: Future owning handle for cuCtxCreate/cuCtxDestroy -// ContextHandle create_context_handle(CUdevice dev, unsigned int flags) { ... } +// Thread-local storage for primary context cache +// Each thread maintains its own cache of primary contexts indexed by device ID +thread_local std::vector primary_context_cache; + +ContextHandle get_primary_context(int dev_id) noexcept { + // Check thread-local cache + if (static_cast(dev_id) < primary_context_cache.size()) { + auto cached = primary_context_cache[dev_id]; + if (cached.get() != nullptr) { + return cached; // Cache hit + } + } + + // Cache miss - acquire primary context from driver + CUcontext ctx; + CUresult err; + { + GILReleaseGuard gil; + err = cuDevicePrimaryCtxRetain(&ctx, dev_id); + } + if (err != CUDA_SUCCESS) { + // Return empty handle on error (caller must check) + return ContextHandle(); + } + + // Create owning handle with custom deleter that releases the primary context + auto box = new ContextBox{ctx}; + auto box_ptr = std::shared_ptr(box, [dev_id](const ContextBox* b) { + GILReleaseGuard gil; + cuDevicePrimaryCtxRelease(dev_id); + delete b; + }); + + // Use aliasing constructor to expose only CUcontext + auto h_context = ContextHandle(box_ptr, &box_ptr->resource); + + // Resize cache if needed + if (static_cast(dev_id) >= primary_context_cache.size()) { + primary_context_cache.resize(dev_id + 1); + } + primary_context_cache[dev_id] = h_context; + + return h_context; +} + +ContextHandle get_current_context() noexcept { + CUcontext ctx = nullptr; + CUresult err; + { + GILReleaseGuard gil; + err = cuCtxGetCurrent(&ctx); + } + if (err != CUDA_SUCCESS || ctx == nullptr) { + // Return empty handle if no current context or error + return ContextHandle(); + } + return create_context_handle_ref(ctx); +} // ============================================================================ // Stream Handles diff --git a/cuda_core/cuda/core/experimental/_cpp/resource_handles.hpp b/cuda_core/cuda/core/experimental/_cpp/resource_handles.hpp index 949d9f1289..7d6892ccef 100644 --- a/cuda_core/cuda/core/experimental/_cpp/resource_handles.hpp +++ b/cuda_core/cuda/core/experimental/_cpp/resource_handles.hpp @@ -9,22 +9,22 @@ namespace cuda_core { -// Forward declarations -struct ContextBox; - // Handle type aliases - expose only the raw CUDA resource using ContextHandle = std::shared_ptr; -// Internal box structure for Context -// This holds the resource and any dependencies needed for lifetime management -struct ContextBox { - CUcontext resource; - // Context doesn't depend on other CUDA resources, but we keep the structure - // extensible for future needs -}; - -// Function to create a non-owning context handle (references existing context) -// This will be implemented in the .cpp file +// Function to create a non-owning context handle (references existing context). ContextHandle create_context_handle_ref(CUcontext ctx); +// ============================================================================ +// Context acquisition functions (pure C++, nogil-safe) +// ============================================================================ + +// Get handle to the primary context for a device (with thread-local caching) +// Returns empty handle on error (caller must check) +ContextHandle get_primary_context(int dev_id) noexcept; + +// Get handle to the current CUDA context +// Returns empty handle if no context is current (caller must check) +ContextHandle get_current_context() noexcept; + } // namespace cuda_core diff --git a/cuda_core/cuda/core/experimental/_device.pyx b/cuda_core/cuda/core/experimental/_device.pyx index 2e3e5a1e43..8d62b1de51 100644 --- a/cuda_core/cuda/core/experimental/_device.pyx +++ b/cuda_core/cuda/core/experimental/_device.pyx @@ -13,13 +13,17 @@ from typing import Optional, TYPE_CHECKING, Union from cuda.core.experimental._context cimport ( Context, - get_primary_context, - get_current_context, set_current_context, ) from cuda.core.experimental._context import ContextOptions from cuda.core.experimental._event import Event, EventOptions -from cuda.core.experimental._resource_handles cimport ContextHandle, intptr, native +from cuda.core.experimental._resource_handles cimport ( + ContextHandle, + get_primary_context, + get_current_context, + intptr, + native, +) from cuda.core.experimental._graph import GraphBuilder from cuda.core.experimental._stream import IsStreamT, Stream, StreamOptions from cuda.core.experimental._utils.clear_error_support import assert_type diff --git a/cuda_core/cuda/core/experimental/_resource_handles.pxd b/cuda_core/cuda/core/experimental/_resource_handles.pxd index 4ec0e6b62c..99e97f977e 100644 --- a/cuda_core/cuda/core/experimental/_resource_handles.pxd +++ b/cuda_core/cuda/core/experimental/_resource_handles.pxd @@ -16,6 +16,10 @@ cdef extern from "_cpp/resource_handles.hpp" namespace "cuda_core": # This is nogil-safe (pure C++, no Python dependencies) ContextHandle create_context_handle_ref(cydriver.CUcontext ctx) nogil + # Context acquisition functions (pure C++, nogil-safe with thread-local caching) + ContextHandle get_primary_context(int dev_id) nogil + ContextHandle get_current_context() nogil + # ============================================================================ # Helper functions to extract raw resources from handles diff --git a/cuda_core/cuda/core/experimental/_stream.pyx b/cuda_core/cuda/core/experimental/_stream.pyx index ec6436af53..8e775c56be 100644 --- a/cuda_core/cuda/core/experimental/_stream.pyx +++ b/cuda_core/cuda/core/experimental/_stream.pyx @@ -28,10 +28,13 @@ if TYPE_CHECKING: from cuda.core.experimental._context cimport ( Context, get_stream_context, - get_current_context, ) from cuda.core.experimental._event import Event, EventOptions -from cuda.core.experimental._resource_handles cimport ContextHandle, native +from cuda.core.experimental._resource_handles cimport ( + ContextHandle, + get_current_context, + native, +) from cuda.core.experimental._graph import GraphBuilder from cuda.core.experimental._utils.cuda_utils import ( driver, From 625a86f077f0d95a737b920a943654dec4caabef Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Mon, 8 Dec 2025 17:47:11 -0800 Subject: [PATCH 06/38] Fix link error by loading _resource_handles with RTLD_GLOBAL The C++ implementation in _resource_handles_impl.cpp is now compiled only into _resource_handles.so. Other modules that depend on these symbols (_context, _device, etc.) resolve them at runtime via the global symbol table. This ensures a single shared instance of thread-local caches and avoids setuptools issues with shared source files across extensions. --- cuda_core/build_hooks.py | 19 ++++--------- cuda_core/cuda/core/experimental/__init__.py | 17 +++++++++++ .../experimental/_cpp/resource_handles.cpp | 28 +++---------------- .../experimental/_cpp/resource_handles.hpp | 2 +- 4 files changed, 27 insertions(+), 39 deletions(-) diff --git a/cuda_core/build_hooks.py b/cuda_core/build_hooks.py index 9b85973e17..a20407488e 100644 --- a/cuda_core/build_hooks.py +++ b/cuda_core/build_hooks.py @@ -105,26 +105,17 @@ def get_sources(mod_name): if os.path.exists(cpp_file): sources.append(cpp_file) - # Modules that use resource handles need to link against _resource_handles_impl.cpp - # This includes _context, _stream, _event, etc. as they adopt handle-based management - # Modules that call into the handle helpers implemented in - # `_resource_handles_impl.cpp` must link against that translation unit. - # Keep this in sync with any module that cimports `get_primary_context` - # or other helpers defined there. - resource_handle_users = {"_context", "_stream", "_event", "_device"} - if mod_name in resource_handle_users: - resource_handles_impl = "cuda/core/experimental/_resource_handles_impl.cpp" - if os.path.exists(resource_handles_impl): - sources.append(resource_handles_impl) - return sources def get_extension_kwargs(mod_name): """Return Extension kwargs (libraries, library_dirs) per module.""" - resource_handle_users = {"_context", "_stream", "_event", "_device"} + # Modules that use CUDA driver APIs need to link against libcuda + # _resource_handles: contains the C++ implementation that calls CUDA driver + # _context, _stream, _event, _device: use resource handles and may call CUDA driver directly + cuda_users = {"_resource_handles", "_context", "_stream", "_event", "_device"} kwargs = {} - if mod_name in resource_handle_users: + if mod_name in cuda_users: kwargs["libraries"] = ["cuda"] kwargs["library_dirs"] = get_cuda_library_dirs() return kwargs diff --git a/cuda_core/cuda/core/experimental/__init__.py b/cuda_core/cuda/core/experimental/__init__.py index 826ea70b97..ac0627222b 100644 --- a/cuda_core/cuda/core/experimental/__init__.py +++ b/cuda_core/cuda/core/experimental/__init__.py @@ -12,6 +12,23 @@ raise ImportError("cuda.bindings 12.x or 13.x must be installed") import importlib +import sys + +# Load _resource_handles with RTLD_GLOBAL so its C++ symbols are available +# to other extension modules that depend on them (_context, _device, etc.) +# This must happen before importing any dependent modules. +if sys.platform != "win32": + import os + + _old_dlopen_flags = sys.getdlopenflags() + sys.setdlopenflags(os.RTLD_GLOBAL | os.RTLD_NOW) + try: + from cuda.core.experimental import _resource_handles # noqa: F401 + finally: + sys.setdlopenflags(_old_dlopen_flags) + del _old_dlopen_flags, os +else: + from cuda.core.experimental import _resource_handles # noqa: F401 subdir = f"cu{cuda_major}" try: diff --git a/cuda_core/cuda/core/experimental/_cpp/resource_handles.cpp b/cuda_core/cuda/core/experimental/_cpp/resource_handles.cpp index f584304496..6ee1088937 100644 --- a/cuda_core/cuda/core/experimental/_cpp/resource_handles.cpp +++ b/cuda_core/cuda/core/experimental/_cpp/resource_handles.cpp @@ -35,6 +35,10 @@ class GILReleaseGuard { } } + // Non-copyable, non-movable + GILReleaseGuard(const GILReleaseGuard&) = delete; + GILReleaseGuard& operator=(const GILReleaseGuard&) = delete; + private: PyThreadState* tstate_; bool released_; @@ -118,28 +122,4 @@ ContextHandle get_current_context() noexcept { return create_context_handle_ref(ctx); } -// ============================================================================ -// Stream Handles -// ============================================================================ - -// TODO: Implement StreamH create_stream_handle(...) when Stream gets handle support - -// ============================================================================ -// Event Handles -// ============================================================================ - -// TODO: Implement EventH create_event_handle(...) when Event gets handle support - -// ============================================================================ -// Device Pointer Handles -// ============================================================================ - -// TODO: Implement DevicePtrH create_deviceptr_handle(...) when DevicePtr gets handle support - -// ============================================================================ -// Memory Pool Handles -// ============================================================================ - -// TODO: Implement MemPoolH create_mempool_handle(...) when MemPool gets handle support - } // namespace cuda_core diff --git a/cuda_core/cuda/core/experimental/_cpp/resource_handles.hpp b/cuda_core/cuda/core/experimental/_cpp/resource_handles.hpp index 7d6892ccef..54e7c3ba39 100644 --- a/cuda_core/cuda/core/experimental/_cpp/resource_handles.hpp +++ b/cuda_core/cuda/core/experimental/_cpp/resource_handles.hpp @@ -4,8 +4,8 @@ #pragma once -#include #include +#include namespace cuda_core { From c0cbacd21c89b3ea08b68500f6802f0c5c6b5eb4 Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Tue, 9 Dec 2025 10:55:57 -0800 Subject: [PATCH 07/38] Move helper functions to C++ for overloading support Move native(), intptr(), and py() from Cython inline functions to inline C++ functions in resource_handles.hpp. This enables function overloading when additional handle types (e.g., StreamHandle) are added. - native(): extract raw CUDA handle from ContextHandle - intptr(): extract handle as uintptr_t for Python interop - py(): convert handle to Python driver wrapper object --- .../experimental/_cpp/resource_handles.hpp | 33 +++++++++++++++++++ .../core/experimental/_resource_handles.pxd | 30 ++++++----------- .../core/experimental/_resource_handles.pyx | 15 ++------- 3 files changed, 46 insertions(+), 32 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_cpp/resource_handles.hpp b/cuda_core/cuda/core/experimental/_cpp/resource_handles.hpp index 54e7c3ba39..7c0bf2ec63 100644 --- a/cuda_core/cuda/core/experimental/_cpp/resource_handles.hpp +++ b/cuda_core/cuda/core/experimental/_cpp/resource_handles.hpp @@ -4,7 +4,9 @@ #pragma once +#include #include +#include #include namespace cuda_core { @@ -27,4 +29,35 @@ ContextHandle get_primary_context(int dev_id) noexcept; // Returns empty handle if no context is current (caller must check) ContextHandle get_current_context() noexcept; +// ============================================================================ +// Helper functions to extract raw resources from handles +// These are defined as inline C++ functions to support overloading when +// additional handle types (e.g., StreamHandle) are added. +// ============================================================================ + +// native() - extract the raw CUDA handle +inline CUcontext native(const ContextHandle& h) noexcept { + return h ? *h : nullptr; +} + +// intptr() - extract handle as uintptr_t for Python interop +inline std::uintptr_t intptr(const ContextHandle& h) noexcept { + return reinterpret_cast(h ? *h : nullptr); +} + +// py() - convert handle to Python driver wrapper object +// Returns new reference. Caller must hold GIL. +inline PyObject* py(const ContextHandle& h) { + static PyObject* cls = nullptr; + if (!cls) { + PyObject* mod = PyImport_ImportModule("cuda.bindings.driver"); + if (!mod) return nullptr; + cls = PyObject_GetAttrString(mod, "CUcontext"); + Py_DECREF(mod); + if (!cls) return nullptr; + } + std::uintptr_t val = h ? reinterpret_cast(*h) : 0; + return PyObject_CallFunction(cls, "K", val); +} + } // namespace cuda_core diff --git a/cuda_core/cuda/core/experimental/_resource_handles.pxd b/cuda_core/cuda/core/experimental/_resource_handles.pxd index 99e97f977e..b65ee676a8 100644 --- a/cuda_core/cuda/core/experimental/_resource_handles.pxd +++ b/cuda_core/cuda/core/experimental/_resource_handles.pxd @@ -20,26 +20,16 @@ cdef extern from "_cpp/resource_handles.hpp" namespace "cuda_core": ContextHandle get_primary_context(int dev_id) nogil ContextHandle get_current_context() nogil + # ======================================================================== + # Helper functions to extract raw resources from handles + # Defined in C++ to support overloading when additional handle types are added + # ======================================================================== -# ============================================================================ -# Helper functions to extract raw resources from handles -# ============================================================================ + # native() - extract the raw CUDA handle (nogil-safe) + cydriver.CUcontext native(ContextHandle h) nogil -cdef inline cydriver.CUcontext native(ContextHandle h) nogil: - """Extract the native C type (cydriver.CUcontext) from the handle. + # intptr() - extract handle as uintptr_t (nogil-safe) + uintptr_t intptr(ContextHandle h) nogil - This is for use with cydriver API calls that expect the raw C type. - """ - return h.get()[0] - - -# Python conversion function (implemented in .pyx due to Python module dependency) -cdef object py(ContextHandle h) - - -cdef inline uintptr_t intptr(ContextHandle h) nogil: - """Extract the handle as a uintptr_t integer address. - - This is for use with internal APIs that expect integer addresses. - """ - return (h.get()[0]) + # py() - convert handle to Python driver wrapper object (requires GIL) + object py(ContextHandle h) diff --git a/cuda_core/cuda/core/experimental/_resource_handles.pyx b/cuda_core/cuda/core/experimental/_resource_handles.pyx index b150228762..6395f21e2a 100644 --- a/cuda_core/cuda/core/experimental/_resource_handles.pyx +++ b/cuda_core/cuda/core/experimental/_resource_handles.pyx @@ -2,15 +2,6 @@ # # SPDX-License-Identifier: Apache-2.0 -from libc.stdint cimport uintptr_t - -from cuda.bindings import driver -from cuda.core.experimental._resource_handles cimport ContextHandle - - -cdef object py(ContextHandle h): - """Convert the handle to a Python driver.CUcontext object. - - This is for use with driver (Python) API calls or returning to Python code. - """ - return driver.CUcontext((h.get()[0])) +# This module exists to compile _cpp/resource_handles.cpp into a shared library. +# The helper functions (native, intptr, py) are implemented as inline C++ functions +# in _cpp/resource_handles.hpp and declared as extern in _resource_handles.pxd. From 4046023188b493813d49df24f1a56ffca4699d2c Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Tue, 9 Dec 2025 11:19:29 -0800 Subject: [PATCH 08/38] Extend resource handle paradigm to Stream Add StreamHandle for automatic stream lifetime management using the same shared_ptr-based pattern established for ContextHandle. Changes: - Add StreamHandle type and create_stream_handle/create_stream_handle_ref functions in C++ with implementations in _resource_handles_impl.cpp - Add overloaded native(), intptr(), py() helpers for StreamHandle - Update Stream class to use _h_stream (StreamHandle) instead of raw _handle - Owned streams are automatically destroyed when last reference is released - Borrowed streams (from __cuda_stream__ protocol) hold _owner reference - Update memory resource files to use native(stream._h_stream) - Simplify Context using intptr() and py() helpers --- cuda_core/cuda/core/experimental/_context.pyx | 28 ++------ .../experimental/_cpp/resource_handles.cpp | 41 +++++++++-- .../experimental/_cpp/resource_handles.hpp | 50 +++++++++++-- .../_memory/_device_memory_resource.pyx | 5 +- .../_memory/_graph_memory_resource.pyx | 5 +- .../core/experimental/_resource_handles.pxd | 26 +++++-- cuda_core/cuda/core/experimental/_stream.pxd | 3 +- cuda_core/cuda/core/experimental/_stream.pyx | 70 ++++++++++--------- 8 files changed, 148 insertions(+), 80 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_context.pyx b/cuda_core/cuda/core/experimental/_context.pyx index 6532eecadf..94c8379875 100644 --- a/cuda_core/cuda/core/experimental/_context.pyx +++ b/cuda_core/cuda/core/experimental/_context.pyx @@ -9,9 +9,9 @@ from libc.stdint cimport uintptr_t from cuda.bindings cimport cydriver from cuda.core.experimental._resource_handles cimport ( create_context_handle_ref, - get_primary_context, - get_current_context, + intptr, native, + py, ) from cuda.core.experimental._utils.cuda_utils import driver from cuda.core.experimental._utils.cuda_utils cimport HANDLE_RETURN @@ -42,28 +42,18 @@ cdef class Context: @property def handle(self): """Return the underlying CUcontext handle.""" - cdef const cydriver.CUcontext* ptr = self._h_context.get() - if ptr != NULL: - return driver.CUcontext((ptr[0])) - return None + if self._h_context.get() == NULL: + return None + return py(self._h_context) def __eq__(self, other): if not isinstance(other, Context): return NotImplemented cdef Context _other = other - # Compare the actual CUcontext values, not the shared_ptr objects - # (aliasing constructor creates different addresses even for same CUcontext) - cdef const cydriver.CUcontext* ptr1 = self._h_context.get() - cdef const cydriver.CUcontext* ptr2 = _other._h_context.get() - if ptr1 == NULL or ptr2 == NULL: - return ptr1 == ptr2 - return ptr1[0] == ptr2[0] + return intptr(self._h_context) == intptr(_other._h_context) def __hash__(self) -> int: - cdef const cydriver.CUcontext* ptr = self._h_context.get() - if ptr == NULL: - return hash((type(self), 0)) - return hash((type(self), (ptr[0]))) + return hash((type(self), intptr(self._h_context))) @dataclass @@ -75,10 +65,6 @@ class ContextOptions: pass # TODO -# get_current_context() and get_primary_context() are now pure C++ functions -# imported from _resource_handles (with thread-local caching in C++) - - cdef ContextHandle get_stream_context(cydriver.CUstream stream) except * nogil: """Get handle to the context associated with a stream. diff --git a/cuda_core/cuda/core/experimental/_cpp/resource_handles.cpp b/cuda_core/cuda/core/experimental/_cpp/resource_handles.cpp index 6ee1088937..a99a0c09e3 100644 --- a/cuda_core/cuda/core/experimental/_cpp/resource_handles.cpp +++ b/cuda_core/cuda/core/experimental/_cpp/resource_handles.cpp @@ -55,12 +55,11 @@ ContextHandle create_context_handle_ref(CUcontext ctx) { // Use default deleter - it will delete the box, but not touch the CUcontext // CUcontext lifetime is managed externally (e.g., by CUDA driver) - auto box = new ContextBox{ctx}; - auto box_ptr = std::shared_ptr(box); + auto box = std::shared_ptr(new ContextBox{ctx}); // Use aliasing constructor to create handle that exposes only CUcontext - // The handle's reference count is tied to box_ptr, but it points to &box_ptr->resource - return ContextHandle(box_ptr, &box_ptr->resource); + // The handle's reference count is tied to box, but it points to &box->resource + return ContextHandle(box, &box->resource); } // Thread-local storage for primary context cache @@ -89,15 +88,14 @@ ContextHandle get_primary_context(int dev_id) noexcept { } // Create owning handle with custom deleter that releases the primary context - auto box = new ContextBox{ctx}; - auto box_ptr = std::shared_ptr(box, [dev_id](const ContextBox* b) { + auto box = std::shared_ptr(new ContextBox{ctx}, [dev_id](const ContextBox* b) { GILReleaseGuard gil; cuDevicePrimaryCtxRelease(dev_id); delete b; }); // Use aliasing constructor to expose only CUcontext - auto h_context = ContextHandle(box_ptr, &box_ptr->resource); + auto h_context = ContextHandle(box, &box->resource); // Resize cache if needed if (static_cast(dev_id) >= primary_context_cache.size()) { @@ -122,4 +120,33 @@ ContextHandle get_current_context() noexcept { return create_context_handle_ref(ctx); } +// ============================================================================ +// Stream Handles +// ============================================================================ + +// Internal box structure for Stream +struct StreamBox { + CUstream resource; +}; + +StreamHandle create_stream_handle(CUstream stream) { + // Creates an owning handle - stream will be destroyed when handle is released + auto box = std::shared_ptr(new StreamBox{stream}, [](const StreamBox* b) { + GILReleaseGuard gil; + cuStreamDestroy(b->resource); + delete b; + }); + + // Use aliasing constructor to expose only CUstream + return StreamHandle(box, &box->resource); +} + +StreamHandle create_stream_handle_ref(CUstream stream) { + // Creates a non-owning handle - stream will NOT be destroyed + auto box = std::shared_ptr(new StreamBox{stream}); + + // Use aliasing constructor to expose only CUstream + return StreamHandle(box, &box->resource); +} + } // namespace cuda_core diff --git a/cuda_core/cuda/core/experimental/_cpp/resource_handles.hpp b/cuda_core/cuda/core/experimental/_cpp/resource_handles.hpp index 7c0bf2ec63..f6f7d6fa79 100644 --- a/cuda_core/cuda/core/experimental/_cpp/resource_handles.hpp +++ b/cuda_core/cuda/core/experimental/_cpp/resource_handles.hpp @@ -11,16 +11,20 @@ namespace cuda_core { +// ============================================================================ // Handle type aliases - expose only the raw CUDA resource -using ContextHandle = std::shared_ptr; +// ============================================================================ -// Function to create a non-owning context handle (references existing context). -ContextHandle create_context_handle_ref(CUcontext ctx); +using ContextHandle = std::shared_ptr; +using StreamHandle = std::shared_ptr; // ============================================================================ -// Context acquisition functions (pure C++, nogil-safe) +// Context handle functions // ============================================================================ +// Function to create a non-owning context handle (references existing context). +ContextHandle create_context_handle_ref(CUcontext ctx); + // Get handle to the primary context for a device (with thread-local caching) // Returns empty handle on error (caller must check) ContextHandle get_primary_context(int dev_id) noexcept; @@ -30,9 +34,20 @@ ContextHandle get_primary_context(int dev_id) noexcept; ContextHandle get_current_context() noexcept; // ============================================================================ -// Helper functions to extract raw resources from handles -// These are defined as inline C++ functions to support overloading when -// additional handle types (e.g., StreamHandle) are added. +// Stream handle functions +// ============================================================================ + +// Create an owning stream handle. When the last reference is released, +// cuStreamDestroy is called automatically. +StreamHandle create_stream_handle(CUstream stream); + +// Create a non-owning stream handle (references existing stream). +// Use for borrowed streams (from foreign code) or built-in streams. +// The stream will NOT be destroyed when the handle is released. +StreamHandle create_stream_handle_ref(CUstream stream); + +// ============================================================================ +// Overloaded helper functions to extract raw resources from handles // ============================================================================ // native() - extract the raw CUDA handle @@ -40,11 +55,19 @@ inline CUcontext native(const ContextHandle& h) noexcept { return h ? *h : nullptr; } +inline CUstream native(const StreamHandle& h) noexcept { + return h ? *h : nullptr; +} + // intptr() - extract handle as uintptr_t for Python interop inline std::uintptr_t intptr(const ContextHandle& h) noexcept { return reinterpret_cast(h ? *h : nullptr); } +inline std::uintptr_t intptr(const StreamHandle& h) noexcept { + return reinterpret_cast(h ? *h : nullptr); +} + // py() - convert handle to Python driver wrapper object // Returns new reference. Caller must hold GIL. inline PyObject* py(const ContextHandle& h) { @@ -60,4 +83,17 @@ inline PyObject* py(const ContextHandle& h) { return PyObject_CallFunction(cls, "K", val); } +inline PyObject* py(const StreamHandle& h) { + static PyObject* cls = nullptr; + if (!cls) { + PyObject* mod = PyImport_ImportModule("cuda.bindings.driver"); + if (!mod) return nullptr; + cls = PyObject_GetAttrString(mod, "CUstream"); + Py_DECREF(mod); + if (!cls) return nullptr; + } + std::uintptr_t val = h ? reinterpret_cast(*h) : 0; + return PyObject_CallFunction(cls, "K", val); +} + } // namespace cuda_core diff --git a/cuda_core/cuda/core/experimental/_memory/_device_memory_resource.pyx b/cuda_core/cuda/core/experimental/_memory/_device_memory_resource.pyx index ac18079a62..3bfdb59c07 100644 --- a/cuda_core/cuda/core/experimental/_memory/_device_memory_resource.pyx +++ b/cuda_core/cuda/core/experimental/_memory/_device_memory_resource.pyx @@ -13,6 +13,7 @@ from cuda.bindings cimport cydriver from cuda.core.experimental._memory._buffer cimport Buffer, MemoryResource from cuda.core.experimental._memory cimport _ipc from cuda.core.experimental._memory._ipc cimport IPCAllocationHandle, IPCDataForMR +from cuda.core.experimental._resource_handles cimport native from cuda.core.experimental._stream cimport default_stream, Stream_accept, Stream from cuda.core.experimental._utils.cuda_utils cimport ( check_or_create_options, @@ -552,7 +553,7 @@ cdef inline int check_not_capturing(cydriver.CUstream s) except?-1 nogil: cdef inline Buffer DMR_allocate(DeviceMemoryResource self, size_t size, Stream stream): - cdef cydriver.CUstream s = stream._handle + cdef cydriver.CUstream s = native(stream._h_stream) cdef cydriver.CUdeviceptr devptr with nogil: check_not_capturing(s) @@ -569,7 +570,7 @@ cdef inline Buffer DMR_allocate(DeviceMemoryResource self, size_t size, Stream s cdef inline void DMR_deallocate( DeviceMemoryResource self, uintptr_t ptr, size_t size, Stream stream ) noexcept: - cdef cydriver.CUstream s = stream._handle + cdef cydriver.CUstream s = native(stream._h_stream) cdef cydriver.CUdeviceptr devptr = ptr cdef cydriver.CUresult r with nogil: diff --git a/cuda_core/cuda/core/experimental/_memory/_graph_memory_resource.pyx b/cuda_core/cuda/core/experimental/_memory/_graph_memory_resource.pyx index c65354b612..9a83c9007c 100644 --- a/cuda_core/cuda/core/experimental/_memory/_graph_memory_resource.pyx +++ b/cuda_core/cuda/core/experimental/_memory/_graph_memory_resource.pyx @@ -8,6 +8,7 @@ from libc.stdint cimport intptr_t from cuda.bindings cimport cydriver from cuda.core.experimental._memory._buffer cimport Buffer, MemoryResource +from cuda.core.experimental._resource_handles cimport native from cuda.core.experimental._stream cimport default_stream, Stream_accept, Stream from cuda.core.experimental._utils.cuda_utils cimport HANDLE_RETURN @@ -186,7 +187,7 @@ cdef inline int check_capturing(cydriver.CUstream s) except?-1 nogil: cdef inline Buffer GMR_allocate(cyGraphMemoryResource self, size_t size, Stream stream): - cdef cydriver.CUstream s = stream._handle + cdef cydriver.CUstream s = native(stream._h_stream) cdef cydriver.CUdeviceptr devptr with nogil: check_capturing(s) @@ -201,7 +202,7 @@ cdef inline Buffer GMR_allocate(cyGraphMemoryResource self, size_t size, Stream cdef inline void GMR_deallocate(intptr_t ptr, size_t size, Stream stream) noexcept: - cdef cydriver.CUstream s = stream._handle + cdef cydriver.CUstream s = native(stream._h_stream) cdef cydriver.CUdeviceptr devptr = ptr with nogil: HANDLE_RETURN(cydriver.cuMemFreeAsync(devptr, s)) diff --git a/cuda_core/cuda/core/experimental/_resource_handles.pxd b/cuda_core/cuda/core/experimental/_resource_handles.pxd index b65ee676a8..1cf7a31a8d 100644 --- a/cuda_core/cuda/core/experimental/_resource_handles.pxd +++ b/cuda_core/cuda/core/experimental/_resource_handles.pxd @@ -9,11 +9,12 @@ from cuda.bindings cimport cydriver # Declare the C++ namespace and types cdef extern from "_cpp/resource_handles.hpp" namespace "cuda_core": - # Handle type - shared_ptr to const CUcontext + # ======================================================================== + # Context Handle + # ======================================================================== ctypedef shared_ptr[const cydriver.CUcontext] ContextHandle # Function to create a non-owning context handle (references existing context) - # This is nogil-safe (pure C++, no Python dependencies) ContextHandle create_context_handle_ref(cydriver.CUcontext ctx) nogil # Context acquisition functions (pure C++, nogil-safe with thread-local caching) @@ -21,15 +22,28 @@ cdef extern from "_cpp/resource_handles.hpp" namespace "cuda_core": ContextHandle get_current_context() nogil # ======================================================================== - # Helper functions to extract raw resources from handles - # Defined in C++ to support overloading when additional handle types are added + # Stream Handle + # ======================================================================== + ctypedef shared_ptr[const cydriver.CUstream] StreamHandle + + # Create an owning stream handle (stream destroyed when handle released) + StreamHandle create_stream_handle(cydriver.CUstream stream) nogil + + # Create a non-owning stream handle (stream NOT destroyed when handle released) + StreamHandle create_stream_handle_ref(cydriver.CUstream stream) nogil + + # ======================================================================== + # Overloaded helper functions (C++ handles dispatch by type) # ======================================================================== - # native() - extract the raw CUDA handle (nogil-safe) + # native() - extract the raw CUDA handle cydriver.CUcontext native(ContextHandle h) nogil + cydriver.CUstream native(StreamHandle h) nogil - # intptr() - extract handle as uintptr_t (nogil-safe) + # intptr() - extract handle as uintptr_t for Python interop uintptr_t intptr(ContextHandle h) nogil + uintptr_t intptr(StreamHandle h) nogil # py() - convert handle to Python driver wrapper object (requires GIL) object py(ContextHandle h) + object py(StreamHandle h) diff --git a/cuda_core/cuda/core/experimental/_stream.pxd b/cuda_core/cuda/core/experimental/_stream.pxd index edc25e2ba7..f386386e98 100644 --- a/cuda_core/cuda/core/experimental/_stream.pxd +++ b/cuda_core/cuda/core/experimental/_stream.pxd @@ -3,12 +3,13 @@ # SPDX-License-Identifier: Apache-2.0 from cuda.bindings cimport cydriver +from cuda.core.experimental._resource_handles cimport StreamHandle cdef class Stream: cdef: - cydriver.CUstream _handle + StreamHandle _h_stream object _owner bint _builtin int _nonblocking diff --git a/cuda_core/cuda/core/experimental/_stream.pyx b/cuda_core/cuda/core/experimental/_stream.pyx index 8e775c56be..e8bd46f9b4 100644 --- a/cuda_core/cuda/core/experimental/_stream.pyx +++ b/cuda_core/cuda/core/experimental/_stream.pyx @@ -32,8 +32,13 @@ from cuda.core.experimental._context cimport ( from cuda.core.experimental._event import Event, EventOptions from cuda.core.experimental._resource_handles cimport ( ContextHandle, + StreamHandle, + create_stream_handle, + create_stream_handle_ref, get_current_context, + intptr, native, + py, ) from cuda.core.experimental._graph import GraphBuilder from cuda.core.experimental._utils.cuda_utils import ( @@ -87,7 +92,7 @@ cdef class Stream: using Stream.from_handle(). """ def __cinit__(self): - self._handle = (NULL) + # _h_stream is default-initialized to empty StreamHandle by C++ self._owner = None self._builtin = False self._nonblocking = -1 # lazy init'd @@ -104,26 +109,31 @@ cdef class Stream: @classmethod def _legacy_default(cls): cdef Stream self = Stream.__new__(cls) - self._handle = (cydriver.CU_STREAM_LEGACY) + # Built-in streams are non-owning references + self._h_stream = create_stream_handle_ref((cydriver.CU_STREAM_LEGACY)) self._builtin = True return self @classmethod def _per_thread_default(cls): cdef Stream self = Stream.__new__(cls) - self._handle = (cydriver.CU_STREAM_PER_THREAD) + # Built-in streams are non-owning references + self._h_stream = create_stream_handle_ref((cydriver.CU_STREAM_PER_THREAD)) self._builtin = True return self @classmethod def _init(cls, obj: IsStreamT | None = None, options=None, device_id: int = None): cdef Stream self = Stream.__new__(cls) + cdef cydriver.CUstream borrowed if obj is not None and options is not None: raise ValueError("obj and options cannot be both specified") if obj is not None: - self._handle = _handle_from_stream_protocol(obj) - # TODO: check if obj is created under the current context/device + # Borrowed stream from foreign object - non-owning reference + # Hold a reference to the owner to keep the underlying stream alive + borrowed = _handle_from_stream_protocol(obj) + self._h_stream = create_stream_handle_ref(borrowed) self._owner = obj return self @@ -147,46 +157,40 @@ cdef class Stream: cdef cydriver.CUstream s with nogil: HANDLE_RETURN(cydriver.cuStreamCreateWithPriority(&s, flags, prio)) - self._handle = s + # Owned stream - will be destroyed when handle is released + self._h_stream = create_stream_handle(s) self._nonblocking = int(nonblocking) self._priority = prio self._device_id = device_id if device_id is not None else self._device_id return self - def __dealloc__(self): - self.close() - cpdef close(self): """Destroy the stream. - Destroy the stream if we own it. Borrowed foreign stream - object will instead have their references released. - + Releases the stream handle. For owned streams, this destroys the + underlying CUDA stream. For borrowed streams, this just releases + the reference. """ - if self._owner is None: - if self._handle and not self._builtin: - with nogil: - HANDLE_RETURN(cydriver.cuStreamDestroy(self._handle)) - else: - self._owner = None - self._handle = (NULL) + # Reset handle to empty - this decrements refcount and may trigger destruction + self._h_stream.reset() + self._owner = None def __cuda_stream__(self) -> tuple[int, int]: """Return an instance of a __cuda_stream__ protocol.""" - return (0, (self._handle)) + return (0, intptr(self._h_stream)) def __hash__(self) -> int: # Ensure context is initialized for hash consistency if self._ctx_handle == CU_CONTEXT_INVALID: self._get_context() - return hash(((self._ctx_handle), (self._handle))) + return hash(((self._ctx_handle), intptr(self._h_stream))) def __eq__(self, other) -> bool: if not isinstance(other, Stream): return NotImplemented cdef Stream _other = other # Fast path: compare handles first - if (self._handle) != ((_other)._handle): + if intptr(self._h_stream) != intptr(_other._h_stream): return False # Ensure contexts are initialized for both streams if self._ctx_handle == CU_CONTEXT_INVALID: @@ -205,7 +209,7 @@ cdef class Stream: This handle is a Python object. To get the memory address of the underlying C handle, call ``int(Stream.handle)``. """ - return driver.CUstream((self._handle)) + return py(self._h_stream) @property def is_nonblocking(self) -> bool: @@ -213,11 +217,8 @@ cdef class Stream: cdef unsigned int flags if self._nonblocking == -1: with nogil: - HANDLE_RETURN(cydriver.cuStreamGetFlags(self._handle, &flags)) - if flags & cydriver.CUstream_flags.CU_STREAM_NON_BLOCKING: - self._nonblocking = True - else: - self._nonblocking = False + HANDLE_RETURN(cydriver.cuStreamGetFlags(native(self._h_stream), &flags)) + self._nonblocking = flags & cydriver.CUstream_flags.CU_STREAM_NON_BLOCKING return bool(self._nonblocking) @property @@ -226,14 +227,14 @@ cdef class Stream: cdef int prio if self._priority == INT32_MIN: with nogil: - HANDLE_RETURN(cydriver.cuStreamGetPriority(self._handle, &prio)) + HANDLE_RETURN(cydriver.cuStreamGetPriority(native(self._h_stream), &prio)) self._priority = prio return self._priority def sync(self): """Synchronize the stream.""" with nogil: - HANDLE_RETURN(cydriver.cuStreamSynchronize(self._handle)) + HANDLE_RETURN(cydriver.cuStreamSynchronize(native(self._h_stream))) def record(self, event: Event = None, options: EventOptions = None) -> Event: """Record an event onto the stream. @@ -268,7 +269,7 @@ cdef class Stream: cdef cydriver.CUevent e = ((event))._handle with nogil: - HANDLE_RETURN(cydriver.cuEventRecord(e, self._handle)) + HANDLE_RETURN(cydriver.cuEventRecord(e, native(self._h_stream))) return event def wait(self, event_or_stream: Union[Event, Stream]): @@ -288,7 +289,7 @@ cdef class Stream: event = (event_or_stream.handle) with nogil: # TODO: support flags other than 0? - HANDLE_RETURN(cydriver.cuStreamWaitEvent(self._handle, event, 0)) + HANDLE_RETURN(cydriver.cuStreamWaitEvent(native(self._h_stream), event, 0)) else: if isinstance(event_or_stream, Stream): stream = (event_or_stream.handle) @@ -305,7 +306,7 @@ cdef class Stream: HANDLE_RETURN(cydriver.cuEventCreate(&event, cydriver.CUevent_flags.CU_EVENT_DISABLE_TIMING)) HANDLE_RETURN(cydriver.cuEventRecord(event, stream)) # TODO: support flags other than 0? - HANDLE_RETURN(cydriver.cuStreamWaitEvent(self._handle, event, 0)) + HANDLE_RETURN(cydriver.cuStreamWaitEvent(native(self._h_stream), event, 0)) HANDLE_RETURN(cydriver.cuEventDestroy(event)) @property @@ -324,8 +325,9 @@ cdef class Stream: return Device((self._device_id)) cdef int _get_context(self) except?-1 nogil: + cdef ContextHandle h_context if self._ctx_handle == CU_CONTEXT_INVALID: - h_context = get_stream_context(self._handle) + h_context = get_stream_context(native(self._h_stream)) self._ctx_handle = native(h_context) return 0 From 39fbefcfed6c296c9d7637e3b5167331f202d424 Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Tue, 9 Dec 2025 12:50:31 -0800 Subject: [PATCH 09/38] Simplify Stream by moving more logic to C++ - Move stream creation to C++ (create_stream_handle now calls cuStreamCreateWithPriority internally) - Add get_legacy_stream/get_per_thread_stream for built-in streams - Add create_stream_handle_with_owner for borrowed streams that prevents Python owner from being GC'd via captured PyObject* - Add GILAcquireGuard (symmetric to GILReleaseGuard) for safely acquiring GIL in C++ destructors - Simplify Stream class: remove __cinit__, _owner, _builtin, _legacy_default, _per_thread_default - Use _from_handle as single initialization point for Stream - Remove obsolete subclassing tests for removed methods --- .../experimental/_cpp/resource_handles.cpp | 80 ++++++++++++++++++- .../experimental/_cpp/resource_handles.hpp | 17 +++- .../core/experimental/_resource_handles.pxd | 14 +++- cuda_core/cuda/core/experimental/_stream.pxd | 5 +- cuda_core/cuda/core/experimental/_stream.pyx | 72 ++++++++--------- 5 files changed, 142 insertions(+), 46 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_cpp/resource_handles.cpp b/cuda_core/cuda/core/experimental/_cpp/resource_handles.cpp index a99a0c09e3..a8b0fa60ef 100644 --- a/cuda_core/cuda/core/experimental/_cpp/resource_handles.cpp +++ b/cuda_core/cuda/core/experimental/_cpp/resource_handles.cpp @@ -44,6 +44,38 @@ class GILReleaseGuard { bool released_; }; +// Helper to acquire the GIL when we might not hold it. +// Use in C++ destructors that need to manipulate Python objects. +// Symmetric counterpart to GILReleaseGuard. +class GILAcquireGuard { +public: + GILAcquireGuard() : acquired_(false) { + // Don't try to acquire GIL if Python is finalizing + if (!Py_IsInitialized() || _Py_IsFinalizing()) { + return; + } + gstate_ = PyGILState_Ensure(); + acquired_ = true; + } + + ~GILAcquireGuard() { + if (acquired_) { + PyGILState_Release(gstate_); + } + } + + // Check if GIL was successfully acquired (for conditional operations) + bool acquired() const { return acquired_; } + + // Non-copyable, non-movable + GILAcquireGuard(const GILAcquireGuard&) = delete; + GILAcquireGuard& operator=(const GILAcquireGuard&) = delete; + +private: + PyGILState_STATE gstate_; + bool acquired_; +}; + // Internal box structure for Context (kept private to this TU) struct ContextBox { CUcontext resource; @@ -129,8 +161,19 @@ struct StreamBox { CUstream resource; }; -StreamHandle create_stream_handle(CUstream stream) { - // Creates an owning handle - stream will be destroyed when handle is released +StreamHandle create_stream_handle(unsigned int flags, int priority) { + // Creates an owning stream handle - calls cuStreamCreateWithPriority internally. + // Returns empty handle on error (caller must check). + CUstream stream; + CUresult err; + { + GILReleaseGuard gil; + err = cuStreamCreateWithPriority(&stream, flags, priority); + } + if (err != CUDA_SUCCESS) { + return StreamHandle(); + } + auto box = std::shared_ptr(new StreamBox{stream}, [](const StreamBox* b) { GILReleaseGuard gil; cuStreamDestroy(b->resource); @@ -149,4 +192,37 @@ StreamHandle create_stream_handle_ref(CUstream stream) { return StreamHandle(box, &box->resource); } +StreamHandle create_stream_handle_with_owner(CUstream stream, PyObject* owner) { + // Creates a non-owning handle that prevents a Python owner from being GC'd. + // The owner's refcount is incremented here and decremented when handle is released. + Py_XINCREF(owner); + + auto box = std::shared_ptr(new StreamBox{stream}, [owner](const StreamBox* b) { + // Safely decrement owner refcount (GILAcquireGuard handles finalization check) + { + GILAcquireGuard gil; + if (gil.acquired()) { + Py_XDECREF(owner); + } + } + delete b; + }); + + return StreamHandle(box, &box->resource); +} + +StreamHandle get_legacy_stream() noexcept { + // Return non-owning handle to the legacy default stream. + // Use function-local static for efficient repeated access. + static StreamHandle handle = create_stream_handle_ref(CU_STREAM_LEGACY); + return handle; +} + +StreamHandle get_per_thread_stream() noexcept { + // Return non-owning handle to the per-thread default stream. + // Use function-local static for efficient repeated access. + static StreamHandle handle = create_stream_handle_ref(CU_STREAM_PER_THREAD); + return handle; +} + } // namespace cuda_core diff --git a/cuda_core/cuda/core/experimental/_cpp/resource_handles.hpp b/cuda_core/cuda/core/experimental/_cpp/resource_handles.hpp index f6f7d6fa79..e32bc4d77c 100644 --- a/cuda_core/cuda/core/experimental/_cpp/resource_handles.hpp +++ b/cuda_core/cuda/core/experimental/_cpp/resource_handles.hpp @@ -37,15 +37,26 @@ ContextHandle get_current_context() noexcept; // Stream handle functions // ============================================================================ -// Create an owning stream handle. When the last reference is released, -// cuStreamDestroy is called automatically. -StreamHandle create_stream_handle(CUstream stream); +// Create an owning stream handle by calling cuStreamCreateWithPriority. +// When the last reference is released, cuStreamDestroy is called automatically. +// Returns empty handle on error (caller must check). +StreamHandle create_stream_handle(unsigned int flags, int priority); // Create a non-owning stream handle (references existing stream). // Use for borrowed streams (from foreign code) or built-in streams. // The stream will NOT be destroyed when the handle is released. StreamHandle create_stream_handle_ref(CUstream stream); +// Create a non-owning stream handle that prevents a Python owner from being GC'd. +// The owner's refcount is incremented; decremented when handle is released. +StreamHandle create_stream_handle_with_owner(CUstream stream, PyObject* owner); + +// Get non-owning handle to the legacy default stream (CU_STREAM_LEGACY) +StreamHandle get_legacy_stream() noexcept; + +// Get non-owning handle to the per-thread default stream (CU_STREAM_PER_THREAD) +StreamHandle get_per_thread_stream() noexcept; + // ============================================================================ // Overloaded helper functions to extract raw resources from handles // ============================================================================ diff --git a/cuda_core/cuda/core/experimental/_resource_handles.pxd b/cuda_core/cuda/core/experimental/_resource_handles.pxd index 1cf7a31a8d..da152f4473 100644 --- a/cuda_core/cuda/core/experimental/_resource_handles.pxd +++ b/cuda_core/cuda/core/experimental/_resource_handles.pxd @@ -26,12 +26,22 @@ cdef extern from "_cpp/resource_handles.hpp" namespace "cuda_core": # ======================================================================== ctypedef shared_ptr[const cydriver.CUstream] StreamHandle - # Create an owning stream handle (stream destroyed when handle released) - StreamHandle create_stream_handle(cydriver.CUstream stream) nogil + # Create an owning stream handle via cuStreamCreateWithPriority + # Returns empty handle on error (caller must check) + StreamHandle create_stream_handle(unsigned int flags, int priority) nogil # Create a non-owning stream handle (stream NOT destroyed when handle released) StreamHandle create_stream_handle_ref(cydriver.CUstream stream) nogil + # Create non-owning handle that prevents Python owner from being GC'd + StreamHandle create_stream_handle_with_owner(cydriver.CUstream stream, object owner) + + # Get non-owning handle to the legacy default stream + StreamHandle get_legacy_stream() nogil + + # Get non-owning handle to the per-thread default stream + StreamHandle get_per_thread_stream() nogil + # ======================================================================== # Overloaded helper functions (C++ handles dispatch by type) # ======================================================================== diff --git a/cuda_core/cuda/core/experimental/_stream.pxd b/cuda_core/cuda/core/experimental/_stream.pxd index f386386e98..e727a29226 100644 --- a/cuda_core/cuda/core/experimental/_stream.pxd +++ b/cuda_core/cuda/core/experimental/_stream.pxd @@ -10,13 +10,14 @@ cdef class Stream: cdef: StreamHandle _h_stream - object _owner - bint _builtin int _nonblocking int _priority cydriver.CUdevice _device_id cydriver.CUcontext _ctx_handle + @staticmethod + cdef Stream _from_handle(type cls, StreamHandle h_stream) + cpdef close(self) cdef int _get_context(self) except?-1 nogil cdef int _get_device_and_context(self) except?-1 diff --git a/cuda_core/cuda/core/experimental/_stream.pyx b/cuda_core/cuda/core/experimental/_stream.pyx index e8bd46f9b4..10742e2730 100644 --- a/cuda_core/cuda/core/experimental/_stream.pyx +++ b/cuda_core/cuda/core/experimental/_stream.pyx @@ -34,8 +34,10 @@ from cuda.core.experimental._resource_handles cimport ( ContextHandle, StreamHandle, create_stream_handle, - create_stream_handle_ref, + create_stream_handle_with_owner, get_current_context, + get_legacy_stream, + get_per_thread_stream, intptr, native, py, @@ -91,57 +93,54 @@ cdef class Stream: object, or created directly through using an existing handle using Stream.from_handle(). """ - def __cinit__(self): - # _h_stream is default-initialized to empty StreamHandle by C++ - self._owner = None - self._builtin = False - self._nonblocking = -1 # lazy init'd - self._priority = INT32_MIN # lazy init'd - self._device_id = cydriver.CU_DEVICE_INVALID # lazy init'd - self._ctx_handle = CU_CONTEXT_INVALID # lazy init'd - def __init__(self, *args, **kwargs): raise RuntimeError( "Stream objects cannot be instantiated directly. " "Please use Device APIs (create_stream) or other Stream APIs (from_handle)." ) + @staticmethod + cdef Stream _from_handle(type cls, StreamHandle h_stream): + """Create a Stream from an existing StreamHandle (cdef-only factory).""" + cdef Stream s = cls.__new__(cls) + s._h_stream = h_stream + s._nonblocking = -1 # lazy init'd + s._priority = INT32_MIN # lazy init'd + s._device_id = cydriver.CU_DEVICE_INVALID # lazy init'd + s._ctx_handle = CU_CONTEXT_INVALID # lazy init'd + return s + @classmethod def _legacy_default(cls): - cdef Stream self = Stream.__new__(cls) - # Built-in streams are non-owning references - self._h_stream = create_stream_handle_ref((cydriver.CU_STREAM_LEGACY)) - self._builtin = True - return self + """Return the legacy default stream (supports subclassing).""" + return Stream._from_handle(cls, get_legacy_stream()) @classmethod def _per_thread_default(cls): - cdef Stream self = Stream.__new__(cls) - # Built-in streams are non-owning references - self._h_stream = create_stream_handle_ref((cydriver.CU_STREAM_PER_THREAD)) - self._builtin = True - return self + """Return the per-thread default stream (supports subclassing).""" + return Stream._from_handle(cls, get_per_thread_stream()) @classmethod def _init(cls, obj: IsStreamT | None = None, options=None, device_id: int = None): - cdef Stream self = Stream.__new__(cls) + cdef StreamHandle h_stream cdef cydriver.CUstream borrowed + cdef Stream self if obj is not None and options is not None: raise ValueError("obj and options cannot be both specified") if obj is not None: - # Borrowed stream from foreign object - non-owning reference - # Hold a reference to the owner to keep the underlying stream alive + # Borrowed stream from foreign object + # C++ handle prevents owner from being GC'd until handle is released borrowed = _handle_from_stream_protocol(obj) - self._h_stream = create_stream_handle_ref(borrowed) - self._owner = obj - return self + h_stream = create_stream_handle_with_owner(borrowed, obj) + return Stream._from_handle(cls, h_stream) cdef StreamOptions opts = check_or_create_options(StreamOptions, options, "Stream options") nonblocking = opts.nonblocking priority = opts.priority - flags = cydriver.CUstream_flags.CU_STREAM_NON_BLOCKING if nonblocking else cydriver.CUstream_flags.CU_STREAM_DEFAULT + cdef unsigned int flags = (cydriver.CUstream_flags.CU_STREAM_NON_BLOCKING if nonblocking + else cydriver.CUstream_flags.CU_STREAM_DEFAULT) # TODO: we might want to consider memoizing high/low per CUDA context and avoid this call cdef int high, low with nogil: @@ -154,26 +153,25 @@ cdef class Stream: else: prio = high - cdef cydriver.CUstream s - with nogil: - HANDLE_RETURN(cydriver.cuStreamCreateWithPriority(&s, flags, prio)) - # Owned stream - will be destroyed when handle is released - self._h_stream = create_stream_handle(s) + # C++ creates the stream and returns owning handle + h_stream = create_stream_handle(flags, prio) + if not h_stream: + raise RuntimeError("Failed to create CUDA stream") + self = Stream._from_handle(cls, h_stream) self._nonblocking = int(nonblocking) self._priority = prio - self._device_id = device_id if device_id is not None else self._device_id + if device_id is not None: + self._device_id = device_id return self cpdef close(self): """Destroy the stream. Releases the stream handle. For owned streams, this destroys the - underlying CUDA stream. For borrowed streams, this just releases - the reference. + underlying CUDA stream. For borrowed streams, this releases the + reference and allows the Python owner to be GC'd. """ - # Reset handle to empty - this decrements refcount and may trigger destruction self._h_stream.reset() - self._owner = None def __cuda_stream__(self) -> tuple[int, int]: """Return an instance of a __cuda_stream__ protocol.""" From f90e625f3b69c45131fdbd33b87f5c7763b76a1d Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Tue, 9 Dec 2025 14:02:06 -0800 Subject: [PATCH 10/38] Refactor Stream to use ContextHandle and simplify initialization - Replace raw CUcontext _ctx_handle with ContextHandle _h_context for consistent handle paradigm and cleaner code - Replace CUdevice _device_id with int using -1 sentinel - Use intptr() helper instead of () casts throughout - Add _from_handle(type cls, ...) factory with subclass support - Add _legacy_default and _per_thread_default classmethods - Eliminate duplicated initialization code in _init --- cuda_core/cuda/core/experimental/_stream.pxd | 7 ++-- cuda_core/cuda/core/experimental/_stream.pyx | 35 +++++++++----------- 2 files changed, 19 insertions(+), 23 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_stream.pxd b/cuda_core/cuda/core/experimental/_stream.pxd index e727a29226..5b7603d23b 100644 --- a/cuda_core/cuda/core/experimental/_stream.pxd +++ b/cuda_core/cuda/core/experimental/_stream.pxd @@ -2,18 +2,17 @@ # # SPDX-License-Identifier: Apache-2.0 -from cuda.bindings cimport cydriver -from cuda.core.experimental._resource_handles cimport StreamHandle +from cuda.core.experimental._resource_handles cimport ContextHandle, StreamHandle cdef class Stream: cdef: StreamHandle _h_stream + ContextHandle _h_context + int _device_id int _nonblocking int _priority - cydriver.CUdevice _device_id - cydriver.CUcontext _ctx_handle @staticmethod cdef Stream _from_handle(type cls, StreamHandle h_stream) diff --git a/cuda_core/cuda/core/experimental/_stream.pyx b/cuda_core/cuda/core/experimental/_stream.pyx index 10742e2730..9114bcb65f 100644 --- a/cuda_core/cuda/core/experimental/_stream.pyx +++ b/cuda_core/cuda/core/experimental/_stream.pyx @@ -12,7 +12,6 @@ from cuda.bindings cimport cydriver from cuda.core.experimental._event cimport Event as cyEvent from cuda.core.experimental._utils.cuda_utils cimport ( check_or_create_options, - CU_CONTEXT_INVALID, get_device_from_ctx, HANDLE_RETURN, ) @@ -104,10 +103,10 @@ cdef class Stream: """Create a Stream from an existing StreamHandle (cdef-only factory).""" cdef Stream s = cls.__new__(cls) s._h_stream = h_stream + # _h_context is default-initialized to empty ContextHandle by C++ + s._device_id = -1 # lazy init'd (invalid sentinel) s._nonblocking = -1 # lazy init'd s._priority = INT32_MIN # lazy init'd - s._device_id = cydriver.CU_DEVICE_INVALID # lazy init'd - s._ctx_handle = CU_CONTEXT_INVALID # lazy init'd return s @classmethod @@ -179,9 +178,9 @@ cdef class Stream: def __hash__(self) -> int: # Ensure context is initialized for hash consistency - if self._ctx_handle == CU_CONTEXT_INVALID: + if not self._h_context: self._get_context() - return hash(((self._ctx_handle), intptr(self._h_stream))) + return hash((intptr(self._h_context), intptr(self._h_stream))) def __eq__(self, other) -> bool: if not isinstance(other, Stream): @@ -191,12 +190,12 @@ cdef class Stream: if intptr(self._h_stream) != intptr(_other._h_stream): return False # Ensure contexts are initialized for both streams - if self._ctx_handle == CU_CONTEXT_INVALID: + if not self._h_context: self._get_context() - if _other._ctx_handle == CU_CONTEXT_INVALID: + if not _other._h_context: _other._get_context() # Compare contexts as well - return (self._ctx_handle) == ((_other)._ctx_handle) + return intptr(self._h_context) == intptr(_other._h_context) @property def handle(self) -> cuda.bindings.driver.CUstream: @@ -258,7 +257,7 @@ cdef class Stream: # and CU_EVENT_RECORD_EXTERNAL, can be set in EventOptions. if event is None: self._get_device_and_context() - event = Event._init((self._device_id), (self._ctx_handle), options, False) + event = Event._init(self._device_id, intptr(self._h_context), options, False) elif event.is_ipc_enabled: raise TypeError( "IPC-enabled events should not be re-recorded, instead create a " @@ -320,27 +319,25 @@ cdef class Stream: """ from cuda.core.experimental._device import Device # avoid circular import self._get_device_and_context() - return Device((self._device_id)) + return Device(self._device_id) cdef int _get_context(self) except?-1 nogil: - cdef ContextHandle h_context - if self._ctx_handle == CU_CONTEXT_INVALID: - h_context = get_stream_context(native(self._h_stream)) - self._ctx_handle = native(h_context) + if not self._h_context: + self._h_context = get_stream_context(native(self._h_stream)) return 0 cdef int _get_device_and_context(self) except?-1: cdef ContextHandle h_curr_context cdef cydriver.CUcontext curr_ctx - if self._device_id == cydriver.CU_DEVICE_INVALID: + if self._device_id < 0: # Get the current context with nogil: h_curr_context = get_current_context() - curr_ctx = native(h_curr_context) if h_curr_context.get() != NULL else 0 - # Get the stream's context (self._ctx_handle is populated) + curr_ctx = native(h_curr_context) if h_curr_context else 0 + # Get the stream's context (self._h_context is populated) self._get_context() # Get the stream's device (may require a context-switching dance) - self._device_id = get_device_from_ctx(self._ctx_handle, curr_ctx) + self._device_id = get_device_from_ctx(native(self._h_context), curr_ctx) return 0 @property @@ -348,7 +345,7 @@ cdef class Stream: """Return the :obj:`~_context.Context` associated with this stream.""" self._get_context() self._get_device_and_context() - return Context._from_ctx((self._ctx_handle), (self._device_id)) + return Context._from_ctx(intptr(self._h_context), self._device_id) @staticmethod def from_handle(handle: int) -> Stream: From d7a999dcc739235866d002ccce0326e6b06acfcf Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Tue, 9 Dec 2025 16:03:55 -0800 Subject: [PATCH 11/38] Extend ContextHandle to Event and standardize naming - Event now uses ContextHandle for _h_context instead of raw object - Event._init is now a cdef staticmethod accepting ContextHandle - Context._from_ctx renamed to Context._from_handle (cdef staticmethod) - Moved get_device_from_ctx to Stream module as Stream_ensure_ctx_device - Inlined get_stream_context into Stream_ensure_ctx - Simplified context push/pop logic in Stream_ensure_ctx_device Naming standardization: - Device._id -> Device._device_id - _dev_id -> _device_id throughout codebase - dev_id -> device_id for local variables - Updated tests to use public APIs instead of internal _init methods --- cuda_core/cuda/core/experimental/_context.pxd | 7 +- cuda_core/cuda/core/experimental/_context.pyx | 47 ++------ .../experimental/_cpp/resource_handles.cpp | 18 ++-- .../experimental/_cpp/resource_handles.hpp | 2 +- cuda_core/cuda/core/experimental/_device.pyx | 58 +++++----- cuda_core/cuda/core/experimental/_event.pxd | 6 +- cuda_core/cuda/core/experimental/_event.pyx | 22 ++-- .../_memory/_device_memory_resource.pxd | 2 +- .../_memory/_device_memory_resource.pyx | 42 ++++---- .../_memory/_graph_memory_resource.pxd | 2 +- .../_memory/_graph_memory_resource.pyx | 16 +-- .../cuda/core/experimental/_memory/_ipc.pyx | 2 +- .../cuda/core/experimental/_memory/_legacy.py | 6 +- .../core/experimental/_resource_handles.pxd | 2 +- cuda_core/cuda/core/experimental/_stream.pxd | 2 - cuda_core/cuda/core/experimental/_stream.pyx | 77 +++++++------- .../core/experimental/_utils/cuda_utils.pxd | 4 - .../core/experimental/_utils/cuda_utils.pyx | 19 ---- cuda_core/tests/test_comparable.py | 47 +++----- cuda_core/tests/test_event.py | 10 +- cuda_core/tests/test_hashable.py | 100 +++++++----------- 21 files changed, 197 insertions(+), 294 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_context.pxd b/cuda_core/cuda/core/experimental/_context.pxd index 01552c055e..062e865172 100644 --- a/cuda_core/cuda/core/experimental/_context.pxd +++ b/cuda_core/cuda/core/experimental/_context.pxd @@ -2,7 +2,6 @@ # # SPDX-License-Identifier: Apache-2.0 -from cuda.bindings cimport cydriver from cuda.core.experimental._resource_handles cimport ContextHandle cdef class Context: @@ -16,7 +15,5 @@ cdef class Context: ContextHandle _h_context int _device_id -# Cython-level context operations (handle-centric API) -# Note: get_primary_context and get_current_context are now pure C++ (imported from _resource_handles) -cdef void set_current_context(ContextHandle h_context) except * nogil -cdef ContextHandle get_stream_context(cydriver.CUstream stream) except * nogil + @staticmethod + cdef Context _from_handle(type cls, ContextHandle h_context, int device_id) diff --git a/cuda_core/cuda/core/experimental/_context.pyx b/cuda_core/cuda/core/experimental/_context.pyx index 94c8379875..0504778207 100644 --- a/cuda_core/cuda/core/experimental/_context.pyx +++ b/cuda_core/cuda/core/experimental/_context.pyx @@ -4,16 +4,14 @@ from dataclasses import dataclass -from libc.stdint cimport uintptr_t - from cuda.bindings cimport cydriver from cuda.core.experimental._resource_handles cimport ( + ContextHandle, create_context_handle_ref, intptr, native, py, ) -from cuda.core.experimental._utils.cuda_utils import driver from cuda.core.experimental._utils.cuda_utils cimport HANDLE_RETURN @@ -30,12 +28,11 @@ cdef class Context: def __init__(self, *args, **kwargs): raise RuntimeError("Context objects cannot be instantiated directly. Please use Device or Stream APIs.") - @classmethod - def _from_ctx(cls, handle: driver.CUcontext, int device_id): - cdef Context ctx = Context.__new__(Context) - # Convert Python CUcontext to C-level CUcontext and create non-owning ContextHandle - cdef cydriver.CUcontext c_ctx = int(handle) - ctx._h_context = create_context_handle_ref(c_ctx) + @staticmethod + cdef Context _from_handle(type cls, ContextHandle h_context, int device_id): + """Create Context from existing ContextHandle (cdef-only factory).""" + cdef Context ctx = cls.__new__(cls) + ctx._h_context = h_context ctx._device_id = device_id return ctx @@ -63,35 +60,3 @@ class ContextOptions: Currently unused, reserved for future use. """ pass # TODO - - -cdef ContextHandle get_stream_context(cydriver.CUstream stream) except * nogil: - """Get handle to the context associated with a stream. - - Parameters - ---------- - stream : CUstream - Stream handle - - Returns - ------- - ContextHandle - Handle to context associated with the stream - """ - cdef cydriver.CUcontext ctx = NULL - HANDLE_RETURN(cydriver.cuStreamGetCtx(stream, &ctx)) - return create_context_handle_ref(ctx) - - -cdef void set_current_context(ContextHandle h_context) except * nogil: - """Set the current CUDA context from a handle. - - Parameters - ---------- - h_context : ContextHandle - Context handle to set as current - """ - if h_context.get() == NULL: - with gil: - raise ValueError("Cannot set NULL context as current") - HANDLE_RETURN(cydriver.cuCtxSetCurrent(native(h_context))) diff --git a/cuda_core/cuda/core/experimental/_cpp/resource_handles.cpp b/cuda_core/cuda/core/experimental/_cpp/resource_handles.cpp index a8b0fa60ef..076ff10810 100644 --- a/cuda_core/cuda/core/experimental/_cpp/resource_handles.cpp +++ b/cuda_core/cuda/core/experimental/_cpp/resource_handles.cpp @@ -98,10 +98,10 @@ ContextHandle create_context_handle_ref(CUcontext ctx) { // Each thread maintains its own cache of primary contexts indexed by device ID thread_local std::vector primary_context_cache; -ContextHandle get_primary_context(int dev_id) noexcept { +ContextHandle get_primary_context(int device_id) noexcept { // Check thread-local cache - if (static_cast(dev_id) < primary_context_cache.size()) { - auto cached = primary_context_cache[dev_id]; + if (static_cast(device_id) < primary_context_cache.size()) { + auto cached = primary_context_cache[device_id]; if (cached.get() != nullptr) { return cached; // Cache hit } @@ -112,7 +112,7 @@ ContextHandle get_primary_context(int dev_id) noexcept { CUresult err; { GILReleaseGuard gil; - err = cuDevicePrimaryCtxRetain(&ctx, dev_id); + err = cuDevicePrimaryCtxRetain(&ctx, device_id); } if (err != CUDA_SUCCESS) { // Return empty handle on error (caller must check) @@ -120,9 +120,9 @@ ContextHandle get_primary_context(int dev_id) noexcept { } // Create owning handle with custom deleter that releases the primary context - auto box = std::shared_ptr(new ContextBox{ctx}, [dev_id](const ContextBox* b) { + auto box = std::shared_ptr(new ContextBox{ctx}, [device_id](const ContextBox* b) { GILReleaseGuard gil; - cuDevicePrimaryCtxRelease(dev_id); + cuDevicePrimaryCtxRelease(device_id); delete b; }); @@ -130,10 +130,10 @@ ContextHandle get_primary_context(int dev_id) noexcept { auto h_context = ContextHandle(box, &box->resource); // Resize cache if needed - if (static_cast(dev_id) >= primary_context_cache.size()) { - primary_context_cache.resize(dev_id + 1); + if (static_cast(device_id) >= primary_context_cache.size()) { + primary_context_cache.resize(device_id + 1); } - primary_context_cache[dev_id] = h_context; + primary_context_cache[device_id] = h_context; return h_context; } diff --git a/cuda_core/cuda/core/experimental/_cpp/resource_handles.hpp b/cuda_core/cuda/core/experimental/_cpp/resource_handles.hpp index e32bc4d77c..945ac0b2a8 100644 --- a/cuda_core/cuda/core/experimental/_cpp/resource_handles.hpp +++ b/cuda_core/cuda/core/experimental/_cpp/resource_handles.hpp @@ -27,7 +27,7 @@ ContextHandle create_context_handle_ref(CUcontext ctx); // Get handle to the primary context for a device (with thread-local caching) // Returns empty handle on error (caller must check) -ContextHandle get_primary_context(int dev_id) noexcept; +ContextHandle get_primary_context(int device_id) noexcept; // Get handle to the current CUDA context // Returns empty handle if no context is current (caller must check) diff --git a/cuda_core/cuda/core/experimental/_device.pyx b/cuda_core/cuda/core/experimental/_device.pyx index 8d62b1de51..f2f2f72a72 100644 --- a/cuda_core/cuda/core/experimental/_device.pyx +++ b/cuda_core/cuda/core/experimental/_device.pyx @@ -11,17 +11,15 @@ from cuda.core.experimental._utils.cuda_utils cimport HANDLE_RETURN import threading from typing import Optional, TYPE_CHECKING, Union -from cuda.core.experimental._context cimport ( - Context, - set_current_context, -) +from cuda.core.experimental._context cimport Context from cuda.core.experimental._context import ContextOptions +from cuda.core.experimental._event cimport Event as cyEvent from cuda.core.experimental._event import Event, EventOptions from cuda.core.experimental._resource_handles cimport ( ContextHandle, + create_context_handle_ref, get_primary_context, get_current_context, - intptr, native, ) from cuda.core.experimental._graph import GraphBuilder @@ -945,7 +943,7 @@ class Device: Default value of `None` return the currently used device. """ - __slots__ = ("_id", "_memory_resource", "_has_inited", "_properties", "_uuid") + __slots__ = ("_device_id", "_memory_resource", "_has_inited", "_properties", "_uuid") def __new__(cls, device_id: Device | int | None = None): # Handle device_id argument. @@ -987,9 +985,9 @@ class Device: with nogil: HANDLE_RETURN(cydriver.cuDeviceGetCount(&total)) devices = _tls.devices = [] - for dev_id in range(total): + for i in range(total): device = super().__new__(cls) - device._id = dev_id + device._device_id = i device._memory_resource = None device._has_inited = False device._properties = None @@ -1004,19 +1002,19 @@ class Device: def _check_context_initialized(self): if not self._has_inited: raise CUDAError( - f"Device {self._id} is not yet initialized, perhaps you forgot to call .set_current() first?" + f"Device {self._device_id} is not yet initialized, perhaps you forgot to call .set_current() first?" ) @property def device_id(self) -> int: """Return device ordinal.""" - return self._id + return self._device_id @property def pci_bus_id(self) -> str: """Return a PCI Bus Id string for this device.""" - bus_id = handle_return(runtime.cudaDeviceGetPCIBusId(13, self._id)) + bus_id = handle_return(runtime.cudaDeviceGetPCIBusId(13, self._device_id)) return bus_id[:12].decode() def can_access_peer(self, peer: Device | int) -> bool: @@ -1062,7 +1060,7 @@ class Device: cdef str uuid_hex if self._uuid is None: - dev = self._id + dev = self._device_id with nogil: IF CUDA_CORE_BUILD_MAJOR == "12": HANDLE_RETURN(cydriver.cuDeviceGetUuid_v2(&uuid, dev)) @@ -1081,7 +1079,7 @@ class Device: cdef int LENGTH = 256 cdef bytes name = bytes(LENGTH) cdef char* name_ptr = name - cdef cydriver.CUdevice this_dev = self._id + cdef cydriver.CUdevice this_dev = self._device_id with nogil: HANDLE_RETURN(cydriver.cuDeviceGetName(name_ptr, LENGTH, this_dev)) name = name.split(b"\0")[0] @@ -1091,7 +1089,7 @@ class Device: def properties(self) -> DeviceProperties: """Return a :obj:`~_device.DeviceProperties` class with information about the device.""" if self._properties is None: - self._properties = DeviceProperties._init(self._id) + self._properties = DeviceProperties._init(self._device_id) return self._properties @@ -1127,9 +1125,9 @@ class Device: raise CUDAError("No context is bound to the calling CPU thread.") with nogil: HANDLE_RETURN(cydriver.cuCtxGetDevice(&dev)) - if dev != self._id: + if dev != self._device_id: raise CUDAError("Internal error (current device is not equal to Device.device_id)") - return Context._from_ctx(intptr(h_context), self._id) + return Context._from_handle(Context, h_context, self._device_id) @property def memory_resource(self) -> MemoryResource: @@ -1138,7 +1136,7 @@ class Device: if self._memory_resource is None: # If the device is in TCC mode, or does not support memory pools for some other reason, # use the SynchronousMemoryResource which does not use memory pools. - device_id = self._id + device_id = self._device_id with nogil: HANDLE_RETURN( cydriver.cuDeviceGetAttribute( @@ -1147,10 +1145,10 @@ class Device: ) if attr == 1: from cuda.core.experimental._memory import DeviceMemoryResource - self._memory_resource = DeviceMemoryResource(self._id) + self._memory_resource = DeviceMemoryResource(self._device_id) else: from cuda.core.experimental._memory import _SynchronousMemoryResource - self._memory_resource = _SynchronousMemoryResource(self._id) + self._memory_resource = _SynchronousMemoryResource(self._device_id) return self._memory_resource @@ -1175,10 +1173,10 @@ class Device: def __int__(self): """Return device_id.""" - return self._id + return self._device_id def __repr__(self): - return f"" + return f"" def __hash__(self) -> int: return hash(self.uuid) @@ -1186,7 +1184,7 @@ class Device: def __eq__(self, other) -> bool: if not isinstance(other, Device): return NotImplemented - return self._id == other._id + return self._device_id == other._device_id def __reduce__(self): return Device, (self.device_id,) @@ -1227,10 +1225,10 @@ class Device: if ctx is not None: # TODO: revisit once Context is cythonized assert_type(ctx, Context) - if ctx._device_id != self._id: + if ctx._device_id != self._device_id: raise RuntimeError( "the provided context was created on the device with" - f" id={ctx._device_id}, which is different from the target id={self._id}" + f" id={ctx._device_id}, which is different from the target id={self._device_id}" ) # prev_ctx is the previous context curr_ctx = native(ctx._h_context) @@ -1240,12 +1238,14 @@ class Device: HANDLE_RETURN(cydriver.cuCtxPushCurrent(curr_ctx)) self._has_inited = True if prev_ctx != NULL: - return Context._from_ctx((prev_ctx), self._id) + return Context._from_handle(Context, create_context_handle_ref(prev_ctx), self._device_id) else: # use primary ctx - h_context = get_primary_context(self._id) + h_context = get_primary_context(self._device_id) + if h_context.get() == NULL: + raise ValueError("Cannot set NULL context as current") with nogil: - set_current_context(h_context) + HANDLE_RETURN(cydriver.cuCtxSetCurrent(native(h_context))) self._has_inited = True def create_context(self, options: ContextOptions = None) -> Context: @@ -1297,7 +1297,7 @@ class Device: """ self._check_context_initialized() - return Stream._init(obj=obj, options=options, device_id=self._id) + return Stream._init(obj=obj, options=options, device_id=self._device_id) def create_event(self, options: EventOptions | None = None) -> Event: """Create an Event object without recording it to a Stream. @@ -1322,7 +1322,7 @@ class Device: h_context = get_current_context() if h_context.get() == NULL: raise CUDAError("No context is bound to the calling CPU thread.") - return Event._init(self._id, intptr(h_context), options, True) + return cyEvent._init(cyEvent, self._device_id, h_context, options, True) def allocate(self, size, stream: Stream | GraphBuilder | None = None) -> Buffer: """Allocate device memory from a specified stream. diff --git a/cuda_core/cuda/core/experimental/_event.pxd b/cuda_core/cuda/core/experimental/_event.pxd index 1f586f18df..d92c9627c3 100644 --- a/cuda_core/cuda/core/experimental/_event.pxd +++ b/cuda_core/cuda/core/experimental/_event.pxd @@ -3,6 +3,7 @@ # SPDX-License-Identifier: Apache-2.0 from cuda.bindings cimport cydriver +from cuda.core.experimental._resource_handles cimport ContextHandle cdef class Event: @@ -14,6 +15,9 @@ cdef class Event: bint _ipc_enabled object _ipc_descriptor int _device_id - object _ctx_handle + ContextHandle _h_context + + @staticmethod + cdef Event _init(type cls, int device_id, ContextHandle h_context, options, bint is_free) cpdef close(self) diff --git a/cuda_core/cuda/core/experimental/_event.pyx b/cuda_core/cuda/core/experimental/_event.pyx index 149c92b8e1..2ac284d8c9 100644 --- a/cuda_core/cuda/core/experimental/_event.pyx +++ b/cuda_core/cuda/core/experimental/_event.pyx @@ -8,6 +8,8 @@ cimport cpython from libc.stdint cimport uintptr_t from libc.string cimport memcpy from cuda.bindings cimport cydriver +from cuda.core.experimental._context cimport Context +from cuda.core.experimental._resource_handles cimport ContextHandle, intptr from cuda.core.experimental._utils.cuda_utils cimport ( check_or_create_options, HANDLE_RETURN @@ -17,8 +19,6 @@ import cython from dataclasses import dataclass import multiprocessing from typing import TYPE_CHECKING, Optional - -from cuda.core.experimental._context import Context from cuda.core.experimental._utils.cuda_utils import ( CUDAError, check_multiprocessing_start_method, @@ -87,9 +87,9 @@ cdef class Event: def __init__(self, *args, **kwargs): raise RuntimeError("Event objects cannot be instantiated directly. Please use Stream APIs (record).") - @classmethod - def _init(cls, device_id: int, ctx_handle: Context, options=None, is_free=False): - cdef Event self = Event.__new__(cls) + @staticmethod + cdef Event _init(type cls, int device_id, ContextHandle h_context, options, bint is_free): + cdef Event self = cls.__new__(cls) cdef EventOptions opts = check_or_create_options(EventOptions, options, "Event options") cdef unsigned int flags = 0x0 self._timing_disabled = False @@ -114,7 +114,7 @@ cdef class Event: with nogil: HANDLE_RETURN(cydriver.cuEventCreate(&self._handle, flags)) self._device_id = device_id - self._ctx_handle = ctx_handle + self._h_context = h_context if opts.ipc_enabled: self.get_ipc_descriptor() return self @@ -165,7 +165,7 @@ cdef class Event: raise RuntimeError(explanation) def __hash__(self) -> int: - return hash((self._ctx_handle, (self._handle))) + return hash((type(self), intptr(self._h_context), (self._handle))) def __eq__(self, other) -> bool: # Note: using isinstance because `Event` can be subclassed. @@ -199,8 +199,8 @@ cdef class Event: self._busy_waited = ipc_descriptor._busy_waited self._ipc_enabled = True self._ipc_descriptor = ipc_descriptor - self._device_id = -1 # ?? - self._ctx_handle = None # ?? + self._device_id = -1 + self._h_context = ContextHandle() return self @property @@ -271,8 +271,8 @@ cdef class Event: @property def context(self) -> Context: """Return the :obj:`~_context.Context` associated with this event.""" - if self._ctx_handle is not None and self._device_id >= 0: - return Context._from_ctx(self._ctx_handle, self._device_id) + if self._h_context and self._device_id >= 0: + return Context._from_handle(Context, self._h_context, self._device_id) cdef class IPCEventDescriptor: diff --git a/cuda_core/cuda/core/experimental/_memory/_device_memory_resource.pxd b/cuda_core/cuda/core/experimental/_memory/_device_memory_resource.pxd index 823a270b27..d31ff7b2e1 100644 --- a/cuda_core/cuda/core/experimental/_memory/_device_memory_resource.pxd +++ b/cuda_core/cuda/core/experimental/_memory/_device_memory_resource.pxd @@ -9,7 +9,7 @@ from cuda.core.experimental._memory._ipc cimport IPCDataForMR cdef class DeviceMemoryResource(MemoryResource): cdef: - int _dev_id + int _device_id cydriver.CUmemoryPool _handle bint _mempool_owned IPCDataForMR _ipc_data diff --git a/cuda_core/cuda/core/experimental/_memory/_device_memory_resource.pyx b/cuda_core/cuda/core/experimental/_memory/_device_memory_resource.pyx index 3bfdb59c07..d06f0b8297 100644 --- a/cuda_core/cuda/core/experimental/_memory/_device_memory_resource.pyx +++ b/cuda_core/cuda/core/experimental/_memory/_device_memory_resource.pyx @@ -219,7 +219,7 @@ cdef class DeviceMemoryResource(MemoryResource): """ def __cinit__(self): - self._dev_id = cydriver.CU_DEVICE_INVALID + self._device_id = cydriver.CU_DEVICE_INVALID self._handle = NULL self._mempool_owned = False self._ipc_data = None @@ -228,16 +228,16 @@ cdef class DeviceMemoryResource(MemoryResource): def __init__(self, device_id: Device | int, options=None): from .._device import Device - cdef int dev_id = Device(device_id).device_id + cdef int c_device_id = Device(device_id).device_id opts = check_or_create_options( DeviceMemoryResourceOptions, options, "DeviceMemoryResource options", keep_none=True ) if opts is None: - DMR_init_current(self, dev_id) + DMR_init_current(self, c_device_id) else: - DMR_init_create(self, dev_id, opts) + DMR_init_create(self, c_device_id, opts) def __dealloc__(self): DMR_close(self) @@ -366,7 +366,7 @@ cdef class DeviceMemoryResource(MemoryResource): @property def device_id(self) -> int: """The associated device ordinal.""" - return self._dev_id + return self._device_id @property def handle(self) -> driver.CUmemoryPool: @@ -438,11 +438,11 @@ cdef class DeviceMemoryResource(MemoryResource): # Convert all devices to device IDs cdef set[int] target_ids = {Device(dev).device_id for dev in devices} - target_ids.discard(self._dev_id) # exclude this device from peer access list - this_dev = Device(self._dev_id) + target_ids.discard(self._device_id) # exclude this device from peer access list + this_dev = Device(self._device_id) cdef list bad = [dev for dev in target_ids if not this_dev.can_access_peer(dev)] if bad: - raise ValueError(f"Device {self._dev_id} cannot access peer(s): {', '.join(map(str, bad))}") + raise ValueError(f"Device {self._device_id} cannot access peer(s): {', '.join(map(str, bad))}") cdef set[int] cur_ids = set(self._peer_accessible_by) cdef set[int] to_add = target_ids - cur_ids cdef set[int] to_rm = cur_ids - target_ids @@ -456,16 +456,16 @@ cdef class DeviceMemoryResource(MemoryResource): raise MemoryError("Failed to allocate memory for access descriptors") try: - for dev_id in to_add: + for device_id in to_add: access_desc[i].flags = cydriver.CUmemAccess_flags.CU_MEM_ACCESS_FLAGS_PROT_READWRITE access_desc[i].location.type = cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE - access_desc[i].location.id = dev_id + access_desc[i].location.id = device_id i += 1 - for dev_id in to_rm: + for device_id in to_rm: access_desc[i].flags = cydriver.CUmemAccess_flags.CU_MEM_ACCESS_FLAGS_PROT_NONE access_desc[i].location.type = cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE - access_desc[i].location.id = dev_id + access_desc[i].location.id = device_id i += 1 with nogil: @@ -480,16 +480,16 @@ cdef class DeviceMemoryResource(MemoryResource): # DeviceMemoryResource Implementation # ----------------------------------- -cdef void DMR_init_current(DeviceMemoryResource self, int dev_id): +cdef void DMR_init_current(DeviceMemoryResource self, int device_id): # Get the current memory pool. cdef cydriver.cuuint64_t current_threshold cdef cydriver.cuuint64_t max_threshold = ULLONG_MAX - self._dev_id = dev_id + self._device_id = device_id self._mempool_owned = False with nogil: - HANDLE_RETURN(cydriver.cuDeviceGetMemPool(&(self._handle), dev_id)) + HANDLE_RETURN(cydriver.cuDeviceGetMemPool(&(self._handle), device_id)) # Set a higher release threshold to improve performance when there are # no active allocations. By default, the release threshold is 0, which @@ -513,7 +513,7 @@ cdef void DMR_init_current(DeviceMemoryResource self, int dev_id): cdef void DMR_init_create( - DeviceMemoryResource self, int dev_id, DeviceMemoryResourceOptions opts + DeviceMemoryResource self, int device_id, DeviceMemoryResourceOptions opts ): # Create a new memory pool. cdef cydriver.CUmemPoolProps properties @@ -524,13 +524,13 @@ cdef void DMR_init_create( memset(&properties, 0, sizeof(cydriver.CUmemPoolProps)) properties.allocType = cydriver.CUmemAllocationType.CU_MEM_ALLOCATION_TYPE_PINNED properties.handleTypes = _ipc.IPC_HANDLE_TYPE if opts.ipc_enabled else cydriver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_NONE - properties.location.id = dev_id + properties.location.id = device_id properties.location.type = cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE properties.maxSize = opts.max_size properties.win32SecurityAttributes = NULL properties.usage = 0 - self._dev_id = dev_id + self._device_id = device_id self._mempool_owned = True with nogil: @@ -593,7 +593,7 @@ cdef inline DMR_close(DeviceMemoryResource self): with nogil: HANDLE_RETURN(cydriver.cuMemPoolDestroy(self._handle)) finally: - self._dev_id = cydriver.CU_DEVICE_INVALID + self._device_id = cydriver.CU_DEVICE_INVALID self._handle = NULL self._attributes = None self._mempool_owned = False @@ -618,12 +618,12 @@ cpdef DMR_mempool_get_access(DeviceMemoryResource dmr, int device_id): """ from .._device import Device - cdef int dev_id = Device(device_id).device_id + cdef int c_device_id = Device(device_id).device_id cdef cydriver.CUmemAccess_flags flags cdef cydriver.CUmemLocation location location.type = cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE - location.id = dev_id + location.id = c_device_id with nogil: HANDLE_RETURN(cydriver.cuMemPoolGetAccess(&flags, dmr._handle, &location)) diff --git a/cuda_core/cuda/core/experimental/_memory/_graph_memory_resource.pxd b/cuda_core/cuda/core/experimental/_memory/_graph_memory_resource.pxd index f9c7798e76..00af6e407b 100644 --- a/cuda_core/cuda/core/experimental/_memory/_graph_memory_resource.pxd +++ b/cuda_core/cuda/core/experimental/_memory/_graph_memory_resource.pxd @@ -7,4 +7,4 @@ from cuda.core.experimental._memory._buffer cimport MemoryResource cdef class cyGraphMemoryResource(MemoryResource): cdef: - int _dev_id + int _device_id diff --git a/cuda_core/cuda/core/experimental/_memory/_graph_memory_resource.pyx b/cuda_core/cuda/core/experimental/_memory/_graph_memory_resource.pyx index 9a83c9007c..5ad9d86c53 100644 --- a/cuda_core/cuda/core/experimental/_memory/_graph_memory_resource.pyx +++ b/cuda_core/cuda/core/experimental/_memory/_graph_memory_resource.pyx @@ -23,7 +23,7 @@ __all__ = ['GraphMemoryResource'] cdef class GraphMemoryResourceAttributes: cdef: - int _dev_id + int _device_id def __init__(self, *args, **kwargs): raise RuntimeError("GraphMemoryResourceAttributes cannot be instantiated directly. Please use MemoryResource APIs.") @@ -31,7 +31,7 @@ cdef class GraphMemoryResourceAttributes: @classmethod def _init(cls, device_id: int): cdef GraphMemoryResourceAttributes self = GraphMemoryResourceAttributes.__new__(cls) - self._dev_id = device_id + self._device_id = device_id return self def __repr__(self): @@ -42,12 +42,12 @@ cdef class GraphMemoryResourceAttributes: cdef int _getattribute(self, cydriver.CUgraphMem_attribute attr_enum, void* value) except?-1: with nogil: - HANDLE_RETURN(cydriver.cuDeviceGetGraphMemAttribute(self._dev_id, attr_enum, value)) + HANDLE_RETURN(cydriver.cuDeviceGetGraphMemAttribute(self._device_id, attr_enum, value)) return 0 cdef int _setattribute(self, cydriver.CUgraphMem_attribute attr_enum, void* value) except?-1: with nogil: - HANDLE_RETURN(cydriver.cuDeviceSetGraphMemAttribute(self._dev_id, attr_enum, value)) + HANDLE_RETURN(cydriver.cuDeviceSetGraphMemAttribute(self._device_id, attr_enum, value)) return 0 @property @@ -101,7 +101,7 @@ cdef class GraphMemoryResourceAttributes: cdef class cyGraphMemoryResource(MemoryResource): def __cinit__(self, int device_id): - self._dev_id = device_id + self._device_id = device_id def allocate(self, size_t size, stream: Stream | GraphBuilder | None = None) -> Buffer: """ @@ -124,17 +124,17 @@ cdef class cyGraphMemoryResource(MemoryResource): def trim(self): """Free unused memory that was cached on the specified device for use with graphs back to the OS.""" with nogil: - HANDLE_RETURN(cydriver.cuDeviceGraphMemTrim(self._dev_id)) + HANDLE_RETURN(cydriver.cuDeviceGraphMemTrim(self._device_id)) @property def attributes(self) -> GraphMemoryResourceAttributes: """Asynchronous allocation attributes related to graphs.""" - return GraphMemoryResourceAttributes._init(self._dev_id) + return GraphMemoryResourceAttributes._init(self._device_id) @property def device_id(self) -> int: """The associated device ordinal.""" - return self._dev_id + return self._device_id @property def is_device_accessible(self) -> bool: diff --git a/cuda_core/cuda/core/experimental/_memory/_ipc.pyx b/cuda_core/cuda/core/experimental/_memory/_ipc.pyx index c9931855cf..7c5a9b0409 100644 --- a/cuda_core/cuda/core/experimental/_memory/_ipc.pyx +++ b/cuda_core/cuda/core/experimental/_memory/_ipc.pyx @@ -212,7 +212,7 @@ cdef DeviceMemoryResource DMR_from_allocation_handle(cls, device_id, alloc_handl # Construct a new DMR. cdef DeviceMemoryResource self = DeviceMemoryResource.__new__(cls) from .._device import Device - self._dev_id = Device(device_id).device_id + self._device_id = Device(device_id).device_id self._mempool_owned = True self._ipc_data = IPCDataForMR(alloc_handle, True) diff --git a/cuda_core/cuda/core/experimental/_memory/_legacy.py b/cuda_core/cuda/core/experimental/_memory/_legacy.py index 09ea0e15d2..bff7638734 100644 --- a/cuda_core/cuda/core/experimental/_memory/_legacy.py +++ b/cuda_core/cuda/core/experimental/_memory/_legacy.py @@ -84,12 +84,12 @@ def device_id(self) -> int: class _SynchronousMemoryResource(MemoryResource): - __slots__ = ("_dev_id",) + __slots__ = ("_device_id",) def __init__(self, device_id): from .._device import Device - self._dev_id = Device(device_id).device_id + self._device_id = Device(device_id).device_id def allocate(self, size, stream=None) -> Buffer: if stream is None: @@ -116,4 +116,4 @@ def is_host_accessible(self) -> bool: @property def device_id(self) -> int: - return self._dev_id + return self._device_id diff --git a/cuda_core/cuda/core/experimental/_resource_handles.pxd b/cuda_core/cuda/core/experimental/_resource_handles.pxd index da152f4473..0423ef0ec8 100644 --- a/cuda_core/cuda/core/experimental/_resource_handles.pxd +++ b/cuda_core/cuda/core/experimental/_resource_handles.pxd @@ -18,7 +18,7 @@ cdef extern from "_cpp/resource_handles.hpp" namespace "cuda_core": ContextHandle create_context_handle_ref(cydriver.CUcontext ctx) nogil # Context acquisition functions (pure C++, nogil-safe with thread-local caching) - ContextHandle get_primary_context(int dev_id) nogil + ContextHandle get_primary_context(int device_id) nogil ContextHandle get_current_context() nogil # ======================================================================== diff --git a/cuda_core/cuda/core/experimental/_stream.pxd b/cuda_core/cuda/core/experimental/_stream.pxd index 5b7603d23b..0877d37be1 100644 --- a/cuda_core/cuda/core/experimental/_stream.pxd +++ b/cuda_core/cuda/core/experimental/_stream.pxd @@ -18,8 +18,6 @@ cdef class Stream: cdef Stream _from_handle(type cls, StreamHandle h_stream) cpdef close(self) - cdef int _get_context(self) except?-1 nogil - cdef int _get_device_and_context(self) except?-1 cpdef Stream default_stream() diff --git a/cuda_core/cuda/core/experimental/_stream.pyx b/cuda_core/cuda/core/experimental/_stream.pyx index 9114bcb65f..3ba38095e4 100644 --- a/cuda_core/cuda/core/experimental/_stream.pyx +++ b/cuda_core/cuda/core/experimental/_stream.pyx @@ -12,7 +12,6 @@ from cuda.bindings cimport cydriver from cuda.core.experimental._event cimport Event as cyEvent from cuda.core.experimental._utils.cuda_utils cimport ( check_or_create_options, - get_device_from_ctx, HANDLE_RETURN, ) @@ -24,14 +23,12 @@ from typing import TYPE_CHECKING, Optional, Protocol, Union if TYPE_CHECKING: import cuda.bindings from cuda.core.experimental._device import Device -from cuda.core.experimental._context cimport ( - Context, - get_stream_context, -) +from cuda.core.experimental._context cimport Context from cuda.core.experimental._event import Event, EventOptions from cuda.core.experimental._resource_handles cimport ( ContextHandle, StreamHandle, + create_context_handle_ref, create_stream_handle, create_stream_handle_with_owner, get_current_context, @@ -178,8 +175,7 @@ cdef class Stream: def __hash__(self) -> int: # Ensure context is initialized for hash consistency - if not self._h_context: - self._get_context() + Stream_ensure_ctx(self) return hash((intptr(self._h_context), intptr(self._h_stream))) def __eq__(self, other) -> bool: @@ -190,10 +186,8 @@ cdef class Stream: if intptr(self._h_stream) != intptr(_other._h_stream): return False # Ensure contexts are initialized for both streams - if not self._h_context: - self._get_context() - if not _other._h_context: - _other._get_context() + Stream_ensure_ctx(self) + Stream_ensure_ctx(_other) # Compare contexts as well return intptr(self._h_context) == intptr(_other._h_context) @@ -256,8 +250,8 @@ cdef class Stream: # on the stream. Event flags such as disabling timing, nonblocking, # and CU_EVENT_RECORD_EXTERNAL, can be set in EventOptions. if event is None: - self._get_device_and_context() - event = Event._init(self._device_id, intptr(self._h_context), options, False) + Stream_ensure_ctx_device(self) + event = cyEvent._init(cyEvent, self._device_id, self._h_context, options, False) elif event.is_ipc_enabled: raise TypeError( "IPC-enabled events should not be re-recorded, instead create a " @@ -318,34 +312,15 @@ cdef class Stream: """ from cuda.core.experimental._device import Device # avoid circular import - self._get_device_and_context() + Stream_ensure_ctx_device(self) return Device(self._device_id) - cdef int _get_context(self) except?-1 nogil: - if not self._h_context: - self._h_context = get_stream_context(native(self._h_stream)) - return 0 - - cdef int _get_device_and_context(self) except?-1: - cdef ContextHandle h_curr_context - cdef cydriver.CUcontext curr_ctx - if self._device_id < 0: - # Get the current context - with nogil: - h_curr_context = get_current_context() - curr_ctx = native(h_curr_context) if h_curr_context else 0 - # Get the stream's context (self._h_context is populated) - self._get_context() - # Get the stream's device (may require a context-switching dance) - self._device_id = get_device_from_ctx(native(self._h_context), curr_ctx) - return 0 - @property def context(self) -> Context: """Return the :obj:`~_context.Context` associated with this stream.""" - self._get_context() - self._get_device_and_context() - return Context._from_ctx(intptr(self._h_context), self._device_id) + Stream_ensure_ctx(self) + Stream_ensure_ctx_device(self) + return Context._from_handle(Context, self._h_context, self._device_id) @staticmethod def from_handle(handle: int) -> Stream: @@ -425,6 +400,36 @@ cpdef Stream default_stream(): return C_LEGACY_DEFAULT_STREAM +cdef inline int Stream_ensure_ctx(Stream self) except?-1 nogil: + """Ensure the stream's context handle is populated.""" + cdef cydriver.CUcontext ctx + if not self._h_context: + HANDLE_RETURN(cydriver.cuStreamGetCtx(native(self._h_stream), &ctx)) + self._h_context = create_context_handle_ref(ctx) + return 0 + + +cdef inline int Stream_ensure_ctx_device(Stream self) except?-1: + """Ensure the stream's context and device_id are populated.""" + cdef ContextHandle h_curr_context + cdef cydriver.CUcontext target_ctx, curr_ctx, ctx + cdef cydriver.CUdevice target_dev + cdef bint switch_context + + if self._device_id < 0: + with nogil: + # Get device ID from context, switching context temporarily if needed + Stream_ensure_ctx(self) + switch_context = (get_current_context() != self._h_context) + if switch_context: + HANDLE_RETURN(cydriver.cuCtxPushCurrent(native(self._h_context))) + HANDLE_RETURN(cydriver.cuCtxGetDevice(&target_dev)) + if switch_context: + HANDLE_RETURN(cydriver.cuCtxPopCurrent(&ctx)) + self._device_id = target_dev + return 0 + + cdef cydriver.CUstream _handle_from_stream_protocol(obj) except*: if isinstance(obj, Stream): return (obj.handle) diff --git a/cuda_core/cuda/core/experimental/_utils/cuda_utils.pxd b/cuda_core/cuda/core/experimental/_utils/cuda_utils.pxd index ce30285aa5..9b5044beda 100644 --- a/cuda_core/cuda/core/experimental/_utils/cuda_utils.pxd +++ b/cuda_core/cuda/core/experimental/_utils/cuda_utils.pxd @@ -22,10 +22,6 @@ ctypedef fused integer_t: cdef const cydriver.CUcontext CU_CONTEXT_INVALID = (-2) -cdef cydriver.CUdevice get_device_from_ctx( - cydriver.CUcontext target_ctx, cydriver.CUcontext curr_ctx) except?cydriver.CU_DEVICE_INVALID nogil - - cdef int HANDLE_RETURN(supported_error_type err) except?-1 nogil diff --git a/cuda_core/cuda/core/experimental/_utils/cuda_utils.pyx b/cuda_core/cuda/core/experimental/_utils/cuda_utils.pyx index 4489871747..22b6fb6c4b 100644 --- a/cuda_core/cuda/core/experimental/_utils/cuda_utils.pyx +++ b/cuda_core/cuda/core/experimental/_utils/cuda_utils.pyx @@ -197,25 +197,6 @@ def precondition(checker: Callable[..., None], str what="") -> Callable: return outer -cdef cydriver.CUdevice get_device_from_ctx( - cydriver.CUcontext target_ctx, cydriver.CUcontext curr_ctx) except?cydriver.CU_DEVICE_INVALID nogil: - """Get device ID from the given ctx.""" - cdef bint switch_context = (curr_ctx != target_ctx) - cdef cydriver.CUcontext ctx - cdef cydriver.CUdevice target_dev - with nogil: - if switch_context: - HANDLE_RETURN(cydriver.cuCtxPopCurrent(&ctx)) - assert curr_ctx == ctx - HANDLE_RETURN(cydriver.cuCtxPushCurrent(target_ctx)) - HANDLE_RETURN(cydriver.cuCtxGetDevice(&target_dev)) - if switch_context: - HANDLE_RETURN(cydriver.cuCtxPopCurrent(&ctx)) - assert target_ctx == ctx - HANDLE_RETURN(cydriver.cuCtxPushCurrent(curr_ctx)) - return target_dev - - def is_sequence(obj): """ Check if the given object is a sequence (list or tuple). diff --git a/cuda_core/tests/test_comparable.py b/cuda_core/tests/test_comparable.py index 72b3caa2ba..2c05932dcc 100644 --- a/cuda_core/tests/test_comparable.py +++ b/cuda_core/tests/test_comparable.py @@ -9,8 +9,7 @@ """ from cuda.core.experimental import Device, Stream -from cuda.core.experimental._context import Context -from cuda.core.experimental._event import Event, EventOptions +from cuda.core.experimental._event import Event from cuda.core.experimental._stream import StreamOptions # ============================================================================ @@ -105,50 +104,34 @@ def test_event_subclass_equality(init_cuda): Event uses isinstance() for equality checking, similar to Stream. """ - - class MyEvent(Event): - pass - device = Device(0) device.set_current() - # Create two different events - event = Event._init(device.device_id, device.context, options=EventOptions()) - my_event = MyEvent._init(device.device_id, device.context, options=EventOptions()) + # Create events using public API + event1 = device.create_event() + event2 = device.create_event() + event3 = device.create_event() # Different events should not be equal (different handles) - assert event != my_event, "Different Event instances are not equal" + assert event1 != event2, "Different Event instances are not equal" + assert event2 != event3, "Different Event instances are not equal" - # Same subclass type with different handles - my_event2 = MyEvent._init(device.device_id, device.context, options=EventOptions()) - assert my_event != my_event2, "Different MyEvent instances are not equal" - - -def test_context_subclass_equality(init_cuda): - """Test Context subclass equality behavior.""" - - class MyContext(Context): - pass +def test_context_equality(init_cuda): + """Test Context equality behavior.""" device = Device(0) device.set_current() - stream = device.create_stream() - context = stream.context - - # MyContext._from_ctx() returns a Context instance, not MyContext - my_context = MyContext._from_ctx(context.handle, device.device_id) - assert type(my_context) is Context, "_from_ctx returns Context, not subclass" - assert type(my_context) is not MyContext - - # Since both are Context instances with same handle, they're equal - assert context == my_context, "Context instances with same handle are equal" - # Create another context from different stream + # Get context from different sources + stream1 = device.create_stream() stream2 = device.create_stream() + context1 = stream1.context context2 = stream2.context + device_context = device.context # Same device, same primary context, should be equal - assert context == context2, "Contexts from same device are equal" + assert context1 == context2, "Contexts from same device are equal" + assert context1 == device_context, "Stream context equals device context" def test_subclass_type_safety(init_cuda): diff --git a/cuda_core/tests/test_event.py b/cuda_core/tests/test_event.py index ec35448619..f5bf19f8e3 100644 --- a/cuda_core/tests/test_event.py +++ b/cuda_core/tests/test_event.py @@ -148,14 +148,12 @@ def test_event_context(init_cuda): assert context is not None -def test_event_subclassing(): - class MyEvent(Event): - pass - +def test_event_creation(): + """Test Event creation via public API.""" dev = Device() dev.set_current() - event = MyEvent._init(dev.device_id, dev.context) - assert isinstance(event, MyEvent) + event = dev.create_event() + assert isinstance(event, Event) # ============================================================================ diff --git a/cuda_core/tests/test_hashable.py b/cuda_core/tests/test_hashable.py index 751a88250c..1ecf8cdedd 100644 --- a/cuda_core/tests/test_hashable.py +++ b/cuda_core/tests/test_hashable.py @@ -13,8 +13,7 @@ """ from cuda.core.experimental import Device -from cuda.core.experimental._context import Context -from cuda.core.experimental._event import Event, EventOptions +from cuda.core.experimental._event import Event from cuda.core.experimental._stream import Stream, StreamOptions # ============================================================================ @@ -128,65 +127,51 @@ class MyStream(Stream): assert hash(my_stream) != hash(my_stream2), "Different streams have different hashes" -def test_event_subclass_hash(init_cuda): - """Test Event subclass hash behavior.""" - - class MyEvent(Event): - pass - +def test_event_hash(init_cuda): + """Test Event hash behavior.""" device = Device(0) device.set_current() - # Create events with different handles - event = Event._init(device.device_id, device.context, options=EventOptions()) - my_event = MyEvent._init(device.device_id, device.context, options=EventOptions()) + # Create events using public API + event1 = device.create_event() + event2 = device.create_event() # Different events (different handles) -> different hashes - assert hash(event) != hash(my_event), "Different events have different hashes" - assert event != my_event, "Different handles means not equal" + assert hash(event1) != hash(event2), "Different events have different hashes" + assert event1 != event2, "Different handles means not equal" # Verify hash consistency - hash1 = hash(event) - hash2 = hash(event) + hash1 = hash(event1) + hash2 = hash(event1) assert hash1 == hash2, "Hash is consistent across multiple calls" # Both should be usable as dict keys - cache = {event: "base", my_event: "subclass"} + cache = {event1: "first", event2: "second"} assert len(cache) == 2, "Different events are distinct dict keys" - assert cache[event] == "base" - assert cache[my_event] == "subclass" - - -def test_context_subclass_hash(init_cuda): - """Test Context subclass hash behavior. + assert cache[event1] == "first" + assert cache[event2] == "second" - Context._from_ctx() always returns Context instances, even when called - as MyContext._from_ctx(). This means we can't create actual MyContext - instances in practice. - """ - - class MyContext(Context): - pass +def test_context_hash(init_cuda): + """Test Context hash behavior.""" device = Device(0) device.set_current() - stream = device.create_stream() - context = stream.context - # MyContext._from_ctx() returns Context, not MyContext - my_context = MyContext._from_ctx(context.handle, device.device_id) - assert type(my_context) is Context, "_from_ctx returns Context type" + # Get context from different sources + stream1 = device.create_stream() + stream2 = device.create_stream() + context1 = stream1.context + context2 = stream2.context - # Same handle -> same hash - assert hash(context) == hash(my_context), "Contexts with same handle have same hash" + # Same underlying context -> same hash + assert hash(context1) == hash(context2), "Contexts with same handle have same hash" # Verify equality matches hash - assert context == my_context, "Contexts with same handle are equal" - assert hash(context) == hash(my_context), "Equal contexts have equal hashes" + assert context1 == context2, "Contexts with same handle are equal" # Verify hash consistency - hash1 = hash(context) - hash2 = hash(context) + hash1 = hash(context1) + hash2 = hash(context1) assert hash1 == hash2, "Hash is consistent across multiple calls" @@ -200,33 +185,24 @@ def test_hash_equality_contract_maintained(init_cuda): allowing cross-type equality with consistent hashing. """ - class MyStream(Stream): - pass - - class MyEvent(Event): - pass - - class MyContext(Context): - pass - device = Device(0) device.set_current() - # Test Stream: base and subclass with same handle - my_stream = MyStream._init(options=StreamOptions(), device_id=device.device_id) - stream = Stream.from_handle(int(my_stream.handle)) + # Test Stream: two references to same handle + stream1 = device.create_stream() + stream2 = Stream.from_handle(int(stream1.handle)) - assert my_stream == stream, "Equal due to isinstance() check and same handle" - assert hash(my_stream) == hash(stream), "Equal objects have equal hashes" + assert stream1 == stream2, "Equal due to same handle" + assert hash(stream1) == hash(stream2), "Equal objects have equal hashes" - # Test Context: always returns base type from _from_ctx - ctx = device.context - my_ctx = MyContext._from_ctx(ctx.handle, device.device_id) + # Test Context: contexts from same device share same underlying context + ctx1 = device.context + ctx2 = device.create_stream().context - assert ctx == my_ctx, "Equal contexts with same handle" - assert hash(ctx) == hash(my_ctx), "Equal objects have equal hashes" + assert ctx1 == ctx2, "Equal contexts with same handle" + assert hash(ctx1) == hash(ctx2), "Equal objects have equal hashes" # Test that different handles still produce different hashes - my_stream2 = MyStream._init(options=StreamOptions(), device_id=device.device_id) - assert my_stream != my_stream2, "Different handles means not equal" - assert hash(my_stream) != hash(my_stream2), "Different objects have different hashes" + stream3 = device.create_stream() + assert stream1 != stream3, "Different handles means not equal" + assert hash(stream1) != hash(stream3), "Different objects have different hashes" From 1e1398471d793a0921b704380d9ba6d8189fdec0 Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Wed, 10 Dec 2025 21:06:44 -0800 Subject: [PATCH 12/38] Store owning context handle in Device Device now stores its Context in _context slot, set during set_current(). This ensures Device holds an owning reference to its context, enabling proper lifetime management when passed to Stream and Event creation. Changes: - Add _context to Device.__slots__ - Store Context in set_current() for both primary and explicit context paths - Simplify context property to return stored _context - Update create_event() to use self._context._h_context - Remove get_current_context import (no longer needed in _device.pyx) Add structural context dependency to owned streams StreamBox now holds ContextHandle to ensure context outlives the stream. This structural dependency is only for owned streams - borrowed streams delegate context lifetime management to their owners. C++ changes: - StreamBox gains h_context member - create_stream_handle(h_ctx, flags, priority) takes owning context - create_stream_handle_ref(stream) - caller manages context - create_stream_handle_with_owner(stream, owner) - Python owner manages context Cython/Python changes: - Stream._init() accepts optional ctx parameter - Device.create_stream() passes self._context to Stream._init() - Owned streams get context handle embedded in C++ handle --- .../experimental/_cpp/resource_handles.cpp | 11 ++++-- .../experimental/_cpp/resource_handles.hpp | 7 +++- cuda_core/cuda/core/experimental/_device.pyx | 35 +++++++------------ .../core/experimental/_resource_handles.pxd | 9 +++-- cuda_core/cuda/core/experimental/_stream.pyx | 13 +++++-- 5 files changed, 42 insertions(+), 33 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_cpp/resource_handles.cpp b/cuda_core/cuda/core/experimental/_cpp/resource_handles.cpp index 076ff10810..f39fc10816 100644 --- a/cuda_core/cuda/core/experimental/_cpp/resource_handles.cpp +++ b/cuda_core/cuda/core/experimental/_cpp/resource_handles.cpp @@ -161,8 +161,9 @@ struct StreamBox { CUstream resource; }; -StreamHandle create_stream_handle(unsigned int flags, int priority) { +StreamHandle create_stream_handle(ContextHandle h_ctx, unsigned int flags, int priority) { // Creates an owning stream handle - calls cuStreamCreateWithPriority internally. + // The context handle is captured in the deleter to ensure context outlives the stream. // Returns empty handle on error (caller must check). CUstream stream; CUresult err; @@ -174,10 +175,12 @@ StreamHandle create_stream_handle(unsigned int flags, int priority) { return StreamHandle(); } - auto box = std::shared_ptr(new StreamBox{stream}, [](const StreamBox* b) { + // Capture h_ctx in lambda - shared_ptr control block keeps it alive + auto box = std::shared_ptr(new StreamBox{stream}, [h_ctx](const StreamBox* b) { GILReleaseGuard gil; cuStreamDestroy(b->resource); delete b; + // h_ctx destructor runs here when last stream reference is released }); // Use aliasing constructor to expose only CUstream @@ -185,7 +188,8 @@ StreamHandle create_stream_handle(unsigned int flags, int priority) { } StreamHandle create_stream_handle_ref(CUstream stream) { - // Creates a non-owning handle - stream will NOT be destroyed + // Creates a non-owning handle - stream will NOT be destroyed. + // Caller is responsible for keeping the stream's context alive. auto box = std::shared_ptr(new StreamBox{stream}); // Use aliasing constructor to expose only CUstream @@ -195,6 +199,7 @@ StreamHandle create_stream_handle_ref(CUstream stream) { StreamHandle create_stream_handle_with_owner(CUstream stream, PyObject* owner) { // Creates a non-owning handle that prevents a Python owner from being GC'd. // The owner's refcount is incremented here and decremented when handle is released. + // The owner is responsible for keeping the stream's context alive. Py_XINCREF(owner); auto box = std::shared_ptr(new StreamBox{stream}, [owner](const StreamBox* b) { diff --git a/cuda_core/cuda/core/experimental/_cpp/resource_handles.hpp b/cuda_core/cuda/core/experimental/_cpp/resource_handles.hpp index 945ac0b2a8..06b04ba974 100644 --- a/cuda_core/cuda/core/experimental/_cpp/resource_handles.hpp +++ b/cuda_core/cuda/core/experimental/_cpp/resource_handles.hpp @@ -38,23 +38,28 @@ ContextHandle get_current_context() noexcept; // ============================================================================ // Create an owning stream handle by calling cuStreamCreateWithPriority. +// The stream structurally depends on the provided context handle. // When the last reference is released, cuStreamDestroy is called automatically. // Returns empty handle on error (caller must check). -StreamHandle create_stream_handle(unsigned int flags, int priority); +StreamHandle create_stream_handle(ContextHandle h_ctx, unsigned int flags, int priority); // Create a non-owning stream handle (references existing stream). // Use for borrowed streams (from foreign code) or built-in streams. // The stream will NOT be destroyed when the handle is released. +// Caller is responsible for keeping the stream's context alive. StreamHandle create_stream_handle_ref(CUstream stream); // Create a non-owning stream handle that prevents a Python owner from being GC'd. // The owner's refcount is incremented; decremented when handle is released. +// The owner is responsible for keeping the stream's context alive. StreamHandle create_stream_handle_with_owner(CUstream stream, PyObject* owner); // Get non-owning handle to the legacy default stream (CU_STREAM_LEGACY) +// Note: Legacy stream has no specific context dependency. StreamHandle get_legacy_stream() noexcept; // Get non-owning handle to the per-thread default stream (CU_STREAM_PER_THREAD) +// Note: Per-thread stream has no specific context dependency. StreamHandle get_per_thread_stream() noexcept; // ============================================================================ diff --git a/cuda_core/cuda/core/experimental/_device.pyx b/cuda_core/cuda/core/experimental/_device.pyx index f2f2f72a72..7cf8e8dbbd 100644 --- a/cuda_core/cuda/core/experimental/_device.pyx +++ b/cuda_core/cuda/core/experimental/_device.pyx @@ -19,7 +19,6 @@ from cuda.core.experimental._resource_handles cimport ( ContextHandle, create_context_handle_ref, get_primary_context, - get_current_context, native, ) from cuda.core.experimental._graph import GraphBuilder @@ -943,7 +942,7 @@ class Device: Default value of `None` return the currently used device. """ - __slots__ = ("_device_id", "_memory_resource", "_has_inited", "_properties", "_uuid") + __slots__ = ("_device_id", "_memory_resource", "_has_inited", "_properties", "_uuid", "_context") def __new__(cls, device_id: Device | int | None = None): # Handle device_id argument. @@ -962,16 +961,15 @@ class Device: # important: creating a Device instance does not initialize the GPU! cdef cydriver.CUdevice dev cdef cydriver.CUcontext ctx - cdef ContextHandle h_context if device_id is None: with nogil: err = cydriver.cuCtxGetDevice(&dev) if err == cydriver.CUresult.CUDA_SUCCESS: device_id = int(dev) elif err == cydriver.CUresult.CUDA_ERROR_INVALID_CONTEXT: - h_context = get_current_context() - assert h_context.get() == NULL - device_id = 0 # cudart behavior + # No context is current - verify and default to device 0 (cudart behavior) + assert cydriver.cuCtxGetCurrent(&ctx) == cydriver.CUresult.CUDA_SUCCESS and ctx == NULL + device_id = 0 else: HANDLE_RETURN(err) elif device_id < 0: @@ -992,6 +990,7 @@ class Device: device._has_inited = False device._properties = None device._uuid = None + device._context = None devices.append(device) try: @@ -1110,7 +1109,7 @@ class Device: @property def context(self) -> Context: - """Return the current :obj:`~_context.Context` associated with this device. + """Return the :obj:`~_context.Context` associated with this device. Note ---- @@ -1118,16 +1117,7 @@ class Device: """ self._check_context_initialized() - cdef ContextHandle h_context - cdef cydriver.CUdevice dev - h_context = get_current_context() - if h_context.get() == NULL: - raise CUDAError("No context is bound to the calling CPU thread.") - with nogil: - HANDLE_RETURN(cydriver.cuCtxGetDevice(&dev)) - if dev != self._device_id: - raise CUDAError("Internal error (current device is not equal to Device.device_id)") - return Context._from_handle(Context, h_context, self._device_id) + return self._context @property def memory_resource(self) -> MemoryResource: @@ -1237,6 +1227,7 @@ class Device: HANDLE_RETURN(cydriver.cuCtxPopCurrent(&prev_ctx)) HANDLE_RETURN(cydriver.cuCtxPushCurrent(curr_ctx)) self._has_inited = True + self._context = ctx # Store owning context reference if prev_ctx != NULL: return Context._from_handle(Context, create_context_handle_ref(prev_ctx), self._device_id) else: @@ -1247,6 +1238,7 @@ class Device: with nogil: HANDLE_RETURN(cydriver.cuCtxSetCurrent(native(h_context))) self._has_inited = True + self._context = Context._from_handle(Context, h_context, self._device_id) # Store owning context def create_context(self, options: ContextOptions = None) -> Context: """Create a new :obj:`~_context.Context` object. @@ -1297,7 +1289,7 @@ class Device: """ self._check_context_initialized() - return Stream._init(obj=obj, options=options, device_id=self._device_id) + return Stream._init(obj=obj, options=options, device_id=self._device_id, ctx=self._context) def create_event(self, options: EventOptions | None = None) -> Event: """Create an Event object without recording it to a Stream. @@ -1318,11 +1310,8 @@ class Device: """ self._check_context_initialized() - cdef ContextHandle h_context - h_context = get_current_context() - if h_context.get() == NULL: - raise CUDAError("No context is bound to the calling CPU thread.") - return cyEvent._init(cyEvent, self._device_id, h_context, options, True) + cdef Context ctx = self._context + return cyEvent._init(cyEvent, self._device_id, ctx._h_context, options, True) def allocate(self, size, stream: Stream | GraphBuilder | None = None) -> Buffer: """Allocate device memory from a specified stream. diff --git a/cuda_core/cuda/core/experimental/_resource_handles.pxd b/cuda_core/cuda/core/experimental/_resource_handles.pxd index 0423ef0ec8..711b28ffcb 100644 --- a/cuda_core/cuda/core/experimental/_resource_handles.pxd +++ b/cuda_core/cuda/core/experimental/_resource_handles.pxd @@ -27,19 +27,22 @@ cdef extern from "_cpp/resource_handles.hpp" namespace "cuda_core": ctypedef shared_ptr[const cydriver.CUstream] StreamHandle # Create an owning stream handle via cuStreamCreateWithPriority + # Context handle establishes structural dependency (context outlives stream) # Returns empty handle on error (caller must check) - StreamHandle create_stream_handle(unsigned int flags, int priority) nogil + StreamHandle create_stream_handle(ContextHandle h_ctx, unsigned int flags, int priority) nogil # Create a non-owning stream handle (stream NOT destroyed when handle released) + # Caller is responsible for keeping the stream's context alive StreamHandle create_stream_handle_ref(cydriver.CUstream stream) nogil # Create non-owning handle that prevents Python owner from being GC'd + # Owner is responsible for keeping the stream's context alive StreamHandle create_stream_handle_with_owner(cydriver.CUstream stream, object owner) - # Get non-owning handle to the legacy default stream + # Get non-owning handle to the legacy default stream (no context dependency) StreamHandle get_legacy_stream() nogil - # Get non-owning handle to the per-thread default stream + # Get non-owning handle to the per-thread default stream (no context dependency) StreamHandle get_per_thread_stream() nogil # ======================================================================== diff --git a/cuda_core/cuda/core/experimental/_stream.pyx b/cuda_core/cuda/core/experimental/_stream.pyx index 3ba38095e4..d75e2bef0e 100644 --- a/cuda_core/cuda/core/experimental/_stream.pyx +++ b/cuda_core/cuda/core/experimental/_stream.pyx @@ -117,16 +117,23 @@ cdef class Stream: return Stream._from_handle(cls, get_per_thread_stream()) @classmethod - def _init(cls, obj: IsStreamT | None = None, options=None, device_id: int = None): + def _init(cls, obj: IsStreamT | None = None, options=None, device_id: int = None, + ctx: Context = None): cdef StreamHandle h_stream cdef cydriver.CUstream borrowed + cdef ContextHandle h_context cdef Stream self + # Extract context handle if provided + if ctx is not None: + h_context = (ctx)._h_context + if obj is not None and options is not None: raise ValueError("obj and options cannot be both specified") if obj is not None: # Borrowed stream from foreign object # C++ handle prevents owner from being GC'd until handle is released + # Owner is responsible for keeping the stream's context alive borrowed = _handle_from_stream_protocol(obj) h_stream = create_stream_handle_with_owner(borrowed, obj) return Stream._from_handle(cls, h_stream) @@ -149,8 +156,8 @@ cdef class Stream: else: prio = high - # C++ creates the stream and returns owning handle - h_stream = create_stream_handle(flags, prio) + # C++ creates the stream and returns owning handle with context dependency + h_stream = create_stream_handle(h_context, flags, prio) if not h_stream: raise RuntimeError("Failed to create CUDA stream") self = Stream._from_handle(cls, h_stream) From 6268b6e7ab7b0341f60e1c65e2dad5911719e8b0 Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Wed, 10 Dec 2025 22:08:44 -0800 Subject: [PATCH 13/38] Convert Event to use resource handles Event now uses EventHandle (shared_ptr) for RAII-based lifetime management, following the same pattern as Stream. C++ changes: - Add EventHandle type alias and EventBox struct - Add create_event_handle(h_ctx, flags) with context captured in deleter - Add create_event_handle_ipc(ipc_handle) for IPC events (no context dep) - Add native(), intptr(), py() overloads for EventHandle Cython changes: - Event._h_event replaces raw CUevent _handle - _init() uses create_event_handle() - from_ipc_descriptor() uses create_event_handle_ipc() - close() uses _h_event.reset() - Keep _h_context for cached fast access --- .../experimental/_cpp/resource_handles.cpp | 59 +++++++++++++++++++ .../experimental/_cpp/resource_handles.hpp | 38 ++++++++++++ cuda_core/cuda/core/experimental/_event.pxd | 6 +- cuda_core/cuda/core/experimental/_event.pyx | 56 ++++++++++-------- .../core/experimental/_resource_handles.pxd | 18 ++++++ cuda_core/cuda/core/experimental/_stream.pyx | 3 +- 6 files changed, 153 insertions(+), 27 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_cpp/resource_handles.cpp b/cuda_core/cuda/core/experimental/_cpp/resource_handles.cpp index f39fc10816..a236176b95 100644 --- a/cuda_core/cuda/core/experimental/_cpp/resource_handles.cpp +++ b/cuda_core/cuda/core/experimental/_cpp/resource_handles.cpp @@ -230,4 +230,63 @@ StreamHandle get_per_thread_stream() noexcept { return handle; } +// ============================================================================ +// Event Handles +// ============================================================================ + +// Internal box structure for Event +struct EventBox { + CUevent resource; +}; + +EventHandle create_event_handle(ContextHandle h_ctx, unsigned int flags) { + // Creates an owning event handle - calls cuEventCreate internally. + // The context handle is captured in the deleter to ensure context outlives the event. + // Returns empty handle on error (caller must check). + CUevent event; + CUresult err; + { + GILReleaseGuard gil; + err = cuEventCreate(&event, flags); + } + if (err != CUDA_SUCCESS) { + return EventHandle(); + } + + // Capture h_ctx in lambda - shared_ptr control block keeps it alive + auto box = std::shared_ptr(new EventBox{event}, [h_ctx](const EventBox* b) { + GILReleaseGuard gil; + cuEventDestroy(b->resource); + delete b; + // h_ctx destructor runs here when last event reference is released + }); + + // Use aliasing constructor to expose only CUevent + return EventHandle(box, &box->resource); +} + +EventHandle create_event_handle_ipc(const CUipcEventHandle& ipc_handle) { + // Creates an owning event handle from an IPC handle. + // The originating process owns the event and its context. + // Returns empty handle on error (caller must check). + CUevent event; + CUresult err; + { + GILReleaseGuard gil; + err = cuIpcOpenEventHandle(&event, ipc_handle); + } + if (err != CUDA_SUCCESS) { + return EventHandle(); + } + + auto box = std::shared_ptr(new EventBox{event}, [](const EventBox* b) { + GILReleaseGuard gil; + cuEventDestroy(b->resource); + delete b; + }); + + // Use aliasing constructor to expose only CUevent + return EventHandle(box, &box->resource); +} + } // namespace cuda_core diff --git a/cuda_core/cuda/core/experimental/_cpp/resource_handles.hpp b/cuda_core/cuda/core/experimental/_cpp/resource_handles.hpp index 06b04ba974..44a8dd823a 100644 --- a/cuda_core/cuda/core/experimental/_cpp/resource_handles.hpp +++ b/cuda_core/cuda/core/experimental/_cpp/resource_handles.hpp @@ -17,6 +17,7 @@ namespace cuda_core { using ContextHandle = std::shared_ptr; using StreamHandle = std::shared_ptr; +using EventHandle = std::shared_ptr; // ============================================================================ // Context handle functions @@ -62,6 +63,22 @@ StreamHandle get_legacy_stream() noexcept; // Note: Per-thread stream has no specific context dependency. StreamHandle get_per_thread_stream() noexcept; +// ============================================================================ +// Event handle functions +// ============================================================================ + +// Create an owning event handle by calling cuEventCreate. +// The event structurally depends on the provided context handle. +// When the last reference is released, cuEventDestroy is called automatically. +// Returns empty handle on error (caller must check). +EventHandle create_event_handle(ContextHandle h_ctx, unsigned int flags); + +// Create an owning event handle from an IPC handle. +// The originating process owns the event and its context. +// When the last reference is released, cuEventDestroy is called automatically. +// Returns empty handle on error (caller must check). +EventHandle create_event_handle_ipc(const CUipcEventHandle& ipc_handle); + // ============================================================================ // Overloaded helper functions to extract raw resources from handles // ============================================================================ @@ -75,6 +92,10 @@ inline CUstream native(const StreamHandle& h) noexcept { return h ? *h : nullptr; } +inline CUevent native(const EventHandle& h) noexcept { + return h ? *h : nullptr; +} + // intptr() - extract handle as uintptr_t for Python interop inline std::uintptr_t intptr(const ContextHandle& h) noexcept { return reinterpret_cast(h ? *h : nullptr); @@ -84,6 +105,10 @@ inline std::uintptr_t intptr(const StreamHandle& h) noexcept { return reinterpret_cast(h ? *h : nullptr); } +inline std::uintptr_t intptr(const EventHandle& h) noexcept { + return reinterpret_cast(h ? *h : nullptr); +} + // py() - convert handle to Python driver wrapper object // Returns new reference. Caller must hold GIL. inline PyObject* py(const ContextHandle& h) { @@ -112,4 +137,17 @@ inline PyObject* py(const StreamHandle& h) { return PyObject_CallFunction(cls, "K", val); } +inline PyObject* py(const EventHandle& h) { + static PyObject* cls = nullptr; + if (!cls) { + PyObject* mod = PyImport_ImportModule("cuda.bindings.driver"); + if (!mod) return nullptr; + cls = PyObject_GetAttrString(mod, "CUevent"); + Py_DECREF(mod); + if (!cls) return nullptr; + } + std::uintptr_t val = h ? reinterpret_cast(*h) : 0; + return PyObject_CallFunction(cls, "K", val); +} + } // namespace cuda_core diff --git a/cuda_core/cuda/core/experimental/_event.pxd b/cuda_core/cuda/core/experimental/_event.pxd index d92c9627c3..29317dde66 100644 --- a/cuda_core/cuda/core/experimental/_event.pxd +++ b/cuda_core/cuda/core/experimental/_event.pxd @@ -3,19 +3,19 @@ # SPDX-License-Identifier: Apache-2.0 from cuda.bindings cimport cydriver -from cuda.core.experimental._resource_handles cimport ContextHandle +from cuda.core.experimental._resource_handles cimport ContextHandle, EventHandle cdef class Event: cdef: - cydriver.CUevent _handle + EventHandle _h_event + ContextHandle _h_context # Cached for fast access bint _timing_disabled bint _busy_waited bint _ipc_enabled object _ipc_descriptor int _device_id - ContextHandle _h_context @staticmethod cdef Event _init(type cls, int device_id, ContextHandle h_context, options, bint is_free) diff --git a/cuda_core/cuda/core/experimental/_event.pyx b/cuda_core/cuda/core/experimental/_event.pyx index 2ac284d8c9..763df94fe3 100644 --- a/cuda_core/cuda/core/experimental/_event.pyx +++ b/cuda_core/cuda/core/experimental/_event.pyx @@ -9,7 +9,15 @@ from libc.stdint cimport uintptr_t from libc.string cimport memcpy from cuda.bindings cimport cydriver from cuda.core.experimental._context cimport Context -from cuda.core.experimental._resource_handles cimport ContextHandle, intptr +from cuda.core.experimental._resource_handles cimport ( + ContextHandle, + EventHandle, + create_event_handle, + create_event_handle_ipc, + intptr, + native, + py, +) from cuda.core.experimental._utils.cuda_utils cimport ( check_or_create_options, HANDLE_RETURN @@ -81,8 +89,6 @@ cdef class Event: and they should instead be created through a :obj:`~_stream.Stream` object. """ - def __cinit__(self): - self._handle = (NULL) def __init__(self, *args, **kwargs): raise RuntimeError("Event objects cannot be instantiated directly. Please use Stream APIs (record).") @@ -111,23 +117,24 @@ cdef class Event: self._ipc_enabled = True if not self._timing_disabled: raise TypeError("IPC-enabled events cannot use timing.") - with nogil: - HANDLE_RETURN(cydriver.cuEventCreate(&self._handle, flags)) - self._device_id = device_id + # C++ creates the event and returns owning handle with context dependency + cdef EventHandle h_event = create_event_handle(h_context, flags) + if not h_event: + raise RuntimeError("Failed to create CUDA event") + self._h_event = h_event self._h_context = h_context + self._device_id = device_id if opts.ipc_enabled: self.get_ipc_descriptor() return self cpdef close(self): - """Destroy the event.""" - if self._handle != NULL: - with nogil: - HANDLE_RETURN(cydriver.cuEventDestroy(self._handle)) - self._handle = (NULL) + """Destroy the event. - def __dealloc__(self): - self.close() + Releases the event handle. The underlying CUDA event is destroyed + when the last reference is released. + """ + self._h_event.reset() def __isub__(self, other): return NotImplemented @@ -139,7 +146,7 @@ cdef class Event: # return self - other (in milliseconds) cdef float timing with nogil: - err = cydriver.cuEventElapsedTime(&timing, other._handle, self._handle) + err = cydriver.cuEventElapsedTime(&timing, native((other)._h_event), native(self._h_event)) if err == 0: return timing else: @@ -165,14 +172,14 @@ cdef class Event: raise RuntimeError(explanation) def __hash__(self) -> int: - return hash((type(self), intptr(self._h_context), (self._handle))) + return hash((type(self), intptr(self._h_context), intptr(self._h_event))) def __eq__(self, other) -> bool: # Note: using isinstance because `Event` can be subclassed. if not isinstance(other, Event): return NotImplemented cdef Event _other = other - return (self._handle) == (_other._handle) + return intptr(self._h_event) == intptr(_other._h_event) def get_ipc_descriptor(self) -> IPCEventDescriptor: """Export an event allocated for sharing between processes.""" @@ -182,7 +189,7 @@ cdef class Event: raise RuntimeError("Event is not IPC-enabled") cdef cydriver.CUipcEventHandle data with nogil: - HANDLE_RETURN(cydriver.cuIpcGetEventHandle(&data, (self._handle))) + HANDLE_RETURN(cydriver.cuIpcGetEventHandle(&data, native(self._h_event))) cdef bytes data_b = cpython.PyBytes_FromStringAndSize((data.reserved), sizeof(data.reserved)) self._ipc_descriptor = IPCEventDescriptor._init(data_b, self._busy_waited) return self._ipc_descriptor @@ -193,14 +200,17 @@ cdef class Event: cdef cydriver.CUipcEventHandle data memcpy(data.reserved, (ipc_descriptor._reserved), sizeof(data.reserved)) cdef Event self = Event.__new__(cls) - with nogil: - HANDLE_RETURN(cydriver.cuIpcOpenEventHandle(&self._handle, data)) + # IPC events: the originating process owns the event and its context + cdef EventHandle h_event = create_event_handle_ipc(data) + if not h_event: + raise RuntimeError("Failed to open IPC event handle") + self._h_event = h_event + self._h_context = ContextHandle() self._timing_disabled = True self._busy_waited = ipc_descriptor._busy_waited self._ipc_enabled = True self._ipc_descriptor = ipc_descriptor self._device_id = -1 - self._h_context = ContextHandle() return self @property @@ -229,13 +239,13 @@ cdef class Event: """ with nogil: - HANDLE_RETURN(cydriver.cuEventSynchronize(self._handle)) + HANDLE_RETURN(cydriver.cuEventSynchronize(native(self._h_event))) @property def is_done(self) -> bool: """Return True if all captured works have been completed, otherwise False.""" with nogil: - result = cydriver.cuEventQuery(self._handle) + result = cydriver.cuEventQuery(native(self._h_event)) if result == cydriver.CUresult.CUDA_SUCCESS: return True if result == cydriver.CUresult.CUDA_ERROR_NOT_READY: @@ -251,7 +261,7 @@ cdef class Event: This handle is a Python object. To get the memory address of the underlying C handle, call ``int(Event.handle)``. """ - return driver.CUevent((self._handle)) + return py(self._h_event) @property def device(self) -> Device: diff --git a/cuda_core/cuda/core/experimental/_resource_handles.pxd b/cuda_core/cuda/core/experimental/_resource_handles.pxd index 711b28ffcb..4c33a9f358 100644 --- a/cuda_core/cuda/core/experimental/_resource_handles.pxd +++ b/cuda_core/cuda/core/experimental/_resource_handles.pxd @@ -45,6 +45,21 @@ cdef extern from "_cpp/resource_handles.hpp" namespace "cuda_core": # Get non-owning handle to the per-thread default stream (no context dependency) StreamHandle get_per_thread_stream() nogil + # ======================================================================== + # Event Handle + # ======================================================================== + ctypedef shared_ptr[const cydriver.CUevent] EventHandle + + # Create an owning event handle via cuEventCreate + # Context handle establishes structural dependency (context outlives event) + # Returns empty handle on error (caller must check) + EventHandle create_event_handle(ContextHandle h_ctx, unsigned int flags) nogil + + # Create an owning event handle from IPC handle + # The originating process owns the event and its context + # Returns empty handle on error (caller must check) + EventHandle create_event_handle_ipc(const cydriver.CUipcEventHandle& ipc_handle) nogil + # ======================================================================== # Overloaded helper functions (C++ handles dispatch by type) # ======================================================================== @@ -52,11 +67,14 @@ cdef extern from "_cpp/resource_handles.hpp" namespace "cuda_core": # native() - extract the raw CUDA handle cydriver.CUcontext native(ContextHandle h) nogil cydriver.CUstream native(StreamHandle h) nogil + cydriver.CUevent native(EventHandle h) nogil # intptr() - extract handle as uintptr_t for Python interop uintptr_t intptr(ContextHandle h) nogil uintptr_t intptr(StreamHandle h) nogil + uintptr_t intptr(EventHandle h) nogil # py() - convert handle to Python driver wrapper object (requires GIL) object py(ContextHandle h) object py(StreamHandle h) + object py(EventHandle h) diff --git a/cuda_core/cuda/core/experimental/_stream.pyx b/cuda_core/cuda/core/experimental/_stream.pyx index d75e2bef0e..a3f9149d3e 100644 --- a/cuda_core/cuda/core/experimental/_stream.pyx +++ b/cuda_core/cuda/core/experimental/_stream.pyx @@ -27,6 +27,7 @@ from cuda.core.experimental._context cimport Context from cuda.core.experimental._event import Event, EventOptions from cuda.core.experimental._resource_handles cimport ( ContextHandle, + EventHandle, StreamHandle, create_context_handle_ref, create_stream_handle, @@ -265,7 +266,7 @@ cdef class Stream: "new event by supplying options." ) - cdef cydriver.CUevent e = ((event))._handle + cdef cydriver.CUevent e = native(((event))._h_event) with nogil: HANDLE_RETURN(cydriver.cuEventRecord(e, native(self._h_stream))) return event From 1082f5a7302479d0a379222264f003e0479d43dd Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Thu, 11 Dec 2025 08:19:28 -0800 Subject: [PATCH 14/38] Clean up Stream.wait() to use EventHandle for temporary events - Simplified branch structure: early return for Event, single path for Stream - Use native() helper for handle access instead of casting via handle property - Temporary events now use EventHandle with RAII cleanup (no explicit cuEventDestroy) - Added create_event_handle import --- cuda_core/cuda/core/experimental/_stream.pyx | 47 +++++++++++--------- 1 file changed, 26 insertions(+), 21 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_stream.pyx b/cuda_core/cuda/core/experimental/_stream.pyx index a3f9149d3e..078497c066 100644 --- a/cuda_core/cuda/core/experimental/_stream.pyx +++ b/cuda_core/cuda/core/experimental/_stream.pyx @@ -30,6 +30,7 @@ from cuda.core.experimental._resource_handles cimport ( EventHandle, StreamHandle, create_context_handle_ref, + create_event_handle, create_stream_handle, create_stream_handle_with_owner, get_current_context, @@ -281,32 +282,36 @@ cdef class Stream: on the stream and then waiting on it. """ - cdef cydriver.CUevent event - cdef cydriver.CUstream stream + cdef Stream stream + cdef EventHandle h_event + # Handle Event directly if isinstance(event_or_stream, Event): - event = (event_or_stream.handle) with nogil: # TODO: support flags other than 0? - HANDLE_RETURN(cydriver.cuStreamWaitEvent(native(self._h_stream), event, 0)) + HANDLE_RETURN(cydriver.cuStreamWaitEvent( + native(self._h_stream), native((event_or_stream)._h_event), 0)) + return + + # Convert to Stream if needed + if isinstance(event_or_stream, Stream): + stream = event_or_stream else: - if isinstance(event_or_stream, Stream): - stream = (event_or_stream.handle) - else: - try: - s = Stream._init(obj=event_or_stream) - except Exception as e: - raise ValueError( - "only an Event, Stream, or object supporting __cuda_stream__ can be waited," - f" got {type(event_or_stream)}" - ) from e - stream = (s.handle) - with nogil: - HANDLE_RETURN(cydriver.cuEventCreate(&event, cydriver.CUevent_flags.CU_EVENT_DISABLE_TIMING)) - HANDLE_RETURN(cydriver.cuEventRecord(event, stream)) - # TODO: support flags other than 0? - HANDLE_RETURN(cydriver.cuStreamWaitEvent(native(self._h_stream), event, 0)) - HANDLE_RETURN(cydriver.cuEventDestroy(event)) + try: + stream = Stream._init(obj=event_or_stream) + except Exception as e: + raise ValueError( + "only an Event, Stream, or object supporting __cuda_stream__ can be waited," + f" got {type(event_or_stream)}" + ) from e + + # Wait on stream via temporary event + Stream_ensure_ctx(self) + h_event = create_event_handle(self._h_context, cydriver.CUevent_flags.CU_EVENT_DISABLE_TIMING) + with nogil: + HANDLE_RETURN(cydriver.cuEventRecord(native(h_event), native(stream._h_stream))) + # TODO: support flags other than 0? + HANDLE_RETURN(cydriver.cuStreamWaitEvent(native(self._h_stream), native(h_event), 0)) @property def device(self) -> Device: From cd81f485681dc85a0c88af08f77de0606917f656 Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Thu, 11 Dec 2025 08:34:07 -0800 Subject: [PATCH 15/38] Add create_event_handle overload for temporary events - New overload takes only flags (no ContextHandle) for temporary events - Delegates to existing overload with empty ContextHandle - Updated _stream.pyx and _memoryview.pyx to use simpler overload - Removed unnecessary get_current_context import from _memoryview.pyx - Removed unnecessary Stream_ensure_ctx call from Stream.wait() --- .../experimental/_cpp/resource_handles.cpp | 7 +++++++ .../experimental/_cpp/resource_handles.hpp | 6 ++++++ .../cuda/core/experimental/_memoryview.pyx | 20 ++++++++++++++----- .../core/experimental/_resource_handles.pxd | 5 +++++ cuda_core/cuda/core/experimental/_stream.pyx | 3 +-- 5 files changed, 34 insertions(+), 7 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_cpp/resource_handles.cpp b/cuda_core/cuda/core/experimental/_cpp/resource_handles.cpp index a236176b95..860aae4857 100644 --- a/cuda_core/cuda/core/experimental/_cpp/resource_handles.cpp +++ b/cuda_core/cuda/core/experimental/_cpp/resource_handles.cpp @@ -265,6 +265,13 @@ EventHandle create_event_handle(ContextHandle h_ctx, unsigned int flags) { return EventHandle(box, &box->resource); } +EventHandle create_event_handle(unsigned int flags) { + // Creates an owning event handle without context dependency. + // Use for temporary events that are created and destroyed in the same scope. + // Returns empty handle on error (caller must check). + return create_event_handle(ContextHandle{}, flags); +} + EventHandle create_event_handle_ipc(const CUipcEventHandle& ipc_handle) { // Creates an owning event handle from an IPC handle. // The originating process owns the event and its context. diff --git a/cuda_core/cuda/core/experimental/_cpp/resource_handles.hpp b/cuda_core/cuda/core/experimental/_cpp/resource_handles.hpp index 44a8dd823a..eb6475f758 100644 --- a/cuda_core/cuda/core/experimental/_cpp/resource_handles.hpp +++ b/cuda_core/cuda/core/experimental/_cpp/resource_handles.hpp @@ -73,6 +73,12 @@ StreamHandle get_per_thread_stream() noexcept; // Returns empty handle on error (caller must check). EventHandle create_event_handle(ContextHandle h_ctx, unsigned int flags); +// Create an owning event handle without context dependency. +// Use for temporary events that are created and destroyed in the same scope. +// When the last reference is released, cuEventDestroy is called automatically. +// Returns empty handle on error (caller must check). +EventHandle create_event_handle(unsigned int flags); + // Create an owning event handle from an IPC handle. // The originating process owns the event and its context. // When the last reference is released, cuEventDestroy is called automatically. diff --git a/cuda_core/cuda/core/experimental/_memoryview.pyx b/cuda_core/cuda/core/experimental/_memoryview.pyx index 9e13ebea45..718736e5cf 100644 --- a/cuda_core/cuda/core/experimental/_memoryview.pyx +++ b/cuda_core/cuda/core/experimental/_memoryview.pyx @@ -13,7 +13,15 @@ from typing import Optional import numpy +from cuda.bindings cimport cydriver +from cuda.core.experimental._resource_handles cimport ( + EventHandle, + create_event_handle, + native, +) from cuda.core.experimental._utils.cuda_utils import handle_return, driver +from cuda.core.experimental._utils cimport cuda_utils +from cuda.core.experimental._utils.cuda_utils cimport HANDLE_RETURN from cuda.core.experimental._memory import Buffer @@ -579,6 +587,7 @@ cpdef StridedMemoryView view_as_cai(obj, stream_ptr, view=None): buf.ptr)) cdef intptr_t producer_s, consumer_s + cdef EventHandle h_event stream_ptr = int(stream_ptr) if stream_ptr != -1: stream = cai_data.get("stream") @@ -588,11 +597,12 @@ cpdef StridedMemoryView view_as_cai(obj, stream_ptr, view=None): assert producer_s > 0 # establish stream order if producer_s != consumer_s: - e = handle_return(driver.cuEventCreate( - driver.CUevent_flags.CU_EVENT_DISABLE_TIMING)) - handle_return(driver.cuEventRecord(e, producer_s)) - handle_return(driver.cuStreamWaitEvent(consumer_s, e, 0)) - handle_return(driver.cuEventDestroy(e)) + h_event = create_event_handle(cydriver.CUevent_flags.CU_EVENT_DISABLE_TIMING) + with nogil: + HANDLE_RETURN(cydriver.cuEventRecord( + native(h_event), producer_s)) + HANDLE_RETURN(cydriver.cuStreamWaitEvent( + consumer_s, native(h_event), 0)) return buf diff --git a/cuda_core/cuda/core/experimental/_resource_handles.pxd b/cuda_core/cuda/core/experimental/_resource_handles.pxd index 4c33a9f358..7c10599f8d 100644 --- a/cuda_core/cuda/core/experimental/_resource_handles.pxd +++ b/cuda_core/cuda/core/experimental/_resource_handles.pxd @@ -55,6 +55,11 @@ cdef extern from "_cpp/resource_handles.hpp" namespace "cuda_core": # Returns empty handle on error (caller must check) EventHandle create_event_handle(ContextHandle h_ctx, unsigned int flags) nogil + # Create an owning event handle without context dependency + # Use for temporary events that are created and destroyed in the same scope + # Returns empty handle on error (caller must check) + EventHandle create_event_handle(unsigned int flags) nogil + # Create an owning event handle from IPC handle # The originating process owns the event and its context # Returns empty handle on error (caller must check) diff --git a/cuda_core/cuda/core/experimental/_stream.pyx b/cuda_core/cuda/core/experimental/_stream.pyx index 078497c066..4a16399323 100644 --- a/cuda_core/cuda/core/experimental/_stream.pyx +++ b/cuda_core/cuda/core/experimental/_stream.pyx @@ -306,8 +306,7 @@ cdef class Stream: ) from e # Wait on stream via temporary event - Stream_ensure_ctx(self) - h_event = create_event_handle(self._h_context, cydriver.CUevent_flags.CU_EVENT_DISABLE_TIMING) + h_event = create_event_handle(cydriver.CUevent_flags.CU_EVENT_DISABLE_TIMING) with nogil: HANDLE_RETURN(cydriver.cuEventRecord(native(h_event), native(stream._h_stream))) # TODO: support flags other than 0? From 2b798f27a2918b5d1898ecf392ccd78d8bdf3c7c Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Thu, 11 Dec 2025 09:58:09 -0800 Subject: [PATCH 16/38] Convert DeviceMemoryResource to use MemoryPoolHandle C++ layer (resource_handles.hpp/cpp): - Add MemoryPoolHandle = std::shared_ptr - Add create_mempool_handle(props) - owning, calls cuMemPoolDestroy on release - Add create_mempool_handle_ref(pool) - non-owning reference - Add create_mempool_handle_ipc(fd, handle_type) - owning from IPC import - Add get_device_mempool(device_id) - get current pool for device (non-owning) - Add native(), intptr(), py() overloads for MemoryPoolHandle Cython layer: - Update _resource_handles.pxd with new types and functions - Update _device_memory_resource.pxd: replace raw handle with MemoryPoolHandle - Reorder members: _h_pool first (matches Stream/Event pattern) - Update _device_memory_resource.pyx to use new handle functions - Update _ipc.pyx to use create_mempool_handle_ipc for IPC imports - DMR_close now uses RAII (_h_pool.reset()) instead of explicit cuMemPoolDestroy - Consistent member initialization order across __cinit__, init functions, and close --- .../experimental/_cpp/resource_handles.cpp | 90 +++++++++++++++++++ .../experimental/_cpp/resource_handles.hpp | 47 ++++++++++ .../_memory/_device_memory_resource.pxd | 15 ++-- .../_memory/_device_memory_resource.pyx | 77 ++++++++-------- .../cuda/core/experimental/_memory/_ipc.pyx | 24 ++--- .../core/experimental/_resource_handles.pxd | 26 ++++++ 6 files changed, 221 insertions(+), 58 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_cpp/resource_handles.cpp b/cuda_core/cuda/core/experimental/_cpp/resource_handles.cpp index 860aae4857..8935a358e1 100644 --- a/cuda_core/cuda/core/experimental/_cpp/resource_handles.cpp +++ b/cuda_core/cuda/core/experimental/_cpp/resource_handles.cpp @@ -296,4 +296,94 @@ EventHandle create_event_handle_ipc(const CUipcEventHandle& ipc_handle) { return EventHandle(box, &box->resource); } +// ============================================================================ +// Memory Pool Handles +// ============================================================================ + +// Internal box structure for MemoryPool +struct MemoryPoolBox { + CUmemoryPool resource; +}; + +// Helper to clear peer access before destroying a memory pool. +// Works around nvbug 5698116: recycled pool handles inherit peer access state. +static void clear_mempool_peer_access(CUmemoryPool pool) { + int device_count = 0; + if (cuDeviceGetCount(&device_count) != CUDA_SUCCESS || device_count <= 0) { + return; + } + + std::vector clear_access(device_count); + for (int i = 0; i < device_count; ++i) { + clear_access[i].location.type = CU_MEM_LOCATION_TYPE_DEVICE; + clear_access[i].location.id = i; + clear_access[i].flags = CU_MEM_ACCESS_FLAGS_PROT_NONE; + } + + // Ignore errors - best effort cleanup + cuMemPoolSetAccess(pool, clear_access.data(), device_count); +} + +// Helper to wrap a raw pool in an owning handle. +// The deleter clears peer access (nvbug 5698116 workaround) and destroys the pool. +static MemoryPoolHandle wrap_mempool_owned(CUmemoryPool pool) { + auto box = std::shared_ptr(new MemoryPoolBox{pool}, [](const MemoryPoolBox* b) { + GILReleaseGuard gil; + clear_mempool_peer_access(b->resource); + cuMemPoolDestroy(b->resource); + delete b; + }); + return MemoryPoolHandle(box, &box->resource); +} + +MemoryPoolHandle create_mempool_handle(const CUmemPoolProps& props) { + // Creates an owning memory pool handle - calls cuMemPoolCreate internally. + // Memory pools are device-scoped (not context-scoped). + // Returns empty handle on error (caller must check). + CUmemoryPool pool; + CUresult err; + { + GILReleaseGuard gil; + err = cuMemPoolCreate(&pool, &props); + } + return err == CUDA_SUCCESS ? wrap_mempool_owned(pool) : MemoryPoolHandle(); +} + +MemoryPoolHandle create_mempool_handle_ref(CUmemoryPool pool) { + // Creates a non-owning handle - pool will NOT be destroyed. + // Use for device default/current pools managed by the driver. + auto box = std::shared_ptr(new MemoryPoolBox{pool}); + + // Use aliasing constructor to expose only CUmemoryPool + return MemoryPoolHandle(box, &box->resource); +} + +MemoryPoolHandle get_device_mempool(int device_id) noexcept { + // Get the current memory pool for a device. + // Returns a non-owning handle (pool managed by driver). + CUmemoryPool pool; + CUresult err; + { + GILReleaseGuard gil; + err = cuDeviceGetMemPool(&pool, device_id); + } + if (err != CUDA_SUCCESS) { + return MemoryPoolHandle(); + } + return create_mempool_handle_ref(pool); +} + +MemoryPoolHandle create_mempool_handle_ipc(int fd, CUmemAllocationHandleType handle_type) { + // Creates an owning memory pool handle from an IPC import. + // The file descriptor is NOT owned by this handle. + // Returns empty handle on error (caller must check). + CUmemoryPool pool; + CUresult err; + { + GILReleaseGuard gil; + err = cuMemPoolImportFromShareableHandle(&pool, reinterpret_cast(static_cast(fd)), handle_type, 0); + } + return err == CUDA_SUCCESS ? wrap_mempool_owned(pool) : MemoryPoolHandle(); +} + } // namespace cuda_core diff --git a/cuda_core/cuda/core/experimental/_cpp/resource_handles.hpp b/cuda_core/cuda/core/experimental/_cpp/resource_handles.hpp index eb6475f758..83a68c8b40 100644 --- a/cuda_core/cuda/core/experimental/_cpp/resource_handles.hpp +++ b/cuda_core/cuda/core/experimental/_cpp/resource_handles.hpp @@ -18,6 +18,7 @@ namespace cuda_core { using ContextHandle = std::shared_ptr; using StreamHandle = std::shared_ptr; using EventHandle = std::shared_ptr; +using MemoryPoolHandle = std::shared_ptr; // ============================================================================ // Context handle functions @@ -85,6 +86,31 @@ EventHandle create_event_handle(unsigned int flags); // Returns empty handle on error (caller must check). EventHandle create_event_handle_ipc(const CUipcEventHandle& ipc_handle); +// ============================================================================ +// Memory pool handle functions +// ============================================================================ + +// Create an owning memory pool handle by calling cuMemPoolCreate. +// Memory pools are device-scoped (not context-scoped). +// When the last reference is released, cuMemPoolDestroy is called automatically. +// Returns empty handle on error (caller must check). +MemoryPoolHandle create_mempool_handle(const CUmemPoolProps& props); + +// Create a non-owning memory pool handle (references existing pool). +// Use for device default/current pools that are managed by the driver. +// The pool will NOT be destroyed when the handle is released. +MemoryPoolHandle create_mempool_handle_ref(CUmemoryPool pool); + +// Get non-owning handle to the current memory pool for a device. +// Returns empty handle on error (caller must check). +MemoryPoolHandle get_device_mempool(int device_id) noexcept; + +// Create an owning memory pool handle from an IPC import. +// The file descriptor is NOT owned by this handle (caller manages FD separately). +// When the last reference is released, cuMemPoolDestroy is called automatically. +// Returns empty handle on error (caller must check). +MemoryPoolHandle create_mempool_handle_ipc(int fd, CUmemAllocationHandleType handle_type); + // ============================================================================ // Overloaded helper functions to extract raw resources from handles // ============================================================================ @@ -102,6 +128,10 @@ inline CUevent native(const EventHandle& h) noexcept { return h ? *h : nullptr; } +inline CUmemoryPool native(const MemoryPoolHandle& h) noexcept { + return h ? *h : nullptr; +} + // intptr() - extract handle as uintptr_t for Python interop inline std::uintptr_t intptr(const ContextHandle& h) noexcept { return reinterpret_cast(h ? *h : nullptr); @@ -115,6 +145,10 @@ inline std::uintptr_t intptr(const EventHandle& h) noexcept { return reinterpret_cast(h ? *h : nullptr); } +inline std::uintptr_t intptr(const MemoryPoolHandle& h) noexcept { + return reinterpret_cast(h ? *h : nullptr); +} + // py() - convert handle to Python driver wrapper object // Returns new reference. Caller must hold GIL. inline PyObject* py(const ContextHandle& h) { @@ -156,4 +190,17 @@ inline PyObject* py(const EventHandle& h) { return PyObject_CallFunction(cls, "K", val); } +inline PyObject* py(const MemoryPoolHandle& h) { + static PyObject* cls = nullptr; + if (!cls) { + PyObject* mod = PyImport_ImportModule("cuda.bindings.driver"); + if (!mod) return nullptr; + cls = PyObject_GetAttrString(mod, "CUmemoryPool"); + Py_DECREF(mod); + if (!cls) return nullptr; + } + std::uintptr_t val = h ? reinterpret_cast(*h) : 0; + return PyObject_CallFunction(cls, "K", val); +} + } // namespace cuda_core diff --git a/cuda_core/cuda/core/experimental/_memory/_device_memory_resource.pxd b/cuda_core/cuda/core/experimental/_memory/_device_memory_resource.pxd index d31ff7b2e1..9b5c384d39 100644 --- a/cuda_core/cuda/core/experimental/_memory/_device_memory_resource.pxd +++ b/cuda_core/cuda/core/experimental/_memory/_device_memory_resource.pxd @@ -5,17 +5,18 @@ from cuda.bindings cimport cydriver from cuda.core.experimental._memory._buffer cimport MemoryResource from cuda.core.experimental._memory._ipc cimport IPCDataForMR +from cuda.core.experimental._resource_handles cimport MemoryPoolHandle cdef class DeviceMemoryResource(MemoryResource): cdef: - int _device_id - cydriver.CUmemoryPool _handle - bint _mempool_owned - IPCDataForMR _ipc_data - object _attributes - object _peer_accessible_by - object __weakref__ + MemoryPoolHandle _h_pool + int _device_id + bint _pool_owned + IPCDataForMR _ipc_data + object _attributes + object _peer_accessible_by + object __weakref__ cpdef DMR_mempool_get_access(DeviceMemoryResource, int) diff --git a/cuda_core/cuda/core/experimental/_memory/_device_memory_resource.pyx b/cuda_core/cuda/core/experimental/_memory/_device_memory_resource.pyx index d06f0b8297..b009408a43 100644 --- a/cuda_core/cuda/core/experimental/_memory/_device_memory_resource.pyx +++ b/cuda_core/cuda/core/experimental/_memory/_device_memory_resource.pyx @@ -13,7 +13,13 @@ from cuda.bindings cimport cydriver from cuda.core.experimental._memory._buffer cimport Buffer, MemoryResource from cuda.core.experimental._memory cimport _ipc from cuda.core.experimental._memory._ipc cimport IPCAllocationHandle, IPCDataForMR -from cuda.core.experimental._resource_handles cimport native +from cuda.core.experimental._resource_handles cimport ( + MemoryPoolHandle, + create_mempool_handle, + get_device_mempool, + native, + py, +) from cuda.core.experimental._stream cimport default_stream, Stream_accept, Stream from cuda.core.experimental._utils.cuda_utils cimport ( check_or_create_options, @@ -77,7 +83,7 @@ cdef class DeviceMemoryResourceAttributes: cdef DeviceMemoryResource mr = (self._mr_weakref()) if mr is None: raise RuntimeError("DeviceMemoryResource is expired") - cdef cydriver.CUmemoryPool pool_handle = mr._handle + cdef cydriver.CUmemoryPool pool_handle = native(mr._h_pool) with nogil: HANDLE_RETURN(cydriver.cuMemPoolGetAttribute(pool_handle, attr_enum, value)) return 0 @@ -219,9 +225,9 @@ cdef class DeviceMemoryResource(MemoryResource): """ def __cinit__(self): + # _h_pool is default-initialized (empty shared_ptr) by C++ self._device_id = cydriver.CU_DEVICE_INVALID - self._handle = NULL - self._mempool_owned = False + self._pool_owned = False self._ipc_data = None self._attributes = None self._peer_accessible_by = () @@ -239,9 +245,6 @@ cdef class DeviceMemoryResource(MemoryResource): else: DMR_init_create(self, c_device_id, opts) - def __dealloc__(self): - DMR_close(self) - def close(self): """ Close the device memory resource and destroy the associated memory pool @@ -371,7 +374,7 @@ cdef class DeviceMemoryResource(MemoryResource): @property def handle(self) -> driver.CUmemoryPool: """Handle to the underlying memory pool.""" - return driver.CUmemoryPool((self._handle)) + return py(self._h_pool) @property def is_device_accessible(self) -> bool: @@ -381,7 +384,7 @@ cdef class DeviceMemoryResource(MemoryResource): @property def is_handle_owned(self) -> bool: """Whether the memory resource handle is owned. If False, ``close`` has no effect.""" - return self._mempool_owned + return self._pool_owned @property def is_host_accessible(self) -> bool: @@ -469,7 +472,7 @@ cdef class DeviceMemoryResource(MemoryResource): i += 1 with nogil: - HANDLE_RETURN(cydriver.cuMemPoolSetAccess(self._handle, access_desc, count)) + HANDLE_RETURN(cydriver.cuMemPoolSetAccess(native(self._h_pool), access_desc, count)) finally: if access_desc != NULL: free(access_desc) @@ -485,19 +488,20 @@ cdef void DMR_init_current(DeviceMemoryResource self, int device_id): cdef cydriver.cuuint64_t current_threshold cdef cydriver.cuuint64_t max_threshold = ULLONG_MAX + self._h_pool = get_device_mempool(device_id) + if not self._h_pool: + raise RuntimeError("Failed to get device memory pool") self._device_id = device_id - self._mempool_owned = False + self._pool_owned = False with nogil: - HANDLE_RETURN(cydriver.cuDeviceGetMemPool(&(self._handle), device_id)) - # Set a higher release threshold to improve performance when there are # no active allocations. By default, the release threshold is 0, which # means memory is immediately released back to the OS when there are no # active suballocations, causing performance issues. HANDLE_RETURN( cydriver.cuMemPoolGetAttribute( - self._handle, + native(self._h_pool), cydriver.CUmemPool_attribute.CU_MEMPOOL_ATTR_RELEASE_THRESHOLD, ¤t_threshold ) @@ -506,7 +510,7 @@ cdef void DMR_init_current(DeviceMemoryResource self, int device_id): # If threshold is 0 (default), set it to maximum to retain memory in the pool. if current_threshold == 0: HANDLE_RETURN(cydriver.cuMemPoolSetAttribute( - self._handle, + native(self._h_pool), cydriver.CUmemPool_attribute.CU_MEMPOOL_ATTR_RELEASE_THRESHOLD, &max_threshold )) @@ -530,16 +534,15 @@ cdef void DMR_init_create( properties.win32SecurityAttributes = NULL properties.usage = 0 + self._h_pool = create_mempool_handle(properties) + if not self._h_pool: + raise RuntimeError("Failed to create memory pool") self._device_id = device_id - self._mempool_owned = True - - with nogil: - HANDLE_RETURN(cydriver.cuMemPoolCreate(&(self._handle), &properties)) - # TODO: should we also set the threshold here? - + self._pool_owned = True if opts.ipc_enabled: alloc_handle = _ipc.DMR_export_mempool(self) self._ipc_data = IPCDataForMR(alloc_handle, False) + # TODO: should we also set the threshold here? # Raise an exception if the given stream is capturing. @@ -554,10 +557,11 @@ cdef inline int check_not_capturing(cydriver.CUstream s) except?-1 nogil: cdef inline Buffer DMR_allocate(DeviceMemoryResource self, size_t size, Stream stream): cdef cydriver.CUstream s = native(stream._h_stream) + cdef cydriver.CUmemoryPool pool = native(self._h_pool) cdef cydriver.CUdeviceptr devptr with nogil: check_not_capturing(s) - HANDLE_RETURN(cydriver.cuMemAllocFromPoolAsync(&devptr, size, self._handle, s)) + HANDLE_RETURN(cydriver.cuMemAllocFromPoolAsync(&devptr, size, pool, s)) cdef Buffer buf = Buffer.__new__(Buffer) buf._ptr = (devptr) buf._ptr_obj = None @@ -580,25 +584,18 @@ cdef inline void DMR_deallocate( cdef inline DMR_close(DeviceMemoryResource self): - if self._handle == NULL: + if not self._h_pool: return - # This works around nvbug 5698116. When a memory pool handle is recycled - # the new handle inherits the peer access state of the previous handle. - if self._peer_accessible_by: - self.peer_accessible_by = [] - - try: - if self._mempool_owned: - with nogil: - HANDLE_RETURN(cydriver.cuMemPoolDestroy(self._handle)) - finally: - self._device_id = cydriver.CU_DEVICE_INVALID - self._handle = NULL - self._attributes = None - self._mempool_owned = False - self._ipc_data = None - self._peer_accessible_by = () + # Reset members in declaration order. + # The RAII deleter handles nvbug 5698116 workaround (clears peer access) + # and calls cuMemPoolDestroy if this is an owning handle. + self._h_pool.reset() + self._device_id = cydriver.CU_DEVICE_INVALID + self._pool_owned = False + self._ipc_data = None + self._attributes = None + self._peer_accessible_by = () # Note: this is referenced in instructions to debug nvbug 5698116. @@ -626,7 +623,7 @@ cpdef DMR_mempool_get_access(DeviceMemoryResource dmr, int device_id): location.id = c_device_id with nogil: - HANDLE_RETURN(cydriver.cuMemPoolGetAccess(&flags, dmr._handle, &location)) + HANDLE_RETURN(cydriver.cuMemPoolGetAccess(&flags, native(dmr._h_pool), &location)) if flags == cydriver.CUmemAccess_flags.CU_MEM_ACCESS_FLAGS_PROT_READWRITE: return "rw" diff --git a/cuda_core/cuda/core/experimental/_memory/_ipc.pyx b/cuda_core/cuda/core/experimental/_memory/_ipc.pyx index 7c5a9b0409..4eb062dda0 100644 --- a/cuda_core/cuda/core/experimental/_memory/_ipc.pyx +++ b/cuda_core/cuda/core/experimental/_memory/_ipc.pyx @@ -8,6 +8,11 @@ from libc.string cimport memcpy from cuda.bindings cimport cydriver from cuda.core.experimental._memory._buffer cimport Buffer +from cuda.core.experimental._resource_handles cimport ( + MemoryPoolHandle, + create_mempool_handle_ipc, + native, +) from cuda.core.experimental._stream cimport default_stream from cuda.core.experimental._utils.cuda_utils cimport HANDLE_RETURN from cuda.core.experimental._utils.cuda_utils import check_multiprocessing_start_method @@ -185,7 +190,7 @@ cdef Buffer Buffer_from_ipc_descriptor( ) cdef cydriver.CUdeviceptr ptr with nogil: - HANDLE_RETURN(cydriver.cuMemPoolImportPointer(&ptr, mr._handle, &data)) + HANDLE_RETURN(cydriver.cuMemPoolImportPointer(&ptr, native(mr._h_pool), &data)) return Buffer._init(ptr, ipc_descriptor.size, mr, stream, ipc_descriptor) @@ -209,20 +214,17 @@ cdef DeviceMemoryResource DMR_from_allocation_handle(cls, device_id, alloc_handl os.close(fd) raise - # Construct a new DMR. + # Construct a new DMR (set members in declaration order). cdef DeviceMemoryResource self = DeviceMemoryResource.__new__(cls) + cdef int ipc_fd = int(alloc_handle) + self._h_pool = create_mempool_handle_ipc(ipc_fd, IPC_HANDLE_TYPE) + if not self._h_pool: + raise RuntimeError("Failed to import memory pool from IPC handle") from .._device import Device self._device_id = Device(device_id).device_id - self._mempool_owned = True + self._pool_owned = True self._ipc_data = IPCDataForMR(alloc_handle, True) - # Map the mempool into this process. - cdef int handle = int(alloc_handle) - with nogil: - HANDLE_RETURN(cydriver.cuMemPoolImportFromShareableHandle( - &(self._handle), (handle), IPC_HANDLE_TYPE, 0) - ) - # Register it. if uuid is not None: registered = self.register(uuid) @@ -253,7 +255,7 @@ cdef IPCAllocationHandle DMR_export_mempool(DeviceMemoryResource self): cdef int fd with nogil: HANDLE_RETURN(cydriver.cuMemPoolExportToShareableHandle( - &fd, self._handle, IPC_HANDLE_TYPE, 0) + &fd, native(self._h_pool), IPC_HANDLE_TYPE, 0) ) try: return IPCAllocationHandle._init(fd, uuid.uuid4()) diff --git a/cuda_core/cuda/core/experimental/_resource_handles.pxd b/cuda_core/cuda/core/experimental/_resource_handles.pxd index 7c10599f8d..6bb172e64d 100644 --- a/cuda_core/cuda/core/experimental/_resource_handles.pxd +++ b/cuda_core/cuda/core/experimental/_resource_handles.pxd @@ -65,6 +65,29 @@ cdef extern from "_cpp/resource_handles.hpp" namespace "cuda_core": # Returns empty handle on error (caller must check) EventHandle create_event_handle_ipc(const cydriver.CUipcEventHandle& ipc_handle) nogil + # ======================================================================== + # Memory Pool Handle + # ======================================================================== + ctypedef shared_ptr[const cydriver.CUmemoryPool] MemoryPoolHandle + + # Create an owning memory pool handle via cuMemPoolCreate + # Memory pools are device-scoped (not context-scoped) + # Returns empty handle on error (caller must check) + MemoryPoolHandle create_mempool_handle(const cydriver.CUmemPoolProps& props) nogil + + # Create a non-owning memory pool handle (pool NOT destroyed when released) + # Use for device default/current pools managed by the driver + MemoryPoolHandle create_mempool_handle_ref(cydriver.CUmemoryPool pool) nogil + + # Get non-owning handle to the current memory pool for a device + # Returns empty handle on error (caller must check) + MemoryPoolHandle get_device_mempool(int device_id) nogil + + # Create an owning memory pool handle from IPC import + # File descriptor NOT owned by this handle (caller manages FD separately) + # Returns empty handle on error (caller must check) + MemoryPoolHandle create_mempool_handle_ipc(int fd, cydriver.CUmemAllocationHandleType handle_type) nogil + # ======================================================================== # Overloaded helper functions (C++ handles dispatch by type) # ======================================================================== @@ -73,13 +96,16 @@ cdef extern from "_cpp/resource_handles.hpp" namespace "cuda_core": cydriver.CUcontext native(ContextHandle h) nogil cydriver.CUstream native(StreamHandle h) nogil cydriver.CUevent native(EventHandle h) nogil + cydriver.CUmemoryPool native(MemoryPoolHandle h) nogil # intptr() - extract handle as uintptr_t for Python interop uintptr_t intptr(ContextHandle h) nogil uintptr_t intptr(StreamHandle h) nogil uintptr_t intptr(EventHandle h) nogil + uintptr_t intptr(MemoryPoolHandle h) nogil # py() - convert handle to Python driver wrapper object (requires GIL) object py(ContextHandle h) object py(StreamHandle h) object py(EventHandle h) + object py(MemoryPoolHandle h) From 63d263dc17f41c0eb605a26bee140502a29d5190 Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Thu, 11 Dec 2025 12:52:20 -0800 Subject: [PATCH 17/38] Add DevicePtrHandle for RAII device pointer management Introduce DevicePtrHandle (std::shared_ptr) to manage device pointer lifetimes with automatic deallocation. Key features: - Allocation functions: deviceptr_alloc_from_pool, deviceptr_alloc_async, deviceptr_alloc, deviceptr_alloc_host, deviceptr_create_ref - IPC import via deviceptr_import_ipc with error output parameter - Deallocation stream stored in mutable DevicePtrBox, accessible via deallocation_stream() and set_deallocation_stream() - cuMemFreeAsync used for deallocation (NULL stream = legacy default) - Buffer class updated to use DevicePtrHandle instead of raw pointers - Buffer.handle returns integer for backward compatibility with ctypes - IPCBufferDescriptor.payload_ptr() helper to simplify casting Note: IPC-imported pointers do not yet implement reference counting workaround for nvbug 5570902. --- .../experimental/_cpp/resource_handles.cpp | 170 +++++++++++ .../experimental/_cpp/resource_handles.hpp | 73 +++++ .../core/experimental/_memory/_buffer.pxd | 23 +- .../core/experimental/_memory/_buffer.pyx | 266 ++++++------------ .../_memory/_device_memory_resource.pyx | 18 +- .../_memory/_graph_memory_resource.pyx | 21 +- .../cuda/core/experimental/_memory/_ipc.pxd | 2 + .../cuda/core/experimental/_memory/_ipc.pyx | 29 +- .../core/experimental/_resource_handles.pxd | 47 ++++ 9 files changed, 426 insertions(+), 223 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_cpp/resource_handles.cpp b/cuda_core/cuda/core/experimental/_cpp/resource_handles.cpp index 8935a358e1..b5ccfff105 100644 --- a/cuda_core/cuda/core/experimental/_cpp/resource_handles.cpp +++ b/cuda_core/cuda/core/experimental/_cpp/resource_handles.cpp @@ -386,4 +386,174 @@ MemoryPoolHandle create_mempool_handle_ipc(int fd, CUmemAllocationHandleType han return err == CUDA_SUCCESS ? wrap_mempool_owned(pool) : MemoryPoolHandle(); } +// ============================================================================ +// Device Pointer Handles +// ============================================================================ + +// Internal box structure for DevicePtr. +// The h_stream is mutable to allow updating the deallocation stream after creation. +struct DevicePtrBox { + CUdeviceptr resource; + mutable StreamHandle h_stream; +}; + +// Internal helper to retrieve the box from a handle (for deallocation_stream access). +static DevicePtrBox* get_box(const DevicePtrHandle& h) { + const CUdeviceptr* p = h.get(); + return reinterpret_cast( + reinterpret_cast(const_cast(p)) + - offsetof(DevicePtrBox, resource) + ); +} + +StreamHandle deallocation_stream(const DevicePtrHandle& h) { + return get_box(h)->h_stream; +} + +void set_deallocation_stream(const DevicePtrHandle& h, StreamHandle h_stream) { + get_box(h)->h_stream = std::move(h_stream); +} + +DevicePtrHandle deviceptr_alloc_from_pool( + size_t size, + MemoryPoolHandle h_pool, + StreamHandle h_stream) +{ + // Allocate from pool asynchronously. + // Pool handle is captured in deleter to keep pool alive. + CUdeviceptr ptr; + CUresult err; + { + GILReleaseGuard gil; + err = cuMemAllocFromPoolAsync(&ptr, size, *h_pool, native(h_stream)); + } + if (err != CUDA_SUCCESS) { + return DevicePtrHandle(); + } + + auto box = std::shared_ptr( + new DevicePtrBox{ptr, h_stream}, + [h_pool](DevicePtrBox* b) { + GILReleaseGuard gil; + // cuMemFreeAsync accepts NULL stream (uses legacy default stream) + cuMemFreeAsync(b->resource, native(b->h_stream)); + delete b; + // h_pool destructor runs here, releasing pool reference + } + ); + return DevicePtrHandle(box, &box->resource); +} + +DevicePtrHandle deviceptr_alloc_async(size_t size, StreamHandle h_stream) { + // Allocate asynchronously (not from a specific pool). + CUdeviceptr ptr; + CUresult err; + { + GILReleaseGuard gil; + err = cuMemAllocAsync(&ptr, size, native(h_stream)); + } + if (err != CUDA_SUCCESS) { + return DevicePtrHandle(); + } + + auto box = std::shared_ptr( + new DevicePtrBox{ptr, h_stream}, + [](DevicePtrBox* b) { + GILReleaseGuard gil; + // cuMemFreeAsync accepts NULL stream (uses legacy default stream) + cuMemFreeAsync(b->resource, native(b->h_stream)); + delete b; + } + ); + return DevicePtrHandle(box, &box->resource); +} + +DevicePtrHandle deviceptr_alloc(size_t size) { + // Allocate synchronously. + CUdeviceptr ptr; + CUresult err; + { + GILReleaseGuard gil; + err = cuMemAlloc(&ptr, size); + } + if (err != CUDA_SUCCESS) { + return DevicePtrHandle(); + } + + auto box = std::shared_ptr( + new DevicePtrBox{ptr, StreamHandle{}}, + [](DevicePtrBox* b) { + GILReleaseGuard gil; + cuMemFree(b->resource); + delete b; + } + ); + return DevicePtrHandle(box, &box->resource); +} + +DevicePtrHandle deviceptr_alloc_host(size_t size) { + // Allocate pinned host memory. + void* ptr; + CUresult err; + { + GILReleaseGuard gil; + err = cuMemAllocHost(&ptr, size); + } + if (err != CUDA_SUCCESS) { + return DevicePtrHandle(); + } + + auto box = std::shared_ptr( + new DevicePtrBox{reinterpret_cast(ptr), StreamHandle{}}, + [](DevicePtrBox* b) { + GILReleaseGuard gil; + cuMemFreeHost(reinterpret_cast(b->resource)); + delete b; + } + ); + return DevicePtrHandle(box, &box->resource); +} + +DevicePtrHandle deviceptr_create_ref(CUdeviceptr ptr) { + // Non-owning reference - pointer will NOT be freed. + auto box = std::shared_ptr(new DevicePtrBox{ptr, StreamHandle{}}); + return DevicePtrHandle(box, &box->resource); +} + +DevicePtrHandle deviceptr_import_ipc( + MemoryPoolHandle h_pool, + const void* export_data, + StreamHandle h_stream, + CUresult* error_out) +{ + // Import pointer from IPC. + // Note: Does not implement reference counting workaround for nvbug 5570902 yet. + CUdeviceptr ptr; + CUresult err; + { + GILReleaseGuard gil; + err = cuMemPoolImportPointer(&ptr, *h_pool, + const_cast( + reinterpret_cast(export_data))); + } + if (error_out) { + *error_out = err; + } + if (err != CUDA_SUCCESS) { + return DevicePtrHandle(); + } + + auto box = std::shared_ptr( + new DevicePtrBox{ptr, h_stream}, + [h_pool](DevicePtrBox* b) { + GILReleaseGuard gil; + // cuMemFreeAsync accepts NULL stream (uses legacy default stream) + cuMemFreeAsync(b->resource, native(b->h_stream)); + delete b; + // h_pool destructor runs here + } + ); + return DevicePtrHandle(box, &box->resource); +} + } // namespace cuda_core diff --git a/cuda_core/cuda/core/experimental/_cpp/resource_handles.hpp b/cuda_core/cuda/core/experimental/_cpp/resource_handles.hpp index 83a68c8b40..5ce6671fa9 100644 --- a/cuda_core/cuda/core/experimental/_cpp/resource_handles.hpp +++ b/cuda_core/cuda/core/experimental/_cpp/resource_handles.hpp @@ -111,6 +111,58 @@ MemoryPoolHandle get_device_mempool(int device_id) noexcept; // Returns empty handle on error (caller must check). MemoryPoolHandle create_mempool_handle_ipc(int fd, CUmemAllocationHandleType handle_type); +// ============================================================================ +// Device pointer handle functions +// ============================================================================ + +using DevicePtrHandle = std::shared_ptr; + +// Allocate device memory from a pool asynchronously via cuMemAllocFromPoolAsync. +// The pointer structurally depends on the provided pool handle (captured in deleter). +// When the last reference is released, cuMemFreeAsync is called on the stored stream. +// Returns empty handle on error (caller must check). +DevicePtrHandle deviceptr_alloc_from_pool( + size_t size, + MemoryPoolHandle h_pool, + StreamHandle h_stream); + +// Allocate device memory asynchronously via cuMemAllocAsync. +// When the last reference is released, cuMemFreeAsync is called on the stored stream. +// Returns empty handle on error (caller must check). +DevicePtrHandle deviceptr_alloc_async(size_t size, StreamHandle h_stream); + +// Allocate device memory synchronously via cuMemAlloc. +// When the last reference is released, cuMemFree is called. +// Returns empty handle on error (caller must check). +DevicePtrHandle deviceptr_alloc(size_t size); + +// Allocate pinned host memory via cuMemAllocHost. +// When the last reference is released, cuMemFreeHost is called. +// Returns empty handle on error (caller must check). +DevicePtrHandle deviceptr_alloc_host(size_t size); + +// Create a non-owning device pointer handle (references existing pointer). +// Use for foreign pointers (e.g., from external libraries). +// The pointer will NOT be freed when the handle is released. +DevicePtrHandle deviceptr_create_ref(CUdeviceptr ptr); + +// Import a device pointer from IPC via cuMemPoolImportPointer. +// When the last reference is released, cuMemFreeAsync is called on the stored stream. +// Note: Does not yet implement reference counting for nvbug 5570902. +// Error code is written to error_out (caller must check). +DevicePtrHandle deviceptr_import_ipc( + MemoryPoolHandle h_pool, + const void* export_data, + StreamHandle h_stream, + CUresult* error_out); + +// Access the deallocation stream for a device pointer handle (read-only). +// For non-owning handles, the stream is not used but can still be accessed. +StreamHandle deallocation_stream(const DevicePtrHandle& h); + +// Set the deallocation stream for a device pointer handle. +void set_deallocation_stream(const DevicePtrHandle& h, StreamHandle h_stream); + // ============================================================================ // Overloaded helper functions to extract raw resources from handles // ============================================================================ @@ -132,6 +184,10 @@ inline CUmemoryPool native(const MemoryPoolHandle& h) noexcept { return h ? *h : nullptr; } +inline CUdeviceptr native(const DevicePtrHandle& h) noexcept { + return h ? *h : 0; +} + // intptr() - extract handle as uintptr_t for Python interop inline std::uintptr_t intptr(const ContextHandle& h) noexcept { return reinterpret_cast(h ? *h : nullptr); @@ -149,6 +205,10 @@ inline std::uintptr_t intptr(const MemoryPoolHandle& h) noexcept { return reinterpret_cast(h ? *h : nullptr); } +inline std::uintptr_t intptr(const DevicePtrHandle& h) noexcept { + return h ? static_cast(*h) : 0; +} + // py() - convert handle to Python driver wrapper object // Returns new reference. Caller must hold GIL. inline PyObject* py(const ContextHandle& h) { @@ -203,4 +263,17 @@ inline PyObject* py(const MemoryPoolHandle& h) { return PyObject_CallFunction(cls, "K", val); } +inline PyObject* py(const DevicePtrHandle& h) { + static PyObject* cls = nullptr; + if (!cls) { + PyObject* mod = PyImport_ImportModule("cuda.bindings.driver"); + if (!mod) return nullptr; + cls = PyObject_GetAttrString(mod, "CUdeviceptr"); + Py_DECREF(mod); + if (!cls) return nullptr; + } + std::uintptr_t val = h ? static_cast(*h) : 0; + return PyObject_CallFunction(cls, "K", val); +} + } // namespace cuda_core diff --git a/cuda_core/cuda/core/experimental/_memory/_buffer.pxd b/cuda_core/cuda/core/experimental/_memory/_buffer.pxd index b581dcd293..81653dafd5 100644 --- a/cuda_core/cuda/core/experimental/_memory/_buffer.pxd +++ b/cuda_core/cuda/core/experimental/_memory/_buffer.pxd @@ -4,6 +4,7 @@ from libc.stdint cimport uintptr_t +from cuda.core.experimental._resource_handles cimport DevicePtrHandle from cuda.core.experimental._stream cimport Stream @@ -15,16 +16,20 @@ cdef struct _MemAttrs: cdef class Buffer: cdef: - uintptr_t _ptr - size_t _size - MemoryResource _memory_resource - object _ipc_data - object _owner - object _ptr_obj - Stream _alloc_stream - _MemAttrs _mem_attrs - bint _mem_attrs_inited + DevicePtrHandle _h_ptr + size_t _size + MemoryResource _memory_resource + object _ipc_data cdef class MemoryResource: pass + + +# Helper function to create a Buffer from a DevicePtrHandle +cdef Buffer Buffer_from_deviceptr_handle( + DevicePtrHandle h_ptr, + size_t size, + MemoryResource mr, + object ipc_descriptor = * +) diff --git a/cuda_core/cuda/core/experimental/_memory/_buffer.pyx b/cuda_core/cuda/core/experimental/_memory/_buffer.pyx index b26471ed0e..c7ab15ae95 100644 --- a/cuda_core/cuda/core/experimental/_memory/_buffer.pyx +++ b/cuda_core/cuda/core/experimental/_memory/_buffer.pyx @@ -11,6 +11,15 @@ from cuda.bindings cimport cydriver from cuda.core.experimental._memory._device_memory_resource cimport DeviceMemoryResource from cuda.core.experimental._memory._ipc cimport IPCBufferDescriptor, IPCDataForBuffer from cuda.core.experimental._memory cimport _ipc +from cuda.core.experimental._resource_handles cimport ( + DevicePtrHandle, + StreamHandle, + deviceptr_create_ref, + intptr, + native, + py, + set_deallocation_stream, +) from cuda.core.experimental._stream cimport Stream_accept, Stream from cuda.core.experimental._utils.cuda_utils cimport HANDLE_RETURN @@ -43,39 +52,39 @@ cdef class Buffer: self._clear() def _clear(self): - self._ptr = 0 + # _h_ptr is default-initialized (empty shared_ptr) by C++ self._size = 0 self._memory_resource = None self._ipc_data = None - self._ptr_obj = None - self._alloc_stream = None - self._owner = None - self._mem_attrs_inited = False def __init__(self, *args, **kwargs): raise RuntimeError("Buffer objects cannot be instantiated directly. " "Please use MemoryResource APIs.") + # Note: _init_from_handle is a cdef inline function, not a method + # See Buffer_init_from_handle below + @classmethod def _init( cls, ptr: DevicePointerT, size_t size, mr: MemoryResource | None = None, stream: Stream | None = None, ipc_descriptor: IPCBufferDescriptor | None = None, owner : object | None = None ): + """Legacy init for compatibility - creates a non-owning ref handle. + + Note: The stream parameter is accepted for API compatibility but is + ignored since non-owning refs are never freed by the handle. + """ cdef Buffer self = Buffer.__new__(cls) - self._ptr = (int(ptr)) - self._ptr_obj = ptr + self._h_ptr = deviceptr_create_ref((int(ptr))) self._size = size if mr is not None and owner is not None: raise ValueError("owner and memory resource cannot be both specified together") self._memory_resource = mr self._ipc_data = IPCDataForBuffer(ipc_descriptor, True) if ipc_descriptor is not None else None - self._alloc_stream = (stream) if stream is not None else None - self._owner = owner return self - def __dealloc__(self): - self.close(self._alloc_stream) + # No __dealloc__ needed - RAII handles cleanup via _h_ptr destructor def __reduce__(self): # Must not serialize the parent's stream! @@ -96,13 +105,14 @@ cdef class Buffer: Memory size of the buffer mr : :obj:`~_memory.MemoryResource`, optional Memory resource associated with the buffer - owner : object, optional - An object holding external allocation that the ``ptr`` points to. - The reference is kept as long as the buffer is alive. - The ``owner`` and ``mr`` cannot be specified together. + + Note + ---- + This creates a non-owning reference. The pointer will NOT be freed + when the Buffer is closed or garbage collected. """ - # TODO: It is better to take a stream for latter deallocation - return Buffer._init(ptr, size, mr=mr, owner=owner) + cdef DevicePtrHandle h_ptr = deviceptr_create_ref((int(ptr))) + return Buffer_from_deviceptr_handle(h_ptr, size, mr) @classmethod def from_ipc_descriptor( @@ -128,7 +138,7 @@ cdef class Buffer: ---------- stream : :obj:`~_stream.Stream` | :obj:`~_graph.GraphBuilder`, optional The stream object to use for asynchronous deallocation. If None, - the behavior depends on the underlying memory resource. + the deallocation stream stored in the handle is used. """ Buffer_close(self, stream) @@ -163,14 +173,8 @@ cdef class Buffer: raise ValueError( "buffer sizes mismatch between src and dst (sizes " f"are: src={src_size}, dst={dst_size})" ) - cdef cydriver.CUstream s = s_stream._handle - with nogil: - HANDLE_RETURN(cydriver.cuMemcpyAsync( - dst._ptr, - self._ptr, - src_size, - s - )) + err, = driver.cuMemcpyAsync(native(dst._h_ptr), native(self._h_ptr), src_size, stream.handle) + raise_if_driver_error(err) return dst def copy_from(self, src: Buffer, *, stream: Stream | GraphBuilder): @@ -194,14 +198,8 @@ cdef class Buffer: raise ValueError( "buffer sizes mismatch between src and dst (sizes " f"are: src={src_size}, dst={dst_size})" ) - cdef cydriver.CUstream s = s_stream._handle - with nogil: - HANDLE_RETURN(cydriver.cuMemcpyAsync( - self._ptr, - src._ptr, - dst_size, - s - )) + err, = driver.cuMemcpyAsync(native(self._h_ptr), native(src._h_ptr), dst_size, stream.handle) + raise_if_driver_error(err) def fill(self, value: int, width: int, *, stream: Stream | GraphBuilder): """Fill this buffer with a value pattern asynchronously on the given stream. @@ -222,42 +220,33 @@ cdef class Buffer: or if buffer size is not divisible by width """ - cdef Stream s_stream = Stream_accept(stream) - cdef unsigned char c_value8 - cdef unsigned short c_value16 - cdef unsigned int c_value32 - cdef size_t N + stream = Stream_accept(stream) # Validate width if width not in (1, 2, 4): raise ValueError(f"width must be 1, 2, or 4, got {width}") # Validate buffer size modulus. - cdef size_t buffer_size = self._size + buffer_size = self._size if buffer_size % width != 0: raise ValueError(f"buffer size ({buffer_size}) must be divisible by width ({width})") # Map width (bytes) to bitwidth and validate value - cdef int bitwidth = width * 8 + bitwidth = width * 8 _validate_value_against_bitwidth(bitwidth, value, is_signed=False) # Validate value fits in width and perform fill - cdef cydriver.CUstream s = s_stream._handle + ptr = native(self._h_ptr) if width == 1: - c_value8 = value N = buffer_size - with nogil: - HANDLE_RETURN(cydriver.cuMemsetD8Async(self._ptr, c_value8, N, s)) + err, = driver.cuMemsetD8Async(ptr, value, N, stream.handle) elif width == 2: - c_value16 = value N = buffer_size // 2 - with nogil: - HANDLE_RETURN(cydriver.cuMemsetD16Async(self._ptr, c_value16, N, s)) + err, = driver.cuMemsetD16Async(ptr, value, N, stream.handle) else: # width == 4 - c_value32 = value N = buffer_size // 4 - with nogil: - HANDLE_RETURN(cydriver.cuMemsetD32Async(self._ptr, c_value32, N, s)) + err, = driver.cuMemsetD32Async(ptr, value, N, stream.handle) + raise_if_driver_error(err) def __dlpack__( self, @@ -310,9 +299,7 @@ cdef class Buffer: """Return the device ordinal of this buffer.""" if self._memory_resource is not None: return self._memory_resource.device_id - else: - Buffer_init_mem_attrs(self) - return self._mem_attrs.device_id + raise NotImplementedError("device_id requires a memory resource") @property def handle(self) -> DevicePointerT: @@ -323,31 +310,23 @@ cdef class Buffer: This handle is a Python object. To get the memory address of the underlying C handle, call ``int(Buffer.handle)``. """ - if self._ptr_obj is not None: - return self._ptr_obj - elif self._ptr: - return self._ptr - else: - # contract: Buffer is closed - return 0 + # Return raw integer for compatibility with ctypes and other tools + # that expect a raw pointer value + return intptr(self._h_ptr) @property def is_device_accessible(self) -> bool: """Return True if this buffer can be accessed by the GPU, otherwise False.""" if self._memory_resource is not None: return self._memory_resource.is_device_accessible - else: - Buffer_init_mem_attrs(self) - return self._mem_attrs.is_device_accessible + raise NotImplementedError("is_device_accessible requires a memory resource") @property def is_host_accessible(self) -> bool: """Return True if this buffer can be accessed by the CPU, otherwise False.""" if self._memory_resource is not None: return self._memory_resource.is_host_accessible - else: - Buffer_init_mem_attrs(self) - return self._mem_attrs.is_host_accessible + raise NotImplementedError("is_host_accessible requires a memory resource") @property def is_mapped(self) -> bool: @@ -365,92 +344,6 @@ cdef class Buffer: """Return the memory size of this buffer.""" return self._size - @property - def owner(self) -> object: - """Return the object holding external allocation.""" - return self._owner - - -# Buffer Implementation -# --------------------- -cdef inline void Buffer_close(Buffer self, stream): - cdef Stream s - if self._ptr: - if self._memory_resource is not None: - s = Stream_accept(stream) if stream is not None else self._alloc_stream - self._memory_resource.deallocate(self._ptr, self._size, s) - self._ptr = 0 - self._memory_resource = None - self._owner = None - self._ptr_obj = None - self._alloc_stream = None - - -cdef Buffer_init_mem_attrs(Buffer self): - if not self._mem_attrs_inited: - query_memory_attrs(self._mem_attrs, self._ptr) - self._mem_attrs_inited = True - - -cdef int query_memory_attrs(_MemAttrs &out, uintptr_t ptr) except -1 nogil: - cdef unsigned int memory_type = 0 - cdef int is_managed = 0 - cdef int device_id = 0 - _query_memory_attrs(memory_type, is_managed, device_id, ptr) - - if memory_type == 0: - # unregistered host pointer - out.is_host_accessible = True - out.is_device_accessible = False - out.device_id = -1 - # for managed memory, the memory type can be CU_MEMORYTYPE_DEVICE, - # so we need to check it first not to falsely claim it is not - # host accessible. - elif ( - is_managed - or memory_type == cydriver.CUmemorytype.CU_MEMORYTYPE_HOST - ): - # For pinned memory allocated with cudaMallocHost or paged-locked - # with cudaHostRegister, the memory_type is - # cydriver.CUmemorytype.CU_MEMORYTYPE_HOST. - # TODO(ktokarski): In some cases, the registered memory requires - # using different ptr for device and host, we could check - # cuMemHostGetDevicePointer and - # CU_DEVICE_ATTRIBUTE_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM - # to double check the device accessibility. - out.is_host_accessible = True - out.is_device_accessible = True - out.device_id = device_id - elif memory_type == cydriver.CUmemorytype.CU_MEMORYTYPE_DEVICE: - out.is_host_accessible = False - out.is_device_accessible = True - out.device_id = device_id - else: - raise ValueError(f"Unsupported memory type: {memory_type}") - return 0 - - -cdef inline int _query_memory_attrs(unsigned int& memory_type, int & is_managed, int& device_id, cydriver.CUdeviceptr ptr) except -1 nogil: - cdef cydriver.CUpointer_attribute attrs[3] - cdef uintptr_t vals[3] - attrs[0] = cydriver.CUpointer_attribute.CU_POINTER_ATTRIBUTE_MEMORY_TYPE - attrs[1] = cydriver.CUpointer_attribute.CU_POINTER_ATTRIBUTE_IS_MANAGED - attrs[2] = cydriver.CUpointer_attribute.CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL - vals[0] = &memory_type - vals[1] = &is_managed - vals[2] = &device_id - - cdef cydriver.CUresult ret - ret = cydriver.cuPointerGetAttributes(3, attrs, vals, ptr) - if ret == cydriver.CUresult.CUDA_ERROR_NOT_INITIALIZED: - with cython.gil: - # Device class handles the cuInit call internally - Device() - ret = cydriver.cuPointerGetAttributes(3, attrs, vals, ptr) - HANDLE_RETURN(ret) - return 0 - - cdef class MemoryResource: """Abstract base class for memory resources that manage allocation and deallocation of buffers. @@ -502,39 +395,50 @@ cdef class MemoryResource: ... -# Helper Functions -# ---------------- -cdef void _validate_value_against_bitwidth(int bitwidth, int64_t value, bint is_signed=False) except *: - """Validate that a value fits within the representable range for a given bitwidth. - - Parameters - ---------- - bitwidth : int - Number of bits (e.g., 8, 16, 32) - value : int64_t - Value to validate - is_signed : bool, optional - Whether the value is signed (default: False) - - Raises - ------ - ValueError - If value is outside the representable range for the bitwidth - """ - cdef int max_bits = bitwidth +# Buffer Implementation Helpers +# ----------------------------- +cdef inline Buffer Buffer_from_deviceptr_handle( + DevicePtrHandle h_ptr, + size_t size, + MemoryResource mr, + object ipc_descriptor = None +): + """Create a Buffer from an existing DevicePtrHandle.""" + cdef Buffer buf = Buffer.__new__(Buffer) + buf._h_ptr = h_ptr + buf._size = size + buf._memory_resource = mr + buf._ipc_data = IPCDataForBuffer(ipc_descriptor, True) if ipc_descriptor is not None else None + return buf + + +cdef inline void Buffer_close(Buffer self, object stream): + """Close a buffer, freeing its memory.""" + cdef Stream s + if not self._h_ptr: + return + # Update deallocation stream if provided + if stream is not None: + s = Stream_accept(stream) + set_deallocation_stream(self._h_ptr, s._h_stream) + # Reset handle - RAII deleter will free the memory + self._h_ptr.reset() + self._size = 0 + self._memory_resource = None + self._ipc_data = None + + +def _validate_value_against_bitwidth(bitwidth, value, is_signed=False): + """Validate that a value fits within the representable range for a given bitwidth.""" + max_bits = bitwidth assert max_bits < 64, f"bitwidth ({max_bits}) must be less than 64" - cdef int64_t min_value - cdef uint64_t max_value_unsigned - cdef int64_t max_value - if is_signed: - min_value = -(1 << (max_bits - 1)) - max_value = (1 << (max_bits - 1)) - 1 + min_value = -(1 << (max_bits - 1)) + max_value = (1 << (max_bits - 1)) - 1 else: min_value = 0 - max_value_unsigned = (1 << max_bits) - 1 - max_value = max_value_unsigned + max_value = (1 << max_bits) - 1 if not min_value <= value <= max_value: raise ValueError( diff --git a/cuda_core/cuda/core/experimental/_memory/_device_memory_resource.pyx b/cuda_core/cuda/core/experimental/_memory/_device_memory_resource.pyx index b009408a43..2a3e5c2dfe 100644 --- a/cuda_core/cuda/core/experimental/_memory/_device_memory_resource.pyx +++ b/cuda_core/cuda/core/experimental/_memory/_device_memory_resource.pyx @@ -10,12 +10,14 @@ from libc.stdlib cimport malloc, free from libc.string cimport memset from cuda.bindings cimport cydriver -from cuda.core.experimental._memory._buffer cimport Buffer, MemoryResource +from cuda.core.experimental._memory._buffer cimport Buffer, Buffer_from_deviceptr_handle, MemoryResource from cuda.core.experimental._memory cimport _ipc from cuda.core.experimental._memory._ipc cimport IPCAllocationHandle, IPCDataForMR from cuda.core.experimental._resource_handles cimport ( + DevicePtrHandle, MemoryPoolHandle, create_mempool_handle, + deviceptr_alloc_from_pool, get_device_mempool, native, py, @@ -557,18 +559,12 @@ cdef inline int check_not_capturing(cydriver.CUstream s) except?-1 nogil: cdef inline Buffer DMR_allocate(DeviceMemoryResource self, size_t size, Stream stream): cdef cydriver.CUstream s = native(stream._h_stream) - cdef cydriver.CUmemoryPool pool = native(self._h_pool) - cdef cydriver.CUdeviceptr devptr with nogil: check_not_capturing(s) - HANDLE_RETURN(cydriver.cuMemAllocFromPoolAsync(&devptr, size, pool, s)) - cdef Buffer buf = Buffer.__new__(Buffer) - buf._ptr = (devptr) - buf._ptr_obj = None - buf._size = size - buf._memory_resource = self - buf._alloc_stream = stream - return buf + cdef DevicePtrHandle h_ptr = deviceptr_alloc_from_pool(size, self._h_pool, stream._h_stream) + if not h_ptr: + raise RuntimeError("Failed to allocate memory from pool") + return Buffer_from_deviceptr_handle(h_ptr, size, self, None) cdef inline void DMR_deallocate( diff --git a/cuda_core/cuda/core/experimental/_memory/_graph_memory_resource.pyx b/cuda_core/cuda/core/experimental/_memory/_graph_memory_resource.pyx index 5ad9d86c53..981c2830dd 100644 --- a/cuda_core/cuda/core/experimental/_memory/_graph_memory_resource.pyx +++ b/cuda_core/cuda/core/experimental/_memory/_graph_memory_resource.pyx @@ -7,8 +7,12 @@ from __future__ import annotations from libc.stdint cimport intptr_t from cuda.bindings cimport cydriver -from cuda.core.experimental._memory._buffer cimport Buffer, MemoryResource -from cuda.core.experimental._resource_handles cimport native +from cuda.core.experimental._memory._buffer cimport Buffer, Buffer_from_deviceptr_handle, MemoryResource +from cuda.core.experimental._resource_handles cimport ( + DevicePtrHandle, + deviceptr_alloc_async, + native, +) from cuda.core.experimental._stream cimport default_stream, Stream_accept, Stream from cuda.core.experimental._utils.cuda_utils cimport HANDLE_RETURN @@ -188,17 +192,12 @@ cdef inline int check_capturing(cydriver.CUstream s) except?-1 nogil: cdef inline Buffer GMR_allocate(cyGraphMemoryResource self, size_t size, Stream stream): cdef cydriver.CUstream s = native(stream._h_stream) - cdef cydriver.CUdeviceptr devptr with nogil: check_capturing(s) - HANDLE_RETURN(cydriver.cuMemAllocAsync(&devptr, size, s)) - cdef Buffer buf = Buffer.__new__(Buffer) - buf._ptr = (devptr) - buf._ptr_obj = None - buf._size = size - buf._memory_resource = self - buf._alloc_stream = stream - return buf + cdef DevicePtrHandle h_ptr = deviceptr_alloc_async(size, stream._h_stream) + if not h_ptr: + raise RuntimeError("Failed to allocate memory asynchronously") + return Buffer_from_deviceptr_handle(h_ptr, size, self, None) cdef inline void GMR_deallocate(intptr_t ptr, size_t size, Stream stream) noexcept: diff --git a/cuda_core/cuda/core/experimental/_memory/_ipc.pxd b/cuda_core/cuda/core/experimental/_memory/_ipc.pxd index 60d96a3b33..5505e92381 100644 --- a/cuda_core/cuda/core/experimental/_memory/_ipc.pxd +++ b/cuda_core/cuda/core/experimental/_memory/_ipc.pxd @@ -41,6 +41,8 @@ cdef class IPCBufferDescriptor: bytes _payload size_t _size + cdef const void* payload_ptr(self) noexcept + cdef class IPCAllocationHandle: cdef: diff --git a/cuda_core/cuda/core/experimental/_memory/_ipc.pyx b/cuda_core/cuda/core/experimental/_memory/_ipc.pyx index 4eb062dda0..f0bdc22216 100644 --- a/cuda_core/cuda/core/experimental/_memory/_ipc.pyx +++ b/cuda_core/cuda/core/experimental/_memory/_ipc.pyx @@ -7,10 +7,13 @@ from libc.stdint cimport uintptr_t from libc.string cimport memcpy from cuda.bindings cimport cydriver -from cuda.core.experimental._memory._buffer cimport Buffer +from cuda.core.experimental._memory._buffer cimport Buffer, Buffer_from_deviceptr_handle +from cuda.core.experimental._stream cimport Stream from cuda.core.experimental._resource_handles cimport ( + DevicePtrHandle, MemoryPoolHandle, create_mempool_handle_ipc, + deviceptr_import_ipc, native, ) from cuda.core.experimental._stream cimport default_stream @@ -92,6 +95,10 @@ cdef class IPCBufferDescriptor: def size(self): return self._size + cdef const void* payload_ptr(self) noexcept: + """Return the payload as a const void* for C API calls.""" + return (self._payload) + cdef class IPCAllocationHandle: """Shareable handle to an IPC-enabled device memory pool.""" @@ -166,7 +173,7 @@ cdef IPCBufferDescriptor Buffer_get_ipc_descriptor(Buffer self): cdef cydriver.CUmemPoolPtrExportData data with nogil: HANDLE_RETURN( - cydriver.cuMemPoolExportPointer(&data, (self._ptr)) + cydriver.cuMemPoolExportPointer(&data, native(self._h_ptr)) ) cdef bytes data_b = cpython.PyBytes_FromStringAndSize( (data.reserved), sizeof(data.reserved) @@ -182,16 +189,16 @@ cdef Buffer Buffer_from_ipc_descriptor( if stream is None: # Note: match this behavior to DeviceMemoryResource.allocate() stream = default_stream() - cdef cydriver.CUmemPoolPtrExportData data - memcpy( - data.reserved, - (ipc_descriptor._payload), - sizeof(data.reserved) + cdef Stream s = stream + cdef cydriver.CUresult err + cdef DevicePtrHandle h_ptr = deviceptr_import_ipc( + mr._h_pool, + ipc_descriptor.payload_ptr(), + s._h_stream, + &err ) - cdef cydriver.CUdeviceptr ptr - with nogil: - HANDLE_RETURN(cydriver.cuMemPoolImportPointer(&ptr, native(mr._h_pool), &data)) - return Buffer._init(ptr, ipc_descriptor.size, mr, stream, ipc_descriptor) + HANDLE_RETURN(err) + return Buffer_from_deviceptr_handle(h_ptr, ipc_descriptor.size, mr, ipc_descriptor) # DeviceMemoryResource IPC Implementation diff --git a/cuda_core/cuda/core/experimental/_resource_handles.pxd b/cuda_core/cuda/core/experimental/_resource_handles.pxd index 6bb172e64d..416dd8bd5c 100644 --- a/cuda_core/cuda/core/experimental/_resource_handles.pxd +++ b/cuda_core/cuda/core/experimental/_resource_handles.pxd @@ -88,6 +88,50 @@ cdef extern from "_cpp/resource_handles.hpp" namespace "cuda_core": # Returns empty handle on error (caller must check) MemoryPoolHandle create_mempool_handle_ipc(int fd, cydriver.CUmemAllocationHandleType handle_type) nogil + # ======================================================================== + # Device Pointer Handle + # ======================================================================== + ctypedef shared_ptr[const cydriver.CUdeviceptr] DevicePtrHandle + + # Allocate device memory from a pool asynchronously via cuMemAllocFromPoolAsync + # Pool handle is captured in deleter to keep pool alive + # Returns empty handle on error (caller must check) + DevicePtrHandle deviceptr_alloc_from_pool( + size_t size, + MemoryPoolHandle h_pool, + StreamHandle h_stream) nogil + + # Allocate device memory asynchronously via cuMemAllocAsync + # Returns empty handle on error (caller must check) + DevicePtrHandle deviceptr_alloc_async(size_t size, StreamHandle h_stream) nogil + + # Allocate device memory synchronously via cuMemAlloc + # Returns empty handle on error (caller must check) + DevicePtrHandle deviceptr_alloc(size_t size) nogil + + # Allocate pinned host memory via cuMemAllocHost + # Returns empty handle on error (caller must check) + DevicePtrHandle deviceptr_alloc_host(size_t size) nogil + + # Create a non-owning device pointer handle (pointer NOT freed when released) + # Use for foreign pointers from external libraries + DevicePtrHandle deviceptr_create_ref(cydriver.CUdeviceptr ptr) nogil + + # Import a device pointer from IPC via cuMemPoolImportPointer + # Note: Does not yet implement reference counting for nvbug 5570902 + # Error code is written to error_out (caller must check) + DevicePtrHandle deviceptr_import_ipc( + MemoryPoolHandle h_pool, + const void* export_data, + StreamHandle h_stream, + cydriver.CUresult* error_out) nogil + + # Access the deallocation stream for a device pointer handle (read-only) + StreamHandle deallocation_stream(const DevicePtrHandle& h) nogil + + # Set the deallocation stream for a device pointer handle + void set_deallocation_stream(const DevicePtrHandle& h, StreamHandle h_stream) nogil + # ======================================================================== # Overloaded helper functions (C++ handles dispatch by type) # ======================================================================== @@ -97,15 +141,18 @@ cdef extern from "_cpp/resource_handles.hpp" namespace "cuda_core": cydriver.CUstream native(StreamHandle h) nogil cydriver.CUevent native(EventHandle h) nogil cydriver.CUmemoryPool native(MemoryPoolHandle h) nogil + cydriver.CUdeviceptr native(DevicePtrHandle h) nogil # intptr() - extract handle as uintptr_t for Python interop uintptr_t intptr(ContextHandle h) nogil uintptr_t intptr(StreamHandle h) nogil uintptr_t intptr(EventHandle h) nogil uintptr_t intptr(MemoryPoolHandle h) nogil + uintptr_t intptr(DevicePtrHandle h) nogil # py() - convert handle to Python driver wrapper object (requires GIL) object py(ContextHandle h) object py(StreamHandle h) object py(EventHandle h) object py(MemoryPoolHandle h) + object py(DevicePtrHandle h) From ea9a293a18459d0311eee549a11c6362aa19fb78 Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Thu, 11 Dec 2025 13:56:25 -0800 Subject: [PATCH 18/38] Use intptr_t for all handle integer conversions Change all intptr() overloads to return std::intptr_t (signed) instead of std::uintptr_t per C standard convention for pointer-to-integer conversion. This addresses issue #1342 which requires Buffer.handle to return a signed integer. Fixes #1342 --- .../experimental/_cpp/resource_handles.hpp | 23 ++++++++++--------- .../core/experimental/_resource_handles.pxd | 15 ++++++------ 2 files changed, 20 insertions(+), 18 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_cpp/resource_handles.hpp b/cuda_core/cuda/core/experimental/_cpp/resource_handles.hpp index 5ce6671fa9..7ef7ca153c 100644 --- a/cuda_core/cuda/core/experimental/_cpp/resource_handles.hpp +++ b/cuda_core/cuda/core/experimental/_cpp/resource_handles.hpp @@ -188,25 +188,26 @@ inline CUdeviceptr native(const DevicePtrHandle& h) noexcept { return h ? *h : 0; } -// intptr() - extract handle as uintptr_t for Python interop -inline std::uintptr_t intptr(const ContextHandle& h) noexcept { - return reinterpret_cast(h ? *h : nullptr); +// intptr() - extract handle as intptr_t for Python interop +// Using signed intptr_t per C standard convention and issue #1342 +inline std::intptr_t intptr(const ContextHandle& h) noexcept { + return reinterpret_cast(h ? *h : nullptr); } -inline std::uintptr_t intptr(const StreamHandle& h) noexcept { - return reinterpret_cast(h ? *h : nullptr); +inline std::intptr_t intptr(const StreamHandle& h) noexcept { + return reinterpret_cast(h ? *h : nullptr); } -inline std::uintptr_t intptr(const EventHandle& h) noexcept { - return reinterpret_cast(h ? *h : nullptr); +inline std::intptr_t intptr(const EventHandle& h) noexcept { + return reinterpret_cast(h ? *h : nullptr); } -inline std::uintptr_t intptr(const MemoryPoolHandle& h) noexcept { - return reinterpret_cast(h ? *h : nullptr); +inline std::intptr_t intptr(const MemoryPoolHandle& h) noexcept { + return reinterpret_cast(h ? *h : nullptr); } -inline std::uintptr_t intptr(const DevicePtrHandle& h) noexcept { - return h ? static_cast(*h) : 0; +inline std::intptr_t intptr(const DevicePtrHandle& h) noexcept { + return h ? static_cast(*h) : 0; } // py() - convert handle to Python driver wrapper object diff --git a/cuda_core/cuda/core/experimental/_resource_handles.pxd b/cuda_core/cuda/core/experimental/_resource_handles.pxd index 416dd8bd5c..ea0841ed27 100644 --- a/cuda_core/cuda/core/experimental/_resource_handles.pxd +++ b/cuda_core/cuda/core/experimental/_resource_handles.pxd @@ -2,7 +2,7 @@ # # SPDX-License-Identifier: Apache-2.0 -from libc.stdint cimport uintptr_t +from libc.stdint cimport intptr_t from libcpp.memory cimport shared_ptr from cuda.bindings cimport cydriver @@ -143,12 +143,13 @@ cdef extern from "_cpp/resource_handles.hpp" namespace "cuda_core": cydriver.CUmemoryPool native(MemoryPoolHandle h) nogil cydriver.CUdeviceptr native(DevicePtrHandle h) nogil - # intptr() - extract handle as uintptr_t for Python interop - uintptr_t intptr(ContextHandle h) nogil - uintptr_t intptr(StreamHandle h) nogil - uintptr_t intptr(EventHandle h) nogil - uintptr_t intptr(MemoryPoolHandle h) nogil - uintptr_t intptr(DevicePtrHandle h) nogil + # intptr() - extract handle as intptr_t for Python interop + # Using signed intptr_t per C standard convention and issue #1342 + intptr_t intptr(ContextHandle h) nogil + intptr_t intptr(StreamHandle h) nogil + intptr_t intptr(EventHandle h) nogil + intptr_t intptr(MemoryPoolHandle h) nogil + intptr_t intptr(DevicePtrHandle h) nogil # py() - convert handle to Python driver wrapper object (requires GIL) object py(ContextHandle h) From 92fa76bc32c3328389bf5fcb7169e833cdd70c98 Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Thu, 11 Dec 2025 14:15:25 -0800 Subject: [PATCH 19/38] Add thread-local error handling for resource handle functions Implement a systematic error handling approach for C++ resource handle functions using thread-local storage, similar to cudaGetLastError(). API: - get_last_error(): Returns and clears the last CUDA error - peek_last_error(): Returns without clearing - clear_last_error(): Explicitly clears the error All functions that can fail now set the thread-local error before returning an empty handle. This allows callers to retrieve specific CUDA error codes for proper exception propagation. Updated deviceptr_import_ipc to use this pattern instead of an output parameter. --- .../experimental/_cpp/resource_handles.cpp | 372 +++++++----------- .../experimental/_cpp/resource_handles.hpp | 18 +- .../cuda/core/experimental/_memory/_ipc.pyx | 8 +- .../core/experimental/_resource_handles.pxd | 12 +- 4 files changed, 164 insertions(+), 246 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_cpp/resource_handles.cpp b/cuda_core/cuda/core/experimental/_cpp/resource_handles.cpp index b5ccfff105..bc663f8228 100644 --- a/cuda_core/cuda/core/experimental/_cpp/resource_handles.cpp +++ b/cuda_core/cuda/core/experimental/_cpp/resource_handles.cpp @@ -10,6 +10,31 @@ namespace cuda_core { +// ============================================================================ +// Thread-local error handling +// ============================================================================ + +// Thread-local status of the most recent CUDA API call in this module. +thread_local CUresult err = CUDA_SUCCESS; + +CUresult get_last_error() noexcept { + CUresult e = err; + err = CUDA_SUCCESS; + return e; +} + +CUresult peek_last_error() noexcept { + return err; +} + +void clear_last_error() noexcept { + err = CUDA_SUCCESS; +} + +// ============================================================================ +// GIL management helpers +// ============================================================================ + // Helper to release the GIL while calling into the CUDA driver. // This guard is *conditional*: if the caller already dropped the GIL, // we avoid calling PyEval_SaveThread (which requires holding the GIL). @@ -46,7 +71,6 @@ class GILReleaseGuard { // Helper to acquire the GIL when we might not hold it. // Use in C++ destructors that need to manipulate Python objects. -// Symmetric counterpart to GILReleaseGuard. class GILAcquireGuard { public: GILAcquireGuard() : acquired_(false) { @@ -64,7 +88,6 @@ class GILAcquireGuard { } } - // Check if GIL was successfully acquired (for conditional operations) bool acquired() const { return acquired_; } // Non-copyable, non-movable @@ -76,78 +99,63 @@ class GILAcquireGuard { bool acquired_; }; -// Internal box structure for Context (kept private to this TU) +// ============================================================================ +// Context Handles +// ============================================================================ + struct ContextBox { CUcontext resource; }; ContextHandle create_context_handle_ref(CUcontext ctx) { - // Creates a non-owning handle that references an existing context - // (e.g., primary context managed by CUDA driver) - - // Use default deleter - it will delete the box, but not touch the CUcontext - // CUcontext lifetime is managed externally (e.g., by CUDA driver) - auto box = std::shared_ptr(new ContextBox{ctx}); - - // Use aliasing constructor to create handle that exposes only CUcontext - // The handle's reference count is tied to box, but it points to &box->resource + auto box = std::make_shared(ContextBox{ctx}); return ContextHandle(box, &box->resource); } -// Thread-local storage for primary context cache -// Each thread maintains its own cache of primary contexts indexed by device ID +// Thread-local cache of primary contexts indexed by device ID thread_local std::vector primary_context_cache; ContextHandle get_primary_context(int device_id) noexcept { // Check thread-local cache if (static_cast(device_id) < primary_context_cache.size()) { - auto cached = primary_context_cache[device_id]; - if (cached.get() != nullptr) { - return cached; // Cache hit + if (auto cached = primary_context_cache[device_id]) { + return cached; } } // Cache miss - acquire primary context from driver + GILReleaseGuard gil; CUcontext ctx; - CUresult err; - { - GILReleaseGuard gil; - err = cuDevicePrimaryCtxRetain(&ctx, device_id); + if (CUDA_SUCCESS != (err = cuDevicePrimaryCtxRetain(&ctx, device_id))) { + return {}; } - if (err != CUDA_SUCCESS) { - // Return empty handle on error (caller must check) - return ContextHandle(); - } - - // Create owning handle with custom deleter that releases the primary context - auto box = std::shared_ptr(new ContextBox{ctx}, [device_id](const ContextBox* b) { - GILReleaseGuard gil; - cuDevicePrimaryCtxRelease(device_id); - delete b; - }); - // Use aliasing constructor to expose only CUcontext - auto h_context = ContextHandle(box, &box->resource); + auto box = std::shared_ptr( + new ContextBox{ctx}, + [device_id](const ContextBox* b) { + GILReleaseGuard gil; + cuDevicePrimaryCtxRelease(device_id); + delete b; + } + ); + auto h = ContextHandle(box, &box->resource); - // Resize cache if needed + // Update cache if (static_cast(device_id) >= primary_context_cache.size()) { primary_context_cache.resize(device_id + 1); } - primary_context_cache[device_id] = h_context; - - return h_context; + primary_context_cache[device_id] = h; + return h; } ContextHandle get_current_context() noexcept { + GILReleaseGuard gil; CUcontext ctx = nullptr; - CUresult err; - { - GILReleaseGuard gil; - err = cuCtxGetCurrent(&ctx); + if (CUDA_SUCCESS != (err = cuCtxGetCurrent(&ctx))) { + return {}; } - if (err != CUDA_SUCCESS || ctx == nullptr) { - // Return empty handle if no current context or error - return ContextHandle(); + if (!ctx) { + return {}; // No current context (not an error) } return create_context_handle_ref(ctx); } @@ -156,76 +164,54 @@ ContextHandle get_current_context() noexcept { // Stream Handles // ============================================================================ -// Internal box structure for Stream struct StreamBox { CUstream resource; }; StreamHandle create_stream_handle(ContextHandle h_ctx, unsigned int flags, int priority) { - // Creates an owning stream handle - calls cuStreamCreateWithPriority internally. - // The context handle is captured in the deleter to ensure context outlives the stream. - // Returns empty handle on error (caller must check). + GILReleaseGuard gil; CUstream stream; - CUresult err; - { - GILReleaseGuard gil; - err = cuStreamCreateWithPriority(&stream, flags, priority); - } - if (err != CUDA_SUCCESS) { - return StreamHandle(); + if (CUDA_SUCCESS != (err = cuStreamCreateWithPriority(&stream, flags, priority))) { + return {}; } - // Capture h_ctx in lambda - shared_ptr control block keeps it alive - auto box = std::shared_ptr(new StreamBox{stream}, [h_ctx](const StreamBox* b) { - GILReleaseGuard gil; - cuStreamDestroy(b->resource); - delete b; - // h_ctx destructor runs here when last stream reference is released - }); - - // Use aliasing constructor to expose only CUstream + auto box = std::shared_ptr( + new StreamBox{stream}, + [h_ctx](const StreamBox* b) { + GILReleaseGuard gil; + cuStreamDestroy(b->resource); + delete b; + } + ); return StreamHandle(box, &box->resource); } StreamHandle create_stream_handle_ref(CUstream stream) { - // Creates a non-owning handle - stream will NOT be destroyed. - // Caller is responsible for keeping the stream's context alive. - auto box = std::shared_ptr(new StreamBox{stream}); - - // Use aliasing constructor to expose only CUstream + auto box = std::make_shared(StreamBox{stream}); return StreamHandle(box, &box->resource); } StreamHandle create_stream_handle_with_owner(CUstream stream, PyObject* owner) { - // Creates a non-owning handle that prevents a Python owner from being GC'd. - // The owner's refcount is incremented here and decremented when handle is released. - // The owner is responsible for keeping the stream's context alive. Py_XINCREF(owner); - - auto box = std::shared_ptr(new StreamBox{stream}, [owner](const StreamBox* b) { - // Safely decrement owner refcount (GILAcquireGuard handles finalization check) - { + auto box = std::shared_ptr( + new StreamBox{stream}, + [owner](const StreamBox* b) { GILAcquireGuard gil; if (gil.acquired()) { Py_XDECREF(owner); } + delete b; } - delete b; - }); - + ); return StreamHandle(box, &box->resource); } StreamHandle get_legacy_stream() noexcept { - // Return non-owning handle to the legacy default stream. - // Use function-local static for efficient repeated access. static StreamHandle handle = create_stream_handle_ref(CU_STREAM_LEGACY); return handle; } StreamHandle get_per_thread_stream() noexcept { - // Return non-owning handle to the per-thread default stream. - // Use function-local static for efficient repeated access. static StreamHandle handle = create_stream_handle_ref(CU_STREAM_PER_THREAD); return handle; } @@ -234,65 +220,47 @@ StreamHandle get_per_thread_stream() noexcept { // Event Handles // ============================================================================ -// Internal box structure for Event struct EventBox { CUevent resource; }; EventHandle create_event_handle(ContextHandle h_ctx, unsigned int flags) { - // Creates an owning event handle - calls cuEventCreate internally. - // The context handle is captured in the deleter to ensure context outlives the event. - // Returns empty handle on error (caller must check). + GILReleaseGuard gil; CUevent event; - CUresult err; - { - GILReleaseGuard gil; - err = cuEventCreate(&event, flags); - } - if (err != CUDA_SUCCESS) { - return EventHandle(); + if (CUDA_SUCCESS != (err = cuEventCreate(&event, flags))) { + return {}; } - // Capture h_ctx in lambda - shared_ptr control block keeps it alive - auto box = std::shared_ptr(new EventBox{event}, [h_ctx](const EventBox* b) { - GILReleaseGuard gil; - cuEventDestroy(b->resource); - delete b; - // h_ctx destructor runs here when last event reference is released - }); - - // Use aliasing constructor to expose only CUevent + auto box = std::shared_ptr( + new EventBox{event}, + [h_ctx](const EventBox* b) { + GILReleaseGuard gil; + cuEventDestroy(b->resource); + delete b; + } + ); return EventHandle(box, &box->resource); } EventHandle create_event_handle(unsigned int flags) { - // Creates an owning event handle without context dependency. - // Use for temporary events that are created and destroyed in the same scope. - // Returns empty handle on error (caller must check). return create_event_handle(ContextHandle{}, flags); } EventHandle create_event_handle_ipc(const CUipcEventHandle& ipc_handle) { - // Creates an owning event handle from an IPC handle. - // The originating process owns the event and its context. - // Returns empty handle on error (caller must check). + GILReleaseGuard gil; CUevent event; - CUresult err; - { - GILReleaseGuard gil; - err = cuIpcOpenEventHandle(&event, ipc_handle); - } - if (err != CUDA_SUCCESS) { - return EventHandle(); + if (CUDA_SUCCESS != (err = cuIpcOpenEventHandle(&event, ipc_handle))) { + return {}; } - auto box = std::shared_ptr(new EventBox{event}, [](const EventBox* b) { - GILReleaseGuard gil; - cuEventDestroy(b->resource); - delete b; - }); - - // Use aliasing constructor to expose only CUevent + auto box = std::shared_ptr( + new EventBox{event}, + [](const EventBox* b) { + GILReleaseGuard gil; + cuEventDestroy(b->resource); + delete b; + } + ); return EventHandle(box, &box->resource); } @@ -300,7 +268,6 @@ EventHandle create_event_handle_ipc(const CUipcEventHandle& ipc_handle) { // Memory Pool Handles // ============================================================================ -// Internal box structure for MemoryPool struct MemoryPoolBox { CUmemoryPool resource; }; @@ -319,90 +286,68 @@ static void clear_mempool_peer_access(CUmemoryPool pool) { clear_access[i].location.id = i; clear_access[i].flags = CU_MEM_ACCESS_FLAGS_PROT_NONE; } - - // Ignore errors - best effort cleanup - cuMemPoolSetAccess(pool, clear_access.data(), device_count); + cuMemPoolSetAccess(pool, clear_access.data(), device_count); // Best effort } -// Helper to wrap a raw pool in an owning handle. -// The deleter clears peer access (nvbug 5698116 workaround) and destroys the pool. static MemoryPoolHandle wrap_mempool_owned(CUmemoryPool pool) { - auto box = std::shared_ptr(new MemoryPoolBox{pool}, [](const MemoryPoolBox* b) { - GILReleaseGuard gil; - clear_mempool_peer_access(b->resource); - cuMemPoolDestroy(b->resource); - delete b; - }); + auto box = std::shared_ptr( + new MemoryPoolBox{pool}, + [](const MemoryPoolBox* b) { + GILReleaseGuard gil; + clear_mempool_peer_access(b->resource); + cuMemPoolDestroy(b->resource); + delete b; + } + ); return MemoryPoolHandle(box, &box->resource); } MemoryPoolHandle create_mempool_handle(const CUmemPoolProps& props) { - // Creates an owning memory pool handle - calls cuMemPoolCreate internally. - // Memory pools are device-scoped (not context-scoped). - // Returns empty handle on error (caller must check). + GILReleaseGuard gil; CUmemoryPool pool; - CUresult err; - { - GILReleaseGuard gil; - err = cuMemPoolCreate(&pool, &props); + if (CUDA_SUCCESS != (err = cuMemPoolCreate(&pool, &props))) { + return {}; } - return err == CUDA_SUCCESS ? wrap_mempool_owned(pool) : MemoryPoolHandle(); + return wrap_mempool_owned(pool); } MemoryPoolHandle create_mempool_handle_ref(CUmemoryPool pool) { - // Creates a non-owning handle - pool will NOT be destroyed. - // Use for device default/current pools managed by the driver. - auto box = std::shared_ptr(new MemoryPoolBox{pool}); - - // Use aliasing constructor to expose only CUmemoryPool + auto box = std::make_shared(MemoryPoolBox{pool}); return MemoryPoolHandle(box, &box->resource); } MemoryPoolHandle get_device_mempool(int device_id) noexcept { - // Get the current memory pool for a device. - // Returns a non-owning handle (pool managed by driver). + GILReleaseGuard gil; CUmemoryPool pool; - CUresult err; - { - GILReleaseGuard gil; - err = cuDeviceGetMemPool(&pool, device_id); - } - if (err != CUDA_SUCCESS) { - return MemoryPoolHandle(); + if (CUDA_SUCCESS != (err = cuDeviceGetMemPool(&pool, device_id))) { + return {}; } return create_mempool_handle_ref(pool); } MemoryPoolHandle create_mempool_handle_ipc(int fd, CUmemAllocationHandleType handle_type) { - // Creates an owning memory pool handle from an IPC import. - // The file descriptor is NOT owned by this handle. - // Returns empty handle on error (caller must check). + GILReleaseGuard gil; CUmemoryPool pool; - CUresult err; - { - GILReleaseGuard gil; - err = cuMemPoolImportFromShareableHandle(&pool, reinterpret_cast(static_cast(fd)), handle_type, 0); + auto handle_ptr = reinterpret_cast(static_cast(fd)); + if (CUDA_SUCCESS != (err = cuMemPoolImportFromShareableHandle(&pool, handle_ptr, handle_type, 0))) { + return {}; } - return err == CUDA_SUCCESS ? wrap_mempool_owned(pool) : MemoryPoolHandle(); + return wrap_mempool_owned(pool); } // ============================================================================ // Device Pointer Handles // ============================================================================ -// Internal box structure for DevicePtr. -// The h_stream is mutable to allow updating the deallocation stream after creation. struct DevicePtrBox { CUdeviceptr resource; mutable StreamHandle h_stream; }; -// Internal helper to retrieve the box from a handle (for deallocation_stream access). static DevicePtrBox* get_box(const DevicePtrHandle& h) { const CUdeviceptr* p = h.get(); return reinterpret_cast( - reinterpret_cast(const_cast(p)) - - offsetof(DevicePtrBox, resource) + reinterpret_cast(const_cast(p)) - offsetof(DevicePtrBox, resource) ); } @@ -414,53 +359,35 @@ void set_deallocation_stream(const DevicePtrHandle& h, StreamHandle h_stream) { get_box(h)->h_stream = std::move(h_stream); } -DevicePtrHandle deviceptr_alloc_from_pool( - size_t size, - MemoryPoolHandle h_pool, - StreamHandle h_stream) -{ - // Allocate from pool asynchronously. - // Pool handle is captured in deleter to keep pool alive. +DevicePtrHandle deviceptr_alloc_from_pool(size_t size, MemoryPoolHandle h_pool, StreamHandle h_stream) { + GILReleaseGuard gil; CUdeviceptr ptr; - CUresult err; - { - GILReleaseGuard gil; - err = cuMemAllocFromPoolAsync(&ptr, size, *h_pool, native(h_stream)); - } - if (err != CUDA_SUCCESS) { - return DevicePtrHandle(); + if (CUDA_SUCCESS != (err = cuMemAllocFromPoolAsync(&ptr, size, *h_pool, native(h_stream)))) { + return {}; } auto box = std::shared_ptr( new DevicePtrBox{ptr, h_stream}, [h_pool](DevicePtrBox* b) { GILReleaseGuard gil; - // cuMemFreeAsync accepts NULL stream (uses legacy default stream) cuMemFreeAsync(b->resource, native(b->h_stream)); delete b; - // h_pool destructor runs here, releasing pool reference } ); return DevicePtrHandle(box, &box->resource); } DevicePtrHandle deviceptr_alloc_async(size_t size, StreamHandle h_stream) { - // Allocate asynchronously (not from a specific pool). + GILReleaseGuard gil; CUdeviceptr ptr; - CUresult err; - { - GILReleaseGuard gil; - err = cuMemAllocAsync(&ptr, size, native(h_stream)); - } - if (err != CUDA_SUCCESS) { - return DevicePtrHandle(); + if (CUDA_SUCCESS != (err = cuMemAllocAsync(&ptr, size, native(h_stream)))) { + return {}; } auto box = std::shared_ptr( new DevicePtrBox{ptr, h_stream}, [](DevicePtrBox* b) { GILReleaseGuard gil; - // cuMemFreeAsync accepts NULL stream (uses legacy default stream) cuMemFreeAsync(b->resource, native(b->h_stream)); delete b; } @@ -469,15 +396,10 @@ DevicePtrHandle deviceptr_alloc_async(size_t size, StreamHandle h_stream) { } DevicePtrHandle deviceptr_alloc(size_t size) { - // Allocate synchronously. + GILReleaseGuard gil; CUdeviceptr ptr; - CUresult err; - { - GILReleaseGuard gil; - err = cuMemAlloc(&ptr, size); - } - if (err != CUDA_SUCCESS) { - return DevicePtrHandle(); + if (CUDA_SUCCESS != (err = cuMemAlloc(&ptr, size))) { + return {}; } auto box = std::shared_ptr( @@ -492,15 +414,10 @@ DevicePtrHandle deviceptr_alloc(size_t size) { } DevicePtrHandle deviceptr_alloc_host(size_t size) { - // Allocate pinned host memory. + GILReleaseGuard gil; void* ptr; - CUresult err; - { - GILReleaseGuard gil; - err = cuMemAllocHost(&ptr, size); - } - if (err != CUDA_SUCCESS) { - return DevicePtrHandle(); + if (CUDA_SUCCESS != (err = cuMemAllocHost(&ptr, size))) { + return {}; } auto box = std::shared_ptr( @@ -515,42 +432,25 @@ DevicePtrHandle deviceptr_alloc_host(size_t size) { } DevicePtrHandle deviceptr_create_ref(CUdeviceptr ptr) { - // Non-owning reference - pointer will NOT be freed. - auto box = std::shared_ptr(new DevicePtrBox{ptr, StreamHandle{}}); + auto box = std::make_shared(DevicePtrBox{ptr, StreamHandle{}}); return DevicePtrHandle(box, &box->resource); } -DevicePtrHandle deviceptr_import_ipc( - MemoryPoolHandle h_pool, - const void* export_data, - StreamHandle h_stream, - CUresult* error_out) -{ - // Import pointer from IPC. - // Note: Does not implement reference counting workaround for nvbug 5570902 yet. +DevicePtrHandle deviceptr_import_ipc(MemoryPoolHandle h_pool, const void* export_data, StreamHandle h_stream) { + GILReleaseGuard gil; CUdeviceptr ptr; - CUresult err; - { - GILReleaseGuard gil; - err = cuMemPoolImportPointer(&ptr, *h_pool, - const_cast( - reinterpret_cast(export_data))); - } - if (error_out) { - *error_out = err; - } - if (err != CUDA_SUCCESS) { - return DevicePtrHandle(); + auto data = const_cast( + reinterpret_cast(export_data)); + if (CUDA_SUCCESS != (err = cuMemPoolImportPointer(&ptr, *h_pool, data))) { + return {}; } auto box = std::shared_ptr( new DevicePtrBox{ptr, h_stream}, [h_pool](DevicePtrBox* b) { GILReleaseGuard gil; - // cuMemFreeAsync accepts NULL stream (uses legacy default stream) cuMemFreeAsync(b->resource, native(b->h_stream)); delete b; - // h_pool destructor runs here } ); return DevicePtrHandle(box, &box->resource); diff --git a/cuda_core/cuda/core/experimental/_cpp/resource_handles.hpp b/cuda_core/cuda/core/experimental/_cpp/resource_handles.hpp index 7ef7ca153c..fc62c9aa2c 100644 --- a/cuda_core/cuda/core/experimental/_cpp/resource_handles.hpp +++ b/cuda_core/cuda/core/experimental/_cpp/resource_handles.hpp @@ -11,6 +11,19 @@ namespace cuda_core { +// ============================================================================ +// Thread-local error handling +// ============================================================================ + +// Get and clear the last CUDA error (like cudaGetLastError) +CUresult get_last_error() noexcept; + +// Get the last CUDA error without clearing it (like cudaPeekAtLastError) +CUresult peek_last_error() noexcept; + +// Explicitly clear the last error +void clear_last_error() noexcept; + // ============================================================================ // Handle type aliases - expose only the raw CUDA resource // ============================================================================ @@ -149,12 +162,11 @@ DevicePtrHandle deviceptr_create_ref(CUdeviceptr ptr); // Import a device pointer from IPC via cuMemPoolImportPointer. // When the last reference is released, cuMemFreeAsync is called on the stored stream. // Note: Does not yet implement reference counting for nvbug 5570902. -// Error code is written to error_out (caller must check). +// On error, returns empty handle and sets thread-local error (use get_last_error()). DevicePtrHandle deviceptr_import_ipc( MemoryPoolHandle h_pool, const void* export_data, - StreamHandle h_stream, - CUresult* error_out); + StreamHandle h_stream); // Access the deallocation stream for a device pointer handle (read-only). // For non-owning handles, the stream is not used but can still be accessed. diff --git a/cuda_core/cuda/core/experimental/_memory/_ipc.pyx b/cuda_core/cuda/core/experimental/_memory/_ipc.pyx index f0bdc22216..414c134601 100644 --- a/cuda_core/cuda/core/experimental/_memory/_ipc.pyx +++ b/cuda_core/cuda/core/experimental/_memory/_ipc.pyx @@ -14,6 +14,7 @@ from cuda.core.experimental._resource_handles cimport ( MemoryPoolHandle, create_mempool_handle_ipc, deviceptr_import_ipc, + get_last_error, native, ) from cuda.core.experimental._stream cimport default_stream @@ -190,14 +191,13 @@ cdef Buffer Buffer_from_ipc_descriptor( # Note: match this behavior to DeviceMemoryResource.allocate() stream = default_stream() cdef Stream s = stream - cdef cydriver.CUresult err cdef DevicePtrHandle h_ptr = deviceptr_import_ipc( mr._h_pool, ipc_descriptor.payload_ptr(), - s._h_stream, - &err + s._h_stream ) - HANDLE_RETURN(err) + if not h_ptr: + HANDLE_RETURN(get_last_error()) return Buffer_from_deviceptr_handle(h_ptr, ipc_descriptor.size, mr, ipc_descriptor) diff --git a/cuda_core/cuda/core/experimental/_resource_handles.pxd b/cuda_core/cuda/core/experimental/_resource_handles.pxd index ea0841ed27..2aeff68cd8 100644 --- a/cuda_core/cuda/core/experimental/_resource_handles.pxd +++ b/cuda_core/cuda/core/experimental/_resource_handles.pxd @@ -9,6 +9,13 @@ from cuda.bindings cimport cydriver # Declare the C++ namespace and types cdef extern from "_cpp/resource_handles.hpp" namespace "cuda_core": + # ======================================================================== + # Thread-local error handling + # ======================================================================== + cydriver.CUresult get_last_error() nogil + cydriver.CUresult peek_last_error() nogil + void clear_last_error() nogil + # ======================================================================== # Context Handle # ======================================================================== @@ -119,12 +126,11 @@ cdef extern from "_cpp/resource_handles.hpp" namespace "cuda_core": # Import a device pointer from IPC via cuMemPoolImportPointer # Note: Does not yet implement reference counting for nvbug 5570902 - # Error code is written to error_out (caller must check) + # On error, returns empty handle and sets thread-local error (use get_last_error()) DevicePtrHandle deviceptr_import_ipc( MemoryPoolHandle h_pool, const void* export_data, - StreamHandle h_stream, - cydriver.CUresult* error_out) nogil + StreamHandle h_stream) nogil # Access the deallocation stream for a device pointer handle (read-only) StreamHandle deallocation_stream(const DevicePtrHandle& h) nogil From f05d45a7a81c1c55c2d7377fac2fa63d58d8ff65 Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Thu, 11 Dec 2025 15:40:34 -0800 Subject: [PATCH 20/38] Add IPC pointer cache to fix duplicate import issue (nvbug 5570902) IPC-imported device pointers are not correctly reference counted by the driver - the first cuMemFreeAsync incorrectly unmaps the memory even when the pointer was imported multiple times. Work around this by caching imported pointers and returning the same handle for duplicate imports. The cache uses weak_ptr so entries are automatically cleaned up when all references are released. The workaround can be easily bypassed via use_ipc_ptr_cache() when a driver fix becomes available. --- .../experimental/_cpp/resource_handles.cpp | 65 ++++++++++++-- .../memory_ipc/test_ipc_duplicate_import.py | 89 +++++++++++++++++++ 2 files changed, 146 insertions(+), 8 deletions(-) create mode 100644 cuda_core/tests/memory_ipc/test_ipc_duplicate_import.py diff --git a/cuda_core/cuda/core/experimental/_cpp/resource_handles.cpp b/cuda_core/cuda/core/experimental/_cpp/resource_handles.cpp index bc663f8228..557b0af74c 100644 --- a/cuda_core/cuda/core/experimental/_cpp/resource_handles.cpp +++ b/cuda_core/cuda/core/experimental/_cpp/resource_handles.cpp @@ -6,6 +6,8 @@ #include "resource_handles.hpp" #include +#include +#include #include namespace cuda_core { @@ -436,6 +438,22 @@ DevicePtrHandle deviceptr_create_ref(CUdeviceptr ptr) { return DevicePtrHandle(box, &box->resource); } +// ============================================================================ +// IPC Pointer Cache (workaround for nvbug 5570902) +// ============================================================================ +// IPC-imported pointers are not correctly reference counted by the driver. +// The first cuMemFreeAsync incorrectly unmaps the memory even when the pointer +// was imported multiple times. We work around this by caching imported pointers +// and returning the same handle for duplicate imports. + +// TODO: When driver fix is available, add version check here to bypass cache. +static bool use_ipc_ptr_cache() { + return true; +} + +static std::mutex ipc_ptr_cache_mutex; +static std::unordered_map> ipc_ptr_cache; + DevicePtrHandle deviceptr_import_ipc(MemoryPoolHandle h_pool, const void* export_data, StreamHandle h_stream) { GILReleaseGuard gil; CUdeviceptr ptr; @@ -445,15 +463,46 @@ DevicePtrHandle deviceptr_import_ipc(MemoryPoolHandle h_pool, const void* export return {}; } - auto box = std::shared_ptr( - new DevicePtrBox{ptr, h_stream}, - [h_pool](DevicePtrBox* b) { - GILReleaseGuard gil; - cuMemFreeAsync(b->resource, native(b->h_stream)); - delete b; + if (use_ipc_ptr_cache()) { + std::lock_guard lock(ipc_ptr_cache_mutex); + + // Check for existing handle + auto it = ipc_ptr_cache.find(ptr); + if (it != ipc_ptr_cache.end()) { + if (auto box = it->second.lock()) { + return DevicePtrHandle(box, &box->resource); + } + ipc_ptr_cache.erase(it); // Expired entry } - ); - return DevicePtrHandle(box, &box->resource); + + // Create new handle with cache-clearing deleter + auto box = std::shared_ptr( + new DevicePtrBox{ptr, h_stream}, + [h_pool, ptr](DevicePtrBox* b) { + { + std::lock_guard lock(ipc_ptr_cache_mutex); + ipc_ptr_cache.erase(ptr); + } + GILReleaseGuard gil; + cuMemFreeAsync(b->resource, native(b->h_stream)); + delete b; + } + ); + ipc_ptr_cache[ptr] = box; + return DevicePtrHandle(box, &box->resource); + + } else { + // No caching - simple handle creation + auto box = std::shared_ptr( + new DevicePtrBox{ptr, h_stream}, + [h_pool](DevicePtrBox* b) { + GILReleaseGuard gil; + cuMemFreeAsync(b->resource, native(b->h_stream)); + delete b; + } + ); + return DevicePtrHandle(box, &box->resource); + } } } // namespace cuda_core diff --git a/cuda_core/tests/memory_ipc/test_ipc_duplicate_import.py b/cuda_core/tests/memory_ipc/test_ipc_duplicate_import.py new file mode 100644 index 0000000000..a08c48d567 --- /dev/null +++ b/cuda_core/tests/memory_ipc/test_ipc_duplicate_import.py @@ -0,0 +1,89 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +"""Test for duplicate IPC buffer imports. + +Verifies that importing the same buffer descriptor multiple times returns the +same underlying handle, and that closing all imports works correctly without +crashing. This tests the workaround for nvbug 5570902 where IPC-imported +pointers are not correctly reference counted by the driver. +""" + +import multiprocessing as mp + +import pytest + +from cuda.core.experimental import Buffer, Device, DeviceMemoryResource, DeviceMemoryResourceOptions +from helpers.logging import TimestampedLogger + +CHILD_TIMEOUT_SEC = 20 +NBYTES = 64 +POOL_SIZE = 2097152 + +ENABLE_LOGGING = False # Set True for test debugging and development + + +def child_main(log, queue): + log.prefix = " child: " + log("ready") + device = Device() + device.set_current() + mr = queue.get() + buffer_desc1 = queue.get() + buffer_desc2 = queue.get() + + # Import the same buffer twice - should return same handle due to cache + buffer1 = Buffer.from_ipc_descriptor(mr, buffer_desc1) + buffer2 = Buffer.from_ipc_descriptor(mr, buffer_desc2) + + log(f"buffer1.handle = {buffer1.handle}") + log(f"buffer2.handle = {buffer2.handle}") + log(f"same handle: {buffer1.handle == buffer2.handle}") + + # Close both - should not crash + buffer1.close() + log("buffer1 closed") + + buffer2.close() + log("buffer2 closed") + + device.sync() + log("done") + + +class TestIpcDuplicateImport: + """Test that duplicate IPC imports return the same handle and close safely.""" + + @pytest.fixture(autouse=True) + def _set_start_method(self): + # Ensure spawn is used for multiprocessing + try: + mp.set_start_method("spawn", force=True) + except RuntimeError: + pass # Already set + + def test_main(self, ipc_device, ipc_memory_resource): + log = TimestampedLogger(prefix="parent: ", enabled=ENABLE_LOGGING) + device = ipc_device + mr = ipc_memory_resource + + log("allocating buffer") + buffer = mr.allocate(NBYTES) + + # Start the child process. + log("starting child") + queue = mp.Queue() + process = mp.Process(target=child_main, args=(log, queue)) + process.start() + + # Send the memory resource and buffer descriptor twice. + log("sending mr and buffer descriptors") + queue.put(mr) + queue.put(buffer.get_ipc_descriptor()) + queue.put(buffer.get_ipc_descriptor()) + + log("waiting for child") + process.join(timeout=CHILD_TIMEOUT_SEC) + log(f"child exit code: {process.exitcode}") + assert process.exitcode == 0, f"Child process failed with exit code {process.exitcode}" + log("done") From 937428b7d6fc97079c79929856f9008626a7f407 Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Thu, 11 Dec 2025 16:07:55 -0800 Subject: [PATCH 21/38] Fix lint issues: remove unused imports and variables --- cuda_core/cuda/core/experimental/_context.pyx | 4 - cuda_core/cuda/core/experimental/_event.pyx | 2 - .../core/experimental/_memory/_buffer.pxd | 3 + .../core/experimental/_memory/_buffer.pyx | 99 ++++++++++++++++--- .../_memory/_device_memory_resource.pyx | 1 - .../cuda/core/experimental/_memory/_ipc.pyx | 3 - .../cuda/core/experimental/_memoryview.pyx | 1 - cuda_core/cuda/core/experimental/_stream.pyx | 6 +- .../memory_ipc/test_ipc_duplicate_import.py | 10 +- cuda_core/tests/test_comparable.py | 1 - cuda_core/tests/test_hashable.py | 1 - 11 files changed, 95 insertions(+), 36 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_context.pyx b/cuda_core/cuda/core/experimental/_context.pyx index 0504778207..2a7434c62b 100644 --- a/cuda_core/cuda/core/experimental/_context.pyx +++ b/cuda_core/cuda/core/experimental/_context.pyx @@ -4,15 +4,11 @@ from dataclasses import dataclass -from cuda.bindings cimport cydriver from cuda.core.experimental._resource_handles cimport ( ContextHandle, - create_context_handle_ref, intptr, - native, py, ) -from cuda.core.experimental._utils.cuda_utils cimport HANDLE_RETURN __all__ = ['Context', 'ContextOptions'] diff --git a/cuda_core/cuda/core/experimental/_event.pyx b/cuda_core/cuda/core/experimental/_event.pyx index 763df94fe3..72618e40d6 100644 --- a/cuda_core/cuda/core/experimental/_event.pyx +++ b/cuda_core/cuda/core/experimental/_event.pyx @@ -5,7 +5,6 @@ from __future__ import annotations cimport cpython -from libc.stdint cimport uintptr_t from libc.string cimport memcpy from cuda.bindings cimport cydriver from cuda.core.experimental._context cimport Context @@ -30,7 +29,6 @@ from typing import TYPE_CHECKING, Optional from cuda.core.experimental._utils.cuda_utils import ( CUDAError, check_multiprocessing_start_method, - driver, ) if TYPE_CHECKING: import cuda.bindings diff --git a/cuda_core/cuda/core/experimental/_memory/_buffer.pxd b/cuda_core/cuda/core/experimental/_memory/_buffer.pxd index 81653dafd5..dda12622f4 100644 --- a/cuda_core/cuda/core/experimental/_memory/_buffer.pxd +++ b/cuda_core/cuda/core/experimental/_memory/_buffer.pxd @@ -20,6 +20,9 @@ cdef class Buffer: size_t _size MemoryResource _memory_resource object _ipc_data + object _owner + _MemAttrs _mem_attrs + bint _mem_attrs_inited cdef class MemoryResource: diff --git a/cuda_core/cuda/core/experimental/_memory/_buffer.pyx b/cuda_core/cuda/core/experimental/_memory/_buffer.pyx index c7ab15ae95..6568d6271f 100644 --- a/cuda_core/cuda/core/experimental/_memory/_buffer.pyx +++ b/cuda_core/cuda/core/experimental/_memory/_buffer.pyx @@ -5,7 +5,7 @@ from __future__ import annotations cimport cython -from libc.stdint cimport uintptr_t, int64_t, uint64_t +from libc.stdint cimport uintptr_t from cuda.bindings cimport cydriver from cuda.core.experimental._memory._device_memory_resource cimport DeviceMemoryResource @@ -13,15 +13,16 @@ from cuda.core.experimental._memory._ipc cimport IPCBufferDescriptor, IPCDataFor from cuda.core.experimental._memory cimport _ipc from cuda.core.experimental._resource_handles cimport ( DevicePtrHandle, - StreamHandle, deviceptr_create_ref, intptr, native, - py, set_deallocation_stream, ) from cuda.core.experimental._stream cimport Stream_accept, Stream -from cuda.core.experimental._utils.cuda_utils cimport HANDLE_RETURN +from cuda.core.experimental._utils.cuda_utils cimport ( + _check_driver_error as raise_if_driver_error, + HANDLE_RETURN, +) import abc from typing import TypeVar, Union @@ -56,6 +57,8 @@ cdef class Buffer: self._size = 0 self._memory_resource = None self._ipc_data = None + self._owner = None + self._mem_attrs_inited = False def __init__(self, *args, **kwargs): raise RuntimeError("Buffer objects cannot be instantiated directly. " @@ -71,7 +74,7 @@ cdef class Buffer: owner : object | None = None ): """Legacy init for compatibility - creates a non-owning ref handle. - + Note: The stream parameter is accepted for API compatibility but is ignored since non-owning refs are never freed by the handle. """ @@ -82,6 +85,8 @@ cdef class Buffer: raise ValueError("owner and memory resource cannot be both specified together") self._memory_resource = mr self._ipc_data = IPCDataForBuffer(ipc_descriptor, True) if ipc_descriptor is not None else None + self._owner = owner + self._mem_attrs_inited = False return self # No __dealloc__ needed - RAII handles cleanup via _h_ptr destructor @@ -105,14 +110,17 @@ cdef class Buffer: Memory size of the buffer mr : :obj:`~_memory.MemoryResource`, optional Memory resource associated with the buffer + owner : object, optional + An object holding external allocation that the ``ptr`` points to. + The reference is kept as long as the buffer is alive. + The ``owner`` and ``mr`` cannot be specified together. Note ---- This creates a non-owning reference. The pointer will NOT be freed when the Buffer is closed or garbage collected. """ - cdef DevicePtrHandle h_ptr = deviceptr_create_ref((int(ptr))) - return Buffer_from_deviceptr_handle(h_ptr, size, mr) + return Buffer._init(ptr, size, mr=mr, owner=owner) @classmethod def from_ipc_descriptor( @@ -159,7 +167,6 @@ cdef class Buffer: """ stream = Stream_accept(stream) - cdef Stream s_stream = stream cdef size_t src_size = self._size if dst is None: @@ -190,7 +197,6 @@ cdef class Buffer: """ stream = Stream_accept(stream) - cdef Stream s_stream = stream cdef size_t dst_size = self._size cdef size_t src_size = src._size @@ -299,7 +305,8 @@ cdef class Buffer: """Return the device ordinal of this buffer.""" if self._memory_resource is not None: return self._memory_resource.device_id - raise NotImplementedError("device_id requires a memory resource") + _init_mem_attrs(self) + return self._mem_attrs.device_id @property def handle(self) -> DevicePointerT: @@ -319,14 +326,16 @@ cdef class Buffer: """Return True if this buffer can be accessed by the GPU, otherwise False.""" if self._memory_resource is not None: return self._memory_resource.is_device_accessible - raise NotImplementedError("is_device_accessible requires a memory resource") + _init_mem_attrs(self) + return self._mem_attrs.is_device_accessible @property def is_host_accessible(self) -> bool: """Return True if this buffer can be accessed by the CPU, otherwise False.""" if self._memory_resource is not None: return self._memory_resource.is_host_accessible - raise NotImplementedError("is_host_accessible requires a memory resource") + _init_mem_attrs(self) + return self._mem_attrs.is_host_accessible @property def is_mapped(self) -> bool: @@ -344,6 +353,71 @@ cdef class Buffer: """Return the memory size of this buffer.""" return self._size + @property + def owner(self) -> object: + """Return the object holding external allocation.""" + return self._owner + + +# Memory Attribute Query Helpers +# ------------------------------ +cdef inline _init_mem_attrs(Buffer self): + """Initialize memory attributes by querying the pointer.""" + if not self._mem_attrs_inited: + _query_memory_attrs(self._mem_attrs, native(self._h_ptr)) + self._mem_attrs_inited = True + + +cdef inline int _query_memory_attrs( + _MemAttrs& out, + cydriver.CUdeviceptr ptr +) except -1 nogil: + """Query memory attributes for a device pointer.""" + cdef unsigned int memory_type = 0 + cdef int is_managed = 0 + cdef int device_id = 0 + cdef cydriver.CUpointer_attribute attrs[3] + cdef uintptr_t vals[3] + + attrs[0] = cydriver.CUpointer_attribute.CU_POINTER_ATTRIBUTE_MEMORY_TYPE + attrs[1] = cydriver.CUpointer_attribute.CU_POINTER_ATTRIBUTE_IS_MANAGED + attrs[2] = cydriver.CUpointer_attribute.CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL + vals[0] = &memory_type + vals[1] = &is_managed + vals[2] = &device_id + + cdef cydriver.CUresult ret + ret = cydriver.cuPointerGetAttributes(3, attrs, vals, ptr) + if ret == cydriver.CUresult.CUDA_ERROR_NOT_INITIALIZED: + with cython.gil: + # Device class handles the cuInit call internally + Device() + ret = cydriver.cuPointerGetAttributes(3, attrs, vals, ptr) + HANDLE_RETURN(ret) + + if memory_type == 0: + # unregistered host pointer + out.is_host_accessible = True + out.is_device_accessible = False + out.device_id = -1 + elif ( + is_managed + or memory_type == cydriver.CUmemorytype.CU_MEMORYTYPE_HOST + ): + # Managed memory or pinned host memory + out.is_host_accessible = True + out.is_device_accessible = True + out.device_id = device_id + elif memory_type == cydriver.CUmemorytype.CU_MEMORYTYPE_DEVICE: + out.is_host_accessible = False + out.is_device_accessible = True + out.device_id = device_id + else: + with cython.gil: + raise ValueError(f"Unsupported memory type: {memory_type}") + return 0 + + cdef class MemoryResource: """Abstract base class for memory resources that manage allocation and deallocation of buffers. @@ -426,6 +500,7 @@ cdef inline void Buffer_close(Buffer self, object stream): self._size = 0 self._memory_resource = None self._ipc_data = None + self._owner = None def _validate_value_against_bitwidth(bitwidth, value, is_signed=False): diff --git a/cuda_core/cuda/core/experimental/_memory/_device_memory_resource.pyx b/cuda_core/cuda/core/experimental/_memory/_device_memory_resource.pyx index 2a3e5c2dfe..e80afe45ac 100644 --- a/cuda_core/cuda/core/experimental/_memory/_device_memory_resource.pyx +++ b/cuda_core/cuda/core/experimental/_memory/_device_memory_resource.pyx @@ -15,7 +15,6 @@ from cuda.core.experimental._memory cimport _ipc from cuda.core.experimental._memory._ipc cimport IPCAllocationHandle, IPCDataForMR from cuda.core.experimental._resource_handles cimport ( DevicePtrHandle, - MemoryPoolHandle, create_mempool_handle, deviceptr_alloc_from_pool, get_device_mempool, diff --git a/cuda_core/cuda/core/experimental/_memory/_ipc.pyx b/cuda_core/cuda/core/experimental/_memory/_ipc.pyx index 414c134601..5b301750c4 100644 --- a/cuda_core/cuda/core/experimental/_memory/_ipc.pyx +++ b/cuda_core/cuda/core/experimental/_memory/_ipc.pyx @@ -3,15 +3,12 @@ # SPDX-License-Identifier: Apache-2.0 cimport cpython -from libc.stdint cimport uintptr_t -from libc.string cimport memcpy from cuda.bindings cimport cydriver from cuda.core.experimental._memory._buffer cimport Buffer, Buffer_from_deviceptr_handle from cuda.core.experimental._stream cimport Stream from cuda.core.experimental._resource_handles cimport ( DevicePtrHandle, - MemoryPoolHandle, create_mempool_handle_ipc, deviceptr_import_ipc, get_last_error, diff --git a/cuda_core/cuda/core/experimental/_memoryview.pyx b/cuda_core/cuda/core/experimental/_memoryview.pyx index 718736e5cf..9f1119894a 100644 --- a/cuda_core/cuda/core/experimental/_memoryview.pyx +++ b/cuda_core/cuda/core/experimental/_memoryview.pyx @@ -20,7 +20,6 @@ from cuda.core.experimental._resource_handles cimport ( native, ) from cuda.core.experimental._utils.cuda_utils import handle_return, driver -from cuda.core.experimental._utils cimport cuda_utils from cuda.core.experimental._utils.cuda_utils cimport HANDLE_RETURN diff --git a/cuda_core/cuda/core/experimental/_stream.pyx b/cuda_core/cuda/core/experimental/_stream.pyx index 4a16399323..87d78eba17 100644 --- a/cuda_core/cuda/core/experimental/_stream.pyx +++ b/cuda_core/cuda/core/experimental/_stream.pyx @@ -41,9 +41,6 @@ from cuda.core.experimental._resource_handles cimport ( py, ) from cuda.core.experimental._graph import GraphBuilder -from cuda.core.experimental._utils.cuda_utils import ( - driver, -) @dataclass @@ -423,8 +420,7 @@ cdef inline int Stream_ensure_ctx(Stream self) except?-1 nogil: cdef inline int Stream_ensure_ctx_device(Stream self) except?-1: """Ensure the stream's context and device_id are populated.""" - cdef ContextHandle h_curr_context - cdef cydriver.CUcontext target_ctx, curr_ctx, ctx + cdef cydriver.CUcontext ctx cdef cydriver.CUdevice target_dev cdef bint switch_context diff --git a/cuda_core/tests/memory_ipc/test_ipc_duplicate_import.py b/cuda_core/tests/memory_ipc/test_ipc_duplicate_import.py index a08c48d567..096b3a2abd 100644 --- a/cuda_core/tests/memory_ipc/test_ipc_duplicate_import.py +++ b/cuda_core/tests/memory_ipc/test_ipc_duplicate_import.py @@ -9,11 +9,11 @@ pointers are not correctly reference counted by the driver. """ +import contextlib import multiprocessing as mp import pytest - -from cuda.core.experimental import Buffer, Device, DeviceMemoryResource, DeviceMemoryResourceOptions +from cuda.core.experimental import Buffer, Device from helpers.logging import TimestampedLogger CHILD_TIMEOUT_SEC = 20 @@ -57,14 +57,12 @@ class TestIpcDuplicateImport: @pytest.fixture(autouse=True) def _set_start_method(self): # Ensure spawn is used for multiprocessing - try: + with contextlib.suppress(RuntimeError): mp.set_start_method("spawn", force=True) - except RuntimeError: - pass # Already set def test_main(self, ipc_device, ipc_memory_resource): log = TimestampedLogger(prefix="parent: ", enabled=ENABLE_LOGGING) - device = ipc_device + ipc_device.set_current() mr = ipc_memory_resource log("allocating buffer") diff --git a/cuda_core/tests/test_comparable.py b/cuda_core/tests/test_comparable.py index 2c05932dcc..8f62db8b49 100644 --- a/cuda_core/tests/test_comparable.py +++ b/cuda_core/tests/test_comparable.py @@ -9,7 +9,6 @@ """ from cuda.core.experimental import Device, Stream -from cuda.core.experimental._event import Event from cuda.core.experimental._stream import StreamOptions # ============================================================================ diff --git a/cuda_core/tests/test_hashable.py b/cuda_core/tests/test_hashable.py index 1ecf8cdedd..bdad435c6f 100644 --- a/cuda_core/tests/test_hashable.py +++ b/cuda_core/tests/test_hashable.py @@ -13,7 +13,6 @@ """ from cuda.core.experimental import Device -from cuda.core.experimental._event import Event from cuda.core.experimental._stream import Stream, StreamOptions # ============================================================================ From b629ec68c99e2b8265284dee55916768c65bb655 Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Fri, 12 Dec 2025 09:13:19 -0800 Subject: [PATCH 22/38] Add deviceptr_create_with_owner for handle-based owner tracking Implements handle-based owner tracking for device pointers, consistent with the pattern used for streams (create_stream_handle_with_owner). Changes: - Add deviceptr_create_with_owner() - creates non-owning handle that keeps a Python owner alive via Py_INCREF/DECREF (lambda capture) - If owner is nullptr, delegates to deviceptr_create_ref - Buffer._owner field tracks owner in Python for property access - Buffer._init() simplified to always call deviceptr_create_with_owner --- .../experimental/_cpp/resource_handles.cpp | 18 ++++++++++++++++++ .../experimental/_cpp/resource_handles.hpp | 6 ++++++ .../cuda/core/experimental/_memory/_buffer.pyx | 13 +++++-------- .../core/experimental/_resource_handles.pxd | 5 +++++ 4 files changed, 34 insertions(+), 8 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_cpp/resource_handles.cpp b/cuda_core/cuda/core/experimental/_cpp/resource_handles.cpp index 557b0af74c..c4b574f6a7 100644 --- a/cuda_core/cuda/core/experimental/_cpp/resource_handles.cpp +++ b/cuda_core/cuda/core/experimental/_cpp/resource_handles.cpp @@ -438,6 +438,24 @@ DevicePtrHandle deviceptr_create_ref(CUdeviceptr ptr) { return DevicePtrHandle(box, &box->resource); } +DevicePtrHandle deviceptr_create_with_owner(CUdeviceptr ptr, PyObject* owner) { + if (!owner) { + return deviceptr_create_ref(ptr); + } + Py_INCREF(owner); + auto box = std::shared_ptr( + new DevicePtrBox{ptr, StreamHandle{}}, + [owner](DevicePtrBox* b) { + GILAcquireGuard gil; + if (gil.acquired()) { + Py_DECREF(owner); + } + delete b; + } + ); + return DevicePtrHandle(box, &box->resource); +} + // ============================================================================ // IPC Pointer Cache (workaround for nvbug 5570902) // ============================================================================ diff --git a/cuda_core/cuda/core/experimental/_cpp/resource_handles.hpp b/cuda_core/cuda/core/experimental/_cpp/resource_handles.hpp index fc62c9aa2c..7649788fdd 100644 --- a/cuda_core/cuda/core/experimental/_cpp/resource_handles.hpp +++ b/cuda_core/cuda/core/experimental/_cpp/resource_handles.hpp @@ -159,6 +159,12 @@ DevicePtrHandle deviceptr_alloc_host(size_t size); // The pointer will NOT be freed when the handle is released. DevicePtrHandle deviceptr_create_ref(CUdeviceptr ptr); +// Create a non-owning device pointer handle that prevents a Python owner from being GC'd. +// The owner's refcount is incremented; decremented when handle is released. +// The pointer will NOT be freed when the handle is released. +// If owner is nullptr, equivalent to deviceptr_create_ref. +DevicePtrHandle deviceptr_create_with_owner(CUdeviceptr ptr, PyObject* owner); + // Import a device pointer from IPC via cuMemPoolImportPointer. // When the last reference is released, cuMemFreeAsync is called on the stored stream. // Note: Does not yet implement reference counting for nvbug 5570902. diff --git a/cuda_core/cuda/core/experimental/_memory/_buffer.pyx b/cuda_core/cuda/core/experimental/_memory/_buffer.pyx index 6568d6271f..bca2a21ff0 100644 --- a/cuda_core/cuda/core/experimental/_memory/_buffer.pyx +++ b/cuda_core/cuda/core/experimental/_memory/_buffer.pyx @@ -13,7 +13,7 @@ from cuda.core.experimental._memory._ipc cimport IPCBufferDescriptor, IPCDataFor from cuda.core.experimental._memory cimport _ipc from cuda.core.experimental._resource_handles cimport ( DevicePtrHandle, - deviceptr_create_ref, + deviceptr_create_with_owner, intptr, native, set_deallocation_stream, @@ -64,9 +64,6 @@ cdef class Buffer: raise RuntimeError("Buffer objects cannot be instantiated directly. " "Please use MemoryResource APIs.") - # Note: _init_from_handle is a cdef inline function, not a method - # See Buffer_init_from_handle below - @classmethod def _init( cls, ptr: DevicePointerT, size_t size, mr: MemoryResource | None = None, @@ -78,11 +75,11 @@ cdef class Buffer: Note: The stream parameter is accepted for API compatibility but is ignored since non-owning refs are never freed by the handle. """ - cdef Buffer self = Buffer.__new__(cls) - self._h_ptr = deviceptr_create_ref((int(ptr))) - self._size = size if mr is not None and owner is not None: raise ValueError("owner and memory resource cannot be both specified together") + cdef Buffer self = Buffer.__new__(cls) + self._h_ptr = deviceptr_create_with_owner((int(ptr)), owner) + self._size = size self._memory_resource = mr self._ipc_data = IPCDataForBuffer(ipc_descriptor, True) if ipc_descriptor is not None else None self._owner = owner @@ -495,7 +492,7 @@ cdef inline void Buffer_close(Buffer self, object stream): if stream is not None: s = Stream_accept(stream) set_deallocation_stream(self._h_ptr, s._h_stream) - # Reset handle - RAII deleter will free the memory + # Reset handle - RAII deleter will free the memory (and release owner ref in C++) self._h_ptr.reset() self._size = 0 self._memory_resource = None diff --git a/cuda_core/cuda/core/experimental/_resource_handles.pxd b/cuda_core/cuda/core/experimental/_resource_handles.pxd index 2aeff68cd8..6aa204efc6 100644 --- a/cuda_core/cuda/core/experimental/_resource_handles.pxd +++ b/cuda_core/cuda/core/experimental/_resource_handles.pxd @@ -124,6 +124,11 @@ cdef extern from "_cpp/resource_handles.hpp" namespace "cuda_core": # Use for foreign pointers from external libraries DevicePtrHandle deviceptr_create_ref(cydriver.CUdeviceptr ptr) nogil + # Create non-owning handle that prevents Python owner from being GC'd + # Pointer NOT freed when released; owner's refcount decremented on release + # If owner is None, equivalent to deviceptr_create_ref + DevicePtrHandle deviceptr_create_with_owner(cydriver.CUdeviceptr ptr, object owner) + # Import a device pointer from IPC via cuMemPoolImportPointer # Note: Does not yet implement reference counting for nvbug 5570902 # On error, returns empty handle and sets thread-local error (use get_last_error()) From cce5e9fa27551a0ef34e57891ca767eb6795a945 Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Tue, 16 Dec 2025 12:55:42 -0800 Subject: [PATCH 23/38] Add resource handles _CXX_API capsule and lazy driver loading Expose a full C++ handles function table via PyCapsule so extensions can dispatch without RTLD_GLOBAL, and switch resource_handles.cpp to load libcuda symbols at runtime to support CPU-only imports. --- cuda_core/build_hooks.py | 28 +- .../experimental/_cpp/resource_handles.cpp | 320 ++++++++++++++++-- .../_cpp/resource_handles_cxx_api.hpp | 80 +++++ cuda_core/cuda/core/experimental/_event.pyx | 27 +- .../core/experimental/_resource_handles.pyx | 20 ++ .../_resource_handles_cxx_api.pxd | 69 ++++ 6 files changed, 494 insertions(+), 50 deletions(-) create mode 100644 cuda_core/cuda/core/experimental/_cpp/resource_handles_cxx_api.hpp create mode 100644 cuda_core/cuda/core/experimental/_resource_handles_cxx_api.pxd diff --git a/cuda_core/build_hooks.py b/cuda_core/build_hooks.py index a20407488e..80a96e0bc2 100644 --- a/cuda_core/build_hooks.py +++ b/cuda_core/build_hooks.py @@ -12,6 +12,7 @@ import os import re import subprocess +import sys from Cython.Build import cythonize from setuptools import Extension @@ -84,18 +85,6 @@ def get_cuda_paths(): print("CUDA paths:", CUDA_PATH) return CUDA_PATH - @functools.cache - def get_cuda_library_dirs(): - """Return library search paths for CUDA driver runtime.""" - - libdirs = [] - for root in get_cuda_paths(): - for subdir in ("lib64", "lib"): - candidate = os.path.join(root, subdir) - if os.path.isdir(candidate): - libdirs.append(candidate) - return libdirs - def get_sources(mod_name): """Get source files for a module, including any .cpp files.""" sources = [f"cuda/core/experimental/{mod_name}.pyx"] @@ -108,16 +97,15 @@ def get_sources(mod_name): return sources def get_extension_kwargs(mod_name): - """Return Extension kwargs (libraries, library_dirs) per module.""" + """Return Extension kwargs (libraries, etc.) per module.""" - # Modules that use CUDA driver APIs need to link against libcuda - # _resource_handles: contains the C++ implementation that calls CUDA driver - # _context, _stream, _event, _device: use resource handles and may call CUDA driver directly - cuda_users = {"_resource_handles", "_context", "_stream", "_event", "_device"} kwargs = {} - if mod_name in cuda_users: - kwargs["libraries"] = ["cuda"] - kwargs["library_dirs"] = get_cuda_library_dirs() + + # _resource_handles.cpp uses dlopen/dlsym on Linux, which requires -ldl on glibc < 2.34. + # (On Windows it uses LoadLibrary/GetProcAddress; on macOS dlopen is in libSystem.) + if sys.platform.startswith("linux") and mod_name == "_resource_handles": + kwargs["libraries"] = ["dl"] + return kwargs ext_modules = tuple( diff --git a/cuda_core/cuda/core/experimental/_cpp/resource_handles.cpp b/cuda_core/cuda/core/experimental/_cpp/resource_handles.cpp index c4b574f6a7..3ad8ea5dc4 100644 --- a/cuda_core/cuda/core/experimental/_cpp/resource_handles.cpp +++ b/cuda_core/cuda/core/experimental/_cpp/resource_handles.cpp @@ -5,13 +5,148 @@ #include #include "resource_handles.hpp" +#include "resource_handles_cxx_api.hpp" #include #include + +#if defined(_WIN32) +#include +#else +#include +#endif + +#include +#include #include #include namespace cuda_core { +// ============================================================================ +// CUDA driver dynamic loading (CPU-only import + MVC compatibility) +// ============================================================================ + +namespace { + +#if defined(_WIN32) +using LibHandle = HMODULE; + +static LibHandle open_libcuda() noexcept { + // CUDA driver DLL + return LoadLibraryA("nvcuda.dll"); +} + +static void* get_symbol(LibHandle lib, const char* name) noexcept { + return reinterpret_cast(GetProcAddress(lib, name)); +} +#else +using LibHandle = void*; + +static LibHandle open_libcuda() noexcept { + // Prefer the soname; fall back to the linker name. + LibHandle lib = dlopen("libcuda.so.1", RTLD_NOW | RTLD_LOCAL); + if (!lib) { + lib = dlopen("libcuda.so", RTLD_NOW | RTLD_LOCAL); + } + return lib; +} + +static void* get_symbol(LibHandle lib, const char* name) noexcept { + return dlsym(lib, name); +} +#endif + +static std::once_flag driver_load_once; +static std::atomic driver_loaded{false}; +static LibHandle libcuda = nullptr; + +#define DECLARE_DRIVER_FN(name) using name##_t = decltype(&name); static name##_t p_##name = nullptr + +DECLARE_DRIVER_FN(cuDevicePrimaryCtxRetain); +DECLARE_DRIVER_FN(cuDevicePrimaryCtxRelease); +DECLARE_DRIVER_FN(cuCtxGetCurrent); + +DECLARE_DRIVER_FN(cuStreamCreateWithPriority); +DECLARE_DRIVER_FN(cuStreamDestroy); + +DECLARE_DRIVER_FN(cuEventCreate); +DECLARE_DRIVER_FN(cuEventDestroy); +DECLARE_DRIVER_FN(cuIpcOpenEventHandle); + +DECLARE_DRIVER_FN(cuDeviceGetCount); + +DECLARE_DRIVER_FN(cuMemPoolSetAccess); +DECLARE_DRIVER_FN(cuMemPoolDestroy); +DECLARE_DRIVER_FN(cuMemPoolCreate); +DECLARE_DRIVER_FN(cuDeviceGetMemPool); +DECLARE_DRIVER_FN(cuMemPoolImportFromShareableHandle); + +DECLARE_DRIVER_FN(cuMemAllocFromPoolAsync); +DECLARE_DRIVER_FN(cuMemAllocAsync); +DECLARE_DRIVER_FN(cuMemAlloc); +DECLARE_DRIVER_FN(cuMemAllocHost); + +DECLARE_DRIVER_FN(cuMemFreeAsync); +DECLARE_DRIVER_FN(cuMemFree); +DECLARE_DRIVER_FN(cuMemFreeHost); + +DECLARE_DRIVER_FN(cuMemPoolImportPointer); + +#undef DECLARE_DRIVER_FN + +template +static bool load_symbol(const char* sym, T& fn) noexcept { + fn = reinterpret_cast(get_symbol(libcuda, sym)); + return fn != nullptr; +} + +static bool load_driver_api() noexcept { + libcuda = open_libcuda(); + if (!libcuda) { + return false; + } + + bool ok = true; + ok &= load_symbol("cuDevicePrimaryCtxRetain", p_cuDevicePrimaryCtxRetain); + ok &= load_symbol("cuDevicePrimaryCtxRelease", p_cuDevicePrimaryCtxRelease); + ok &= load_symbol("cuCtxGetCurrent", p_cuCtxGetCurrent); + + ok &= load_symbol("cuStreamCreateWithPriority", p_cuStreamCreateWithPriority); + ok &= load_symbol("cuStreamDestroy", p_cuStreamDestroy); + + ok &= load_symbol("cuEventCreate", p_cuEventCreate); + ok &= load_symbol("cuEventDestroy", p_cuEventDestroy); + ok &= load_symbol("cuIpcOpenEventHandle", p_cuIpcOpenEventHandle); + + ok &= load_symbol("cuDeviceGetCount", p_cuDeviceGetCount); + + ok &= load_symbol("cuMemPoolSetAccess", p_cuMemPoolSetAccess); + ok &= load_symbol("cuMemPoolDestroy", p_cuMemPoolDestroy); + ok &= load_symbol("cuMemPoolCreate", p_cuMemPoolCreate); + ok &= load_symbol("cuDeviceGetMemPool", p_cuDeviceGetMemPool); + ok &= load_symbol("cuMemPoolImportFromShareableHandle", p_cuMemPoolImportFromShareableHandle); + + ok &= load_symbol("cuMemAllocFromPoolAsync", p_cuMemAllocFromPoolAsync); + ok &= load_symbol("cuMemAllocAsync", p_cuMemAllocAsync); + ok &= load_symbol("cuMemAlloc", p_cuMemAlloc); + ok &= load_symbol("cuMemAllocHost", p_cuMemAllocHost); + + ok &= load_symbol("cuMemFreeAsync", p_cuMemFreeAsync); + ok &= load_symbol("cuMemFree", p_cuMemFree); + ok &= load_symbol("cuMemFreeHost", p_cuMemFreeHost); + + ok &= load_symbol("cuMemPoolImportPointer", p_cuMemPoolImportPointer); + + return ok; +} + +static bool ensure_driver_loaded() noexcept { + std::call_once(driver_load_once, []() { driver_loaded.store(load_driver_api()); }); + return driver_loaded.load(); +} + +} // namespace + // ============================================================================ // Thread-local error handling // ============================================================================ @@ -118,6 +253,10 @@ ContextHandle create_context_handle_ref(CUcontext ctx) { thread_local std::vector primary_context_cache; ContextHandle get_primary_context(int device_id) noexcept { + if (!ensure_driver_loaded()) { + err = CUDA_ERROR_NOT_INITIALIZED; + return {}; + } // Check thread-local cache if (static_cast(device_id) < primary_context_cache.size()) { if (auto cached = primary_context_cache[device_id]) { @@ -128,7 +267,7 @@ ContextHandle get_primary_context(int device_id) noexcept { // Cache miss - acquire primary context from driver GILReleaseGuard gil; CUcontext ctx; - if (CUDA_SUCCESS != (err = cuDevicePrimaryCtxRetain(&ctx, device_id))) { + if (CUDA_SUCCESS != (err = p_cuDevicePrimaryCtxRetain(&ctx, device_id))) { return {}; } @@ -136,7 +275,9 @@ ContextHandle get_primary_context(int device_id) noexcept { new ContextBox{ctx}, [device_id](const ContextBox* b) { GILReleaseGuard gil; - cuDevicePrimaryCtxRelease(device_id); + if (ensure_driver_loaded()) { + p_cuDevicePrimaryCtxRelease(device_id); + } delete b; } ); @@ -151,9 +292,13 @@ ContextHandle get_primary_context(int device_id) noexcept { } ContextHandle get_current_context() noexcept { + if (!ensure_driver_loaded()) { + err = CUDA_ERROR_NOT_INITIALIZED; + return {}; + } GILReleaseGuard gil; CUcontext ctx = nullptr; - if (CUDA_SUCCESS != (err = cuCtxGetCurrent(&ctx))) { + if (CUDA_SUCCESS != (err = p_cuCtxGetCurrent(&ctx))) { return {}; } if (!ctx) { @@ -171,9 +316,13 @@ struct StreamBox { }; StreamHandle create_stream_handle(ContextHandle h_ctx, unsigned int flags, int priority) { + if (!ensure_driver_loaded()) { + err = CUDA_ERROR_NOT_INITIALIZED; + return {}; + } GILReleaseGuard gil; CUstream stream; - if (CUDA_SUCCESS != (err = cuStreamCreateWithPriority(&stream, flags, priority))) { + if (CUDA_SUCCESS != (err = p_cuStreamCreateWithPriority(&stream, flags, priority))) { return {}; } @@ -181,7 +330,9 @@ StreamHandle create_stream_handle(ContextHandle h_ctx, unsigned int flags, int p new StreamBox{stream}, [h_ctx](const StreamBox* b) { GILReleaseGuard gil; - cuStreamDestroy(b->resource); + if (ensure_driver_loaded()) { + p_cuStreamDestroy(b->resource); + } delete b; } ); @@ -227,9 +378,13 @@ struct EventBox { }; EventHandle create_event_handle(ContextHandle h_ctx, unsigned int flags) { + if (!ensure_driver_loaded()) { + err = CUDA_ERROR_NOT_INITIALIZED; + return {}; + } GILReleaseGuard gil; CUevent event; - if (CUDA_SUCCESS != (err = cuEventCreate(&event, flags))) { + if (CUDA_SUCCESS != (err = p_cuEventCreate(&event, flags))) { return {}; } @@ -237,7 +392,9 @@ EventHandle create_event_handle(ContextHandle h_ctx, unsigned int flags) { new EventBox{event}, [h_ctx](const EventBox* b) { GILReleaseGuard gil; - cuEventDestroy(b->resource); + if (ensure_driver_loaded()) { + p_cuEventDestroy(b->resource); + } delete b; } ); @@ -249,9 +406,13 @@ EventHandle create_event_handle(unsigned int flags) { } EventHandle create_event_handle_ipc(const CUipcEventHandle& ipc_handle) { + if (!ensure_driver_loaded()) { + err = CUDA_ERROR_NOT_INITIALIZED; + return {}; + } GILReleaseGuard gil; CUevent event; - if (CUDA_SUCCESS != (err = cuIpcOpenEventHandle(&event, ipc_handle))) { + if (CUDA_SUCCESS != (err = p_cuIpcOpenEventHandle(&event, ipc_handle))) { return {}; } @@ -259,7 +420,9 @@ EventHandle create_event_handle_ipc(const CUipcEventHandle& ipc_handle) { new EventBox{event}, [](const EventBox* b) { GILReleaseGuard gil; - cuEventDestroy(b->resource); + if (ensure_driver_loaded()) { + p_cuEventDestroy(b->resource); + } delete b; } ); @@ -277,8 +440,11 @@ struct MemoryPoolBox { // Helper to clear peer access before destroying a memory pool. // Works around nvbug 5698116: recycled pool handles inherit peer access state. static void clear_mempool_peer_access(CUmemoryPool pool) { + if (!ensure_driver_loaded()) { + return; + } int device_count = 0; - if (cuDeviceGetCount(&device_count) != CUDA_SUCCESS || device_count <= 0) { + if (p_cuDeviceGetCount(&device_count) != CUDA_SUCCESS || device_count <= 0) { return; } @@ -288,7 +454,7 @@ static void clear_mempool_peer_access(CUmemoryPool pool) { clear_access[i].location.id = i; clear_access[i].flags = CU_MEM_ACCESS_FLAGS_PROT_NONE; } - cuMemPoolSetAccess(pool, clear_access.data(), device_count); // Best effort + p_cuMemPoolSetAccess(pool, clear_access.data(), device_count); // Best effort } static MemoryPoolHandle wrap_mempool_owned(CUmemoryPool pool) { @@ -297,7 +463,9 @@ static MemoryPoolHandle wrap_mempool_owned(CUmemoryPool pool) { [](const MemoryPoolBox* b) { GILReleaseGuard gil; clear_mempool_peer_access(b->resource); - cuMemPoolDestroy(b->resource); + if (ensure_driver_loaded()) { + p_cuMemPoolDestroy(b->resource); + } delete b; } ); @@ -305,9 +473,13 @@ static MemoryPoolHandle wrap_mempool_owned(CUmemoryPool pool) { } MemoryPoolHandle create_mempool_handle(const CUmemPoolProps& props) { + if (!ensure_driver_loaded()) { + err = CUDA_ERROR_NOT_INITIALIZED; + return {}; + } GILReleaseGuard gil; CUmemoryPool pool; - if (CUDA_SUCCESS != (err = cuMemPoolCreate(&pool, &props))) { + if (CUDA_SUCCESS != (err = p_cuMemPoolCreate(&pool, &props))) { return {}; } return wrap_mempool_owned(pool); @@ -319,19 +491,27 @@ MemoryPoolHandle create_mempool_handle_ref(CUmemoryPool pool) { } MemoryPoolHandle get_device_mempool(int device_id) noexcept { + if (!ensure_driver_loaded()) { + err = CUDA_ERROR_NOT_INITIALIZED; + return {}; + } GILReleaseGuard gil; CUmemoryPool pool; - if (CUDA_SUCCESS != (err = cuDeviceGetMemPool(&pool, device_id))) { + if (CUDA_SUCCESS != (err = p_cuDeviceGetMemPool(&pool, device_id))) { return {}; } return create_mempool_handle_ref(pool); } MemoryPoolHandle create_mempool_handle_ipc(int fd, CUmemAllocationHandleType handle_type) { + if (!ensure_driver_loaded()) { + err = CUDA_ERROR_NOT_INITIALIZED; + return {}; + } GILReleaseGuard gil; CUmemoryPool pool; auto handle_ptr = reinterpret_cast(static_cast(fd)); - if (CUDA_SUCCESS != (err = cuMemPoolImportFromShareableHandle(&pool, handle_ptr, handle_type, 0))) { + if (CUDA_SUCCESS != (err = p_cuMemPoolImportFromShareableHandle(&pool, handle_ptr, handle_type, 0))) { return {}; } return wrap_mempool_owned(pool); @@ -362,9 +542,13 @@ void set_deallocation_stream(const DevicePtrHandle& h, StreamHandle h_stream) { } DevicePtrHandle deviceptr_alloc_from_pool(size_t size, MemoryPoolHandle h_pool, StreamHandle h_stream) { + if (!ensure_driver_loaded()) { + err = CUDA_ERROR_NOT_INITIALIZED; + return {}; + } GILReleaseGuard gil; CUdeviceptr ptr; - if (CUDA_SUCCESS != (err = cuMemAllocFromPoolAsync(&ptr, size, *h_pool, native(h_stream)))) { + if (CUDA_SUCCESS != (err = p_cuMemAllocFromPoolAsync(&ptr, size, *h_pool, native(h_stream)))) { return {}; } @@ -372,7 +556,9 @@ DevicePtrHandle deviceptr_alloc_from_pool(size_t size, MemoryPoolHandle h_pool, new DevicePtrBox{ptr, h_stream}, [h_pool](DevicePtrBox* b) { GILReleaseGuard gil; - cuMemFreeAsync(b->resource, native(b->h_stream)); + if (ensure_driver_loaded()) { + p_cuMemFreeAsync(b->resource, native(b->h_stream)); + } delete b; } ); @@ -380,9 +566,13 @@ DevicePtrHandle deviceptr_alloc_from_pool(size_t size, MemoryPoolHandle h_pool, } DevicePtrHandle deviceptr_alloc_async(size_t size, StreamHandle h_stream) { + if (!ensure_driver_loaded()) { + err = CUDA_ERROR_NOT_INITIALIZED; + return {}; + } GILReleaseGuard gil; CUdeviceptr ptr; - if (CUDA_SUCCESS != (err = cuMemAllocAsync(&ptr, size, native(h_stream)))) { + if (CUDA_SUCCESS != (err = p_cuMemAllocAsync(&ptr, size, native(h_stream)))) { return {}; } @@ -390,7 +580,9 @@ DevicePtrHandle deviceptr_alloc_async(size_t size, StreamHandle h_stream) { new DevicePtrBox{ptr, h_stream}, [](DevicePtrBox* b) { GILReleaseGuard gil; - cuMemFreeAsync(b->resource, native(b->h_stream)); + if (ensure_driver_loaded()) { + p_cuMemFreeAsync(b->resource, native(b->h_stream)); + } delete b; } ); @@ -398,9 +590,13 @@ DevicePtrHandle deviceptr_alloc_async(size_t size, StreamHandle h_stream) { } DevicePtrHandle deviceptr_alloc(size_t size) { + if (!ensure_driver_loaded()) { + err = CUDA_ERROR_NOT_INITIALIZED; + return {}; + } GILReleaseGuard gil; CUdeviceptr ptr; - if (CUDA_SUCCESS != (err = cuMemAlloc(&ptr, size))) { + if (CUDA_SUCCESS != (err = p_cuMemAlloc(&ptr, size))) { return {}; } @@ -408,7 +604,9 @@ DevicePtrHandle deviceptr_alloc(size_t size) { new DevicePtrBox{ptr, StreamHandle{}}, [](DevicePtrBox* b) { GILReleaseGuard gil; - cuMemFree(b->resource); + if (ensure_driver_loaded()) { + p_cuMemFree(b->resource); + } delete b; } ); @@ -416,9 +614,13 @@ DevicePtrHandle deviceptr_alloc(size_t size) { } DevicePtrHandle deviceptr_alloc_host(size_t size) { + if (!ensure_driver_loaded()) { + err = CUDA_ERROR_NOT_INITIALIZED; + return {}; + } GILReleaseGuard gil; void* ptr; - if (CUDA_SUCCESS != (err = cuMemAllocHost(&ptr, size))) { + if (CUDA_SUCCESS != (err = p_cuMemAllocHost(&ptr, size))) { return {}; } @@ -426,7 +628,9 @@ DevicePtrHandle deviceptr_alloc_host(size_t size) { new DevicePtrBox{reinterpret_cast(ptr), StreamHandle{}}, [](DevicePtrBox* b) { GILReleaseGuard gil; - cuMemFreeHost(reinterpret_cast(b->resource)); + if (ensure_driver_loaded()) { + p_cuMemFreeHost(reinterpret_cast(b->resource)); + } delete b; } ); @@ -473,11 +677,15 @@ static std::mutex ipc_ptr_cache_mutex; static std::unordered_map> ipc_ptr_cache; DevicePtrHandle deviceptr_import_ipc(MemoryPoolHandle h_pool, const void* export_data, StreamHandle h_stream) { + if (!ensure_driver_loaded()) { + err = CUDA_ERROR_NOT_INITIALIZED; + return {}; + } GILReleaseGuard gil; CUdeviceptr ptr; auto data = const_cast( reinterpret_cast(export_data)); - if (CUDA_SUCCESS != (err = cuMemPoolImportPointer(&ptr, *h_pool, data))) { + if (CUDA_SUCCESS != (err = p_cuMemPoolImportPointer(&ptr, *h_pool, data))) { return {}; } @@ -502,7 +710,9 @@ DevicePtrHandle deviceptr_import_ipc(MemoryPoolHandle h_pool, const void* export ipc_ptr_cache.erase(ptr); } GILReleaseGuard gil; - cuMemFreeAsync(b->resource, native(b->h_stream)); + if (ensure_driver_loaded()) { + p_cuMemFreeAsync(b->resource, native(b->h_stream)); + } delete b; } ); @@ -515,7 +725,9 @@ DevicePtrHandle deviceptr_import_ipc(MemoryPoolHandle h_pool, const void* export new DevicePtrBox{ptr, h_stream}, [h_pool](DevicePtrBox* b) { GILReleaseGuard gil; - cuMemFreeAsync(b->resource, native(b->h_stream)); + if (ensure_driver_loaded()) { + p_cuMemFreeAsync(b->resource, native(b->h_stream)); + } delete b; } ); @@ -523,4 +735,60 @@ DevicePtrHandle deviceptr_import_ipc(MemoryPoolHandle h_pool, const void* export } } +// ============================================================================ +// Capsule C++ API table +// ============================================================================ + +const ResourceHandlesCxxApiV1* get_resource_handles_cxx_api_v1() noexcept { + static const ResourceHandlesCxxApiV1 table = []() { + ResourceHandlesCxxApiV1 t{}; + t.abi_version = RESOURCE_HANDLES_CXX_API_VERSION; + t.struct_size = static_cast(sizeof(ResourceHandlesCxxApiV1)); + + // Error handling + t.get_last_error = &get_last_error; + t.peek_last_error = &peek_last_error; + t.clear_last_error = &clear_last_error; + + // Context + t.create_context_handle_ref = &create_context_handle_ref; + t.get_primary_context = &get_primary_context; + t.get_current_context = &get_current_context; + + // Stream + t.create_stream_handle = &create_stream_handle; + t.create_stream_handle_ref = &create_stream_handle_ref; + t.create_stream_handle_with_owner = &create_stream_handle_with_owner; + t.get_legacy_stream = &get_legacy_stream; + t.get_per_thread_stream = &get_per_thread_stream; + + // Event (resolve overloads explicitly) + t.create_event_handle = + static_cast(&create_event_handle); + t.create_event_handle_noctx = + static_cast(&create_event_handle); + t.create_event_handle_ipc = &create_event_handle_ipc; + + // Memory pool + t.create_mempool_handle = &create_mempool_handle; + t.create_mempool_handle_ref = &create_mempool_handle_ref; + t.get_device_mempool = &get_device_mempool; + t.create_mempool_handle_ipc = &create_mempool_handle_ipc; + + // Device pointer + t.deviceptr_alloc_from_pool = &deviceptr_alloc_from_pool; + t.deviceptr_alloc_async = &deviceptr_alloc_async; + t.deviceptr_alloc = &deviceptr_alloc; + t.deviceptr_alloc_host = &deviceptr_alloc_host; + t.deviceptr_create_ref = &deviceptr_create_ref; + t.deviceptr_create_with_owner = &deviceptr_create_with_owner; + t.deviceptr_import_ipc = &deviceptr_import_ipc; + t.deallocation_stream = &deallocation_stream; + t.set_deallocation_stream = &set_deallocation_stream; + + return t; + }(); + return &table; +} + } // namespace cuda_core diff --git a/cuda_core/cuda/core/experimental/_cpp/resource_handles_cxx_api.hpp b/cuda_core/cuda/core/experimental/_cpp/resource_handles_cxx_api.hpp new file mode 100644 index 0000000000..5436b761f5 --- /dev/null +++ b/cuda_core/cuda/core/experimental/_cpp/resource_handles_cxx_api.hpp @@ -0,0 +1,80 @@ +// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include + +#include "resource_handles.hpp" + +namespace cuda_core { + +// C++ capsule API for cross-extension-module calls. +// +// The function-pointer table is exported from the Python extension module +// `cuda.core.experimental._resource_handles` as a PyCapsule named: +// +// "cuda.core.experimental._resource_handles._CXX_API" +// +// Other extension modules import the capsule and dispatch through the table to +// ensure there is a single owner of all correctness-critical static/thread_local +// state in resource_handles.cpp (caches, last-error state, etc.). + +static constexpr std::uint32_t RESOURCE_HANDLES_CXX_API_VERSION = 1; + +struct ResourceHandlesCxxApiV1 { + std::uint32_t abi_version; + std::uint32_t struct_size; + + // Thread-local error handling + CUresult (*get_last_error)() noexcept; + CUresult (*peek_last_error)() noexcept; + void (*clear_last_error)() noexcept; + + // Context handles + ContextHandle (*create_context_handle_ref)(CUcontext ctx); + ContextHandle (*get_primary_context)(int device_id) noexcept; + ContextHandle (*get_current_context)() noexcept; + + // Stream handles + StreamHandle (*create_stream_handle)(ContextHandle h_ctx, unsigned int flags, int priority); + StreamHandle (*create_stream_handle_ref)(CUstream stream); + StreamHandle (*create_stream_handle_with_owner)(CUstream stream, PyObject* owner); + StreamHandle (*get_legacy_stream)() noexcept; + StreamHandle (*get_per_thread_stream)() noexcept; + + // Event handles + EventHandle (*create_event_handle)(ContextHandle h_ctx, unsigned int flags); + EventHandle (*create_event_handle_noctx)(unsigned int flags); + EventHandle (*create_event_handle_ipc)(const CUipcEventHandle& ipc_handle); + + // Memory pool handles + MemoryPoolHandle (*create_mempool_handle)(const CUmemPoolProps& props); + MemoryPoolHandle (*create_mempool_handle_ref)(CUmemoryPool pool); + MemoryPoolHandle (*get_device_mempool)(int device_id) noexcept; + MemoryPoolHandle (*create_mempool_handle_ipc)(int fd, CUmemAllocationHandleType handle_type); + + // Device pointer handles + DevicePtrHandle (*deviceptr_alloc_from_pool)( + size_t size, + MemoryPoolHandle h_pool, + StreamHandle h_stream); + DevicePtrHandle (*deviceptr_alloc_async)(size_t size, StreamHandle h_stream); + DevicePtrHandle (*deviceptr_alloc)(size_t size); + DevicePtrHandle (*deviceptr_alloc_host)(size_t size); + DevicePtrHandle (*deviceptr_create_ref)(CUdeviceptr ptr); + DevicePtrHandle (*deviceptr_create_with_owner)(CUdeviceptr ptr, PyObject* owner); + DevicePtrHandle (*deviceptr_import_ipc)( + MemoryPoolHandle h_pool, + const void* export_data, + StreamHandle h_stream); + StreamHandle (*deallocation_stream)(const DevicePtrHandle& h); + void (*set_deallocation_stream)(const DevicePtrHandle& h, StreamHandle h_stream); +}; + +// Return pointer to a process-wide singleton table. +const ResourceHandlesCxxApiV1* get_resource_handles_cxx_api_v1() noexcept; + +} // namespace cuda_core + diff --git a/cuda_core/cuda/core/experimental/_event.pyx b/cuda_core/cuda/core/experimental/_event.pyx index 72618e40d6..2305199ffc 100644 --- a/cuda_core/cuda/core/experimental/_event.pyx +++ b/cuda_core/cuda/core/experimental/_event.pyx @@ -5,14 +5,14 @@ from __future__ import annotations cimport cpython +from cpython.pycapsule cimport PyCapsule_Import from libc.string cimport memcpy from cuda.bindings cimport cydriver from cuda.core.experimental._context cimport Context +from cuda.core.experimental._resource_handles_cxx_api cimport ResourceHandlesCxxApiV1 from cuda.core.experimental._resource_handles cimport ( ContextHandle, EventHandle, - create_event_handle, - create_event_handle_ipc, intptr, native, py, @@ -34,6 +34,23 @@ if TYPE_CHECKING: import cuda.bindings +cdef const char* _CXX_API_NAME = b"cuda.core.experimental._resource_handles._CXX_API" +cdef const ResourceHandlesCxxApiV1* _handles = NULL + + +cdef inline const ResourceHandlesCxxApiV1* _get_handles() except NULL: + global _handles + if _handles == NULL: + _handles = PyCapsule_Import(_CXX_API_NAME, 0) + if _handles == NULL: + raise ImportError("Failed to import cuda.core.experimental._resource_handles._CXX_API capsule") + if _handles.abi_version != 1: + raise ImportError("Unsupported resource handles C++ API version") + if _handles.struct_size < cython.sizeof(ResourceHandlesCxxApiV1): + raise ImportError("Resource handles C++ API table is too small") + return _handles + + @dataclass cdef class EventOptions: """Customizable :obj:`~_event.Event` options. @@ -116,7 +133,8 @@ cdef class Event: if not self._timing_disabled: raise TypeError("IPC-enabled events cannot use timing.") # C++ creates the event and returns owning handle with context dependency - cdef EventHandle h_event = create_event_handle(h_context, flags) + cdef const ResourceHandlesCxxApiV1* handles = _get_handles() + cdef EventHandle h_event = handles.create_event_handle(h_context, flags) if not h_event: raise RuntimeError("Failed to create CUDA event") self._h_event = h_event @@ -199,7 +217,8 @@ cdef class Event: memcpy(data.reserved, (ipc_descriptor._reserved), sizeof(data.reserved)) cdef Event self = Event.__new__(cls) # IPC events: the originating process owns the event and its context - cdef EventHandle h_event = create_event_handle_ipc(data) + cdef const ResourceHandlesCxxApiV1* handles = _get_handles() + cdef EventHandle h_event = handles.create_event_handle_ipc(data) if not h_event: raise RuntimeError("Failed to open IPC event handle") self._h_event = h_event diff --git a/cuda_core/cuda/core/experimental/_resource_handles.pyx b/cuda_core/cuda/core/experimental/_resource_handles.pyx index 6395f21e2a..47fc1dc1c4 100644 --- a/cuda_core/cuda/core/experimental/_resource_handles.pyx +++ b/cuda_core/cuda/core/experimental/_resource_handles.pyx @@ -5,3 +5,23 @@ # This module exists to compile _cpp/resource_handles.cpp into a shared library. # The helper functions (native, intptr, py) are implemented as inline C++ functions # in _cpp/resource_handles.hpp and declared as extern in _resource_handles.pxd. + +from cpython.pycapsule cimport PyCapsule_New + +from cuda.core.experimental._resource_handles_cxx_api cimport ( + ResourceHandlesCxxApiV1, + get_resource_handles_cxx_api_v1, +) + + +cdef const char* _CXX_API_NAME = b"cuda.core.experimental._resource_handles._CXX_API" + +# Export the C++ handles dispatch table as a PyCapsule. +# Consumers use PyCapsule_Import(_CXX_API_NAME, 0) to retrieve it. +cdef const ResourceHandlesCxxApiV1* _handles_table = get_resource_handles_cxx_api_v1() +if _handles_table == NULL: + raise RuntimeError("Failed to initialize resource handles C++ API table") + +_CXX_API = PyCapsule_New(_handles_table, _CXX_API_NAME, NULL) +if _CXX_API is None: + raise RuntimeError("Failed to create _CXX_API capsule") diff --git a/cuda_core/cuda/core/experimental/_resource_handles_cxx_api.pxd b/cuda_core/cuda/core/experimental/_resource_handles_cxx_api.pxd new file mode 100644 index 0000000000..f8a4874908 --- /dev/null +++ b/cuda_core/cuda/core/experimental/_resource_handles_cxx_api.pxd @@ -0,0 +1,69 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +from libc.stdint cimport uint32_t +from libc.stddef cimport size_t + +from cuda.bindings cimport cydriver +from cuda.core.experimental._resource_handles cimport ( + ContextHandle, + DevicePtrHandle, + EventHandle, + MemoryPoolHandle, + StreamHandle, +) + + +cdef extern from "_cpp/resource_handles_cxx_api.hpp" namespace "cuda_core": + cdef struct ResourceHandlesCxxApiV1: + uint32_t abi_version + uint32_t struct_size + + # Thread-local error handling + cydriver.CUresult (*get_last_error)() nogil + cydriver.CUresult (*peek_last_error)() nogil + void (*clear_last_error)() nogil + + # Context handles + ContextHandle (*create_context_handle_ref)(cydriver.CUcontext ctx) nogil + ContextHandle (*get_primary_context)(int device_id) nogil + ContextHandle (*get_current_context)() nogil + + # Stream handles + StreamHandle (*create_stream_handle)(ContextHandle h_ctx, unsigned int flags, int priority) nogil + StreamHandle (*create_stream_handle_ref)(cydriver.CUstream stream) nogil + StreamHandle (*create_stream_handle_with_owner)(cydriver.CUstream stream, object owner) + StreamHandle (*get_legacy_stream)() nogil + StreamHandle (*get_per_thread_stream)() nogil + + # Event handles + EventHandle (*create_event_handle)(ContextHandle h_ctx, unsigned int flags) nogil + EventHandle (*create_event_handle_noctx)(unsigned int flags) nogil + EventHandle (*create_event_handle_ipc)(const cydriver.CUipcEventHandle& ipc_handle) nogil + + # Memory pool handles + MemoryPoolHandle (*create_mempool_handle)(const cydriver.CUmemPoolProps& props) nogil + MemoryPoolHandle (*create_mempool_handle_ref)(cydriver.CUmemoryPool pool) nogil + MemoryPoolHandle (*get_device_mempool)(int device_id) nogil + MemoryPoolHandle (*create_mempool_handle_ipc)(int fd, cydriver.CUmemAllocationHandleType handle_type) nogil + + # Device pointer handles + DevicePtrHandle (*deviceptr_alloc_from_pool)( + size_t size, + MemoryPoolHandle h_pool, + StreamHandle h_stream) nogil + DevicePtrHandle (*deviceptr_alloc_async)(size_t size, StreamHandle h_stream) nogil + DevicePtrHandle (*deviceptr_alloc)(size_t size) nogil + DevicePtrHandle (*deviceptr_alloc_host)(size_t size) nogil + DevicePtrHandle (*deviceptr_create_ref)(cydriver.CUdeviceptr ptr) nogil + DevicePtrHandle (*deviceptr_create_with_owner)(cydriver.CUdeviceptr ptr, object owner) + DevicePtrHandle (*deviceptr_import_ipc)( + MemoryPoolHandle h_pool, + const void* export_data, + StreamHandle h_stream) nogil + StreamHandle (*deallocation_stream)(const DevicePtrHandle& h) nogil + void (*set_deallocation_stream)(const DevicePtrHandle& h, StreamHandle h_stream) nogil + + const ResourceHandlesCxxApiV1* get_resource_handles_cxx_api_v1() nogil + From ab164569718066f8e34d1474c55a9418556663de Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Tue, 16 Dec 2025 14:31:54 -0800 Subject: [PATCH 24/38] Resolve CUDA driver entrypoints via cuda-bindings cuGetProcAddress Use a lazy PyCapsule in _resource_handles to resolve and cache required CUDA driver entrypoints via cuda.bindings.driver.cuGetProcAddress, and have resource_handles.cpp consume that table on first use. This avoids duplicating driver pathfinding logic and removes dlopen/dlsym linkage requirements. --- cuda_core/build_hooks.py | 10 +- .../experimental/_cpp/resource_handles.cpp | 192 +++++++++++------- .../core/experimental/_resource_handles.pyx | 110 ++++++++++ 3 files changed, 229 insertions(+), 83 deletions(-) diff --git a/cuda_core/build_hooks.py b/cuda_core/build_hooks.py index 80a96e0bc2..1f1197e9bc 100644 --- a/cuda_core/build_hooks.py +++ b/cuda_core/build_hooks.py @@ -12,7 +12,6 @@ import os import re import subprocess -import sys from Cython.Build import cythonize from setuptools import Extension @@ -99,14 +98,7 @@ def get_sources(mod_name): def get_extension_kwargs(mod_name): """Return Extension kwargs (libraries, etc.) per module.""" - kwargs = {} - - # _resource_handles.cpp uses dlopen/dlsym on Linux, which requires -ldl on glibc < 2.34. - # (On Windows it uses LoadLibrary/GetProcAddress; on macOS dlopen is in libSystem.) - if sys.platform.startswith("linux") and mod_name == "_resource_handles": - kwargs["libraries"] = ["dl"] - - return kwargs + return {} ext_modules = tuple( Extension( diff --git a/cuda_core/cuda/core/experimental/_cpp/resource_handles.cpp b/cuda_core/cuda/core/experimental/_cpp/resource_handles.cpp index 3ad8ea5dc4..a4f872566b 100644 --- a/cuda_core/cuda/core/experimental/_cpp/resource_handles.cpp +++ b/cuda_core/cuda/core/experimental/_cpp/resource_handles.cpp @@ -7,15 +7,7 @@ #include "resource_handles.hpp" #include "resource_handles_cxx_api.hpp" #include -#include - -#if defined(_WIN32) -#include -#else -#include -#endif - -#include +#include #include #include #include @@ -23,42 +15,13 @@ namespace cuda_core { // ============================================================================ -// CUDA driver dynamic loading (CPU-only import + MVC compatibility) +// CUDA driver lazy resolution via cuda-bindings (CPU-only import + MVC) // ============================================================================ namespace { -#if defined(_WIN32) -using LibHandle = HMODULE; - -static LibHandle open_libcuda() noexcept { - // CUDA driver DLL - return LoadLibraryA("nvcuda.dll"); -} - -static void* get_symbol(LibHandle lib, const char* name) noexcept { - return reinterpret_cast(GetProcAddress(lib, name)); -} -#else -using LibHandle = void*; - -static LibHandle open_libcuda() noexcept { - // Prefer the soname; fall back to the linker name. - LibHandle lib = dlopen("libcuda.so.1", RTLD_NOW | RTLD_LOCAL); - if (!lib) { - lib = dlopen("libcuda.so", RTLD_NOW | RTLD_LOCAL); - } - return lib; -} - -static void* get_symbol(LibHandle lib, const char* name) noexcept { - return dlsym(lib, name); -} -#endif - static std::once_flag driver_load_once; -static std::atomic driver_loaded{false}; -static LibHandle libcuda = nullptr; +static bool driver_loaded = false; #define DECLARE_DRIVER_FN(name) using name##_t = decltype(&name); static name##_t p_##name = nullptr @@ -94,55 +57,136 @@ DECLARE_DRIVER_FN(cuMemPoolImportPointer); #undef DECLARE_DRIVER_FN -template -static bool load_symbol(const char* sym, T& fn) noexcept { - fn = reinterpret_cast(get_symbol(libcuda, sym)); - return fn != nullptr; -} - static bool load_driver_api() noexcept { - libcuda = open_libcuda(); - if (!libcuda) { + if (!Py_IsInitialized() || _Py_IsFinalizing()) { + return false; + } + + struct CudaDriverApiV1 { + std::uint32_t abi_version; + std::uint32_t struct_size; + + std::uintptr_t cuDevicePrimaryCtxRetain; + std::uintptr_t cuDevicePrimaryCtxRelease; + std::uintptr_t cuCtxGetCurrent; + + std::uintptr_t cuStreamCreateWithPriority; + std::uintptr_t cuStreamDestroy; + + std::uintptr_t cuEventCreate; + std::uintptr_t cuEventDestroy; + std::uintptr_t cuIpcOpenEventHandle; + + std::uintptr_t cuDeviceGetCount; + + std::uintptr_t cuMemPoolSetAccess; + std::uintptr_t cuMemPoolDestroy; + std::uintptr_t cuMemPoolCreate; + std::uintptr_t cuDeviceGetMemPool; + std::uintptr_t cuMemPoolImportFromShareableHandle; + + std::uintptr_t cuMemAllocFromPoolAsync; + std::uintptr_t cuMemAllocAsync; + std::uintptr_t cuMemAlloc; + std::uintptr_t cuMemAllocHost; + + std::uintptr_t cuMemFreeAsync; + std::uintptr_t cuMemFree; + std::uintptr_t cuMemFreeHost; + + std::uintptr_t cuMemPoolImportPointer; + }; + + static constexpr const char* capsule_name = + "cuda.core.experimental._resource_handles._CUDA_DRIVER_API_V1"; + + PyGILState_STATE gstate = PyGILState_Ensure(); + + // `_resource_handles` is already loaded (it exports the handle API capsule), + // so avoid import machinery and just grab the module object. + PyObject* mod = PyImport_AddModule("cuda.core.experimental._resource_handles"); // borrowed + if (!mod) { + PyErr_Clear(); + PyGILState_Release(gstate); + return false; + } + + PyObject* fn = PyObject_GetAttrString(mod, "_get_cuda_driver_api_v1_capsule"); // new ref + if (!fn) { + PyErr_Clear(); + PyGILState_Release(gstate); + return false; + } + + PyObject* cap = PyObject_CallFunctionObjArgs(fn, nullptr); + Py_DECREF(fn); + if (!cap) { + PyErr_Clear(); + PyGILState_Release(gstate); return false; } - bool ok = true; - ok &= load_symbol("cuDevicePrimaryCtxRetain", p_cuDevicePrimaryCtxRetain); - ok &= load_symbol("cuDevicePrimaryCtxRelease", p_cuDevicePrimaryCtxRelease); - ok &= load_symbol("cuCtxGetCurrent", p_cuCtxGetCurrent); + const auto* api = static_cast(PyCapsule_GetPointer(cap, capsule_name)); + Py_DECREF(cap); - ok &= load_symbol("cuStreamCreateWithPriority", p_cuStreamCreateWithPriority); - ok &= load_symbol("cuStreamDestroy", p_cuStreamDestroy); + if (!api) { + PyErr_Clear(); + PyGILState_Release(gstate); + return false; + } + if (api->abi_version != 1 || api->struct_size < sizeof(CudaDriverApiV1)) { + PyGILState_Release(gstate); + return false; + } - ok &= load_symbol("cuEventCreate", p_cuEventCreate); - ok &= load_symbol("cuEventDestroy", p_cuEventDestroy); - ok &= load_symbol("cuIpcOpenEventHandle", p_cuIpcOpenEventHandle); +#define LOAD_ADDR(name) \ + do { \ + if (api->name == 0) { \ + PyGILState_Release(gstate); \ + return false; \ + } \ + p_##name = reinterpret_cast(api->name); \ + } while (0) - ok &= load_symbol("cuDeviceGetCount", p_cuDeviceGetCount); + LOAD_ADDR(cuDevicePrimaryCtxRetain); + LOAD_ADDR(cuDevicePrimaryCtxRelease); + LOAD_ADDR(cuCtxGetCurrent); - ok &= load_symbol("cuMemPoolSetAccess", p_cuMemPoolSetAccess); - ok &= load_symbol("cuMemPoolDestroy", p_cuMemPoolDestroy); - ok &= load_symbol("cuMemPoolCreate", p_cuMemPoolCreate); - ok &= load_symbol("cuDeviceGetMemPool", p_cuDeviceGetMemPool); - ok &= load_symbol("cuMemPoolImportFromShareableHandle", p_cuMemPoolImportFromShareableHandle); + LOAD_ADDR(cuStreamCreateWithPriority); + LOAD_ADDR(cuStreamDestroy); - ok &= load_symbol("cuMemAllocFromPoolAsync", p_cuMemAllocFromPoolAsync); - ok &= load_symbol("cuMemAllocAsync", p_cuMemAllocAsync); - ok &= load_symbol("cuMemAlloc", p_cuMemAlloc); - ok &= load_symbol("cuMemAllocHost", p_cuMemAllocHost); + LOAD_ADDR(cuEventCreate); + LOAD_ADDR(cuEventDestroy); + LOAD_ADDR(cuIpcOpenEventHandle); - ok &= load_symbol("cuMemFreeAsync", p_cuMemFreeAsync); - ok &= load_symbol("cuMemFree", p_cuMemFree); - ok &= load_symbol("cuMemFreeHost", p_cuMemFreeHost); + LOAD_ADDR(cuDeviceGetCount); - ok &= load_symbol("cuMemPoolImportPointer", p_cuMemPoolImportPointer); + LOAD_ADDR(cuMemPoolSetAccess); + LOAD_ADDR(cuMemPoolDestroy); + LOAD_ADDR(cuMemPoolCreate); + LOAD_ADDR(cuDeviceGetMemPool); + LOAD_ADDR(cuMemPoolImportFromShareableHandle); - return ok; + LOAD_ADDR(cuMemAllocFromPoolAsync); + LOAD_ADDR(cuMemAllocAsync); + LOAD_ADDR(cuMemAlloc); + LOAD_ADDR(cuMemAllocHost); + + LOAD_ADDR(cuMemFreeAsync); + LOAD_ADDR(cuMemFree); + LOAD_ADDR(cuMemFreeHost); + + LOAD_ADDR(cuMemPoolImportPointer); + +#undef LOAD_ADDR + + PyGILState_Release(gstate); + return true; } static bool ensure_driver_loaded() noexcept { - std::call_once(driver_load_once, []() { driver_loaded.store(load_driver_api()); }); - return driver_loaded.load(); + std::call_once(driver_load_once, []() { driver_loaded = load_driver_api(); }); + return driver_loaded; } } // namespace diff --git a/cuda_core/cuda/core/experimental/_resource_handles.pyx b/cuda_core/cuda/core/experimental/_resource_handles.pyx index 47fc1dc1c4..8ddb44c175 100644 --- a/cuda_core/cuda/core/experimental/_resource_handles.pyx +++ b/cuda_core/cuda/core/experimental/_resource_handles.pyx @@ -7,14 +7,18 @@ # in _cpp/resource_handles.hpp and declared as extern in _resource_handles.pxd. from cpython.pycapsule cimport PyCapsule_New +from libc.stdint cimport uint32_t, uint64_t, uintptr_t from cuda.core.experimental._resource_handles_cxx_api cimport ( ResourceHandlesCxxApiV1, get_resource_handles_cxx_api_v1, ) +import cython + cdef const char* _CXX_API_NAME = b"cuda.core.experimental._resource_handles._CXX_API" +cdef const char* _CUDA_DRIVER_API_V1_NAME = b"cuda.core.experimental._resource_handles._CUDA_DRIVER_API_V1" # Export the C++ handles dispatch table as a PyCapsule. # Consumers use PyCapsule_Import(_CXX_API_NAME, 0) to retrieve it. @@ -25,3 +29,109 @@ if _handles_table == NULL: _CXX_API = PyCapsule_New(_handles_table, _CXX_API_NAME, NULL) if _CXX_API is None: raise RuntimeError("Failed to create _CXX_API capsule") + + +cdef struct CudaDriverApiV1: + uint32_t abi_version + uint32_t struct_size + + uintptr_t cuDevicePrimaryCtxRetain + uintptr_t cuDevicePrimaryCtxRelease + uintptr_t cuCtxGetCurrent + + uintptr_t cuStreamCreateWithPriority + uintptr_t cuStreamDestroy + + uintptr_t cuEventCreate + uintptr_t cuEventDestroy + uintptr_t cuIpcOpenEventHandle + + uintptr_t cuDeviceGetCount + + uintptr_t cuMemPoolSetAccess + uintptr_t cuMemPoolDestroy + uintptr_t cuMemPoolCreate + uintptr_t cuDeviceGetMemPool + uintptr_t cuMemPoolImportFromShareableHandle + + uintptr_t cuMemAllocFromPoolAsync + uintptr_t cuMemAllocAsync + uintptr_t cuMemAlloc + uintptr_t cuMemAllocHost + + uintptr_t cuMemFreeAsync + uintptr_t cuMemFree + uintptr_t cuMemFreeHost + + uintptr_t cuMemPoolImportPointer + + +cdef CudaDriverApiV1 _cuda_driver_api_v1 +cdef bint _cuda_driver_api_v1_inited = False + + +cdef inline uintptr_t _as_addr(object pfn) except 0: + return int(pfn) + + +cdef inline uintptr_t _resolve(object d, int driver_ver, uint64_t flags, bytes sym) except 0: + err, pfn, status = d.cuGetProcAddress(sym, driver_ver, flags) + if int(err) != 0 or pfn is None: + raise RuntimeError(f"cuGetProcAddress failed for {sym!r}, err={err}, status={status}") + return _as_addr(pfn) + + +def _get_cuda_driver_api_v1_capsule(): + """Return a PyCapsule containing cached CUDA driver entrypoints. + + This is evaluated lazily on first use so cuda-core remains importable on + CPU-only machines. + """ + global _cuda_driver_api_v1_inited, _cuda_driver_api_v1 + if not _cuda_driver_api_v1_inited: + import cuda.bindings.driver as d + + err, ver = d.cuDriverGetVersion() + if int(err) != 0: + raise RuntimeError(f"cuDriverGetVersion failed: {err}") + driver_ver = int(ver) + + flags = 0 # CU_GET_PROC_ADDRESS_DEFAULT + + _cuda_driver_api_v1.cuDevicePrimaryCtxRetain = _resolve(d, driver_ver, flags, b"cuDevicePrimaryCtxRetain") + _cuda_driver_api_v1.cuDevicePrimaryCtxRelease = _resolve(d, driver_ver, flags, b"cuDevicePrimaryCtxRelease") + _cuda_driver_api_v1.cuCtxGetCurrent = _resolve(d, driver_ver, flags, b"cuCtxGetCurrent") + + _cuda_driver_api_v1.cuStreamCreateWithPriority = _resolve(d, driver_ver, flags, b"cuStreamCreateWithPriority") + _cuda_driver_api_v1.cuStreamDestroy = _resolve(d, driver_ver, flags, b"cuStreamDestroy") + + _cuda_driver_api_v1.cuEventCreate = _resolve(d, driver_ver, flags, b"cuEventCreate") + _cuda_driver_api_v1.cuEventDestroy = _resolve(d, driver_ver, flags, b"cuEventDestroy") + _cuda_driver_api_v1.cuIpcOpenEventHandle = _resolve(d, driver_ver, flags, b"cuIpcOpenEventHandle") + + _cuda_driver_api_v1.cuDeviceGetCount = _resolve(d, driver_ver, flags, b"cuDeviceGetCount") + + _cuda_driver_api_v1.cuMemPoolSetAccess = _resolve(d, driver_ver, flags, b"cuMemPoolSetAccess") + _cuda_driver_api_v1.cuMemPoolDestroy = _resolve(d, driver_ver, flags, b"cuMemPoolDestroy") + _cuda_driver_api_v1.cuMemPoolCreate = _resolve(d, driver_ver, flags, b"cuMemPoolCreate") + _cuda_driver_api_v1.cuDeviceGetMemPool = _resolve(d, driver_ver, flags, b"cuDeviceGetMemPool") + _cuda_driver_api_v1.cuMemPoolImportFromShareableHandle = _resolve( + d, driver_ver, flags, b"cuMemPoolImportFromShareableHandle" + ) + + _cuda_driver_api_v1.cuMemAllocFromPoolAsync = _resolve(d, driver_ver, flags, b"cuMemAllocFromPoolAsync") + _cuda_driver_api_v1.cuMemAllocAsync = _resolve(d, driver_ver, flags, b"cuMemAllocAsync") + _cuda_driver_api_v1.cuMemAlloc = _resolve(d, driver_ver, flags, b"cuMemAlloc") + _cuda_driver_api_v1.cuMemAllocHost = _resolve(d, driver_ver, flags, b"cuMemAllocHost") + + _cuda_driver_api_v1.cuMemFreeAsync = _resolve(d, driver_ver, flags, b"cuMemFreeAsync") + _cuda_driver_api_v1.cuMemFree = _resolve(d, driver_ver, flags, b"cuMemFree") + _cuda_driver_api_v1.cuMemFreeHost = _resolve(d, driver_ver, flags, b"cuMemFreeHost") + + _cuda_driver_api_v1.cuMemPoolImportPointer = _resolve(d, driver_ver, flags, b"cuMemPoolImportPointer") + + _cuda_driver_api_v1.abi_version = 1 + _cuda_driver_api_v1.struct_size = cython.sizeof(CudaDriverApiV1) + _cuda_driver_api_v1_inited = True + + return PyCapsule_New(&_cuda_driver_api_v1, _CUDA_DRIVER_API_V1_NAME, NULL) From 3fafe926952699b825b4db316c5696550640a453 Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Tue, 16 Dec 2025 15:21:09 -0800 Subject: [PATCH 25/38] Centralize resource handles capsule dispatch in _resource_handles.pxd Hide _CXX_API capsule import/version checks behind inline pxd wrappers so call sites stay clean, and remove redundant ensure_driver_loaded() checks in C++ deleters. --- .../experimental/_cpp/resource_handles.cpp | 47 +-- cuda_core/cuda/core/experimental/_event.pyx | 27 +- .../cuda/core/experimental/_memoryview.pyx | 4 +- .../core/experimental/_resource_handles.pxd | 344 +++++++++++------- cuda_core/cuda/core/experimental/_stream.pyx | 7 +- 5 files changed, 227 insertions(+), 202 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_cpp/resource_handles.cpp b/cuda_core/cuda/core/experimental/_cpp/resource_handles.cpp index a4f872566b..9f4e21ca24 100644 --- a/cuda_core/cuda/core/experimental/_cpp/resource_handles.cpp +++ b/cuda_core/cuda/core/experimental/_cpp/resource_handles.cpp @@ -319,9 +319,7 @@ ContextHandle get_primary_context(int device_id) noexcept { new ContextBox{ctx}, [device_id](const ContextBox* b) { GILReleaseGuard gil; - if (ensure_driver_loaded()) { - p_cuDevicePrimaryCtxRelease(device_id); - } + p_cuDevicePrimaryCtxRelease(device_id); delete b; } ); @@ -374,9 +372,7 @@ StreamHandle create_stream_handle(ContextHandle h_ctx, unsigned int flags, int p new StreamBox{stream}, [h_ctx](const StreamBox* b) { GILReleaseGuard gil; - if (ensure_driver_loaded()) { - p_cuStreamDestroy(b->resource); - } + p_cuStreamDestroy(b->resource); delete b; } ); @@ -436,9 +432,7 @@ EventHandle create_event_handle(ContextHandle h_ctx, unsigned int flags) { new EventBox{event}, [h_ctx](const EventBox* b) { GILReleaseGuard gil; - if (ensure_driver_loaded()) { - p_cuEventDestroy(b->resource); - } + p_cuEventDestroy(b->resource); delete b; } ); @@ -464,9 +458,7 @@ EventHandle create_event_handle_ipc(const CUipcEventHandle& ipc_handle) { new EventBox{event}, [](const EventBox* b) { GILReleaseGuard gil; - if (ensure_driver_loaded()) { - p_cuEventDestroy(b->resource); - } + p_cuEventDestroy(b->resource); delete b; } ); @@ -484,9 +476,6 @@ struct MemoryPoolBox { // Helper to clear peer access before destroying a memory pool. // Works around nvbug 5698116: recycled pool handles inherit peer access state. static void clear_mempool_peer_access(CUmemoryPool pool) { - if (!ensure_driver_loaded()) { - return; - } int device_count = 0; if (p_cuDeviceGetCount(&device_count) != CUDA_SUCCESS || device_count <= 0) { return; @@ -507,9 +496,7 @@ static MemoryPoolHandle wrap_mempool_owned(CUmemoryPool pool) { [](const MemoryPoolBox* b) { GILReleaseGuard gil; clear_mempool_peer_access(b->resource); - if (ensure_driver_loaded()) { - p_cuMemPoolDestroy(b->resource); - } + p_cuMemPoolDestroy(b->resource); delete b; } ); @@ -600,9 +587,7 @@ DevicePtrHandle deviceptr_alloc_from_pool(size_t size, MemoryPoolHandle h_pool, new DevicePtrBox{ptr, h_stream}, [h_pool](DevicePtrBox* b) { GILReleaseGuard gil; - if (ensure_driver_loaded()) { - p_cuMemFreeAsync(b->resource, native(b->h_stream)); - } + p_cuMemFreeAsync(b->resource, native(b->h_stream)); delete b; } ); @@ -624,9 +609,7 @@ DevicePtrHandle deviceptr_alloc_async(size_t size, StreamHandle h_stream) { new DevicePtrBox{ptr, h_stream}, [](DevicePtrBox* b) { GILReleaseGuard gil; - if (ensure_driver_loaded()) { - p_cuMemFreeAsync(b->resource, native(b->h_stream)); - } + p_cuMemFreeAsync(b->resource, native(b->h_stream)); delete b; } ); @@ -648,9 +631,7 @@ DevicePtrHandle deviceptr_alloc(size_t size) { new DevicePtrBox{ptr, StreamHandle{}}, [](DevicePtrBox* b) { GILReleaseGuard gil; - if (ensure_driver_loaded()) { - p_cuMemFree(b->resource); - } + p_cuMemFree(b->resource); delete b; } ); @@ -672,9 +653,7 @@ DevicePtrHandle deviceptr_alloc_host(size_t size) { new DevicePtrBox{reinterpret_cast(ptr), StreamHandle{}}, [](DevicePtrBox* b) { GILReleaseGuard gil; - if (ensure_driver_loaded()) { - p_cuMemFreeHost(reinterpret_cast(b->resource)); - } + p_cuMemFreeHost(reinterpret_cast(b->resource)); delete b; } ); @@ -754,9 +733,7 @@ DevicePtrHandle deviceptr_import_ipc(MemoryPoolHandle h_pool, const void* export ipc_ptr_cache.erase(ptr); } GILReleaseGuard gil; - if (ensure_driver_loaded()) { - p_cuMemFreeAsync(b->resource, native(b->h_stream)); - } + p_cuMemFreeAsync(b->resource, native(b->h_stream)); delete b; } ); @@ -769,9 +746,7 @@ DevicePtrHandle deviceptr_import_ipc(MemoryPoolHandle h_pool, const void* export new DevicePtrBox{ptr, h_stream}, [h_pool](DevicePtrBox* b) { GILReleaseGuard gil; - if (ensure_driver_loaded()) { - p_cuMemFreeAsync(b->resource, native(b->h_stream)); - } + p_cuMemFreeAsync(b->resource, native(b->h_stream)); delete b; } ); diff --git a/cuda_core/cuda/core/experimental/_event.pyx b/cuda_core/cuda/core/experimental/_event.pyx index 2305199ffc..72618e40d6 100644 --- a/cuda_core/cuda/core/experimental/_event.pyx +++ b/cuda_core/cuda/core/experimental/_event.pyx @@ -5,14 +5,14 @@ from __future__ import annotations cimport cpython -from cpython.pycapsule cimport PyCapsule_Import from libc.string cimport memcpy from cuda.bindings cimport cydriver from cuda.core.experimental._context cimport Context -from cuda.core.experimental._resource_handles_cxx_api cimport ResourceHandlesCxxApiV1 from cuda.core.experimental._resource_handles cimport ( ContextHandle, EventHandle, + create_event_handle, + create_event_handle_ipc, intptr, native, py, @@ -34,23 +34,6 @@ if TYPE_CHECKING: import cuda.bindings -cdef const char* _CXX_API_NAME = b"cuda.core.experimental._resource_handles._CXX_API" -cdef const ResourceHandlesCxxApiV1* _handles = NULL - - -cdef inline const ResourceHandlesCxxApiV1* _get_handles() except NULL: - global _handles - if _handles == NULL: - _handles = PyCapsule_Import(_CXX_API_NAME, 0) - if _handles == NULL: - raise ImportError("Failed to import cuda.core.experimental._resource_handles._CXX_API capsule") - if _handles.abi_version != 1: - raise ImportError("Unsupported resource handles C++ API version") - if _handles.struct_size < cython.sizeof(ResourceHandlesCxxApiV1): - raise ImportError("Resource handles C++ API table is too small") - return _handles - - @dataclass cdef class EventOptions: """Customizable :obj:`~_event.Event` options. @@ -133,8 +116,7 @@ cdef class Event: if not self._timing_disabled: raise TypeError("IPC-enabled events cannot use timing.") # C++ creates the event and returns owning handle with context dependency - cdef const ResourceHandlesCxxApiV1* handles = _get_handles() - cdef EventHandle h_event = handles.create_event_handle(h_context, flags) + cdef EventHandle h_event = create_event_handle(h_context, flags) if not h_event: raise RuntimeError("Failed to create CUDA event") self._h_event = h_event @@ -217,8 +199,7 @@ cdef class Event: memcpy(data.reserved, (ipc_descriptor._reserved), sizeof(data.reserved)) cdef Event self = Event.__new__(cls) # IPC events: the originating process owns the event and its context - cdef const ResourceHandlesCxxApiV1* handles = _get_handles() - cdef EventHandle h_event = handles.create_event_handle_ipc(data) + cdef EventHandle h_event = create_event_handle_ipc(data) if not h_event: raise RuntimeError("Failed to open IPC event handle") self._h_event = h_event diff --git a/cuda_core/cuda/core/experimental/_memoryview.pyx b/cuda_core/cuda/core/experimental/_memoryview.pyx index 9f1119894a..443929b27e 100644 --- a/cuda_core/cuda/core/experimental/_memoryview.pyx +++ b/cuda_core/cuda/core/experimental/_memoryview.pyx @@ -16,7 +16,7 @@ import numpy from cuda.bindings cimport cydriver from cuda.core.experimental._resource_handles cimport ( EventHandle, - create_event_handle, + create_event_handle_noctx, native, ) from cuda.core.experimental._utils.cuda_utils import handle_return, driver @@ -596,7 +596,7 @@ cpdef StridedMemoryView view_as_cai(obj, stream_ptr, view=None): assert producer_s > 0 # establish stream order if producer_s != consumer_s: - h_event = create_event_handle(cydriver.CUevent_flags.CU_EVENT_DISABLE_TIMING) + h_event = create_event_handle_noctx(cydriver.CUevent_flags.CU_EVENT_DISABLE_TIMING) with nogil: HANDLE_RETURN(cydriver.cuEventRecord( native(h_event), producer_s)) diff --git a/cuda_core/cuda/core/experimental/_resource_handles.pxd b/cuda_core/cuda/core/experimental/_resource_handles.pxd index 6aa204efc6..5bfc1821e4 100644 --- a/cuda_core/cuda/core/experimental/_resource_handles.pxd +++ b/cuda_core/cuda/core/experimental/_resource_handles.pxd @@ -2,169 +2,237 @@ # # SPDX-License-Identifier: Apache-2.0 -from libc.stdint cimport intptr_t +from libc.stddef cimport size_t +from libc.stdint cimport intptr_t, uint32_t from libcpp.memory cimport shared_ptr +from cpython.pycapsule cimport PyCapsule_Import + from cuda.bindings cimport cydriver -# Declare the C++ namespace and types +# Declare the C++ namespace and types (inline helpers live in the header). cdef extern from "_cpp/resource_handles.hpp" namespace "cuda_core": - # ======================================================================== - # Thread-local error handling - # ======================================================================== - cydriver.CUresult get_last_error() nogil - cydriver.CUresult peek_last_error() nogil - void clear_last_error() nogil - - # ======================================================================== - # Context Handle - # ======================================================================== ctypedef shared_ptr[const cydriver.CUcontext] ContextHandle - - # Function to create a non-owning context handle (references existing context) - ContextHandle create_context_handle_ref(cydriver.CUcontext ctx) nogil - - # Context acquisition functions (pure C++, nogil-safe with thread-local caching) - ContextHandle get_primary_context(int device_id) nogil - ContextHandle get_current_context() nogil - - # ======================================================================== - # Stream Handle - # ======================================================================== ctypedef shared_ptr[const cydriver.CUstream] StreamHandle - - # Create an owning stream handle via cuStreamCreateWithPriority - # Context handle establishes structural dependency (context outlives stream) - # Returns empty handle on error (caller must check) - StreamHandle create_stream_handle(ContextHandle h_ctx, unsigned int flags, int priority) nogil - - # Create a non-owning stream handle (stream NOT destroyed when handle released) - # Caller is responsible for keeping the stream's context alive - StreamHandle create_stream_handle_ref(cydriver.CUstream stream) nogil - - # Create non-owning handle that prevents Python owner from being GC'd - # Owner is responsible for keeping the stream's context alive - StreamHandle create_stream_handle_with_owner(cydriver.CUstream stream, object owner) - - # Get non-owning handle to the legacy default stream (no context dependency) - StreamHandle get_legacy_stream() nogil - - # Get non-owning handle to the per-thread default stream (no context dependency) - StreamHandle get_per_thread_stream() nogil - - # ======================================================================== - # Event Handle - # ======================================================================== ctypedef shared_ptr[const cydriver.CUevent] EventHandle - - # Create an owning event handle via cuEventCreate - # Context handle establishes structural dependency (context outlives event) - # Returns empty handle on error (caller must check) - EventHandle create_event_handle(ContextHandle h_ctx, unsigned int flags) nogil - - # Create an owning event handle without context dependency - # Use for temporary events that are created and destroyed in the same scope - # Returns empty handle on error (caller must check) - EventHandle create_event_handle(unsigned int flags) nogil - - # Create an owning event handle from IPC handle - # The originating process owns the event and its context - # Returns empty handle on error (caller must check) - EventHandle create_event_handle_ipc(const cydriver.CUipcEventHandle& ipc_handle) nogil - - # ======================================================================== - # Memory Pool Handle - # ======================================================================== ctypedef shared_ptr[const cydriver.CUmemoryPool] MemoryPoolHandle - - # Create an owning memory pool handle via cuMemPoolCreate - # Memory pools are device-scoped (not context-scoped) - # Returns empty handle on error (caller must check) - MemoryPoolHandle create_mempool_handle(const cydriver.CUmemPoolProps& props) nogil - - # Create a non-owning memory pool handle (pool NOT destroyed when released) - # Use for device default/current pools managed by the driver - MemoryPoolHandle create_mempool_handle_ref(cydriver.CUmemoryPool pool) nogil - - # Get non-owning handle to the current memory pool for a device - # Returns empty handle on error (caller must check) - MemoryPoolHandle get_device_mempool(int device_id) nogil - - # Create an owning memory pool handle from IPC import - # File descriptor NOT owned by this handle (caller manages FD separately) - # Returns empty handle on error (caller must check) - MemoryPoolHandle create_mempool_handle_ipc(int fd, cydriver.CUmemAllocationHandleType handle_type) nogil - - # ======================================================================== - # Device Pointer Handle - # ======================================================================== ctypedef shared_ptr[const cydriver.CUdeviceptr] DevicePtrHandle - # Allocate device memory from a pool asynchronously via cuMemAllocFromPoolAsync - # Pool handle is captured in deleter to keep pool alive - # Returns empty handle on error (caller must check) - DevicePtrHandle deviceptr_alloc_from_pool( - size_t size, - MemoryPoolHandle h_pool, - StreamHandle h_stream) nogil - - # Allocate device memory asynchronously via cuMemAllocAsync - # Returns empty handle on error (caller must check) - DevicePtrHandle deviceptr_alloc_async(size_t size, StreamHandle h_stream) nogil - - # Allocate device memory synchronously via cuMemAlloc - # Returns empty handle on error (caller must check) - DevicePtrHandle deviceptr_alloc(size_t size) nogil - - # Allocate pinned host memory via cuMemAllocHost - # Returns empty handle on error (caller must check) - DevicePtrHandle deviceptr_alloc_host(size_t size) nogil - - # Create a non-owning device pointer handle (pointer NOT freed when released) - # Use for foreign pointers from external libraries - DevicePtrHandle deviceptr_create_ref(cydriver.CUdeviceptr ptr) nogil - - # Create non-owning handle that prevents Python owner from being GC'd - # Pointer NOT freed when released; owner's refcount decremented on release - # If owner is None, equivalent to deviceptr_create_ref - DevicePtrHandle deviceptr_create_with_owner(cydriver.CUdeviceptr ptr, object owner) - - # Import a device pointer from IPC via cuMemPoolImportPointer - # Note: Does not yet implement reference counting for nvbug 5570902 - # On error, returns empty handle and sets thread-local error (use get_last_error()) - DevicePtrHandle deviceptr_import_ipc( - MemoryPoolHandle h_pool, - const void* export_data, - StreamHandle h_stream) nogil - - # Access the deallocation stream for a device pointer handle (read-only) - StreamHandle deallocation_stream(const DevicePtrHandle& h) nogil - - # Set the deallocation stream for a device pointer handle - void set_deallocation_stream(const DevicePtrHandle& h, StreamHandle h_stream) nogil - - # ======================================================================== - # Overloaded helper functions (C++ handles dispatch by type) - # ======================================================================== - - # native() - extract the raw CUDA handle + # native() - extract the raw CUDA handle (inline C++) cydriver.CUcontext native(ContextHandle h) nogil cydriver.CUstream native(StreamHandle h) nogil cydriver.CUevent native(EventHandle h) nogil cydriver.CUmemoryPool native(MemoryPoolHandle h) nogil cydriver.CUdeviceptr native(DevicePtrHandle h) nogil - # intptr() - extract handle as intptr_t for Python interop - # Using signed intptr_t per C standard convention and issue #1342 + # intptr() - extract handle as intptr_t for Python interop (inline C++) intptr_t intptr(ContextHandle h) nogil intptr_t intptr(StreamHandle h) nogil intptr_t intptr(EventHandle h) nogil intptr_t intptr(MemoryPoolHandle h) nogil intptr_t intptr(DevicePtrHandle h) nogil - # py() - convert handle to Python driver wrapper object (requires GIL) + # py() - convert handle to Python driver wrapper object (inline C++; requires GIL) object py(ContextHandle h) object py(StreamHandle h) object py(EventHandle h) object py(MemoryPoolHandle h) object py(DevicePtrHandle h) + + +# The resource handles API table is exported from `cuda.core.experimental._resource_handles` +# as a PyCapsule named: +# +# "cuda.core.experimental._resource_handles._CXX_API" +# +# Consumers dispatch through this table to avoid relying on RTLD_GLOBAL and to +# ensure a single owner of correctness-critical static/thread_local state. +cdef extern from "_cpp/resource_handles_cxx_api.hpp" namespace "cuda_core": + cdef struct ResourceHandlesCxxApiV1: + uint32_t abi_version + uint32_t struct_size + + # Thread-local error handling + cydriver.CUresult (*get_last_error)() nogil + cydriver.CUresult (*peek_last_error)() nogil + void (*clear_last_error)() nogil + + # Context handles + ContextHandle (*create_context_handle_ref)(cydriver.CUcontext ctx) nogil + ContextHandle (*get_primary_context)(int device_id) nogil + ContextHandle (*get_current_context)() nogil + + # Stream handles + StreamHandle (*create_stream_handle)(ContextHandle h_ctx, unsigned int flags, int priority) nogil + StreamHandle (*create_stream_handle_ref)(cydriver.CUstream stream) nogil + StreamHandle (*create_stream_handle_with_owner)(cydriver.CUstream stream, object owner) + StreamHandle (*get_legacy_stream)() nogil + StreamHandle (*get_per_thread_stream)() nogil + + # Event handles + EventHandle (*create_event_handle)(ContextHandle h_ctx, unsigned int flags) nogil + EventHandle (*create_event_handle_noctx)(unsigned int flags) nogil + EventHandle (*create_event_handle_ipc)(const cydriver.CUipcEventHandle& ipc_handle) nogil + + # Memory pool handles + MemoryPoolHandle (*create_mempool_handle)(const cydriver.CUmemPoolProps& props) nogil + MemoryPoolHandle (*create_mempool_handle_ref)(cydriver.CUmemoryPool pool) nogil + MemoryPoolHandle (*get_device_mempool)(int device_id) nogil + MemoryPoolHandle (*create_mempool_handle_ipc)(int fd, cydriver.CUmemAllocationHandleType handle_type) nogil + + # Device pointer handles + DevicePtrHandle (*deviceptr_alloc_from_pool)( + size_t size, + MemoryPoolHandle h_pool, + StreamHandle h_stream) nogil + DevicePtrHandle (*deviceptr_alloc_async)(size_t size, StreamHandle h_stream) nogil + DevicePtrHandle (*deviceptr_alloc)(size_t size) nogil + DevicePtrHandle (*deviceptr_alloc_host)(size_t size) nogil + DevicePtrHandle (*deviceptr_create_ref)(cydriver.CUdeviceptr ptr) nogil + DevicePtrHandle (*deviceptr_create_with_owner)(cydriver.CUdeviceptr ptr, object owner) + DevicePtrHandle (*deviceptr_import_ipc)( + MemoryPoolHandle h_pool, + const void* export_data, + StreamHandle h_stream) nogil + StreamHandle (*deallocation_stream)(const DevicePtrHandle& h) nogil + void (*set_deallocation_stream)(const DevicePtrHandle& h, StreamHandle h_stream) nogil + + const ResourceHandlesCxxApiV1* get_resource_handles_cxx_api_v1() nogil + + +cdef const ResourceHandlesCxxApiV1* _handles_table = NULL + + +cdef inline const ResourceHandlesCxxApiV1* _get_handles_table() except NULL nogil: + global _handles_table + if _handles_table == NULL: + with gil: + if _handles_table == NULL: + _handles_table = PyCapsule_Import( + b"cuda.core.experimental._resource_handles._CXX_API", 0 + ) + if _handles_table == NULL: + raise ImportError("Failed to import cuda.core.experimental._resource_handles._CXX_API capsule") + if _handles_table.abi_version != 1: + raise ImportError("Unsupported resource handles C++ API version") + if _handles_table.struct_size < sizeof(ResourceHandlesCxxApiV1): + raise ImportError("Resource handles C++ API table is too small") + return _handles_table + + +# ----------------------------------------------------------------------------- +# Dispatch wrappers (hide capsule init from consumers) +# ----------------------------------------------------------------------------- + +cdef inline cydriver.CUresult get_last_error() except * nogil: + return _get_handles_table().get_last_error() + + +cdef inline cydriver.CUresult peek_last_error() except * nogil: + return _get_handles_table().peek_last_error() + + +cdef inline void clear_last_error() except * nogil: + _get_handles_table().clear_last_error() + + +cdef inline ContextHandle create_context_handle_ref(cydriver.CUcontext ctx) except * nogil: + return _get_handles_table().create_context_handle_ref(ctx) + + +cdef inline ContextHandle get_primary_context(int device_id) except * nogil: + return _get_handles_table().get_primary_context(device_id) + + +cdef inline ContextHandle get_current_context() except * nogil: + return _get_handles_table().get_current_context() + + +cdef inline StreamHandle create_stream_handle(ContextHandle h_ctx, unsigned int flags, int priority) except * nogil: + return _get_handles_table().create_stream_handle(h_ctx, flags, priority) + + +cdef inline StreamHandle create_stream_handle_ref(cydriver.CUstream stream) except * nogil: + return _get_handles_table().create_stream_handle_ref(stream) + + +cdef inline StreamHandle create_stream_handle_with_owner(cydriver.CUstream stream, object owner) except *: + return _get_handles_table().create_stream_handle_with_owner(stream, owner) + + +cdef inline StreamHandle get_legacy_stream() except * nogil: + return _get_handles_table().get_legacy_stream() + + +cdef inline StreamHandle get_per_thread_stream() except * nogil: + return _get_handles_table().get_per_thread_stream() + + +cdef inline EventHandle create_event_handle(ContextHandle h_ctx, unsigned int flags) except * nogil: + return _get_handles_table().create_event_handle(h_ctx, flags) + + +cdef inline EventHandle create_event_handle_noctx(unsigned int flags) except * nogil: + return _get_handles_table().create_event_handle_noctx(flags) + + +cdef inline EventHandle create_event_handle_ipc(const cydriver.CUipcEventHandle& ipc_handle) except * nogil: + return _get_handles_table().create_event_handle_ipc(ipc_handle) + + +cdef inline MemoryPoolHandle create_mempool_handle(const cydriver.CUmemPoolProps& props) except * nogil: + return _get_handles_table().create_mempool_handle(props) + + +cdef inline MemoryPoolHandle create_mempool_handle_ref(cydriver.CUmemoryPool pool) except * nogil: + return _get_handles_table().create_mempool_handle_ref(pool) + + +cdef inline MemoryPoolHandle get_device_mempool(int device_id) except * nogil: + return _get_handles_table().get_device_mempool(device_id) + + +cdef inline MemoryPoolHandle create_mempool_handle_ipc(int fd, cydriver.CUmemAllocationHandleType handle_type) except * nogil: + return _get_handles_table().create_mempool_handle_ipc(fd, handle_type) + + +cdef inline DevicePtrHandle deviceptr_alloc_from_pool( + size_t size, + MemoryPoolHandle h_pool, + StreamHandle h_stream) except * nogil: + return _get_handles_table().deviceptr_alloc_from_pool(size, h_pool, h_stream) + + +cdef inline DevicePtrHandle deviceptr_alloc_async(size_t size, StreamHandle h_stream) except * nogil: + return _get_handles_table().deviceptr_alloc_async(size, h_stream) + + +cdef inline DevicePtrHandle deviceptr_alloc(size_t size) except * nogil: + return _get_handles_table().deviceptr_alloc(size) + + +cdef inline DevicePtrHandle deviceptr_alloc_host(size_t size) except * nogil: + return _get_handles_table().deviceptr_alloc_host(size) + + +cdef inline DevicePtrHandle deviceptr_create_ref(cydriver.CUdeviceptr ptr) except * nogil: + return _get_handles_table().deviceptr_create_ref(ptr) + + +cdef inline DevicePtrHandle deviceptr_create_with_owner(cydriver.CUdeviceptr ptr, object owner) except *: + return _get_handles_table().deviceptr_create_with_owner(ptr, owner) + + +cdef inline DevicePtrHandle deviceptr_import_ipc( + MemoryPoolHandle h_pool, + const void* export_data, + StreamHandle h_stream) except * nogil: + return _get_handles_table().deviceptr_import_ipc(h_pool, export_data, h_stream) + + +cdef inline StreamHandle deallocation_stream(const DevicePtrHandle& h) except * nogil: + return _get_handles_table().deallocation_stream(h) + + +cdef inline void set_deallocation_stream(const DevicePtrHandle& h, StreamHandle h_stream) except * nogil: + _get_handles_table().set_deallocation_stream(h, h_stream) diff --git a/cuda_core/cuda/core/experimental/_stream.pyx b/cuda_core/cuda/core/experimental/_stream.pyx index 87d78eba17..6fb4c79bd5 100644 --- a/cuda_core/cuda/core/experimental/_stream.pyx +++ b/cuda_core/cuda/core/experimental/_stream.pyx @@ -30,7 +30,7 @@ from cuda.core.experimental._resource_handles cimport ( EventHandle, StreamHandle, create_context_handle_ref, - create_event_handle, + create_event_handle_noctx, create_stream_handle, create_stream_handle_with_owner, get_current_context, @@ -303,7 +303,7 @@ cdef class Stream: ) from e # Wait on stream via temporary event - h_event = create_event_handle(cydriver.CUevent_flags.CU_EVENT_DISABLE_TIMING) + h_event = create_event_handle_noctx(cydriver.CUevent_flags.CU_EVENT_DISABLE_TIMING) with nogil: HANDLE_RETURN(cydriver.cuEventRecord(native(h_event), native(stream._h_stream))) # TODO: support flags other than 0? @@ -414,7 +414,8 @@ cdef inline int Stream_ensure_ctx(Stream self) except?-1 nogil: cdef cydriver.CUcontext ctx if not self._h_context: HANDLE_RETURN(cydriver.cuStreamGetCtx(native(self._h_stream), &ctx)) - self._h_context = create_context_handle_ref(ctx) + with gil: + self._h_context = create_context_handle_ref(ctx) return 0 From ba139f34976e806be48f68251565e9d3bd2151a9 Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Tue, 16 Dec 2025 15:25:15 -0800 Subject: [PATCH 26/38] Drop RTLD_GLOBAL import for _resource_handles Resource handle consumers now dispatch through the exported PyCapsule table, so _resource_handles no longer needs to be loaded with RTLD_GLOBAL. --- cuda_core/cuda/core/experimental/__init__.py | 20 +++++-------------- .../_cpp/resource_handles_cxx_api.hpp | 1 - .../_resource_handles_cxx_api.pxd | 1 - 3 files changed, 5 insertions(+), 17 deletions(-) diff --git a/cuda_core/cuda/core/experimental/__init__.py b/cuda_core/cuda/core/experimental/__init__.py index ac0627222b..ead15c92e2 100644 --- a/cuda_core/cuda/core/experimental/__init__.py +++ b/cuda_core/cuda/core/experimental/__init__.py @@ -14,21 +14,11 @@ import importlib import sys -# Load _resource_handles with RTLD_GLOBAL so its C++ symbols are available -# to other extension modules that depend on them (_context, _device, etc.) -# This must happen before importing any dependent modules. -if sys.platform != "win32": - import os - - _old_dlopen_flags = sys.getdlopenflags() - sys.setdlopenflags(os.RTLD_GLOBAL | os.RTLD_NOW) - try: - from cuda.core.experimental import _resource_handles # noqa: F401 - finally: - sys.setdlopenflags(_old_dlopen_flags) - del _old_dlopen_flags, os -else: - from cuda.core.experimental import _resource_handles # noqa: F401 +# Import the resource handles module early. +# +# Other extension modules access its functionality via the exported PyCapsule +# dispatch table, so we don't rely on RTLD_GLOBAL (POSIX-only behavior). +from cuda.core.experimental import _resource_handles # noqa: F401 subdir = f"cu{cuda_major}" try: diff --git a/cuda_core/cuda/core/experimental/_cpp/resource_handles_cxx_api.hpp b/cuda_core/cuda/core/experimental/_cpp/resource_handles_cxx_api.hpp index 5436b761f5..11e458603b 100644 --- a/cuda_core/cuda/core/experimental/_cpp/resource_handles_cxx_api.hpp +++ b/cuda_core/cuda/core/experimental/_cpp/resource_handles_cxx_api.hpp @@ -77,4 +77,3 @@ struct ResourceHandlesCxxApiV1 { const ResourceHandlesCxxApiV1* get_resource_handles_cxx_api_v1() noexcept; } // namespace cuda_core - diff --git a/cuda_core/cuda/core/experimental/_resource_handles_cxx_api.pxd b/cuda_core/cuda/core/experimental/_resource_handles_cxx_api.pxd index f8a4874908..f14fa7e730 100644 --- a/cuda_core/cuda/core/experimental/_resource_handles_cxx_api.pxd +++ b/cuda_core/cuda/core/experimental/_resource_handles_cxx_api.pxd @@ -66,4 +66,3 @@ cdef extern from "_cpp/resource_handles_cxx_api.hpp" namespace "cuda_core": void (*set_deallocation_stream)(const DevicePtrHandle& h, StreamHandle h_stream) nogil const ResourceHandlesCxxApiV1* get_resource_handles_cxx_api_v1() nogil - From bac302b4a07e07ad6a9f2808e5a18614c30e199f Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Tue, 16 Dec 2025 16:20:15 -0800 Subject: [PATCH 27/38] Fix Python 3.13 finalization check Use public Py_IsFinalizing() API instead of removed _Py_IsFinalizing(). --- cuda_core/cuda/core/experimental/_cpp/resource_handles.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_cpp/resource_handles.cpp b/cuda_core/cuda/core/experimental/_cpp/resource_handles.cpp index 9f4e21ca24..5c2a70eced 100644 --- a/cuda_core/cuda/core/experimental/_cpp/resource_handles.cpp +++ b/cuda_core/cuda/core/experimental/_cpp/resource_handles.cpp @@ -58,7 +58,7 @@ DECLARE_DRIVER_FN(cuMemPoolImportPointer); #undef DECLARE_DRIVER_FN static bool load_driver_api() noexcept { - if (!Py_IsInitialized() || _Py_IsFinalizing()) { + if (!Py_IsInitialized() || Py_IsFinalizing()) { return false; } @@ -225,7 +225,7 @@ class GILReleaseGuard { public: GILReleaseGuard() : tstate_(nullptr), released_(false) { // Don't try to manipulate GIL if Python is finalizing - if (!Py_IsInitialized() || _Py_IsFinalizing()) { + if (!Py_IsInitialized() || Py_IsFinalizing()) { return; } // PyGILState_Check() returns 1 if the GIL is held by this thread. @@ -256,7 +256,7 @@ class GILAcquireGuard { public: GILAcquireGuard() : acquired_(false) { // Don't try to acquire GIL if Python is finalizing - if (!Py_IsInitialized() || _Py_IsFinalizing()) { + if (!Py_IsInitialized() || Py_IsFinalizing()) { return; } gstate_ = PyGILState_Ensure(); From 9d5a010ba37ea394b0f9c331f1568aafbefe004b Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Wed, 17 Dec 2025 06:57:03 -0800 Subject: [PATCH 28/38] Fix finalization check across Python versions Use Py_IsFinalizing() on Python 3.13+ and fall back to _Py_IsFinalizing() on older versions. --- .../experimental/_cpp/resource_handles.cpp | 20 ++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_cpp/resource_handles.cpp b/cuda_core/cuda/core/experimental/_cpp/resource_handles.cpp index 5c2a70eced..62c5b0eff7 100644 --- a/cuda_core/cuda/core/experimental/_cpp/resource_handles.cpp +++ b/cuda_core/cuda/core/experimental/_cpp/resource_handles.cpp @@ -23,6 +23,20 @@ namespace { static std::once_flag driver_load_once; static bool driver_loaded = false; +#if PY_VERSION_HEX < 0x030D0000 +extern "C" int _Py_IsFinalizing(void); +#endif + +static inline bool py_is_finalizing() noexcept { +#if PY_VERSION_HEX >= 0x030D0000 + return Py_IsFinalizing(); +#else + // Python < 3.13 does not expose Py_IsFinalizing() publicly. Use the private + // API that exists in those versions. + return _Py_IsFinalizing() != 0; +#endif +} + #define DECLARE_DRIVER_FN(name) using name##_t = decltype(&name); static name##_t p_##name = nullptr DECLARE_DRIVER_FN(cuDevicePrimaryCtxRetain); @@ -58,7 +72,7 @@ DECLARE_DRIVER_FN(cuMemPoolImportPointer); #undef DECLARE_DRIVER_FN static bool load_driver_api() noexcept { - if (!Py_IsInitialized() || Py_IsFinalizing()) { + if (!Py_IsInitialized() || py_is_finalizing()) { return false; } @@ -225,7 +239,7 @@ class GILReleaseGuard { public: GILReleaseGuard() : tstate_(nullptr), released_(false) { // Don't try to manipulate GIL if Python is finalizing - if (!Py_IsInitialized() || Py_IsFinalizing()) { + if (!Py_IsInitialized() || py_is_finalizing()) { return; } // PyGILState_Check() returns 1 if the GIL is held by this thread. @@ -256,7 +270,7 @@ class GILAcquireGuard { public: GILAcquireGuard() : acquired_(false) { // Don't try to acquire GIL if Python is finalizing - if (!Py_IsInitialized() || Py_IsFinalizing()) { + if (!Py_IsInitialized() || py_is_finalizing()) { return; } gstate_ = PyGILState_Ensure(); From 3b45f7c4c48a27d06a0cd26082d1730568779b9f Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Wed, 17 Dec 2025 07:53:36 -0800 Subject: [PATCH 29/38] Fix circular import for _resource_handles Use a relative import in cuda.core.experimental.__init__ to avoid failing imports from partially-initialized packages during test collection. --- cuda_core/cuda/core/experimental/__init__.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/cuda_core/cuda/core/experimental/__init__.py b/cuda_core/cuda/core/experimental/__init__.py index 7eb3611a03..008426740e 100644 --- a/cuda_core/cuda/core/experimental/__init__.py +++ b/cuda_core/cuda/core/experimental/__init__.py @@ -18,7 +18,11 @@ # # Other extension modules access its functionality via the exported PyCapsule # dispatch table, so we don't rely on RTLD_GLOBAL (POSIX-only behavior). -from cuda.core.experimental import _resource_handles # noqa: F401 +# +# Use a relative import to avoid circular-import issues when `cuda.core.experimental` +# is still being initialized (e.g. when importing submodules like +# `cuda.core.experimental._utils.cuda_utils`). +from . import _resource_handles # noqa: F401 subdir = f"cu{cuda_major}" try: From dd07ea88ff4cf1853aaa0b39327f23c6e6e96f95 Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Wed, 17 Dec 2025 11:21:30 -0800 Subject: [PATCH 30/38] Fix circular import in _resource_handles module Use relative cimports instead of fully-qualified cimports to prevent Cython from generating code that imports the parent package during module initialization, which caused circular import errors. --- cuda_core/cuda/core/experimental/_resource_handles.pyx | 2 +- cuda_core/cuda/core/experimental/_resource_handles_cxx_api.pxd | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_resource_handles.pyx b/cuda_core/cuda/core/experimental/_resource_handles.pyx index 8ddb44c175..ed6d286abe 100644 --- a/cuda_core/cuda/core/experimental/_resource_handles.pyx +++ b/cuda_core/cuda/core/experimental/_resource_handles.pyx @@ -9,7 +9,7 @@ from cpython.pycapsule cimport PyCapsule_New from libc.stdint cimport uint32_t, uint64_t, uintptr_t -from cuda.core.experimental._resource_handles_cxx_api cimport ( +from ._resource_handles_cxx_api cimport ( ResourceHandlesCxxApiV1, get_resource_handles_cxx_api_v1, ) diff --git a/cuda_core/cuda/core/experimental/_resource_handles_cxx_api.pxd b/cuda_core/cuda/core/experimental/_resource_handles_cxx_api.pxd index f14fa7e730..da3d8d4fd3 100644 --- a/cuda_core/cuda/core/experimental/_resource_handles_cxx_api.pxd +++ b/cuda_core/cuda/core/experimental/_resource_handles_cxx_api.pxd @@ -6,7 +6,7 @@ from libc.stdint cimport uint32_t from libc.stddef cimport size_t from cuda.bindings cimport cydriver -from cuda.core.experimental._resource_handles cimport ( +from ._resource_handles cimport ( ContextHandle, DevicePtrHandle, EventHandle, From 280665f2147cc42a9c4c739a3aa8cc4dbb09aebd Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Wed, 17 Dec 2025 14:00:33 -0800 Subject: [PATCH 31/38] Fix circular import by using importlib.import_module Replace relative import `from . import _resource_handles` with `importlib.import_module("cuda.core.experimental._resource_handles")` to avoid circular import issues during package initialization. The relative import can fail with "partially initialized module" errors on some Python versions (e.g., Python 3.10) when the package is still being initialized. Using importlib.import_module with an absolute path bypasses the relative import machinery and avoids this issue. --- cuda_core/cuda/core/experimental/__init__.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/cuda_core/cuda/core/experimental/__init__.py b/cuda_core/cuda/core/experimental/__init__.py index 008426740e..b6ed4df302 100644 --- a/cuda_core/cuda/core/experimental/__init__.py +++ b/cuda_core/cuda/core/experimental/__init__.py @@ -14,15 +14,14 @@ import importlib import sys -# Import the resource handles module early. +# The _resource_handles module exports a PyCapsule dispatch table that other +# extension modules access via PyCapsule_Import. We import it here to ensure +# it's loaded before other modules try to use it. # -# Other extension modules access its functionality via the exported PyCapsule -# dispatch table, so we don't rely on RTLD_GLOBAL (POSIX-only behavior). -# -# Use a relative import to avoid circular-import issues when `cuda.core.experimental` -# is still being initialized (e.g. when importing submodules like -# `cuda.core.experimental._utils.cuda_utils`). -from . import _resource_handles # noqa: F401 +# We use importlib.import_module with the full path to avoid triggering +# circular import issues that can occur with relative imports during +# package initialization. +_resource_handles = importlib.import_module("cuda.core.experimental._resource_handles") subdir = f"cu{cuda_major}" try: From 0f89baa92770402a20802b6560017d01cee1d915 Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Thu, 18 Dec 2025 10:17:49 -0800 Subject: [PATCH 32/38] Fix wheel merge script to keep _resource_handles module The wheel merge script was removing _resource_handles.cpython-*.so during the merge process because it only kept a small set of files at the cuda/core/ top level. However, _resource_handles is shared code (not CUDA-version-specific) and must remain at the top level because it's imported early in __init__.py before versioned code. Also keep _cpp/ directory for Cython development headers. --- ci/tools/merge_cuda_core_wheels.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/ci/tools/merge_cuda_core_wheels.py b/ci/tools/merge_cuda_core_wheels.py index 87e2df13a7..e5320e9142 100644 --- a/ci/tools/merge_cuda_core_wheels.py +++ b/ci/tools/merge_cuda_core_wheels.py @@ -150,15 +150,21 @@ def merge_wheels(wheels: List[Path], output_dir: Path, show_wheel_contents: bool "__init__.py", "_version.py", "_include", + "_cpp", # Headers for Cython development "cu12", "cu13", ) + # _resource_handles is shared (not CUDA-version-specific) and must stay + # at top level. It's imported early in __init__.py before versioned code. + items_to_keep_prefix = ("_resource_handles",) all_items = os.scandir(base_wheel / base_dir) removed_count = 0 for f in all_items: f_abspath = f.path if f.name in items_to_keep: continue + if any(f.name.startswith(prefix) for prefix in items_to_keep_prefix): + continue if f.is_dir(): print(f" Removing directory: {f.name}", file=sys.stderr) shutil.rmtree(f_abspath) From 5e437b294fa2e527dadc63d8cd9b146cbd232f72 Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Thu, 18 Dec 2025 10:38:29 -0800 Subject: [PATCH 33/38] Fix IPC pointer cache to use export data as key The cache was using the returned pointer as the key, but checking the cache after calling cuMemPoolImportPointer. This caused duplicate imports to fail with CUDA_ERROR_ALREADY_MAPPED before the cache check. Fix by using the export_data bytes (CUmemPoolPtrExportData) as the cache key and checking the cache BEFORE calling cuMemPoolImportPointer. --- cuda_core/cuda/core/_cpp/resource_handles.cpp | 62 +++++++++++++++---- 1 file changed, 51 insertions(+), 11 deletions(-) diff --git a/cuda_core/cuda/core/_cpp/resource_handles.cpp b/cuda_core/cuda/core/_cpp/resource_handles.cpp index 4f660fe8ef..538c220f98 100644 --- a/cuda_core/cuda/core/_cpp/resource_handles.cpp +++ b/cuda_core/cuda/core/_cpp/resource_handles.cpp @@ -8,6 +8,7 @@ #include "resource_handles_cxx_api.hpp" #include #include +#include #include #include #include @@ -704,58 +705,97 @@ DevicePtrHandle deviceptr_create_with_owner(CUdeviceptr ptr, PyObject* owner) { // The first cuMemFreeAsync incorrectly unmaps the memory even when the pointer // was imported multiple times. We work around this by caching imported pointers // and returning the same handle for duplicate imports. +// +// The cache key is the export_data bytes (CUmemPoolPtrExportData), not the +// returned pointer, because we must check the cache BEFORE calling +// cuMemPoolImportPointer (which fails with CUDA_ERROR_ALREADY_MAPPED if +// the pointer is already imported). // TODO: When driver fix is available, add version check here to bypass cache. static bool use_ipc_ptr_cache() { return true; } +// Wrapper for CUmemPoolPtrExportData to use as map key +struct ExportDataKey { + CUmemPoolPtrExportData data; + + bool operator==(const ExportDataKey& other) const { + return std::memcmp(&data, &other.data, sizeof(data)) == 0; + } +}; + +struct ExportDataKeyHash { + std::size_t operator()(const ExportDataKey& key) const { + // Simple hash of the bytes + std::size_t h = 0; + const auto* bytes = reinterpret_cast(&key.data); + for (std::size_t i = 0; i < sizeof(key.data); ++i) { + h = h * 31 + bytes[i]; + } + return h; + } +}; + static std::mutex ipc_ptr_cache_mutex; -static std::unordered_map> ipc_ptr_cache; +static std::unordered_map, ExportDataKeyHash> ipc_ptr_cache; DevicePtrHandle deviceptr_import_ipc(MemoryPoolHandle h_pool, const void* export_data, StreamHandle h_stream) { if (!ensure_driver_loaded()) { err = CUDA_ERROR_NOT_INITIALIZED; return {}; } - GILReleaseGuard gil; - CUdeviceptr ptr; + auto data = const_cast( reinterpret_cast(export_data)); - if (CUDA_SUCCESS != (err = p_cuMemPoolImportPointer(&ptr, *h_pool, data))) { - return {}; - } if (use_ipc_ptr_cache()) { + // Check cache BEFORE calling cuMemPoolImportPointer + ExportDataKey key; + std::memcpy(&key.data, data, sizeof(key.data)); + std::lock_guard lock(ipc_ptr_cache_mutex); - // Check for existing handle - auto it = ipc_ptr_cache.find(ptr); + auto it = ipc_ptr_cache.find(key); if (it != ipc_ptr_cache.end()) { if (auto box = it->second.lock()) { + // Cache hit - return existing handle return DevicePtrHandle(box, &box->resource); } ipc_ptr_cache.erase(it); // Expired entry } + // Cache miss - import the pointer + GILReleaseGuard gil; + CUdeviceptr ptr; + if (CUDA_SUCCESS != (err = p_cuMemPoolImportPointer(&ptr, *h_pool, data))) { + return {}; + } + // Create new handle with cache-clearing deleter auto box = std::shared_ptr( new DevicePtrBox{ptr, h_stream}, - [h_pool, ptr](DevicePtrBox* b) { + [h_pool, key](DevicePtrBox* b) { { std::lock_guard lock(ipc_ptr_cache_mutex); - ipc_ptr_cache.erase(ptr); + ipc_ptr_cache.erase(key); } GILReleaseGuard gil; p_cuMemFreeAsync(b->resource, native(b->h_stream)); delete b; } ); - ipc_ptr_cache[ptr] = box; + ipc_ptr_cache[key] = box; return DevicePtrHandle(box, &box->resource); } else { // No caching - simple handle creation + GILReleaseGuard gil; + CUdeviceptr ptr; + if (CUDA_SUCCESS != (err = p_cuMemPoolImportPointer(&ptr, *h_pool, data))) { + return {}; + } + auto box = std::shared_ptr( new DevicePtrBox{ptr, h_stream}, [h_pool](DevicePtrBox* b) { From 9bde6a27347227c0c39c753e2a410eda6dbda86b Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Thu, 18 Dec 2025 11:14:00 -0800 Subject: [PATCH 34/38] Improve IPC pointer cache comments and fix race condition - Clarify that the cache handles two different memory type behaviors: memory pool allocations (nvbug 5570902) and pinned memory (ALREADY_MAPPED) - Fix race condition in deleter: only erase cache entry if expired, avoiding erasure of a new entry created by another thread - Move GILReleaseGuard before mutex acquisition in deleter --- cuda_core/cuda/core/_cpp/resource_handles.cpp | 41 +++++++++++++------ 1 file changed, 29 insertions(+), 12 deletions(-) diff --git a/cuda_core/cuda/core/_cpp/resource_handles.cpp b/cuda_core/cuda/core/_cpp/resource_handles.cpp index 538c220f98..88a3f1ff05 100644 --- a/cuda_core/cuda/core/_cpp/resource_handles.cpp +++ b/cuda_core/cuda/core/_cpp/resource_handles.cpp @@ -699,19 +699,31 @@ DevicePtrHandle deviceptr_create_with_owner(CUdeviceptr ptr, PyObject* owner) { } // ============================================================================ -// IPC Pointer Cache (workaround for nvbug 5570902) +// IPC Pointer Cache // ============================================================================ -// IPC-imported pointers are not correctly reference counted by the driver. -// The first cuMemFreeAsync incorrectly unmaps the memory even when the pointer -// was imported multiple times. We work around this by caching imported pointers -// and returning the same handle for duplicate imports. +// This cache handles duplicate IPC imports, which behave differently depending +// on the memory type: +// +// 1. Memory pool allocations (DeviceMemoryResource): +// Multiple imports of the same allocation succeed and return duplicate +// pointers. However, the driver has a reference counting bug (nvbug 5570902) +// where the first cuMemFreeAsync incorrectly unmaps the memory even when +// imported multiple times. A driver fix is expected. +// +// 2. Pinned memory allocations (PinnedMemoryResource): +// Duplicate imports result in CUDA_ERROR_ALREADY_MAPPED. +// +// The cache solves both issues by checking the cache before calling +// cuMemPoolImportPointer and returning the existing handle for duplicate +// imports. This provides a consistent user experience where the same IPC +// descriptor can be imported multiple times regardless of memory type. // // The cache key is the export_data bytes (CUmemPoolPtrExportData), not the -// returned pointer, because we must check the cache BEFORE calling -// cuMemPoolImportPointer (which fails with CUDA_ERROR_ALREADY_MAPPED if -// the pointer is already imported). +// returned pointer, because we must check before calling the driver API. -// TODO: When driver fix is available, add version check here to bypass cache. +// TODO: When driver fix for nvbug 5570902 is available, consider whether +// the cache is still needed for memory pool allocations (it will still be +// needed for pinned memory). static bool use_ipc_ptr_cache() { return true; } @@ -750,7 +762,7 @@ DevicePtrHandle deviceptr_import_ipc(MemoryPoolHandle h_pool, const void* export reinterpret_cast(export_data)); if (use_ipc_ptr_cache()) { - // Check cache BEFORE calling cuMemPoolImportPointer + // Check cache before calling cuMemPoolImportPointer ExportDataKey key; std::memcpy(&key.data, data, sizeof(key.data)); @@ -776,11 +788,16 @@ DevicePtrHandle deviceptr_import_ipc(MemoryPoolHandle h_pool, const void* export auto box = std::shared_ptr( new DevicePtrBox{ptr, h_stream}, [h_pool, key](DevicePtrBox* b) { + GILReleaseGuard gil; { std::lock_guard lock(ipc_ptr_cache_mutex); - ipc_ptr_cache.erase(key); + // Only erase if expired - avoids race where another thread + // replaced the entry with a new import before we acquired the lock. + auto it = ipc_ptr_cache.find(key); + if (it != ipc_ptr_cache.end() && it->second.expired()) { + ipc_ptr_cache.erase(it); + } } - GILReleaseGuard gil; p_cuMemFreeAsync(b->resource, native(b->h_stream)); delete b; } From 90ab0a59d8ba1de814492f2f05e187b142a18bf1 Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Thu, 18 Dec 2025 11:29:29 -0800 Subject: [PATCH 35/38] Refactor load_driver_api to use RAII GIL guard Replace raw PyGILState_Ensure/Release calls with a simple GILGuard class, eliminating manual release on each early return path. --- cuda_core/cuda/core/_cpp/resource_handles.cpp | 40 +++++++++++++------ 1 file changed, 28 insertions(+), 12 deletions(-) diff --git a/cuda_core/cuda/core/_cpp/resource_handles.cpp b/cuda_core/cuda/core/_cpp/resource_handles.cpp index 88a3f1ff05..3d35b0b498 100644 --- a/cuda_core/cuda/core/_cpp/resource_handles.cpp +++ b/cuda_core/cuda/core/_cpp/resource_handles.cpp @@ -38,6 +38,30 @@ static inline bool py_is_finalizing() noexcept { #endif } +// Simple RAII guard to acquire the GIL. Used in load_driver_api. +class GILGuard { +public: + GILGuard() : acquired_(false) { + if (!Py_IsInitialized() || py_is_finalizing()) { + return; + } + gstate_ = PyGILState_Ensure(); + acquired_ = true; + } + ~GILGuard() { + if (acquired_) { + PyGILState_Release(gstate_); + } + } + bool acquired() const { return acquired_; } + GILGuard(const GILGuard&) = delete; + GILGuard& operator=(const GILGuard&) = delete; + +private: + PyGILState_STATE gstate_; + bool acquired_; +}; + #define DECLARE_DRIVER_FN(name) using name##_t = decltype(&name); static name##_t p_##name = nullptr DECLARE_DRIVER_FN(cuDevicePrimaryCtxRetain); @@ -73,10 +97,6 @@ DECLARE_DRIVER_FN(cuMemPoolImportPointer); #undef DECLARE_DRIVER_FN static bool load_driver_api() noexcept { - if (!Py_IsInitialized() || py_is_finalizing()) { - return false; - } - struct CudaDriverApiV1 { std::uint32_t abi_version; std::uint32_t struct_size; @@ -115,21 +135,22 @@ static bool load_driver_api() noexcept { static constexpr const char* capsule_name = "cuda.core._resource_handles._CUDA_DRIVER_API_V1"; - PyGILState_STATE gstate = PyGILState_Ensure(); + GILGuard gil; + if (!gil.acquired()) { + return false; + } // `_resource_handles` is already loaded (it exports the handle API capsule), // so avoid import machinery and just grab the module object. PyObject* mod = PyImport_AddModule("cuda.core._resource_handles"); // borrowed if (!mod) { PyErr_Clear(); - PyGILState_Release(gstate); return false; } PyObject* fn = PyObject_GetAttrString(mod, "_get_cuda_driver_api_v1_capsule"); // new ref if (!fn) { PyErr_Clear(); - PyGILState_Release(gstate); return false; } @@ -137,7 +158,6 @@ static bool load_driver_api() noexcept { Py_DECREF(fn); if (!cap) { PyErr_Clear(); - PyGILState_Release(gstate); return false; } @@ -146,18 +166,15 @@ static bool load_driver_api() noexcept { if (!api) { PyErr_Clear(); - PyGILState_Release(gstate); return false; } if (api->abi_version != 1 || api->struct_size < sizeof(CudaDriverApiV1)) { - PyGILState_Release(gstate); return false; } #define LOAD_ADDR(name) \ do { \ if (api->name == 0) { \ - PyGILState_Release(gstate); \ return false; \ } \ p_##name = reinterpret_cast(api->name); \ @@ -195,7 +212,6 @@ static bool load_driver_api() noexcept { #undef LOAD_ADDR - PyGILState_Release(gstate); return true; } From 675ca24071ef9ef9b62eb034d12223db0ef18df6 Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Thu, 18 Dec 2025 14:38:07 -0800 Subject: [PATCH 36/38] Add DESIGN.md and optimize GIL usage in resource handle wrappers and CUDA operations - Change resource handle wrapper functions from `except * nogil` to `noexcept nogil` to avoid GIL acquisition on every call - Add `_init_handles_table()` for consumers to initialize at module level - Move CUDA operations into nogil blocks: cuMemcpyAsync, deviceptr_alloc_*, create_event_handle_noctx - Add Buffer._clear() to properly reset the handle shared_ptr - Add DESIGN.md documenting the resource handles architecture --- cuda_core/cuda/core/_cpp/DESIGN.md | 286 ++++++++++++++++++ cuda_core/cuda/core/_device.pyx | 3 + cuda_core/cuda/core/_event.pyx | 3 + cuda_core/cuda/core/_memory/_buffer.pyx | 21 +- .../core/_memory/_graph_memory_resource.pyx | 6 +- cuda_core/cuda/core/_memory/_ipc.pyx | 3 + cuda_core/cuda/core/_memory/_memory_pool.pyx | 6 +- cuda_core/cuda/core/_memoryview.pyx | 5 +- cuda_core/cuda/core/_resource_handles.pxd | 118 ++++---- cuda_core/cuda/core/_stream.pyx | 5 +- .../memory_ipc/test_ipc_duplicate_import.py | 2 +- 11 files changed, 390 insertions(+), 68 deletions(-) create mode 100644 cuda_core/cuda/core/_cpp/DESIGN.md diff --git a/cuda_core/cuda/core/_cpp/DESIGN.md b/cuda_core/cuda/core/_cpp/DESIGN.md new file mode 100644 index 0000000000..003dcfd945 --- /dev/null +++ b/cuda_core/cuda/core/_cpp/DESIGN.md @@ -0,0 +1,286 @@ +# Resource Handles Design + +This document describes the resource handle abstraction in cuda.core, which provides +robust lifetime management for CUDA resources. + +## Overview + +The cuda-core Python library provides a high-level interface to CUDA resources such as +Context, Device, Stream, and Event. These objects correspond to resources managed by +the CUDA Driver API, each having explicit creation and destruction routines. Several +of these CUDA resources also participate in non-trivial ownership hierarchies (e.g., +a stream belongs to a context), and releasing them may require additional arguments +or other resources (e.g., a device pointer freed through a specific stream). + +### Goals + +The goal of the handle abstraction is to provide a robust, explicit, and Python-agnostic +layer for ownership and lifetime management of CUDA resources. The intent is to use +handles as the backbone of the cuda-core resource hierarchy, enabling cuda-core Python +objects to manipulate handles rather than work directly with raw CUDA resources. + +While Python-facing objects expose convenient APIs and additional behaviors, the handle +layer isolates all concerns related to resource lifetime. By cleanly separating these +responsibilities, we achieve: + +- **Clearer architecture** with minimal cross-layer coupling +- **Safe transfer of resource ownership** between Python and other domains, including C++ +- **Ability to preserve resource validity** independent of Python +- **Well-specified semantics** for immutability, ownership, and reachability +- **Simplified reasoning about resource lifetimes**, especially with nested or dependent resources + +### Handle Semantics + +Resource handles provide **referentially transparent** wrappers around CUDA resources: + +- **No rebinding**: A handle always refers to the same resource. +- **No invalidation**: If a handle exists, its resource is valid. +- **Structural dependencies**: If resource A depends on resource B, A's handle + embeds B's handle, automatically extending B's lifetime. + +This eliminates global lifetime analysis. Correctness is enforced structurally—if you +have a handle, you have a valid resource. + +## Handle Types + +All handles are `std::shared_ptr` aliases that expose only the raw CUDA resource: + +```cpp +using ContextHandle = std::shared_ptr; +using StreamHandle = std::shared_ptr; +using EventHandle = std::shared_ptr; +using MemoryPoolHandle = std::shared_ptr; +using DevicePtrHandle = std::shared_ptr; +``` + +Internally, handles use **shared pointer aliasing**: the actual managed object is a +"box" containing the resource, its dependencies, and any state needed for destruction. +The public handle points only to the raw resource field, keeping the API minimal. + +### Why shared_ptr? + +- **Automatic reference counting**: Resources are released when the last reference + disappears. +- **Cross-language stability**: Works across Python/C++ boundaries without relying + on Python's garbage collector. +- **Interpreter independence**: Resources remain valid even during Python shutdown. +- **Type-erased deleters**: Destruction logic is captured at creation time, supporting + diverse lifetime strategies. + +## Accessing Handle Values + +Handles can be accessed in three ways via overloaded helper functions: + +| Function | Returns | Use Case | Notes +|----------|---------|----------|-------| +| `native(h)` | Raw CUDA type (e.g., `CUstream`) | Passing to CUDA APIs | An attribute of `cuda.bindings.cydriver` | +| `intptr(h)` | `intptr_t` | Python interop, foreign code | | +| `py(h)` | Python wrapper object | Returning to Python callers | An attribute of `cure.bindings.driver` + +These overloads exist because `std::shared_ptr` cannot have additional attributes. +Wrapping handles in Python objects would be superfluous overhead for internal use, +so we provide these helpers instead. + +Example usage from Cython: + +```cython +# Get raw handle for CUDA API calls +cdef CUstream raw_stream = native(h_stream) # cuda.bindings.cydriver.CUstream + +# Get as integer for other use cases +return hash(intptr(h_stream)) + +# Get Python wrapper for returning to user +return py(h_stream) # cuda.bindings.driver.CUstream +``` + +## Code Structure + +### Directory Layout + +``` +cuda/core/ +├── _resource_handles.pyx # Cython module (compiles resource_handles.cpp) +├── _resource_handles.pxd # Cython declarations and dispatch wrappers +└── _cpp/ + ├── resource_handles.hpp # C++ API declarations + ├── resource_handles.cpp # C++ implementation + └── resource_handles_cxx_api.hpp # Capsule struct definition +``` + +### Build Implications + +The `_cpp/` subdirectory contains C++ source files that are compiled into the +`_resource_handles` extension module. Other Cython modules in cuda.core do **not** +link against this code directly—they access it through a capsule mechanism +(explained below). + +## Capsule Architecture + +The implementation uses **two separate capsule mechanisms** for different purposes: + +### Capsule 1: C++ API Table (`_CXX_API`) + +**Problem**: Cython extension modules compile independently. If multiple modules +(`_memory.pyx`, `_ipc.pyx`, etc.) each linked `resource_handles.cpp`, they would +each have their own copies of: + +- Static driver function pointers +- Thread-local error state +- Other static data, including global caches + +**Solution**: Only `_resource_handles.so` links the C++ code. It exports a capsule +containing function pointers: + +```cpp +struct ResourceHandlesCxxApiV1 { + uint32_t abi_version; + uint32_t struct_size; + + // Thread-local error handling + CUresult (*get_last_error)() noexcept; + CUresult (*peek_last_error)() noexcept; + void (*clear_last_error)() noexcept; + + // Handle creation functions + ContextHandle (*get_primary_context)(int device_id) noexcept; + StreamHandle (*create_stream_handle)(...) noexcept; + // ... etc +}; +``` + +Other Cython modules import this capsule at runtime and call through the function +pointers. The `.pxd` file provides inline wrappers that hide this indirection: + +```cython +cdef inline StreamHandle create_stream_handle(...) except * nogil: + return _handles_table.create_stream_handle(...) +``` + +Importing modules are expected to call `_init_handles_table()` prior to calling +any wrapper functions. + +### Capsule 2: CUDA Driver API (`_CUDA_DRIVER_API_V1`) + +**Problem**: cuda.core cannot directly call CUDA driver functions because: + +1. We don't want to link against `libcuda.so` at build time. +2. The driver symbols must be resolved dynamically through cuda-bindings. + +**Solution**: `_resource_handles.pyx` creates a capsule containing CUDA driver +function pointers obtained from cuda-bindings: + +```cpp +struct CudaDriverApiV1 { + uint32_t abi_version; + uint32_t struct_size; + + uintptr_t cuDevicePrimaryCtxRetain; + uintptr_t cuDevicePrimaryCtxRelease; + uintptr_t cuStreamCreateWithPriority; + uintptr_t cuStreamDestroy; + // ... etc +}; +``` + +The C++ code retrieves this capsule once (via `load_driver_api()`) and caches the +function pointers for subsequent use. + +### Why Two Capsules? + +| Capsule | Direction | Purpose | +|---------|-----------|---------| +| `_CXX_API` | C++ → Cython | Share handle functions across modules | +| `_CUDA_DRIVER_API_V1` | Cython → C++ | Provide resolved driver symbols | + +## Key Implementation Details + +### Structural Dependencies + +When a resource depends on another, its handle embeds the dependency: + +```cpp +struct StreamBox { + CUstream resource; + ContextHandle h_context; // Keeps context alive +}; +``` + +The shared pointer's custom deleter captures any additional state needed for +destruction. This ensures resources are always destroyed in the correct order. + +### GIL Management + +Handle destructors may run from any thread. The implementation includes RAII guards +(`GILReleaseGuard`, `GILAcquireGuard`) that: + +- Release the GIL before calling CUDA APIs (for parallelism) +- Handle Python finalization gracefully (avoid GIL operations during shutdown) +- Ensure Python object manipulation happens with GIL held + +The handle API functions are safe to call with or without the GIL held. They +will release the GIL (if necessary) before calling CUDA driver API functions. + +### Error Handling + +Handle API functions do not raise Python exceptions. Instead, they return an empty +handle (null `shared_ptr`) on failure and store the error code in thread-local state. +Callers should check for failure and retrieve the error using `get_last_error()`: + +```cython +cdef StreamHandle h = create_stream_handle(h_ctx, flags, priority) +if not h: + # Handle creation failed - get the CUDA error code + cdef CUresult err = get_last_error() + # ... handle error (e.g., raise Python exception) +``` + +This design allows handle functions to be called from `nogil` blocks without requiring +GIL acquisition for exception handling on the success path. The error state is +thread-local, so concurrent calls from different threads do not interfere. + +Related functions: +- `get_last_error()`: Returns and clears the most recent error +- `peek_last_error()`: Returns the error without clearing it +- `clear_last_error()`: Clears the error state + +## Usage from Cython + +```cython +from cuda.core._resource_handles cimport ( + StreamHandle, + create_stream_handle, + native, + intptr, + get_last_error, + _init_handles_table, +) + +_init_handles_table() # prerequisite before calling handle API functions + +# Create a stream +cdef StreamHandle h_stream = create_stream_handle(h_ctx, flags, priority) +if not h_stream: + HANDLE_RETURN(get_last_error()) + +# Use in CUDA API +cuStreamSynchronize(native(h_stream)) + +# Return to Python +return py(h_stream) +``` + +## Summary + +The resource handle design: + +1. **Separates resource management** into its own layer, independent of Python objects. +2. **Encodes lifetimes structurally** via embedded handle dependencies. +3. **Uses capsules** to solve two distinct problems: + - Sharing C++ code across Cython modules without duplicate statics. + - Resolving CUDA driver symbols dynamically through cuda-bindings. +4. **Provides overloaded accessors** (`native`, `intptr`, `py`) since handles cannot + have attributes without unnecessary Python object wrappers. + +This architecture ensures CUDA resources are managed correctly regardless of Python +garbage collection timing, interpreter shutdown, or cross-language usage patterns. diff --git a/cuda_core/cuda/core/_device.pyx b/cuda_core/cuda/core/_device.pyx index a8387273c7..014b7dae78 100644 --- a/cuda_core/cuda/core/_device.pyx +++ b/cuda_core/cuda/core/_device.pyx @@ -17,10 +17,13 @@ from cuda.core._event cimport Event as cyEvent from cuda.core._event import Event, EventOptions from cuda.core._resource_handles cimport ( ContextHandle, + _init_handles_table, create_context_handle_ref, get_primary_context, native, ) + +_init_handles_table() from cuda.core._graph import GraphBuilder from cuda.core._stream import IsStreamT, Stream, StreamOptions from cuda.core._utils.clear_error_support import assert_type diff --git a/cuda_core/cuda/core/_event.pyx b/cuda_core/cuda/core/_event.pyx index faff47bec9..1dec487665 100644 --- a/cuda_core/cuda/core/_event.pyx +++ b/cuda_core/cuda/core/_event.pyx @@ -11,12 +11,15 @@ from cuda.core._context cimport Context from cuda.core._resource_handles cimport ( ContextHandle, EventHandle, + _init_handles_table, create_event_handle, create_event_handle_ipc, intptr, native, py, ) + +_init_handles_table() from cuda.core._utils.cuda_utils cimport ( check_or_create_options, HANDLE_RETURN diff --git a/cuda_core/cuda/core/_memory/_buffer.pyx b/cuda_core/cuda/core/_memory/_buffer.pyx index d7e6252ab4..3e2c4c4d05 100644 --- a/cuda_core/cuda/core/_memory/_buffer.pyx +++ b/cuda_core/cuda/core/_memory/_buffer.pyx @@ -16,11 +16,14 @@ from cuda.core._memory cimport _ipc from cuda.core._resource_handles cimport ( DevicePtrHandle, StreamHandle, + _init_handles_table, deviceptr_create_with_owner, intptr, native, set_deallocation_stream, ) + +_init_handles_table() from cuda.core._stream cimport Stream_accept, Stream from cuda.core._utils.cuda_utils cimport ( _check_driver_error as raise_if_driver_error, @@ -61,7 +64,7 @@ cdef class Buffer: self._clear() def _clear(self): - # _h_ptr is default-initialized (empty shared_ptr) by C++ + self._h_ptr.reset() # Release the handle self._size = 0 self._memory_resource = None self._ipc_data = None @@ -171,22 +174,23 @@ cdef class Buffer: asynchronous copy """ - stream = Stream_accept(stream) + cdef Stream s = Stream_accept(stream) cdef size_t src_size = self._size if dst is None: if self._memory_resource is None: raise ValueError("a destination buffer must be provided (this " "buffer does not have a memory_resource)") - dst = self._memory_resource.allocate(src_size, stream) + dst = self._memory_resource.allocate(src_size, s) cdef size_t dst_size = dst._size if dst_size != src_size: raise ValueError( "buffer sizes mismatch between src and dst (sizes " f"are: src={src_size}, dst={dst_size})" ) - err, = driver.cuMemcpyAsync(native(dst._h_ptr), native(self._h_ptr), src_size, stream.handle) - raise_if_driver_error(err) + with nogil: + HANDLE_RETURN(cydriver.cuMemcpyAsync( + native(dst._h_ptr), native(self._h_ptr), src_size, native(s._h_stream))) return dst def copy_from(self, src: Buffer, *, stream: Stream | GraphBuilder): @@ -201,7 +205,7 @@ cdef class Buffer: asynchronous copy """ - stream = Stream_accept(stream) + cdef Stream s = Stream_accept(stream) cdef size_t dst_size = self._size cdef size_t src_size = src._size @@ -209,8 +213,9 @@ cdef class Buffer: raise ValueError( "buffer sizes mismatch between src and dst (sizes " f"are: src={src_size}, dst={dst_size})" ) - err, = driver.cuMemcpyAsync(native(self._h_ptr), native(src._h_ptr), dst_size, stream.handle) - raise_if_driver_error(err) + with nogil: + HANDLE_RETURN(cydriver.cuMemcpyAsync( + native(self._h_ptr), native(src._h_ptr), dst_size, native(s._h_stream))) def fill(self, value: int | BufferProtocol, *, stream: Stream | GraphBuilder): """Fill this buffer with a repeating byte pattern. diff --git a/cuda_core/cuda/core/_memory/_graph_memory_resource.pyx b/cuda_core/cuda/core/_memory/_graph_memory_resource.pyx index 3ad20fdabb..daa38a1216 100644 --- a/cuda_core/cuda/core/_memory/_graph_memory_resource.pyx +++ b/cuda_core/cuda/core/_memory/_graph_memory_resource.pyx @@ -10,9 +10,12 @@ from cuda.bindings cimport cydriver from cuda.core._memory._buffer cimport Buffer, Buffer_from_deviceptr_handle, MemoryResource from cuda.core._resource_handles cimport ( DevicePtrHandle, + _init_handles_table, deviceptr_alloc_async, native, ) + +_init_handles_table() from cuda.core._stream cimport default_stream, Stream_accept, Stream from cuda.core._utils.cuda_utils cimport HANDLE_RETURN @@ -192,9 +195,10 @@ cdef inline int check_capturing(cydriver.CUstream s) except?-1 nogil: cdef inline Buffer GMR_allocate(cyGraphMemoryResource self, size_t size, Stream stream): cdef cydriver.CUstream s = native(stream._h_stream) + cdef DevicePtrHandle h_ptr with nogil: check_capturing(s) - cdef DevicePtrHandle h_ptr = deviceptr_alloc_async(size, stream._h_stream) + h_ptr = deviceptr_alloc_async(size, stream._h_stream) if not h_ptr: raise RuntimeError("Failed to allocate memory asynchronously") return Buffer_from_deviceptr_handle(h_ptr, size, self, None) diff --git a/cuda_core/cuda/core/_memory/_ipc.pyx b/cuda_core/cuda/core/_memory/_ipc.pyx index 3c0eee3300..99608f55db 100644 --- a/cuda_core/cuda/core/_memory/_ipc.pyx +++ b/cuda_core/cuda/core/_memory/_ipc.pyx @@ -10,11 +10,14 @@ from cuda.core._memory._memory_pool cimport _MemPool from cuda.core._stream cimport Stream from cuda.core._resource_handles cimport ( DevicePtrHandle, + _init_handles_table, create_mempool_handle_ipc, deviceptr_import_ipc, get_last_error, native, ) + +_init_handles_table() from cuda.core._stream cimport default_stream from cuda.core._utils.cuda_utils cimport HANDLE_RETURN from cuda.core._utils.cuda_utils import check_multiprocessing_start_method diff --git a/cuda_core/cuda/core/_memory/_memory_pool.pyx b/cuda_core/cuda/core/_memory/_memory_pool.pyx index db994c09b0..7a255ebb3d 100644 --- a/cuda_core/cuda/core/_memory/_memory_pool.pyx +++ b/cuda_core/cuda/core/_memory/_memory_pool.pyx @@ -16,6 +16,7 @@ from cuda.core._stream cimport default_stream, Stream_accept, Stream from cuda.core._resource_handles cimport ( MemoryPoolHandle, DevicePtrHandle, + _init_handles_table, create_mempool_handle, create_mempool_handle_ref, get_device_mempool, @@ -23,6 +24,8 @@ from cuda.core._resource_handles cimport ( native, py, ) + +_init_handles_table() from cuda.core._utils.cuda_utils cimport ( HANDLE_RETURN, ) @@ -424,9 +427,10 @@ cdef inline int check_not_capturing(cydriver.CUstream s) except?-1 nogil: cdef inline Buffer _MP_allocate(_MemPool self, size_t size, Stream stream): cdef cydriver.CUstream s = native(stream._h_stream) + cdef DevicePtrHandle h_ptr with nogil: check_not_capturing(s) - cdef DevicePtrHandle h_ptr = deviceptr_alloc_from_pool(size, self._h_pool, stream._h_stream) + h_ptr = deviceptr_alloc_from_pool(size, self._h_pool, stream._h_stream) if not h_ptr: raise RuntimeError("Failed to allocate memory from pool") return Buffer_from_deviceptr_handle(h_ptr, size, self, None) diff --git a/cuda_core/cuda/core/_memoryview.pyx b/cuda_core/cuda/core/_memoryview.pyx index 082fc7b130..41321c8722 100644 --- a/cuda_core/cuda/core/_memoryview.pyx +++ b/cuda_core/cuda/core/_memoryview.pyx @@ -16,9 +16,12 @@ import numpy from cuda.bindings cimport cydriver from cuda.core._resource_handles cimport ( EventHandle, + _init_handles_table, create_event_handle_noctx, native, ) + +_init_handles_table() from cuda.core._utils.cuda_utils import handle_return, driver from cuda.core._utils.cuda_utils cimport HANDLE_RETURN @@ -608,8 +611,8 @@ cpdef StridedMemoryView view_as_cai(obj, stream_ptr, view=None): assert producer_s > 0 # establish stream order if producer_s != consumer_s: - h_event = create_event_handle_noctx(cydriver.CUevent_flags.CU_EVENT_DISABLE_TIMING) with nogil: + h_event = create_event_handle_noctx(cydriver.CUevent_flags.CU_EVENT_DISABLE_TIMING) HANDLE_RETURN(cydriver.cuEventRecord( native(h_event), producer_s)) HANDLE_RETURN(cydriver.cuStreamWaitEvent( diff --git a/cuda_core/cuda/core/_resource_handles.pxd b/cuda_core/cuda/core/_resource_handles.pxd index 06d41bf170..801d354958 100644 --- a/cuda_core/cuda/core/_resource_handles.pxd +++ b/cuda_core/cuda/core/_resource_handles.pxd @@ -121,118 +121,126 @@ cdef inline const ResourceHandlesCxxApiV1* _get_handles_table() except NULL nogi # ----------------------------------------------------------------------------- -# Dispatch wrappers (hide capsule init from consumers) +# Dispatch wrappers +# +# These wrappers assume _handles_table has been initialized. Consumers must call +# _init_handles_table() at module level before using these functions in nogil blocks. # ----------------------------------------------------------------------------- -cdef inline cydriver.CUresult get_last_error() except * nogil: - return _get_handles_table().get_last_error() +cdef inline void _init_handles_table() except *: + """Initialize the handles table. Call at module level before using wrappers.""" + _get_handles_table() + + +cdef inline cydriver.CUresult get_last_error() noexcept nogil: + return _handles_table.get_last_error() -cdef inline cydriver.CUresult peek_last_error() except * nogil: - return _get_handles_table().peek_last_error() +cdef inline cydriver.CUresult peek_last_error() noexcept nogil: + return _handles_table.peek_last_error() -cdef inline void clear_last_error() except * nogil: - _get_handles_table().clear_last_error() +cdef inline void clear_last_error() noexcept nogil: + _handles_table.clear_last_error() -cdef inline ContextHandle create_context_handle_ref(cydriver.CUcontext ctx) except * nogil: - return _get_handles_table().create_context_handle_ref(ctx) +cdef inline ContextHandle create_context_handle_ref(cydriver.CUcontext ctx) noexcept nogil: + return _handles_table.create_context_handle_ref(ctx) -cdef inline ContextHandle get_primary_context(int device_id) except * nogil: - return _get_handles_table().get_primary_context(device_id) +cdef inline ContextHandle get_primary_context(int device_id) noexcept nogil: + return _handles_table.get_primary_context(device_id) -cdef inline ContextHandle get_current_context() except * nogil: - return _get_handles_table().get_current_context() +cdef inline ContextHandle get_current_context() noexcept nogil: + return _handles_table.get_current_context() -cdef inline StreamHandle create_stream_handle(ContextHandle h_ctx, unsigned int flags, int priority) except * nogil: - return _get_handles_table().create_stream_handle(h_ctx, flags, priority) +cdef inline StreamHandle create_stream_handle(ContextHandle h_ctx, unsigned int flags, int priority) noexcept nogil: + return _handles_table.create_stream_handle(h_ctx, flags, priority) -cdef inline StreamHandle create_stream_handle_ref(cydriver.CUstream stream) except * nogil: - return _get_handles_table().create_stream_handle_ref(stream) +cdef inline StreamHandle create_stream_handle_ref(cydriver.CUstream stream) noexcept nogil: + return _handles_table.create_stream_handle_ref(stream) -cdef inline StreamHandle create_stream_handle_with_owner(cydriver.CUstream stream, object owner) except *: - return _get_handles_table().create_stream_handle_with_owner(stream, owner) +cdef inline StreamHandle create_stream_handle_with_owner(cydriver.CUstream stream, object owner): + return _handles_table.create_stream_handle_with_owner(stream, owner) -cdef inline StreamHandle get_legacy_stream() except * nogil: - return _get_handles_table().get_legacy_stream() +cdef inline StreamHandle get_legacy_stream() noexcept nogil: + return _handles_table.get_legacy_stream() -cdef inline StreamHandle get_per_thread_stream() except * nogil: - return _get_handles_table().get_per_thread_stream() +cdef inline StreamHandle get_per_thread_stream() noexcept nogil: + return _handles_table.get_per_thread_stream() -cdef inline EventHandle create_event_handle(ContextHandle h_ctx, unsigned int flags) except * nogil: - return _get_handles_table().create_event_handle(h_ctx, flags) +cdef inline EventHandle create_event_handle(ContextHandle h_ctx, unsigned int flags) noexcept nogil: + return _handles_table.create_event_handle(h_ctx, flags) -cdef inline EventHandle create_event_handle_noctx(unsigned int flags) except * nogil: - return _get_handles_table().create_event_handle_noctx(flags) +cdef inline EventHandle create_event_handle_noctx(unsigned int flags) noexcept nogil: + return _handles_table.create_event_handle_noctx(flags) -cdef inline EventHandle create_event_handle_ipc(const cydriver.CUipcEventHandle& ipc_handle) except * nogil: - return _get_handles_table().create_event_handle_ipc(ipc_handle) +cdef inline EventHandle create_event_handle_ipc(const cydriver.CUipcEventHandle& ipc_handle) noexcept nogil: + return _handles_table.create_event_handle_ipc(ipc_handle) -cdef inline MemoryPoolHandle create_mempool_handle(const cydriver.CUmemPoolProps& props) except * nogil: - return _get_handles_table().create_mempool_handle(props) +cdef inline MemoryPoolHandle create_mempool_handle(const cydriver.CUmemPoolProps& props) noexcept nogil: + return _handles_table.create_mempool_handle(props) -cdef inline MemoryPoolHandle create_mempool_handle_ref(cydriver.CUmemoryPool pool) except * nogil: - return _get_handles_table().create_mempool_handle_ref(pool) +cdef inline MemoryPoolHandle create_mempool_handle_ref(cydriver.CUmemoryPool pool) noexcept nogil: + return _handles_table.create_mempool_handle_ref(pool) -cdef inline MemoryPoolHandle get_device_mempool(int device_id) except * nogil: - return _get_handles_table().get_device_mempool(device_id) +cdef inline MemoryPoolHandle get_device_mempool(int device_id) noexcept nogil: + return _handles_table.get_device_mempool(device_id) -cdef inline MemoryPoolHandle create_mempool_handle_ipc(int fd, cydriver.CUmemAllocationHandleType handle_type) except * nogil: - return _get_handles_table().create_mempool_handle_ipc(fd, handle_type) +cdef inline MemoryPoolHandle create_mempool_handle_ipc(int fd, cydriver.CUmemAllocationHandleType handle_type) noexcept nogil: + return _handles_table.create_mempool_handle_ipc(fd, handle_type) cdef inline DevicePtrHandle deviceptr_alloc_from_pool( size_t size, MemoryPoolHandle h_pool, - StreamHandle h_stream) except * nogil: - return _get_handles_table().deviceptr_alloc_from_pool(size, h_pool, h_stream) + StreamHandle h_stream) noexcept nogil: + return _handles_table.deviceptr_alloc_from_pool(size, h_pool, h_stream) -cdef inline DevicePtrHandle deviceptr_alloc_async(size_t size, StreamHandle h_stream) except * nogil: - return _get_handles_table().deviceptr_alloc_async(size, h_stream) +cdef inline DevicePtrHandle deviceptr_alloc_async(size_t size, StreamHandle h_stream) noexcept nogil: + return _handles_table.deviceptr_alloc_async(size, h_stream) -cdef inline DevicePtrHandle deviceptr_alloc(size_t size) except * nogil: - return _get_handles_table().deviceptr_alloc(size) +cdef inline DevicePtrHandle deviceptr_alloc(size_t size) noexcept nogil: + return _handles_table.deviceptr_alloc(size) -cdef inline DevicePtrHandle deviceptr_alloc_host(size_t size) except * nogil: - return _get_handles_table().deviceptr_alloc_host(size) +cdef inline DevicePtrHandle deviceptr_alloc_host(size_t size) noexcept nogil: + return _handles_table.deviceptr_alloc_host(size) -cdef inline DevicePtrHandle deviceptr_create_ref(cydriver.CUdeviceptr ptr) except * nogil: - return _get_handles_table().deviceptr_create_ref(ptr) +cdef inline DevicePtrHandle deviceptr_create_ref(cydriver.CUdeviceptr ptr) noexcept nogil: + return _handles_table.deviceptr_create_ref(ptr) -cdef inline DevicePtrHandle deviceptr_create_with_owner(cydriver.CUdeviceptr ptr, object owner) except *: - return _get_handles_table().deviceptr_create_with_owner(ptr, owner) +cdef inline DevicePtrHandle deviceptr_create_with_owner(cydriver.CUdeviceptr ptr, object owner): + return _handles_table.deviceptr_create_with_owner(ptr, owner) cdef inline DevicePtrHandle deviceptr_import_ipc( MemoryPoolHandle h_pool, const void* export_data, - StreamHandle h_stream) except * nogil: - return _get_handles_table().deviceptr_import_ipc(h_pool, export_data, h_stream) + StreamHandle h_stream) noexcept nogil: + return _handles_table.deviceptr_import_ipc(h_pool, export_data, h_stream) -cdef inline StreamHandle deallocation_stream(const DevicePtrHandle& h) except * nogil: - return _get_handles_table().deallocation_stream(h) +cdef inline StreamHandle deallocation_stream(const DevicePtrHandle& h) noexcept nogil: + return _handles_table.deallocation_stream(h) -cdef inline void set_deallocation_stream(const DevicePtrHandle& h, StreamHandle h_stream) except * nogil: - _get_handles_table().set_deallocation_stream(h, h_stream) +cdef inline void set_deallocation_stream(const DevicePtrHandle& h, StreamHandle h_stream) noexcept nogil: + _handles_table.set_deallocation_stream(h, h_stream) diff --git a/cuda_core/cuda/core/_stream.pyx b/cuda_core/cuda/core/_stream.pyx index 4626fbf109..aecf24b06e 100644 --- a/cuda_core/cuda/core/_stream.pyx +++ b/cuda_core/cuda/core/_stream.pyx @@ -29,6 +29,7 @@ from cuda.core._resource_handles cimport ( ContextHandle, EventHandle, StreamHandle, + _init_handles_table, create_context_handle_ref, create_event_handle_noctx, create_stream_handle, @@ -40,6 +41,8 @@ from cuda.core._resource_handles cimport ( native, py, ) + +_init_handles_table() from cuda.core._graph import GraphBuilder @@ -303,8 +306,8 @@ cdef class Stream: ) from e # Wait on stream via temporary event - h_event = create_event_handle_noctx(cydriver.CUevent_flags.CU_EVENT_DISABLE_TIMING) with nogil: + h_event = create_event_handle_noctx(cydriver.CUevent_flags.CU_EVENT_DISABLE_TIMING) HANDLE_RETURN(cydriver.cuEventRecord(native(h_event), native(stream._h_stream))) # TODO: support flags other than 0? HANDLE_RETURN(cydriver.cuStreamWaitEvent(native(self._h_stream), native(h_event), 0)) diff --git a/cuda_core/tests/memory_ipc/test_ipc_duplicate_import.py b/cuda_core/tests/memory_ipc/test_ipc_duplicate_import.py index 096b3a2abd..ca4ecc0749 100644 --- a/cuda_core/tests/memory_ipc/test_ipc_duplicate_import.py +++ b/cuda_core/tests/memory_ipc/test_ipc_duplicate_import.py @@ -13,7 +13,7 @@ import multiprocessing as mp import pytest -from cuda.core.experimental import Buffer, Device +from cuda.core import Buffer, Device from helpers.logging import TimestampedLogger CHILD_TIMEOUT_SEC = 20 From ccf9a3b217242773b9e9996a6ca1825b7e1140dc Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Thu, 18 Dec 2025 15:23:48 -0800 Subject: [PATCH 37/38] linter fix --- cuda_core/cuda/core/_memory/_buffer.pyx | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/cuda_core/cuda/core/_memory/_buffer.pyx b/cuda_core/cuda/core/_memory/_buffer.pyx index 3e2c4c4d05..32fe28bab4 100644 --- a/cuda_core/cuda/core/_memory/_buffer.pyx +++ b/cuda_core/cuda/core/_memory/_buffer.pyx @@ -25,10 +25,7 @@ from cuda.core._resource_handles cimport ( _init_handles_table() from cuda.core._stream cimport Stream_accept, Stream -from cuda.core._utils.cuda_utils cimport ( - _check_driver_error as raise_if_driver_error, - HANDLE_RETURN, -) +from cuda.core._utils.cuda_utils cimport HANDLE_RETURN import sys from typing import TypeVar, Union From 6c82cb65feece3b09d3c09e17faa585f98e8c734 Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Thu, 18 Dec 2025 16:30:05 -0800 Subject: [PATCH 38/38] Consolidate GIL helper classes at top of resource_handles.cpp Move GILReleaseGuard and GILAcquireGuard to the top of the file before first use, and remove redundant GILGuard class that duplicated GILAcquireGuard functionality. --- cuda_core/cuda/core/_cpp/resource_handles.cpp | 127 +++++++----------- 1 file changed, 52 insertions(+), 75 deletions(-) diff --git a/cuda_core/cuda/core/_cpp/resource_handles.cpp b/cuda_core/cuda/core/_cpp/resource_handles.cpp index 3d35b0b498..5ffc84145c 100644 --- a/cuda_core/cuda/core/_cpp/resource_handles.cpp +++ b/cuda_core/cuda/core/_cpp/resource_handles.cpp @@ -38,30 +38,75 @@ static inline bool py_is_finalizing() noexcept { #endif } -// Simple RAII guard to acquire the GIL. Used in load_driver_api. -class GILGuard { +// ============================================================================ +// GIL management helpers +// ============================================================================ + +// Helper to release the GIL while calling into the CUDA driver. +// This guard is *conditional*: if the caller already dropped the GIL, +// we avoid calling PyEval_SaveThread (which requires holding the GIL). +// It also handles the case where Python is finalizing and GIL operations +// are no longer safe. +class GILReleaseGuard { +public: + GILReleaseGuard() : tstate_(nullptr), released_(false) { + // Don't try to manipulate GIL if Python is finalizing + if (!Py_IsInitialized() || py_is_finalizing()) { + return; + } + // PyGILState_Check() returns 1 if the GIL is held by this thread. + if (PyGILState_Check()) { + tstate_ = PyEval_SaveThread(); + released_ = true; + } + } + + ~GILReleaseGuard() { + if (released_) { + PyEval_RestoreThread(tstate_); + } + } + + // Non-copyable, non-movable + GILReleaseGuard(const GILReleaseGuard&) = delete; + GILReleaseGuard& operator=(const GILReleaseGuard&) = delete; + +private: + PyThreadState* tstate_; + bool released_; +}; + +// Helper to acquire the GIL when we might not hold it. +// Use in C++ destructors that need to manipulate Python objects. +class GILAcquireGuard { public: - GILGuard() : acquired_(false) { + GILAcquireGuard() : acquired_(false) { + // Don't try to acquire GIL if Python is finalizing if (!Py_IsInitialized() || py_is_finalizing()) { return; } gstate_ = PyGILState_Ensure(); acquired_ = true; } - ~GILGuard() { + + ~GILAcquireGuard() { if (acquired_) { PyGILState_Release(gstate_); } } + bool acquired() const { return acquired_; } - GILGuard(const GILGuard&) = delete; - GILGuard& operator=(const GILGuard&) = delete; + + // Non-copyable, non-movable + GILAcquireGuard(const GILAcquireGuard&) = delete; + GILAcquireGuard& operator=(const GILAcquireGuard&) = delete; private: PyGILState_STATE gstate_; bool acquired_; }; + #define DECLARE_DRIVER_FN(name) using name##_t = decltype(&name); static name##_t p_##name = nullptr DECLARE_DRIVER_FN(cuDevicePrimaryCtxRetain); @@ -135,7 +180,7 @@ static bool load_driver_api() noexcept { static constexpr const char* capsule_name = "cuda.core._resource_handles._CUDA_DRIVER_API_V1"; - GILGuard gil; + GILAcquireGuard gil; if (!gil.acquired()) { return false; } @@ -243,74 +288,6 @@ void clear_last_error() noexcept { err = CUDA_SUCCESS; } -// ============================================================================ -// GIL management helpers -// ============================================================================ - -// Helper to release the GIL while calling into the CUDA driver. -// This guard is *conditional*: if the caller already dropped the GIL, -// we avoid calling PyEval_SaveThread (which requires holding the GIL). -// It also handles the case where Python is finalizing and GIL operations -// are no longer safe. -class GILReleaseGuard { -public: - GILReleaseGuard() : tstate_(nullptr), released_(false) { - // Don't try to manipulate GIL if Python is finalizing - if (!Py_IsInitialized() || py_is_finalizing()) { - return; - } - // PyGILState_Check() returns 1 if the GIL is held by this thread. - if (PyGILState_Check()) { - tstate_ = PyEval_SaveThread(); - released_ = true; - } - } - - ~GILReleaseGuard() { - if (released_) { - PyEval_RestoreThread(tstate_); - } - } - - // Non-copyable, non-movable - GILReleaseGuard(const GILReleaseGuard&) = delete; - GILReleaseGuard& operator=(const GILReleaseGuard&) = delete; - -private: - PyThreadState* tstate_; - bool released_; -}; - -// Helper to acquire the GIL when we might not hold it. -// Use in C++ destructors that need to manipulate Python objects. -class GILAcquireGuard { -public: - GILAcquireGuard() : acquired_(false) { - // Don't try to acquire GIL if Python is finalizing - if (!Py_IsInitialized() || py_is_finalizing()) { - return; - } - gstate_ = PyGILState_Ensure(); - acquired_ = true; - } - - ~GILAcquireGuard() { - if (acquired_) { - PyGILState_Release(gstate_); - } - } - - bool acquired() const { return acquired_; } - - // Non-copyable, non-movable - GILAcquireGuard(const GILAcquireGuard&) = delete; - GILAcquireGuard& operator=(const GILAcquireGuard&) = delete; - -private: - PyGILState_STATE gstate_; - bool acquired_; -}; - // ============================================================================ // Context Handles // ============================================================================