diff --git a/cuda_core/cuda/core/_device.pyx b/cuda_core/cuda/core/_device.pyx index da6972f372..451ca25dda 100644 --- a/cuda_core/cuda/core/_device.pyx +++ b/cuda_core/cuda/core/_device.pyx @@ -85,9 +85,12 @@ cdef class DeviceProperties: cdef inline int _get_cached_attribute(self, attr, default=0) except? -2: """Retrieve the attribute value, using cache if applicable.""" - if attr not in self._cache: - self._cache[attr] = self._get_attribute(attr, default) - return self._cache[attr] + cached = self._cache.get(attr) + if cached is not None: + return cached + cdef int value = self._get_attribute(attr, default) + self._cache[attr] = value # setdefault not needed for ints + return value @property def max_threads_per_block(self) -> int: @@ -1131,11 +1134,11 @@ class Device: def compute_capability(self) -> ComputeCapability: """Return a named tuple with 2 fields: major and minor.""" cdef DeviceProperties prop = self.properties - if "compute_capability" in prop._cache: - return prop._cache["compute_capability"] + cached = prop._cache.get("compute_capability") + if cached is not None: + return cached cc = ComputeCapability(prop.compute_capability_major, prop.compute_capability_minor) - prop._cache["compute_capability"] = cc - return cc + return prop._cache.setdefault("compute_capability", cc) @property def arch(self) -> str: diff --git a/cuda_core/cuda/core/_device_resources.pxd b/cuda_core/cuda/core/_device_resources.pxd index d618c24cf1..98f91ab473 100644 --- a/cuda_core/cuda/core/_device_resources.pxd +++ b/cuda_core/cuda/core/_device_resources.pxd @@ -2,6 +2,8 @@ # # SPDX-License-Identifier: Apache-2.0 +cimport cython + from cuda.bindings cimport cydriver from cuda.core._resource_handles cimport ContextHandle, GreenCtxHandle @@ -15,6 +17,7 @@ cdef class SMResource: unsigned int _flags bint _is_usable object __weakref__ + cython.pymutex _split_mutex @staticmethod cdef SMResource _from_dev_resource(cydriver.CUdevResource res, int device_id) diff --git a/cuda_core/cuda/core/_device_resources.pyx b/cuda_core/cuda/core/_device_resources.pyx index ecd9e00bf0..bafc462c93 100644 --- a/cuda_core/cuda/core/_device_resources.pyx +++ b/cuda_core/cuda/core/_device_resources.pyx @@ -498,10 +498,11 @@ cdef class SMResource: ) _resolve_group_count(opts) _check_green_ctx_support() - if _can_use_structured_sm_split(): - return _split_with_general_api(self, opts, dry_run) - # SplitByCount requires the same 12.4+ as green ctx support (already checked above) - return _split_with_count_api(self, opts, dry_run) + with self._split_mutex: + if _can_use_structured_sm_split(): + return _split_with_general_api(self, opts, dry_run) + # SplitByCount requires the same 12.4+ as green ctx support (already checked above) + return _split_with_count_api(self, opts, dry_run) cdef class WorkqueueResource: diff --git a/cuda_core/cuda/core/_memory/_buffer.pxd b/cuda_core/cuda/core/_memory/_buffer.pxd index 98c4b50db3..83dcd4f68c 100644 --- a/cuda_core/cuda/core/_memory/_buffer.pxd +++ b/cuda_core/cuda/core/_memory/_buffer.pxd @@ -3,6 +3,8 @@ # SPDX-License-Identifier: Apache-2.0 from libc.stdint cimport uintptr_t +from libcpp cimport bool as cpp_bool +from libcpp.atomic cimport atomic as std_atomic, memory_order_acquire, memory_order_release from cuda.bindings cimport cydriver from cuda.core._resource_handles cimport DevicePtrHandle @@ -18,13 +20,13 @@ cdef struct _MemAttrs: cdef class Buffer: cdef: - DevicePtrHandle _h_ptr - MemoryResource _memory_resource - object _ipc_data - object _owner - _MemAttrs _mem_attrs - bint _mem_attrs_inited - object __weakref__ + DevicePtrHandle _h_ptr + MemoryResource _memory_resource + object _ipc_data + object _owner + _MemAttrs _mem_attrs + std_atomic[cpp_bool] _mem_attrs_inited + object __weakref__ cdef public: # Python code in _memory/_virtual_memory_resource.py needs to update # this value, though it is technically private. diff --git a/cuda_core/cuda/core/_memory/_buffer.pyi b/cuda_core/cuda/core/_memory/_buffer.pyi index 728853c4bc..7118a3a1e0 100644 --- a/cuda_core/cuda/core/_memory/_buffer.pyi +++ b/cuda_core/cuda/core/_memory/_buffer.pyi @@ -2,6 +2,7 @@ from __future__ import annotations +import cython from cuda.core._memory._device_memory_resource import DeviceMemoryResource from cuda.core._memory._ipc import IPCBufferDescriptor from cuda.core._memory._pinned_memory_resource import PinnedMemoryResource @@ -88,6 +89,7 @@ class Buffer: """ @property + @cython.critical_section def ipc_descriptor(self) -> IPCBufferDescriptor: """Descriptor for sharing this buffer with other processes.""" diff --git a/cuda_core/cuda/core/_memory/_buffer.pyx b/cuda_core/cuda/core/_memory/_buffer.pyx index 88f9054385..00359c1f0b 100644 --- a/cuda_core/cuda/core/_memory/_buffer.pyx +++ b/cuda_core/cuda/core/_memory/_buffer.pyx @@ -96,7 +96,7 @@ cdef class Buffer: self._memory_resource = None self._ipc_data = None self._owner = None - self._mem_attrs_inited = False + self._mem_attrs_inited.store(False) def __init__(self, *args, **kwargs) -> None: raise RuntimeError("Buffer objects cannot be instantiated directly. " @@ -126,7 +126,7 @@ cdef class Buffer: self._memory_resource = mr self._ipc_data = IPCDataForBuffer(ipc_descriptor, True) if ipc_descriptor is not None else None self._owner = owner - self._mem_attrs_inited = False + self._mem_attrs_inited.store(False) return self @staticmethod @@ -191,6 +191,7 @@ cdef class Buffer: return _ipc.Buffer_from_ipc_descriptor(cls, mr, ipc_descriptor, stream) @property + @cython.critical_section def ipc_descriptor(self) -> IPCBufferDescriptor: """Descriptor for sharing this buffer with other processes.""" if self._ipc_data is None: @@ -445,11 +446,12 @@ cdef class Buffer: # Memory Attribute Query Helpers # ------------------------------ +@cython.critical_section cdef inline void _init_mem_attrs(Buffer self): """Initialize memory attributes by querying the pointer.""" - if not self._mem_attrs_inited: + if not self._mem_attrs_inited.load(memory_order_acquire): _query_memory_attrs(self._mem_attrs, as_cu(self._h_ptr)) - self._mem_attrs_inited = True + self._mem_attrs_inited.store(True, memory_order_release) cdef inline int _query_memory_attrs( @@ -597,7 +599,7 @@ cdef Buffer Buffer_from_deviceptr_handle( buf._memory_resource = mr buf._ipc_data = IPCDataForBuffer(ipc_descriptor, True) if ipc_descriptor is not None else None buf._owner = None - buf._mem_attrs_inited = False + buf._mem_attrs_inited.store(False) return buf diff --git a/cuda_core/cuda/core/_memory/_graph_memory_resource.pyi b/cuda_core/cuda/core/_memory/_graph_memory_resource.pyi index 4ff85eb597..b34f968fdc 100644 --- a/cuda_core/cuda/core/_memory/_graph_memory_resource.pyi +++ b/cuda_core/cuda/core/_memory/_graph_memory_resource.pyi @@ -2,8 +2,6 @@ from __future__ import annotations -from functools import cache - from cuda.core._device import Device from cuda.core._memory._buffer import Buffer, MemoryResource from cuda.core._stream import Stream @@ -113,7 +111,6 @@ class GraphMemoryResource(cyGraphMemoryResource): ... @classmethod - @cache def _create(cls, device_id: int) -> GraphMemoryResource: ... __all__ = ['GraphMemoryResource'] \ No newline at end of file diff --git a/cuda_core/cuda/core/_memory/_graph_memory_resource.pyx b/cuda_core/cuda/core/_memory/_graph_memory_resource.pyx index 479322ab01..e845a47b08 100644 --- a/cuda_core/cuda/core/_memory/_graph_memory_resource.pyx +++ b/cuda_core/cuda/core/_memory/_graph_memory_resource.pyx @@ -18,7 +18,6 @@ from cuda.core._resource_handles cimport ( from cuda.core._stream cimport Stream_accept, Stream from cuda.core._utils.cuda_utils cimport HANDLE_RETURN -from functools import cache from typing import TYPE_CHECKING if TYPE_CHECKING: @@ -161,6 +160,8 @@ cdef class cyGraphMemoryResource(MemoryResource): return False +cdef dict _mem_resource_cache = {} + class GraphMemoryResource(cyGraphMemoryResource): """ A memory resource for memory related to graphs. @@ -185,9 +186,16 @@ class GraphMemoryResource(cyGraphMemoryResource): return cls._create(c_device_id) @classmethod - @cache def _create(cls, int device_id) -> GraphMemoryResource: - return cyGraphMemoryResource.__new__(cls, device_id) + # we use a dict currently, because functools.cache is currently less + # thread-safe see also: https://github.com/python/cpython/issues/150708 + res = _mem_resource_cache.get(device_id) + if res is not None: + return res + + # create new instance, but in case of a race may return another: + new = cyGraphMemoryResource.__new__(cls, device_id) + return _mem_resource_cache.setdefault(device_id, new) # Raise an exception if the given stream is capturing. diff --git a/cuda_core/cuda/core/_memory/_memory_pool.pyi b/cuda_core/cuda/core/_memory/_memory_pool.pyi index 3d15d9f679..7f8c64aedd 100644 --- a/cuda_core/cuda/core/_memory/_memory_pool.pyi +++ b/cuda_core/cuda/core/_memory/_memory_pool.pyi @@ -4,6 +4,7 @@ from __future__ import annotations import uuid +import cython from cuda.core._memory._buffer import Buffer, MemoryResource from cuda.core._stream import Stream from cuda.core.graph import GraphBuilder @@ -97,6 +98,7 @@ class _MemPool(MemoryResource): """ @property + @cython.critical_section def attributes(self) -> _MemPoolAttributes: """Memory pool attributes.""" diff --git a/cuda_core/cuda/core/_memory/_memory_pool.pyx b/cuda_core/cuda/core/_memory/_memory_pool.pyx index c6276f0f3d..ddcac2d606 100644 --- a/cuda_core/cuda/core/_memory/_memory_pool.pyx +++ b/cuda_core/cuda/core/_memory/_memory_pool.pyx @@ -4,6 +4,7 @@ from __future__ import annotations +cimport cython from libc.limits cimport ULLONG_MAX from libc.stdint cimport uintptr_t from libc.string cimport memset @@ -177,6 +178,7 @@ cdef class _MemPool(MemoryResource): _MP_deallocate(self, ptr, size, s) @property + @cython.critical_section def attributes(self) -> _MemPoolAttributes: """Memory pool attributes.""" if self._attributes is None: diff --git a/cuda_core/cuda/core/_memoryview.pyx b/cuda_core/cuda/core/_memoryview.pyx index c65107ae27..260980c1da 100644 --- a/cuda_core/cuda/core/_memoryview.pyx +++ b/cuda_core/cuda/core/_memoryview.pyx @@ -4,6 +4,7 @@ from __future__ import annotations +cimport cython from ._dlpack cimport * from ._dlpack import classify_dl_device from libc.stdint cimport intptr_t @@ -80,7 +81,7 @@ cdef inline bint _is_torch_tensor(object obj): cdef str mod = tp.__module__ or "" cdef bint result = mod.startswith("torch") and hasattr(obj, "data_ptr") \ and _torch_version_check() - _torch_type_cache[tp] = result + _torch_type_cache[tp] = result # setdefault not needed for bools return result @@ -539,6 +540,7 @@ cdef class StridedMemoryView: + f" readonly={self.readonly},\n" + f" exporting_obj={get_simple_repr(self.exporting_obj)})") + @cython.critical_section cdef inline _StridedLayout get_layout(self): if self._layout is None: if self.dl_tensor: @@ -549,6 +551,7 @@ cdef class StridedMemoryView: raise ValueError("Cannot infer layout from the exporting object") return self._layout + @cython.critical_section cdef inline object get_buffer(self): """ Returns Buffer instance with the underlying data. @@ -562,6 +565,7 @@ cdef class StridedMemoryView: self._buffer = Buffer.from_handle(self.ptr, 0, owner=self.exporting_obj) return self._buffer + @cython.critical_section cdef inline object get_dtype(self): if self._dtype is None: if self.dl_tensor != NULL: diff --git a/cuda_core/cuda/core/_module.pyi b/cuda_core/cuda/core/_module.pyi index caf6b09b71..5125b99131 100644 --- a/cuda_core/cuda/core/_module.pyi +++ b/cuda_core/cuda/core/_module.pyi @@ -5,6 +5,7 @@ from __future__ import annotations from collections import namedtuple from os import PathLike +import cython from cuda.core._device import Device from cuda.core._launch_config import LaunchConfig from cuda.core._stream import Stream @@ -253,6 +254,7 @@ class Kernel: ... @property + @cython.critical_section def attributes(self) -> KernelAttributes: """Get the read-only attributes of this kernel.""" @@ -265,6 +267,7 @@ class Kernel: """list[ParamInfo]: (offset, size) for each argument of this function""" @property + @cython.critical_section def occupancy(self) -> KernelOccupancy: """Get the occupancy information for launching this kernel.""" diff --git a/cuda_core/cuda/core/_module.pyx b/cuda_core/cuda/core/_module.pyx index 5cb1b7f005..91c8ad4389 100644 --- a/cuda_core/cuda/core/_module.pyx +++ b/cuda_core/cuda/core/_module.pyx @@ -4,6 +4,7 @@ from __future__ import annotations +cimport cython from libc.stddef cimport size_t from collections import namedtuple @@ -83,7 +84,7 @@ cdef class KernelAttributes: cdef int result with nogil: HANDLE_RETURN(cydriver.cuKernelGetAttribute(&result, attribute, as_cu(self._h_kernel), device_id)) - self._cache[cache_key] = result + self._cache[cache_key] = result # setdefault not needed for ints return result def __getitem__(self, device: Device | int) -> KernelAttributes: @@ -454,6 +455,7 @@ cdef class Kernel: return ker @property + @cython.critical_section def attributes(self) -> KernelAttributes: """Get the read-only attributes of this kernel.""" if self._attributes is None: @@ -501,6 +503,7 @@ cdef class Kernel: return param_info @property + @cython.critical_section def occupancy(self) -> KernelOccupancy: """Get the occupancy information for launching this kernel.""" if self._occupancy is None: @@ -742,6 +745,7 @@ cdef class ObjectCode: # TODO: do we want to unload in a finalizer? Probably not.. + @cython.critical_section cdef int _lazy_load_module(self) except -1: if self._h_library: return 0 diff --git a/cuda_core/cuda/core/graph/_graph_node.pyx b/cuda_core/cuda/core/graph/_graph_node.pyx index f627edf9bb..d3d684aff3 100644 --- a/cuda_core/cuda/core/graph/_graph_node.pyx +++ b/cuda_core/cuda/core/graph/_graph_node.pyx @@ -78,8 +78,7 @@ _node_registry: weakref.WeakValueDictionary[int, GraphNode] = weakref.WeakValueD cdef inline GraphNode _registered(GraphNode n): - _node_registry[n._h_node.get()] = n - return n + return _node_registry.setdefault(n._h_node.get(), n) cdef class GraphNode: @@ -162,10 +161,11 @@ cdef class GraphNode: cdef cydriver.CUgraphNode node = as_cu(self._h_node) if node == NULL: return - with nogil: - HANDLE_RETURN(cydriver.cuGraphDestroyNode(node)) + _node_registry.pop(self._h_node.get(), None) invalidate_graph_node(self._h_node) + with nogil: + HANDLE_RETURN(cydriver.cuGraphDestroyNode(node)) @property def pred(self) -> AdjacencySetProxy: