diff --git a/.github/workflows/test-wheel-linux.yml b/.github/workflows/test-wheel-linux.yml index 35c5e6c3734..a57107bb052 100644 --- a/.github/workflows/test-wheel-linux.yml +++ b/.github/workflows/test-wheel-linux.yml @@ -130,21 +130,21 @@ jobs: path: ./cuda_pathfinder - name: Download cuda-python build artifacts - if: ${{ env.USE_BACKPORT_BINDINGS == '0' }} + if: ${{ env.BINDINGS_SOURCE == 'main' }} uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8.0.1 with: name: cuda-python-wheel path: . - name: Download cuda.bindings build artifacts - if: ${{ env.USE_BACKPORT_BINDINGS == '0' }} + if: ${{ env.BINDINGS_SOURCE == 'main' }} uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8.0.1 with: name: ${{ env.CUDA_BINDINGS_ARTIFACT_NAME }} path: ${{ env.CUDA_BINDINGS_ARTIFACTS_DIR }} - name: Download cuda-python & cuda.bindings build artifacts from the prior branch - if: ${{ env.USE_BACKPORT_BINDINGS == '1' }} + if: ${{ env.BINDINGS_SOURCE == 'backport' }} env: GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} run: | @@ -184,6 +184,7 @@ jobs: ls -lahR . - name: Display structure of downloaded cuda.bindings artifacts + if: ${{ env.BINDINGS_SOURCE != 'published' }} run: | pwd ls -lahR $CUDA_BINDINGS_ARTIFACTS_DIR @@ -285,6 +286,7 @@ jobs: run: run-tests core - name: Ensure cuda-python installable + if: ${{ env.BINDINGS_SOURCE == 'main' }} run: | if [[ "${{ matrix.LOCAL_CTK }}" == 1 ]]; then pip install --only-binary=:all: cuda_python*.whl diff --git a/.github/workflows/test-wheel-windows.yml b/.github/workflows/test-wheel-windows.yml index 765823c6bfc..b9e03d12851 100644 --- a/.github/workflows/test-wheel-windows.yml +++ b/.github/workflows/test-wheel-windows.yml @@ -125,21 +125,21 @@ jobs: path: ./cuda_pathfinder - name: Download cuda-python build artifacts - if: ${{ env.USE_BACKPORT_BINDINGS == '0' }} + if: ${{ env.BINDINGS_SOURCE == 'main' }} uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8.0.1 with: name: cuda-python-wheel path: . - name: Download cuda.bindings build artifacts - if: ${{ env.USE_BACKPORT_BINDINGS == '0' }} + if: ${{ env.BINDINGS_SOURCE == 'main' }} uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8.0.1 with: name: ${{ env.CUDA_BINDINGS_ARTIFACT_NAME }} path: ${{ env.CUDA_BINDINGS_ARTIFACTS_DIR }} - name: Download cuda-python & cuda.bindings build artifacts from the prior branch - if: ${{ env.USE_BACKPORT_BINDINGS == '1' }} + if: ${{ env.BINDINGS_SOURCE == 'backport' }} env: GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} run: | @@ -170,6 +170,7 @@ jobs: Get-ChildItem -Recurse -Force | Select-Object Mode, LastWriteTime, Length, FullName - name: Display structure of downloaded cuda.bindings artifacts + if: ${{ env.BINDINGS_SOURCE != 'published' }} run: | Get-Location Get-ChildItem -Recurse -Force $env:CUDA_BINDINGS_ARTIFACTS_DIR | Select-Object Mode, LastWriteTime, Length, FullName @@ -261,6 +262,7 @@ jobs: run: run-tests core - name: Ensure cuda-python installable + if: ${{ env.BINDINGS_SOURCE == 'main' }} run: | if ('${{ matrix.LOCAL_CTK }}' -eq '1') { pip install --only-binary=:all: (Get-ChildItem -Filter cuda_python*.whl).FullName diff --git a/ci/tools/env-vars b/ci/tools/env-vars index 17db607c29b..30fac1cdce8 100755 --- a/ci/tools/env-vars +++ b/ci/tools/env-vars @@ -52,34 +52,38 @@ elif [[ "${1}" == "test" ]]; then BUILD_CUDA_MAJOR="$(cut -d '.' -f 1 <<< ${BUILD_CUDA_VER})" TEST_CUDA_MAJOR="$(cut -d '.' -f 1 <<< ${CUDA_VER})" CUDA_BINDINGS_ARTIFACT_BASENAME="cuda-bindings-python${PYTHON_VERSION_FORMATTED}-cuda${BUILD_CUDA_VER}-${HOST_PLATFORM}" - # USE_BACKPORT_BINDINGS flags the CTK-major-mismatch case where the - # current-run bindings wheel was built for a different CTK major than the - # one under test, so we must pull the bindings wheel from the backport - # branch instead. This is independent of whether bindings tests run. - # SKIP_CUDA_BINDINGS_TEST is the test-time gate: it is set when the CTK - # majors differ OR when the caller tells us to skip for path-filter - # reasons via SKIP_BINDINGS_TEST_OVERRIDE. + + # BINDINGS_SOURCE controls which cuda-bindings to install at test time: + # main — use the just-built bindings wheel from this CI run + # backport — fetch bindings from the prior (N-1) branch + # published — install from PyPI (cuda-bindings==${TEST_CUDA_MAJOR}.${TEST_CUDA_MINOR}.*) + # + # SKIP_CUDA_BINDINGS_TEST / SKIP_CYTHON_TEST control which *tests* to run + # (they do NOT affect installation — that's BINDINGS_SOURCE's job). + + BUILD_CUDA_MINOR="$(cut -d '.' -f 2 <<< ${BUILD_CUDA_VER})" + TEST_CUDA_MINOR="$(cut -d '.' -f 2 <<< ${CUDA_VER})" + if [[ ${BUILD_CUDA_MAJOR} != ${TEST_CUDA_MAJOR} ]]; then - USE_BACKPORT_BINDINGS=1 + # Major mismatch (e.g. build=13.x, test=12.x): use the backport branch. + BINDINGS_SOURCE=backport + SKIP_CUDA_BINDINGS_TEST=1 + SKIP_CYTHON_TEST=1 + elif [[ ${BUILD_CUDA_MINOR} != ${TEST_CUDA_MINOR} ]]; then + # Same major, minor mismatch (e.g. build=13.2, test=13.0): use published + # bindings from PyPI to test the real-world backward-compat scenario. + BINDINGS_SOURCE=published SKIP_CUDA_BINDINGS_TEST=1 SKIP_CYTHON_TEST=1 else - USE_BACKPORT_BINDINGS=0 - # Path-filter override only skips bindings tests, NOT cython tests - # for other modules (e.g. cuda.core). Cython skip is driven solely - # by the build/test CTK minor-version mismatch. + # Exact match: use the just-built bindings wheel. + BINDINGS_SOURCE=main if [[ "${SKIP_BINDINGS_TEST_OVERRIDE:-0}" == "1" ]]; then SKIP_CUDA_BINDINGS_TEST=1 else SKIP_CUDA_BINDINGS_TEST=0 fi - BUILD_CUDA_MINOR="$(cut -d '.' -f 2 <<< ${BUILD_CUDA_VER})" - TEST_CUDA_MINOR="$(cut -d '.' -f 2 <<< ${CUDA_VER})" - if [[ ${BUILD_CUDA_MINOR} != ${TEST_CUDA_MINOR} ]]; then - SKIP_CYTHON_TEST=1 - else - SKIP_CYTHON_TEST=0 - fi + SKIP_CYTHON_TEST=0 fi # We don't test compute-sanitizer on CTK<12 because backporting fixes is too much effort # We only test compute-sanitizer on python 3.12 arbitrarily; we don't need to use sanitizer on the entire matrix @@ -93,10 +97,11 @@ elif [[ "${1}" == "test" ]]; then fi { echo "SETUP_SANITIZER=${SETUP_SANITIZER}" + echo "BINDINGS_SOURCE=${BINDINGS_SOURCE}" echo "SKIP_CUDA_BINDINGS_TEST=${SKIP_CUDA_BINDINGS_TEST}" echo "SKIP_CYTHON_TEST=${SKIP_CYTHON_TEST}" echo "TEST_CUDA_MAJOR=${TEST_CUDA_MAJOR}" - echo "USE_BACKPORT_BINDINGS=${USE_BACKPORT_BINDINGS}" + echo "TEST_CUDA_MINOR=${TEST_CUDA_MINOR}" } >> $GITHUB_ENV fi diff --git a/ci/tools/run-tests b/ci/tools/run-tests index d42634a7073..eb741f4f61f 100755 --- a/ci/tools/run-tests +++ b/ci/tools/run-tests @@ -54,10 +54,12 @@ elif [[ "${test_module}" == "bindings" ]]; then fi popd elif [[ "${test_module}" == "core" ]]; then - # If build/test majors match: cuda.bindings is installed in the previous step. - # If mismatch: cuda.bindings is installed from the backport branch. - if [[ "${SKIP_CUDA_BINDINGS_TEST}" == 1 ]]; then - echo "Installing bindings wheel" + # Install cuda.bindings for core tests based on BINDINGS_SOURCE. + if [[ "${BINDINGS_SOURCE}" == "published" ]]; then + echo "Installing published cuda-bindings==${TEST_CUDA_MAJOR}.${TEST_CUDA_MINOR}.* from PyPI" + pip install "cuda-bindings==${TEST_CUDA_MAJOR}.${TEST_CUDA_MINOR}.*" + elif [[ "${BINDINGS_SOURCE}" == "backport" || "${BINDINGS_SOURCE}" == "main" ]]; then + echo "Installing bindings wheel (source: ${BINDINGS_SOURCE})" if [[ "${LOCAL_CTK}" == 1 ]]; then pip install "${CUDA_BINDINGS_ARTIFACTS_DIR}"/*.whl else @@ -83,6 +85,8 @@ elif [[ "${test_module}" == "core" ]]; then # Constrain cuda-toolkit to the requested CTK version to avoid # pip pulling in a newer nvidia-cuda-runtime that conflicts with it. pip install "${WHL_EXTRA[@]}" --group "test-cu${TEST_CUDA_MAJOR}${FREE_THREADING}" "cuda-toolkit==${CUDA_VER_MINOR}.*" + echo "Installed packages before core tests:" + pip list echo "Running core tests" ${SANITIZER_CMD} pytest -rxXs -v --durations=0 --randomly-dont-reorganize tests/ # Currently our CI always installs the latest bindings (from either major version). diff --git a/cuda_core/cuda/core/_cpp/resource_handles.cpp b/cuda_core/cuda/core/_cpp/resource_handles.cpp index ff0e0db5066..f846308af70 100644 --- a/cuda_core/cuda/core/_cpp/resource_handles.cpp +++ b/cuda_core/cuda/core/_cpp/resource_handles.cpp @@ -77,6 +77,13 @@ decltype(&cuLinkDestroy) p_cuLinkDestroy = nullptr; decltype(&cuGraphicsUnmapResources) p_cuGraphicsUnmapResources = nullptr; decltype(&cuGraphicsUnregisterResource) p_cuGraphicsUnregisterResource = nullptr; +// SM resource split (13.1+ — may be null on older drivers/bindings) +#if CUDA_VERSION >= 13010 +decltype(&cuDevSmResourceSplit) p_cuDevSmResourceSplit = nullptr; +#else +void* p_cuDevSmResourceSplit = nullptr; +#endif + // NVRTC function pointers decltype(&nvrtcDestroyProgram) p_nvrtcDestroyProgram = nullptr; @@ -1319,4 +1326,27 @@ FileDescriptorHandle create_fd_handle_ref(int fd) { #endif } +// ============================================================================ +// SM resource split wrapper +// ============================================================================ + +CUresult sm_resource_split(CUdevResource* result, unsigned int nbGroups, + const CUdevResource* input, CUdevResource* remainder, + unsigned int flags, void* groupParams) { +#if CUDA_VERSION >= 13010 + if (!p_cuDevSmResourceSplit) { + return CUDA_ERROR_NOT_SUPPORTED; + } + return p_cuDevSmResourceSplit( + result, nbGroups, input, remainder, flags, + static_cast(groupParams)); +#else + return CUDA_ERROR_NOT_SUPPORTED; +#endif +} + +bool has_sm_resource_split() noexcept { + return p_cuDevSmResourceSplit != nullptr; +} + } // namespace cuda_core diff --git a/cuda_core/cuda/core/_cpp/resource_handles.hpp b/cuda_core/cuda/core/_cpp/resource_handles.hpp index 3c195f1f1ad..1162ec89843 100644 --- a/cuda_core/cuda/core/_cpp/resource_handles.hpp +++ b/cuda_core/cuda/core/_cpp/resource_handles.hpp @@ -108,6 +108,15 @@ extern decltype(&cuLinkDestroy) p_cuLinkDestroy; extern decltype(&cuGraphicsUnmapResources) p_cuGraphicsUnmapResources; extern decltype(&cuGraphicsUnregisterResource) p_cuGraphicsUnregisterResource; +// SM resource split (13.1+ — may be null on older drivers/bindings) +#if CUDA_VERSION >= 13010 +extern decltype(&cuDevSmResourceSplit) p_cuDevSmResourceSplit; +#else +// cuDevSmResourceSplit doesn't exist in CUDA < 13.1 headers, so use a +// void* placeholder. The pointer is always null when built against 12.x. +extern void* p_cuDevSmResourceSplit; +#endif + // ============================================================================ // NVRTC function pointers // @@ -747,4 +756,22 @@ inline PyObject* as_py(const FileDescriptorHandle& h) noexcept { return PyLong_FromSsize_t(as_intptr(h)); } +// ============================================================================ +// SM resource split wrapper (13.1+) +// +// Calls through p_cuDevSmResourceSplit if available, otherwise returns +// CUDA_ERROR_NOT_SUPPORTED. This avoids a direct Cython cimport of the +// cydriver cdef function, which would fail at module init on cuda-bindings +// < 13.1 (see https://github.com/NVIDIA/cuda-python/issues/2063). +// ============================================================================ + +// groupParams is void* so the Cython declaration doesn't reference +// CU_DEV_SM_RESOURCE_GROUP_PARAMS (absent from cuda-bindings 13.0 .pxd). +CUresult sm_resource_split(CUdevResource* result, unsigned int nbGroups, + const CUdevResource* input, CUdevResource* remainder, + unsigned int flags, void* groupParams); + +// Returns true if the cuDevSmResourceSplit function pointer is available. +bool has_sm_resource_split() noexcept; + } // namespace cuda_core diff --git a/cuda_core/cuda/core/_device_resources.pyx b/cuda_core/cuda/core/_device_resources.pyx index 40c0a874d05..5ddc76dcacd 100644 --- a/cuda_core/cuda/core/_device_resources.pyx +++ b/cuda_core/cuda/core/_device_resources.pyx @@ -203,6 +203,9 @@ cdef inline unsigned int _to_sm_count(object value) except? 0: return (value) +IF CUDA_CORE_BUILD_MAJOR >= 13: + from cuda.core._resource_handles cimport sm_resource_split, has_sm_resource_split + cdef int _structured_split_checked = 0 cdef inline bint _can_use_structured_sm_split(): @@ -211,7 +214,9 @@ cdef inline bint _can_use_structured_sm_split(): if _structured_split_checked != 0: return _structured_split_checked == 1 IF CUDA_CORE_BUILD_MAJOR >= 13: - if cy_driver_version() >= (13, 1, 0) and cy_binding_version() >= (13, 1, 0): + if (has_sm_resource_split() + and cy_driver_version() >= (13, 1, 0) + and cy_binding_version() >= (13, 1, 0)): _structured_split_checked = 1 return True _structured_split_checked = -1 @@ -300,13 +305,13 @@ IF CUDA_CORE_BUILD_MAJOR >= 13: memset(&remaining, 0, sizeof(cydriver.CUdevResource)) with nogil: - HANDLE_RETURN(cydriver.cuDevSmResourceSplit( + HANDLE_RETURN(sm_resource_split( result, (n_groups), &sm._resource, &remaining, 0, - params, + params, )) if result != NULL: diff --git a/cuda_core/cuda/core/_resource_handles.pxd b/cuda_core/cuda/core/_resource_handles.pxd index 8d07c27dedb..8d11ce4c735 100644 --- a/cuda_core/cuda/core/_resource_handles.pxd +++ b/cuda_core/cuda/core/_resource_handles.pxd @@ -222,3 +222,12 @@ cdef CuLinkHandle create_culink_handle_ref(cydriver.CUlinkState state) except+ n # File descriptor handles cdef FileDescriptorHandle create_fd_handle(int fd) except+ nogil cdef FileDescriptorHandle create_fd_handle_ref(int fd) except+ nogil + +# SM resource split (13.1+ — calls through function pointer, safe on older bindings) +# groupParams is void* here to avoid referencing CU_DEV_SM_RESOURCE_GROUP_PARAMS +# (which doesn't exist in cuda-bindings 13.0 .pxd). The C++ side casts it. +cdef cydriver.CUresult sm_resource_split( + cydriver.CUdevResource* result, unsigned int nbGroups, + const cydriver.CUdevResource* input, cydriver.CUdevResource* remainder, + unsigned int flags, void* groupParams) nogil +cdef bint has_sm_resource_split() noexcept nogil diff --git a/cuda_core/cuda/core/_resource_handles.pyx b/cuda_core/cuda/core/_resource_handles.pyx index a1dc05464ac..cbb0fdb8433 100644 --- a/cuda_core/cuda/core/_resource_handles.pyx +++ b/cuda_core/cuda/core/_resource_handles.pyx @@ -208,6 +208,15 @@ cdef extern from "_cpp/resource_handles.hpp" namespace "cuda_core": FileDescriptorHandle create_fd_handle_ref "cuda_core::create_fd_handle_ref" ( int fd) except+ nogil + # SM resource split (13.1+ wrapper — avoids direct cydriver cimport) + # groupParams is void* to avoid referencing CU_DEV_SM_RESOURCE_GROUP_PARAMS + # (which doesn't exist in cuda-bindings 13.0 .pxd). The C++ side casts it. + cydriver.CUresult sm_resource_split "cuda_core::sm_resource_split" ( + cydriver.CUdevResource* result, unsigned int nbGroups, + const cydriver.CUdevResource* input, cydriver.CUdevResource* remainder, + unsigned int flags, void* groupParams) nogil + bint has_sm_resource_split "cuda_core::has_sm_resource_split" () noexcept nogil + # ============================================================================= # CUDA Driver API capsule @@ -290,6 +299,9 @@ cdef extern from "_cpp/resource_handles.hpp" namespace "cuda_core": void* p_cuGraphicsUnmapResources "reinterpret_cast(cuda_core::p_cuGraphicsUnmapResources)" void* p_cuGraphicsUnregisterResource "reinterpret_cast(cuda_core::p_cuGraphicsUnregisterResource)" + # SM resource split (13.1+) + void* p_cuDevSmResourceSplit "reinterpret_cast(cuda_core::p_cuDevSmResourceSplit)" + # NVRTC void* p_nvrtcDestroyProgram "reinterpret_cast(cuda_core::p_nvrtcDestroyProgram)" @@ -372,6 +384,9 @@ p_cuLinkDestroy = _get_driver_fn("cuLinkDestroy") p_cuGraphicsUnmapResources = _get_driver_fn("cuGraphicsUnmapResources") p_cuGraphicsUnregisterResource = _get_driver_fn("cuGraphicsUnregisterResource") +# SM resource split (13.1+ — may not exist in older cuda-bindings) +p_cuDevSmResourceSplit = _get_optional_driver_fn("cuDevSmResourceSplit") + # ============================================================================= # NVRTC function pointer initialization # ============================================================================= diff --git a/cuda_core/docs/source/release/1.0.1-notes.rst b/cuda_core/docs/source/release/1.0.1-notes.rst index 8654f70e7fd..b3cc3b44965 100644 --- a/cuda_core/docs/source/release/1.0.1-notes.rst +++ b/cuda_core/docs/source/release/1.0.1-notes.rst @@ -10,6 +10,11 @@ Fixes and enhancements ---------------------- +- Fixed ``ImportError`` when importing ``cuda.core`` with + ``cuda-bindings`` 13.0.x due to an unavailable driver function + (``cuDevSmResourceSplit``). + (`#2063 `__, + `#2064 `__) - When iterating over MIG devices with ``cuda.core.system.Device.mig.get_all_devices``, only available MIG devices will be returned. Previously, if any MIG device was unavailable, an exception would