-
Notifications
You must be signed in to change notification settings - Fork 297
CI: allow specifying custom driver versions in test matrix #2176
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
b1b6070
3e016b5
c0ca869
4a23b23
d33a928
00896dc
0d5f0e9
3dfaa84
701cf2f
a3f1573
c5fef92
f17dd7f
6412f4f
2b34f1f
8d8a9ef
d2c25eb
fa7940a
8dfd06e
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -13,7 +13,16 @@ | |
| # Windows entries also include DRIVER_MODE. | ||
| # | ||
| # Notes: | ||
| # - DRIVER accepts: | ||
| # * 'latest' - use the runner's pre-installed latest driver (no install step) | ||
| # * 'earliest' - use the runner's pre-installed earliest driver (no install step) | ||
| # * a version string (e.g. '580.65.06') | ||
| # - install that version via ci/tools/install_gpu_driver.sh (Linux) | ||
| # or ci/tools/install_gpu_driver.ps1 (Windows) at the start of the | ||
| # job. The matrix row is routed to the 'latest' runner image (the | ||
| # install scripts swap the driver themselves). | ||
| # - DRIVER: 'earliest' does not work with CUDA 12.9.1 | ||
| # - DRIVER: a custom version is not supported with FLAVOR=wsl on Linux. | ||
|
|
||
| linux: | ||
| pull-request: | ||
|
|
@@ -29,10 +38,10 @@ linux: | |
| - { ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '13.3.0', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest' } | ||
| - { ARCH: 'amd64', PY_VER: '3.13', CUDA_VER: '12.9.1', LOCAL_CTK: '0', GPU: 'v100', GPU_COUNT: '1', DRIVER: 'latest' } | ||
| - { ARCH: 'amd64', PY_VER: '3.13', CUDA_VER: '13.0.2', LOCAL_CTK: '1', GPU: 'rtxpro6000', GPU_COUNT: '1', DRIVER: 'latest' } | ||
| - { ARCH: 'amd64', PY_VER: '3.13', CUDA_VER: '13.3.0', LOCAL_CTK: '1', GPU: 'rtxpro6000', GPU_COUNT: '1', DRIVER: 'latest' } | ||
| - { ARCH: 'amd64', PY_VER: '3.13', CUDA_VER: '13.3.0', LOCAL_CTK: '1', GPU: 'rtxpro6000', GPU_COUNT: '1', DRIVER: '610.43.02' } | ||
|
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
| - { ARCH: 'amd64', PY_VER: '3.14', CUDA_VER: '12.9.1', LOCAL_CTK: '0', GPU: 't4', GPU_COUNT: '1', DRIVER: 'latest' } | ||
| - { ARCH: 'amd64', PY_VER: '3.14', CUDA_VER: '13.0.2', LOCAL_CTK: '1', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest' } | ||
| - { ARCH: 'amd64', PY_VER: '3.14', CUDA_VER: '13.3.0', LOCAL_CTK: '1', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest' } | ||
| - { ARCH: 'amd64', PY_VER: '3.14', CUDA_VER: '13.3.0', LOCAL_CTK: '1', GPU: 'l4', GPU_COUNT: '1', DRIVER: '610.43.02' } | ||
|
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
| - { ARCH: 'amd64', PY_VER: '3.14t', CUDA_VER: '12.9.1', LOCAL_CTK: '1', GPU: 't4', GPU_COUNT: '1', DRIVER: 'latest' } | ||
| - { ARCH: 'amd64', PY_VER: '3.14t', CUDA_VER: '13.0.2', LOCAL_CTK: '1', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest' } | ||
| - { ARCH: 'amd64', PY_VER: '3.14t', CUDA_VER: '13.3.0', LOCAL_CTK: '1', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest' } | ||
|
|
@@ -77,7 +86,7 @@ linux: | |
| - { MODE: 'nightly-pytorch', ARCH: 'arm64', PY_VER: '3.12', CUDA_VER: '13.0.2', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest', TORCH_VER: '2.9.1', TORCH_CUDA: 'cu130' } | ||
| # nightly-numba-cuda | ||
| - { MODE: 'nightly-numba-cuda', ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '12.9.1', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest' } | ||
| - { MODE: 'nightly-numba-cuda', ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '13.3.0', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest' } | ||
| - { MODE: 'nightly-numba-cuda', ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '13.3.0', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: '580.65.06' } | ||
|
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
| - { MODE: 'nightly-numba-cuda', ARCH: 'arm64', PY_VER: '3.12', CUDA_VER: '12.9.1', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest' } | ||
| - { MODE: 'nightly-numba-cuda', ARCH: 'arm64', PY_VER: '3.12', CUDA_VER: '13.3.0', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest' } | ||
| # nightly-standard (arm64 l4×2 — nightly-only per runner team request) | ||
|
|
@@ -116,4 +125,4 @@ windows: | |
| - { MODE: 'nightly-pytorch', ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '13.0.2', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest', DRIVER_MODE: 'TCC', TORCH_VER: '2.9.1', TORCH_CUDA: 'cu130' } | ||
| # nightly-numba-cuda | ||
| - { MODE: 'nightly-numba-cuda', ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '12.9.1', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest', DRIVER_MODE: 'TCC' } | ||
| - { MODE: 'nightly-numba-cuda', ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '13.3.0', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest', DRIVER_MODE: 'TCC' } | ||
| - { MODE: 'nightly-numba-cuda', ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '13.3.0', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: '596.36', DRIVER_MODE: 'TCC' } | ||
|
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,58 @@ | ||
| # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. | ||
| # | ||
| # SPDX-License-Identifier: Apache-2.0 | ||
| # | ||
| # configure_driver_mode.ps1 -- set the NVIDIA driver mode on a Windows CI | ||
| # runner and cycle the display devices so the new mode takes effect | ||
| # without rebooting. Always runs (whether or not install_gpu_driver.ps1 | ||
| # just ran). When install_gpu_driver.ps1 has run, this single device | ||
| # cycle also activates the freshly-installed driver. | ||
| # | ||
| # Inputs (env): | ||
| # DRIVER_MODE One of WDDM, TCC, MCDM. | ||
|
|
||
| function Set-DriverMode { | ||
|
|
||
| # Map matrix DRIVER_MODE to nvidia-smi -fdm code. | ||
| # This assumes we have the prior knowledge on which GPU can use which mode. | ||
| $driver_mode = $env:DRIVER_MODE | ||
| if ($driver_mode -eq "WDDM") { | ||
| Write-Output "Setting driver mode to WDDM..." | ||
| nvidia-smi -fdm 0 | ||
| } elseif ($driver_mode -eq "TCC") { | ||
| Write-Output "Setting driver mode to TCC..." | ||
| nvidia-smi -fdm 1 | ||
| } elseif ($driver_mode -eq "MCDM") { | ||
| Write-Output "Setting driver mode to MCDM..." | ||
| nvidia-smi -fdm 2 | ||
| } else { | ||
| Write-Output "Unknown driver mode: $driver_mode" | ||
| exit 1 | ||
| } | ||
|
|
||
| # Only restart NVIDIA display adapters, not other display devices (e.g. QEMU VGA) | ||
| $nvidia_devices = Get-PnpDevice -Class Display -FriendlyName "NVIDIA*" | ||
| foreach ($device in $nvidia_devices) { | ||
| Write-Output "Restarting device: $($device.FriendlyName) ($($device.InstanceId))" | ||
| pnputil /disable-device "$($device.InstanceId)" | ||
| pnputil /enable-device "$($device.InstanceId)" | ||
| } | ||
|
leofang marked this conversation as resolved.
|
||
|
|
||
| # Poll nvidia-smi until NVML can initialize, or give up after ~60s. | ||
| # A fixed sleep is not enough on slower-coming-back-up multi-GPU rows | ||
| # (e.g. 2x H100 MCDM) where pnputil enable returns before NVML is | ||
| # ready. Pattern borrowed from the runner-team `nvgha-driver.ps1`. | ||
| Write-Output "Waiting for nvidia-smi/NVML to come back up after device cycle..." | ||
| $deadline = (Get-Date).AddSeconds(60) | ||
| do { | ||
| Start-Sleep -Seconds 2 | ||
| & nvidia-smi.exe 2>&1 | Out-Null | ||
| } while ($LASTEXITCODE -ne 0 -and (Get-Date) -lt $deadline) | ||
| if ($LASTEXITCODE -ne 0) { | ||
| Write-Error "nvidia-smi did not return cleanly within 60s of the device cycle" | ||
| exit 1 | ||
| } | ||
| } | ||
|
|
||
| # Run the functions | ||
| Set-DriverMode | ||
Uh oh!
There was an error while loading. Please reload this page.