[CUDA] Add sm_121/Blackwell to known target (#24523)
## Summary
Add initial CUDA known target support for `sm_121` / Blackwell NVIDIA
GB10.
The CUDA execution limits are based on local cudaDeviceProp results from
an sm_121 device. Existing NVIDIA MMA ops are reused as a conservative
baseline until Blackwell-specific MMA intrinsics are modeled.
Related to #24477.
#24477 reports that IREE does not currently recognize newer Blackwell
CUDA targets such as `sm_120`. This PR addresses the same
target-enablement path for `sm_121`, which is the Blackwell target I can
validate locally on NVIDIA GB10.
It intentionally does not add `sm_120` support because I do not have
`sm_120` hardware to confirm the device limits or runtime behavior.
## Testing
Tested locally on NVIDIA GB10 / `sm_121`. `sm_121` requires PTX 8.8.
Using `+ptx88` compiles successfully.
Compiled and ran a local abs.mlir smoke test:
```
../iree-build/tools/iree-compile abs.mlir \
--iree-hal-target-device=cuda \
--iree-cuda-target=sm_121 \
--iree-cuda-target-features=+ptx88 \
-o abs_cuda.vmfb
../iree-build/tools/iree-run-module \
--device=cuda \
--module=abs_cuda.vmfb \
--function=abs \
--input=4xf32=-1,-2,3,-4
```
Results:
```
4xf32=1 2 3 4
```
Compiled and ran a local matmul.mlir smoke test:
```
../iree-build/tools/iree-compile matmul.mlir \
--iree-hal-target-device=cuda \
--iree-cuda-target=sm_121 \
--iree-cuda-target-features=+ptx88 \
-o matmul_cuda.vmfb
../iree-build/tools/iree-run-module \
--device=cuda \
--module=matmul_cuda.vmfb \
--function=matmul \
--input=128x256xf16=1 \
--input=256x128xf16=1
```
Result: 128x128xf32 values are 256 as expected.
---------
Signed-off-by: Charlie-Tsai1123 <charlie1123tsai@gmail.com>
diff --git a/compiler/plugins/target/CUDA/test/target_device_features.mlir b/compiler/plugins/target/CUDA/test/target_device_features.mlir
index aa83c67..5cc399c 100644
--- a/compiler/plugins/target/CUDA/test/target_device_features.mlir
+++ b/compiler/plugins/target/CUDA/test/target_device_features.mlir
@@ -6,6 +6,8 @@
// RUN: --iree-cuda-target=rtx4090 %s | FileCheck %s --check-prefix=SM89
// RUN: iree-opt --pass-pipeline='builtin.module(iree-hal-assign-target-devices{targetDevices=cuda},iree-hal-transformation-pipeline{serialize-executables=false})' \
// RUN: --iree-cuda-target=sm_89 --iree-cuda-target-features=+ptx80 %s | FileCheck %s --check-prefix=PTX80
+// RUN: iree-opt --pass-pipeline='builtin.module(iree-hal-assign-target-devices{targetDevices=cuda},iree-hal-transformation-pipeline{serialize-executables=false})' \
+// RUN: --iree-cuda-target=sm_121 --iree-cuda-target-features=+ptx88 %s | FileCheck %s --check-prefix=SM121
// SM89: target_info = #iree_gpu.target<arch = "sm_89", features = "+ptx78",
// SM89-SAME: wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8,
@@ -16,6 +18,7 @@
// SM89-SAME: max_workgroup_counts = [2147483647, 65535, 65535]>>
// PTX80: target_info = #iree_gpu.target<arch = "sm_89", features = "+ptx80",
+// SM121: target_info = #iree_gpu.target<arch = "sm_121", features = "+ptx88",
stream.executable public @target_device_features {
stream.executable.export @target_device_features workgroups()
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/KnownTargets.cpp b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/KnownTargets.cpp
index fc48dd5..8894df9 100644
--- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/KnownTargets.cpp
+++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/KnownTargets.cpp
@@ -973,6 +973,32 @@
// cooperative matrix layouts are opaque. We need to create NVIDIA specific WMMA
// intrinsics if we need to have explicit layout analysis and register mapping.
+// Reports initial NVIDIA Blackwell 12.1 target capabilities for GPU target
+// selection. CUDA execution limits are based on sm_121 GB10 device properties.
+const WgpDetails *getSM121WgpDetails() {
+ static const MMAIntrinsic mmaOps[] = {
+ MMAIntrinsic::NV_MMA_SYNC_F32_16x8x16_F16,
+ MMAIntrinsic::NV_MMA_SYNC_F16_16x8x16_F16,
+ MMAIntrinsic::NV_MMA_SYNC_F32_16x8x16_BF16,
+ MMAIntrinsic::NV_WMMA_F32_16x16x16_F16,
+ MMAIntrinsic::NV_WMMA_F16_16x16x16_F16,
+ };
+ static const WgpDetails sm121Wgp = {allComputeBits,
+ allStorageBits,
+ allSubgroupOps,
+ allDotProductOps,
+ std::size(mmaOps),
+ mmaOps,
+ 0,
+ nullptr,
+ {32, 32},
+ {1024, 1024, 64},
+ 1024,
+ 99 * 1024,
+ {0x7fffffff, 0xffff, 0xffff}};
+ return &sm121Wgp;
+}
+
// Reports Ampere-class NVIDIA tensor core capabilities for GPU target
// selection.
const WgpDetails *getAmpereWgpDetails() {
@@ -1060,6 +1086,7 @@
// Maps NVIDIA target aliases to the GPU capability model used by codegen.
std::optional<TargetDetails> getNVIDIAGPUTargetDetails(StringRef target) {
+ const WgpDetails *sm121Wgp = getSM121WgpDetails();
const WgpDetails *ampereWgp = getAmpereWgpDetails();
const WgpDetails *turingWgp = getTuringWgpDetails();
const WgpDetails *voltaWgp = getVoltaWgpDetails();
@@ -1098,6 +1125,10 @@
.Case("rtx3070ti", TargetDetails{ampereWgp, &rtx3070tiChip})
// https://www.techpowerup.com/gpu-specs/geforce-rtx-3070.c3674
.Case("rtx3070", TargetDetails{ampereWgp, &rtx3070Chip})
+ // Initial support for sm_121 / GB10. Other Blackwell compute
+ // capabilities, including sm_120, are intentionally left for follow-up
+ // validation.
+ .Case("sm_121", TargetDetails{sm121Wgp, nullptr})
.Cases({"ada", "sm_89"}, TargetDetails{ampereWgp, nullptr})
.Cases({"ampere", "sm_80", "sm_86", "sm_87"},
TargetDetails{ampereWgp, nullptr})