[CUDA] Add sm_121/Blackwell to known target (#24523)

## Summary
Add initial CUDA known target support for `sm_121` / Blackwell NVIDIA
GB10.

The CUDA execution limits are based on local cudaDeviceProp results from
an sm_121 device. Existing NVIDIA MMA ops are reused as a conservative
baseline until Blackwell-specific MMA intrinsics are modeled.

Related to #24477.
#24477 reports that IREE does not currently recognize newer Blackwell
CUDA targets such as `sm_120`. This PR addresses the same
target-enablement path for `sm_121`, which is the Blackwell target I can
validate locally on NVIDIA GB10.
It intentionally does not add `sm_120` support because I do not have
`sm_120` hardware to confirm the device limits or runtime behavior.

## Testing
Tested locally on NVIDIA GB10 / `sm_121`. `sm_121` requires PTX 8.8.
Using `+ptx88` compiles successfully.

Compiled and ran a local abs.mlir smoke test:
```
../iree-build/tools/iree-compile abs.mlir \
  --iree-hal-target-device=cuda \
  --iree-cuda-target=sm_121 \
  --iree-cuda-target-features=+ptx88 \
  -o abs_cuda.vmfb

../iree-build/tools/iree-run-module \
  --device=cuda \
  --module=abs_cuda.vmfb \
  --function=abs \
  --input=4xf32=-1,-2,3,-4
```

Results:
```
4xf32=1 2 3 4
```

Compiled and ran a local matmul.mlir smoke test:
```
../iree-build/tools/iree-compile matmul.mlir \
  --iree-hal-target-device=cuda \
  --iree-cuda-target=sm_121 \
  --iree-cuda-target-features=+ptx88 \
  -o matmul_cuda.vmfb

../iree-build/tools/iree-run-module \
  --device=cuda \
  --module=matmul_cuda.vmfb \
  --function=matmul \
  --input=128x256xf16=1 \
  --input=256x128xf16=1
```
Result: 128x128xf32 values are 256 as expected.

---------

Signed-off-by: Charlie-Tsai1123 <charlie1123tsai@gmail.com>
diff --git a/compiler/plugins/target/CUDA/test/target_device_features.mlir b/compiler/plugins/target/CUDA/test/target_device_features.mlir
index aa83c67..5cc399c 100644
--- a/compiler/plugins/target/CUDA/test/target_device_features.mlir
+++ b/compiler/plugins/target/CUDA/test/target_device_features.mlir
@@ -6,6 +6,8 @@
 // RUN:   --iree-cuda-target=rtx4090 %s | FileCheck %s --check-prefix=SM89
 // RUN: iree-opt --pass-pipeline='builtin.module(iree-hal-assign-target-devices{targetDevices=cuda},iree-hal-transformation-pipeline{serialize-executables=false})' \
 // RUN:   --iree-cuda-target=sm_89 --iree-cuda-target-features=+ptx80 %s | FileCheck %s --check-prefix=PTX80
+// RUN: iree-opt --pass-pipeline='builtin.module(iree-hal-assign-target-devices{targetDevices=cuda},iree-hal-transformation-pipeline{serialize-executables=false})' \
+// RUN:   --iree-cuda-target=sm_121 --iree-cuda-target-features=+ptx88 %s | FileCheck %s --check-prefix=SM121
 
 // SM89: target_info = #iree_gpu.target<arch = "sm_89", features = "+ptx78",
 // SM89-SAME: wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8,
@@ -16,6 +18,7 @@
 // SM89-SAME:         max_workgroup_counts = [2147483647, 65535, 65535]>>
 
 // PTX80: target_info = #iree_gpu.target<arch = "sm_89", features = "+ptx80",
+// SM121: target_info = #iree_gpu.target<arch = "sm_121", features = "+ptx88",
 
 stream.executable public @target_device_features {
   stream.executable.export @target_device_features workgroups()
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/KnownTargets.cpp b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/KnownTargets.cpp
index fc48dd5..8894df9 100644
--- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/KnownTargets.cpp
+++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/KnownTargets.cpp
@@ -973,6 +973,32 @@
 // cooperative matrix layouts are opaque. We need to create NVIDIA specific WMMA
 // intrinsics if we need to have explicit layout analysis and register mapping.
 
+// Reports initial NVIDIA Blackwell 12.1 target capabilities for GPU target
+// selection. CUDA execution limits are based on sm_121 GB10 device properties.
+const WgpDetails *getSM121WgpDetails() {
+  static const MMAIntrinsic mmaOps[] = {
+      MMAIntrinsic::NV_MMA_SYNC_F32_16x8x16_F16,
+      MMAIntrinsic::NV_MMA_SYNC_F16_16x8x16_F16,
+      MMAIntrinsic::NV_MMA_SYNC_F32_16x8x16_BF16,
+      MMAIntrinsic::NV_WMMA_F32_16x16x16_F16,
+      MMAIntrinsic::NV_WMMA_F16_16x16x16_F16,
+  };
+  static const WgpDetails sm121Wgp = {allComputeBits,
+                                      allStorageBits,
+                                      allSubgroupOps,
+                                      allDotProductOps,
+                                      std::size(mmaOps),
+                                      mmaOps,
+                                      0,
+                                      nullptr,
+                                      {32, 32},
+                                      {1024, 1024, 64},
+                                      1024,
+                                      99 * 1024,
+                                      {0x7fffffff, 0xffff, 0xffff}};
+  return &sm121Wgp;
+}
+
 // Reports Ampere-class NVIDIA tensor core capabilities for GPU target
 // selection.
 const WgpDetails *getAmpereWgpDetails() {
@@ -1060,6 +1086,7 @@
 
 // Maps NVIDIA target aliases to the GPU capability model used by codegen.
 std::optional<TargetDetails> getNVIDIAGPUTargetDetails(StringRef target) {
+  const WgpDetails *sm121Wgp = getSM121WgpDetails();
   const WgpDetails *ampereWgp = getAmpereWgpDetails();
   const WgpDetails *turingWgp = getTuringWgpDetails();
   const WgpDetails *voltaWgp = getVoltaWgpDetails();
@@ -1098,6 +1125,10 @@
       .Case("rtx3070ti", TargetDetails{ampereWgp, &rtx3070tiChip})
       // https://www.techpowerup.com/gpu-specs/geforce-rtx-3070.c3674
       .Case("rtx3070", TargetDetails{ampereWgp, &rtx3070Chip})
+      // Initial support for sm_121 / GB10. Other Blackwell compute
+      // capabilities, including sm_120, are intentionally left for follow-up
+      // validation.
+      .Case("sm_121", TargetDetails{sm121Wgp, nullptr})
       .Cases({"ada", "sm_89"}, TargetDetails{ampereWgp, nullptr})
       .Cases({"ampere", "sm_80", "sm_86", "sm_87"},
              TargetDetails{ampereWgp, nullptr})