[Transform] Add subgroup size setting when mapping to gpu threads (#14309)

For targets with support for configurable subgroup size, this is passed
down to the runtime through a `subgroup_size` attribute similar to the
way `workgroup_size` is specified. This allows setting subgroup size in
transform dialect based strategies by assigning it at the same time as
workgroup size.
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/TransformExtensions/LLVMGPUExtensions.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/TransformExtensions/LLVMGPUExtensions.cpp
index 5dc95f9..f101d90 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/TransformExtensions/LLVMGPUExtensions.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/TransformExtensions/LLVMGPUExtensions.cpp
@@ -109,10 +109,28 @@
   auto newAttr = rewriter.getIndexArrayAttr(getWorkgroupDims());
   rewriter.startRootUpdate(exportOp);
   exportOp->setAttr(exportOp.getWorkgroupSizeAttrName(), newAttr);
+  if (std::optional<int64_t> subgroupSize = getSubgroupSize()) {
+    auto subgroupSizeAttr = rewriter.getIndexAttr(*subgroupSize);
+    exportOp->setAttr(exportOp.getSubgroupSizeAttrName(), subgroupSizeAttr);
+  }
   rewriter.finalizeRootUpdate(exportOp);
   return DiagnosedSilenceableFailure::success();
 }
 
+void transform_dialect::MapNestedForallToGpuThreadsOp::build(
+    OpBuilder &builder, OperationState &state, Value target,
+    ArrayRef<int64_t> workgroupDims, ArrayRef<int64_t> warpDims) {
+  build(builder, state, {}, target, workgroupDims, warpDims, IntegerAttr());
+}
+
+void transform_dialect::MapNestedForallToGpuThreadsOp::build(
+    OpBuilder &builder, OperationState &state, Value target,
+    ArrayRef<int64_t> workgroupDims, ArrayRef<int64_t> warpDims,
+    int64_t subgroupSize) {
+  build(builder, state, {}, target, workgroupDims, warpDims,
+        builder.getI64IntegerAttr(subgroupSize));
+}
+
 void transform_dialect::MapNestedForallToGpuThreadsOp::getEffects(
     SmallVectorImpl<MemoryEffects::EffectInstance> &effects) {
   transform::onlyReadsHandle(getTarget(), effects);
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/TransformExtensions/LLVMGPUExtensionsOps.td b/compiler/src/iree/compiler/Codegen/LLVMGPU/TransformExtensions/LLVMGPUExtensionsOps.td
index ef673de..8956a6b 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/TransformExtensions/LLVMGPUExtensionsOps.td
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/TransformExtensions/LLVMGPUExtensionsOps.td
@@ -89,18 +89,29 @@
 
   let arguments = (ins TransformHandleTypeInterface:$target,
                    DefaultValuedAttr<DenseI64ArrayAttr, "{}">:$workgroup_dims,
-                   DefaultValuedOptionalAttr<DenseI64ArrayAttr, "{}">:$warp_dims);
+                   DefaultValuedOptionalAttr<DenseI64ArrayAttr, "{}">:$warp_dims,
+                   OptionalAttr<I64Attr>:$subgroup_size);
   let results = (outs);
 
   let assemblyFormat = [{
     $target
     `workgroup_dims` `=` $workgroup_dims
     (`warp_dims` `=` $warp_dims^)?
+    (`subgroup_size` `=` $subgroup_size^)?
     attr-dict
     `:` functional-type($target, results)
   }];
   let cppNamespace = "mlir::iree_compiler::IREE::transform_dialect";
 
+  let builders = [
+    OpBuilder<(ins "Value":$target,
+                   "ArrayRef<int64_t>":$workgroup_dims,
+                   "ArrayRef<int64_t>":$warp_dims)>,
+    OpBuilder<(ins "Value":$target,
+                   "ArrayRef<int64_t>":$workgroup_dims,
+                   "ArrayRef<int64_t>":$warp_dims,
+                   "int64_t":$subgroupSize)>
+  ];
   let extraClassDeclaration = [{
     ::mlir::DiagnosedSilenceableFailure applyToOne(
         ::mlir::transform::TransformRewriter &rewriter,
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_distribute_forall.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_distribute_forall.mlir
index 1dd83ee..d29f891 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_distribute_forall.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_distribute_forall.mlir
@@ -1,5 +1,6 @@
 // RUN: iree-opt %s --pass-pipeline="builtin.module(hal.executable(iree-transform-dialect-interpreter,transform-dialect-drop-schedule))" | FileCheck %s
 
+// CHECK: #[[$DIV32MOD8:.*]] = affine_map<()[s0] -> ((s0 floordiv 32) mod 8)>
 #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {target_arch = "sm_60"}>
 #map = affine_map<()[s0] -> (s0 * 8)>
 #map1 = affine_map<(d0) -> (d0)>
@@ -7,6 +8,9 @@
 #translation = #iree_codegen.translation_info<TransformDialectCodegen>
 hal.executable private @distribute {
   hal.executable.variant public @cuda_nvptx_fb, target = #executable_target_cuda_nvptx_fb {
+// CHECK: hal.executable.export {{.*}} attributes
+// CHECK-SAME: subgroup_size = 32
+// CHECK-SAME: workgroup_size = [256 : index, 1 : index, 1 : index]
     hal.executable.export public @distribute ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} {
     ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
       %c1 = arith.constant 1 : index
@@ -14,7 +18,6 @@
     }
     builtin.module {
 
-//       CHECK: #[[$DIV32MOD8:.*]] = affine_map<()[s0] -> ((s0 floordiv 32) mod 8)>
 // CHECK-LABEL: func.func @distribute
       func.func @distribute() {
         %cst_0 = arith.constant dense<0.000000e+00> : vector<1xf16>
@@ -49,7 +52,7 @@
         %17 = transform.structured.match ops{["func.func"]} in %variant_op 
           : (!transform.any_op) -> !transform.any_op
         transform.iree.map_nested_forall_to_gpu_threads %17 
-          workgroup_dims = [256, 1, 1] warp_dims = [8, 1, 1] : (!transform.any_op) -> ()
+          workgroup_dims = [256, 1, 1] warp_dims = [8, 1, 1] subgroup_size = 32 : (!transform.any_op) -> ()
 
         // Late canonicalizations to cleanup and pass the checks.
         // Needs to occur on the whole variant to perform cse on the workgroup_count region