[Transform] Add subgroup size setting when mapping to gpu threads (#14309)
For targets with support for configurable subgroup size, this is passed
down to the runtime through a `subgroup_size` attribute similar to the
way `workgroup_size` is specified. This allows setting subgroup size in
transform dialect based strategies by assigning it at the same time as
workgroup size.
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/TransformExtensions/LLVMGPUExtensions.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/TransformExtensions/LLVMGPUExtensions.cpp
index 5dc95f9..f101d90 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/TransformExtensions/LLVMGPUExtensions.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/TransformExtensions/LLVMGPUExtensions.cpp
@@ -109,10 +109,28 @@
auto newAttr = rewriter.getIndexArrayAttr(getWorkgroupDims());
rewriter.startRootUpdate(exportOp);
exportOp->setAttr(exportOp.getWorkgroupSizeAttrName(), newAttr);
+ if (std::optional<int64_t> subgroupSize = getSubgroupSize()) {
+ auto subgroupSizeAttr = rewriter.getIndexAttr(*subgroupSize);
+ exportOp->setAttr(exportOp.getSubgroupSizeAttrName(), subgroupSizeAttr);
+ }
rewriter.finalizeRootUpdate(exportOp);
return DiagnosedSilenceableFailure::success();
}
+void transform_dialect::MapNestedForallToGpuThreadsOp::build(
+ OpBuilder &builder, OperationState &state, Value target,
+ ArrayRef<int64_t> workgroupDims, ArrayRef<int64_t> warpDims) {
+ build(builder, state, {}, target, workgroupDims, warpDims, IntegerAttr());
+}
+
+void transform_dialect::MapNestedForallToGpuThreadsOp::build(
+ OpBuilder &builder, OperationState &state, Value target,
+ ArrayRef<int64_t> workgroupDims, ArrayRef<int64_t> warpDims,
+ int64_t subgroupSize) {
+ build(builder, state, {}, target, workgroupDims, warpDims,
+ builder.getI64IntegerAttr(subgroupSize));
+}
+
void transform_dialect::MapNestedForallToGpuThreadsOp::getEffects(
SmallVectorImpl<MemoryEffects::EffectInstance> &effects) {
transform::onlyReadsHandle(getTarget(), effects);
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/TransformExtensions/LLVMGPUExtensionsOps.td b/compiler/src/iree/compiler/Codegen/LLVMGPU/TransformExtensions/LLVMGPUExtensionsOps.td
index ef673de..8956a6b 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/TransformExtensions/LLVMGPUExtensionsOps.td
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/TransformExtensions/LLVMGPUExtensionsOps.td
@@ -89,18 +89,29 @@
let arguments = (ins TransformHandleTypeInterface:$target,
DefaultValuedAttr<DenseI64ArrayAttr, "{}">:$workgroup_dims,
- DefaultValuedOptionalAttr<DenseI64ArrayAttr, "{}">:$warp_dims);
+ DefaultValuedOptionalAttr<DenseI64ArrayAttr, "{}">:$warp_dims,
+ OptionalAttr<I64Attr>:$subgroup_size);
let results = (outs);
let assemblyFormat = [{
$target
`workgroup_dims` `=` $workgroup_dims
(`warp_dims` `=` $warp_dims^)?
+ (`subgroup_size` `=` $subgroup_size^)?
attr-dict
`:` functional-type($target, results)
}];
let cppNamespace = "mlir::iree_compiler::IREE::transform_dialect";
+ let builders = [
+ OpBuilder<(ins "Value":$target,
+ "ArrayRef<int64_t>":$workgroup_dims,
+ "ArrayRef<int64_t>":$warp_dims)>,
+ OpBuilder<(ins "Value":$target,
+ "ArrayRef<int64_t>":$workgroup_dims,
+ "ArrayRef<int64_t>":$warp_dims,
+ "int64_t":$subgroupSize)>
+ ];
let extraClassDeclaration = [{
::mlir::DiagnosedSilenceableFailure applyToOne(
::mlir::transform::TransformRewriter &rewriter,
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_distribute_forall.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_distribute_forall.mlir
index 1dd83ee..d29f891 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_distribute_forall.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_distribute_forall.mlir
@@ -1,5 +1,6 @@
// RUN: iree-opt %s --pass-pipeline="builtin.module(hal.executable(iree-transform-dialect-interpreter,transform-dialect-drop-schedule))" | FileCheck %s
+// CHECK: #[[$DIV32MOD8:.*]] = affine_map<()[s0] -> ((s0 floordiv 32) mod 8)>
#executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {target_arch = "sm_60"}>
#map = affine_map<()[s0] -> (s0 * 8)>
#map1 = affine_map<(d0) -> (d0)>
@@ -7,6 +8,9 @@
#translation = #iree_codegen.translation_info<TransformDialectCodegen>
hal.executable private @distribute {
hal.executable.variant public @cuda_nvptx_fb, target = #executable_target_cuda_nvptx_fb {
+// CHECK: hal.executable.export {{.*}} attributes
+// CHECK-SAME: subgroup_size = 32
+// CHECK-SAME: workgroup_size = [256 : index, 1 : index, 1 : index]
hal.executable.export public @distribute ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} {
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
%c1 = arith.constant 1 : index
@@ -14,7 +18,6 @@
}
builtin.module {
-// CHECK: #[[$DIV32MOD8:.*]] = affine_map<()[s0] -> ((s0 floordiv 32) mod 8)>
// CHECK-LABEL: func.func @distribute
func.func @distribute() {
%cst_0 = arith.constant dense<0.000000e+00> : vector<1xf16>
@@ -49,7 +52,7 @@
%17 = transform.structured.match ops{["func.func"]} in %variant_op
: (!transform.any_op) -> !transform.any_op
transform.iree.map_nested_forall_to_gpu_threads %17
- workgroup_dims = [256, 1, 1] warp_dims = [8, 1, 1] : (!transform.any_op) -> ()
+ workgroup_dims = [256, 1, 1] warp_dims = [8, 1, 1] subgroup_size = 32 : (!transform.any_op) -> ()
// Late canonicalizations to cleanup and pass the checks.
// Needs to occur on the whole variant to perform cse on the workgroup_count region