[Codegen][LLVMGPU] Add option to control sync after map_nested_for_all_to_gpu_threads (#16247)
map_nested_for_all_to_gpu_threads needs to synchronize because the the
number of threads being distributed may not be equal to workgroup size.
This patch adds an option to disable these barriers, which is useful in
cases when the number of threads are equal to workgroup size.
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/TransformExtensions/LLVMGPUExtensions.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/TransformExtensions/LLVMGPUExtensions.cpp
index 1307b85..3aa3299 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/TransformExtensions/LLVMGPUExtensions.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/TransformExtensions/LLVMGPUExtensions.cpp
@@ -97,7 +97,7 @@
DiagnosedSilenceableFailure diag =
mlir::transform::gpu::mapNestedForallToThreadsImpl(
rewriter, transformOp, target, getWorkgroupDims(), getSubgroupSize(),
- true);
+ getSyncAfterDistribution());
if (!diag.succeeded())
return diag;
auto newAttr = rewriter.getIndexArrayAttr(getWorkgroupDims());
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/TransformExtensions/LLVMGPUExtensionsOps.td b/compiler/src/iree/compiler/Codegen/LLVMGPU/TransformExtensions/LLVMGPUExtensionsOps.td
index 5c3a268..3c8ffc3 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/TransformExtensions/LLVMGPUExtensionsOps.td
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/TransformExtensions/LLVMGPUExtensionsOps.td
@@ -36,7 +36,8 @@
If necessary, scf.forall that do not use the whole thread range
result in predicated computations.
- Barriers are inserted after each scf.forall op for now.
+ Barriers are inserted after each scf.forall op
+ if `sync_after_distribution` is true.
Return modes:
=============
@@ -89,13 +90,15 @@
let arguments = (ins TransformHandleTypeInterface:$target,
DefaultValuedAttr<DenseI64ArrayAttr, "{}">:$workgroup_dims,
- DefaultValuedOptionalAttr<I64Attr, "32">:$subgroup_size);
+ DefaultValuedOptionalAttr<I64Attr, "32">:$subgroup_size,
+ DefaultValuedOptionalAttr<BoolAttr, "true">:$sync_after_distribution);
let results = (outs);
let assemblyFormat = [{
$target
`workgroup_dims` `=` $workgroup_dims
(`subgroup_size` `=` $subgroup_size^)?
+ (`sync_after_distribution` `=` $sync_after_distribution^)?
attr-dict
`:` functional-type($target, results)
}];
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/attention.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/attention.mlir
index 845adb6..ce450ba 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/attention.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/attention.mlir
@@ -90,9 +90,6 @@
// CHECK-DAG: %[[D6:.+]] = gpu.thread_id y
// CHECK-DAG: %[[D7:.+]] = gpu.thread_id z
// CHECK-DAG: %[[D8:.+]] = affine.apply #[[MAP2]]()[%[[D5]], %[[D6]], %[[D7]]]
-// CHECK: gpu.barrier
-// CHECK: gpu.barrier
-// CHECK: gpu.barrier
// CHECK: %[[D9:.+]] = vector.transfer_read %[[ALLOC]][%[[C0]], %[[D8]], %[[C0]]], %[[CST_4]] {in_bounds = [true,
// CHECK-SAME: true]} : memref<1x128x64xf16, #[[GPU]].address_space<workgroup>>, vector<32x64xf16>
// CHECK: %[[D10:.+]] = arith.extf %[[D9]] : vector<32x64xf16> to vector<32x64xf32>
@@ -151,7 +148,6 @@
// CHECK: %[[D39:.+]] = vector.contract {indexing_maps = [#[[MAP4]], #[[MAP5]], #[[MAP6]]], iterator_types =
// CHECK-SAME: ["parallel", "parallel", "reduction"], kind = #[[VECTOR]].kind<add>} %[[D36]], %[[D37]], %[[D34]] :
// CHECK-SAME: vector<32x128xf32>, vector<64x128xf32> into vector<32x64xf32>
-// CHECK: gpu.barrier
// CHECK: scf.yield %[[D16]], %[[D24]], %[[D39]] : vector<32xf32>, vector<32xf32>, vector<32x64xf32>
// CHECK: }
// CHECK: %[[DSCALE1:.+]] = vector.broadcast %[[D11]]#1 : vector<32xf32> to vector<64x32xf32>
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/attention_transform_spec.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/attention_transform_spec.mlir
index 627b6d3..117e2c9 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/attention_transform_spec.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/attention_transform_spec.mlir
@@ -128,7 +128,7 @@
// ===========================================================================
%func_7 = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
transform.iree.forall_to_workgroup %func_7 : (!transform.any_op) -> ()
- transform.iree.map_nested_forall_to_gpu_threads %func_7 workgroup_dims = [4, 8, 4] subgroup_size = 32 : (!transform.any_op) -> ()
+ transform.iree.map_nested_forall_to_gpu_threads %func_7 workgroup_dims = [4, 8, 4] subgroup_size = 32 sync_after_distribution = false : (!transform.any_op) -> ()
transform.apply_patterns to %func_7 {
transform.apply_patterns.memref.fold_memref_alias_ops
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/set_transform_strategy_pad.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/set_transform_strategy_pad.mlir
index 49ee57c..fb56ef9 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/set_transform_strategy_pad.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/set_transform_strategy_pad.mlir
@@ -84,7 +84,7 @@
// CHECK: transform.memref.erase_dead_alloc_and_stores {{.*}} : (!transform.any_op) -> ()
// CHECK: {{.*}} = transform.structured.match ops{["func.func"]} in {{.*}} : (!transform.any_op) -> !transform.any_op
// CHECK: transform.iree.forall_to_workgroup {{.*}} : (!transform.any_op) -> ()
-// CHECK: transform.iree.map_nested_forall_to_gpu_threads {{.*}} workgroup_dims = [16, 16, 1] subgroup_size = 32 : (!transform.any_op) -> ()
+// CHECK: transform.iree.map_nested_forall_to_gpu_threads {{.*}} workgroup_dims = [16, 16, 1] subgroup_size = 32 sync_after_distribution = true : (!transform.any_op) -> ()
// CHECK: transform.apply_patterns.vector.lower_masks
// CHECK: transform.apply_patterns.vector.materialize_masks
// CHECK: apply_patterns to %{{.*}} {