[Codegen][LLVMGPU] Add option to control sync after map_nested_for_all_to_gpu_threads (#16247) map_nested_for_all_to_gpu_threads needs to synchronize because the the number of threads being distributed may not be equal to workgroup size. This patch adds an option to disable these barriers, which is useful in cases when the number of threads are equal to workgroup size.

commit: 8efc168270a1f1bd64e4775e29febeb31fa49f06 [log] [tgz]
author: Kunwar Grover <groverkss@gmail.com> Tue Jan 30 01:47:59 2024 +0530
committer: GitHub <noreply@github.com> Mon Jan 29 20:17:59 2024 +0000
tree: e7e7663fe1d976ae736929c052a90a0050e32a59
parent: c88427b14d167b4559920bc817429e8e17a0b4be [diff]
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/TransformExtensions/LLVMGPUExtensions.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/TransformExtensions/LLVMGPUExtensions.cpp
index 1307b85..3aa3299 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/TransformExtensions/LLVMGPUExtensions.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/TransformExtensions/LLVMGPUExtensions.cpp

@@ -97,7 +97,7 @@
   DiagnosedSilenceableFailure diag =
       mlir::transform::gpu::mapNestedForallToThreadsImpl(
           rewriter, transformOp, target, getWorkgroupDims(), getSubgroupSize(),
-          true);
+          getSyncAfterDistribution());
   if (!diag.succeeded())
     return diag;
   auto newAttr = rewriter.getIndexArrayAttr(getWorkgroupDims());

diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/TransformExtensions/LLVMGPUExtensionsOps.td b/compiler/src/iree/compiler/Codegen/LLVMGPU/TransformExtensions/LLVMGPUExtensionsOps.td
index 5c3a268..3c8ffc3 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/TransformExtensions/LLVMGPUExtensionsOps.td
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/TransformExtensions/LLVMGPUExtensionsOps.td

@@ -36,7 +36,8 @@
     If necessary, scf.forall that do not use the whole thread range
     result in predicated computations.
 
-    Barriers are inserted after each scf.forall op for now.
+    Barriers are inserted after each scf.forall op 
+    if `sync_after_distribution` is true.
 
     Return modes:
     =============
@@ -89,13 +90,15 @@
 
   let arguments = (ins TransformHandleTypeInterface:$target,
                    DefaultValuedAttr<DenseI64ArrayAttr, "{}">:$workgroup_dims,
-                   DefaultValuedOptionalAttr<I64Attr, "32">:$subgroup_size);
+                   DefaultValuedOptionalAttr<I64Attr, "32">:$subgroup_size,
+                   DefaultValuedOptionalAttr<BoolAttr, "true">:$sync_after_distribution);
   let results = (outs);
 
   let assemblyFormat = [{
     $target
     `workgroup_dims` `=` $workgroup_dims
     (`subgroup_size` `=` $subgroup_size^)?
+    (`sync_after_distribution` `=` $sync_after_distribution^)?
     attr-dict
     `:` functional-type($target, results)
   }];

diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/attention.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/attention.mlir
index 845adb6..ce450ba 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/attention.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/attention.mlir

@@ -90,9 +90,6 @@
 // CHECK-DAG:    %[[D6:.+]] = gpu.thread_id  y
 // CHECK-DAG:    %[[D7:.+]] = gpu.thread_id  z
 // CHECK-DAG:    %[[D8:.+]] = affine.apply #[[MAP2]]()[%[[D5]], %[[D6]], %[[D7]]]
-// CHECK:        gpu.barrier
-// CHECK:        gpu.barrier
-// CHECK:        gpu.barrier
 // CHECK:        %[[D9:.+]] = vector.transfer_read %[[ALLOC]][%[[C0]], %[[D8]], %[[C0]]], %[[CST_4]] {in_bounds = [true,
 // CHECK-SAME:     true]} : memref<1x128x64xf16, #[[GPU]].address_space<workgroup>>, vector<32x64xf16>
 // CHECK:        %[[D10:.+]] = arith.extf %[[D9]] : vector<32x64xf16> to vector<32x64xf32>
@@ -151,7 +148,6 @@
 // CHECK:          %[[D39:.+]] = vector.contract {indexing_maps = [#[[MAP4]], #[[MAP5]], #[[MAP6]]], iterator_types =
 // CHECK-SAME:       ["parallel", "parallel", "reduction"], kind = #[[VECTOR]].kind<add>} %[[D36]], %[[D37]], %[[D34]] :
 // CHECK-SAME:       vector<32x128xf32>, vector<64x128xf32> into vector<32x64xf32>
-// CHECK:          gpu.barrier
 // CHECK:          scf.yield %[[D16]], %[[D24]], %[[D39]] : vector<32xf32>, vector<32xf32>, vector<32x64xf32>
 // CHECK:        }
 // CHECK:        %[[DSCALE1:.+]] = vector.broadcast %[[D11]]#1 : vector<32xf32> to vector<64x32xf32>

diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/attention_transform_spec.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/attention_transform_spec.mlir
index 627b6d3..117e2c9 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/attention_transform_spec.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/attention_transform_spec.mlir

@@ -128,7 +128,7 @@
     // ===========================================================================
     %func_7 = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
     transform.iree.forall_to_workgroup %func_7 : (!transform.any_op) -> ()
-    transform.iree.map_nested_forall_to_gpu_threads %func_7 workgroup_dims = [4, 8, 4] subgroup_size = 32 : (!transform.any_op) -> ()
+    transform.iree.map_nested_forall_to_gpu_threads %func_7 workgroup_dims = [4, 8, 4] subgroup_size = 32 sync_after_distribution = false : (!transform.any_op) -> ()
 
     transform.apply_patterns to %func_7 {
       transform.apply_patterns.memref.fold_memref_alias_ops

diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/set_transform_strategy_pad.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/set_transform_strategy_pad.mlir
index 49ee57c..fb56ef9 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/set_transform_strategy_pad.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/set_transform_strategy_pad.mlir

@@ -84,7 +84,7 @@
 //       CHECK:   transform.memref.erase_dead_alloc_and_stores {{.*}} : (!transform.any_op) -> ()
 //       CHECK:   {{.*}} = transform.structured.match ops{["func.func"]} in {{.*}} : (!transform.any_op) -> !transform.any_op
 //       CHECK:   transform.iree.forall_to_workgroup {{.*}} : (!transform.any_op) -> ()
-//       CHECK:   transform.iree.map_nested_forall_to_gpu_threads {{.*}} workgroup_dims = [16, 16, 1] subgroup_size = 32 : (!transform.any_op) -> ()
+//       CHECK:   transform.iree.map_nested_forall_to_gpu_threads {{.*}} workgroup_dims = [16, 16, 1] subgroup_size = 32 sync_after_distribution = true : (!transform.any_op) -> ()
 //       CHECK:     transform.apply_patterns.vector.lower_masks
 //       CHECK:     transform.apply_patterns.vector.materialize_masks
 //       CHECK:   apply_patterns to %{{.*}} {
commit	8efc168270a1f1bd64e4775e29febeb31fa49f06	[log] [tgz]
author	Kunwar Grover <groverkss@gmail.com>	Tue Jan 30 01:47:59 2024 +0530
committer	GitHub <noreply@github.com>	Mon Jan 29 20:17:59 2024 +0000
tree	e7e7663fe1d976ae736929c052a90a0050e32a59
parent	c88427b14d167b4559920bc817429e8e17a0b4be [diff]