[GPU] Add barriers when resolving GPUMappedForall to fix race condition (#19635) The barriers added here can be pessimistic and we can look into optimizing them at a later point if needed. However, we end up with a race if we dont have them. In some local testing I did on a MI300 GPU, I did not find any significant performance impact by these barriers. For example an unaligned matmul + elementwise took 47us and 48us with and without the barriers respectively with TileAndFuse with padding support and the corresponding default path takes 68us. The prefill stage of ToyLLAMA took 325us and 324us respectively with and without barriers while the default path takes 461us. Signed-off-by: Nirvedh Meshram <nirvedh@gmail.com>

commit: c484058957158b08c920a78734dce6ebb9a9bde0 [log] [tgz]
author: Nirvedh Meshram <96096277+nirvedhmeshram@users.noreply.github.com> Wed Jan 08 15:39:42 2025 -0600
committer: GitHub <noreply@github.com> Wed Jan 08 15:39:42 2025 -0600
tree: 73e129e60e2e42333ab0deedf8011b63efc0238b
parent: 9b4906e7848ebca3e80bf662f31bff24cff77ae7 [diff]
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUDistributeForall.cpp b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUDistributeForall.cpp
index 334427c..63bff76 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUDistributeForall.cpp
+++ b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUDistributeForall.cpp

@@ -127,7 +127,12 @@
   Value ub = getValueOrCreateConstantIndexOp(rewriter, loc, totalLoopTripCount);
   Value step =
       rewriter.create<arith::ConstantIndexOp>(loc, flatTotalNumWorkers);
+  // We need to add barriers before and after the distributed loop because the
+  // loop might have reads/writes to shared memory that can have a different
+  // layout compared to rest of the program.
+  rewriter.create<gpu::BarrierOp>(loc);
   auto forLoop = rewriter.create<scf::ForOp>(loc, lb, ub, step, ValueRange{});
+  rewriter.create<gpu::BarrierOp>(loc);
   Block *loopBody = forLoop.getBody();
 
   // Get the replacement IDs for the forall iterator ids.

diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_distribute_forall.mlir b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_distribute_forall.mlir
index 32bda8c..cc6b575 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_distribute_forall.mlir
+++ b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_distribute_forall.mlir

@@ -16,9 +16,11 @@
 //   CHECK-DAG:   %[[TX:.+]] = gpu.thread_id x
 //   CHECK-DAG:   %[[TY:.+]] = gpu.thread_id y
 //       CHECK:   %[[TFLAT:.+]] = affine.linearize_index disjoint [%[[TY]], %[[TX]]] by (2, 64)
+//       CHECK:   gpu.barrier
 //       CHECK:   scf.for %[[I:.+]] = %c0 to %c1024 step %c128 {
 //       CHECK:     %[[LINID:.+]] = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%[[I]])[%[[TFLAT]]]
 //       CHECK:     memref.store {{.*}}[%[[LINID]]]
+//       CHECK:   gpu.barrier
 
 // -----
 
@@ -38,9 +40,11 @@
 //   CHECK-DAG:   %[[TY:.+]] = gpu.thread_id y
 //       CHECK:   %[[TFLAT:.+]] = affine.linearize_index disjoint [%[[TY]], %[[TX]]] by (2, 64)
 //       CHECK:   %[[WARPSPLIT:.+]]:2 = affine.delinearize_index %[[TFLAT]] into (4, 32)
+//       CHECK:   gpu.barrier
 //       CHECK:   scf.for %[[I:.+]] = %c0 to %c32 step %c4 {
 //       CHECK:     %[[LINID:.+]] = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%[[I]])[%[[WARPSPLIT]]#0]
 //       CHECK:     memref.store {{.*}}[%[[LINID]]]
+//       CHECK:   gpu.barrier
 
 // -----
 
@@ -76,7 +80,9 @@
 //   CHECK-DAG:   %[[TX:.+]] = gpu.thread_id x
 //   CHECK-DAG:   %[[TY:.+]] = gpu.thread_id y
 //       CHECK:   %[[LINID:.+]] = affine.linearize_index disjoint [%[[TY]], %[[TX]]] by (2, 64)
+//       CHECK:   gpu.barrier
 //       CHECK:   memref.store {{.*}}[%[[LINID]]]
+//       CHECK:   gpu.barrier
 
 // -----
 
@@ -96,8 +102,10 @@
 //   CHECK-DAG:   %[[TX:.+]] = gpu.thread_id x
 //   CHECK-DAG:   %[[TY:.+]] = gpu.thread_id y
 //       CHECK:   %[[TFLAT:.+]] = affine.linearize_index disjoint [%[[TY]], %[[TX]]] by (2, 64)
+//       CHECK:   gpu.barrier
 //       CHECK:   scf.for %[[I:.+]] = %[[TFLAT]] to %c1 step %c128 {
 //       CHECK:     memref.store {{.*}}[%[[I]]]
+//       CHECK:   gpu.barrier
 
 // -----
 
@@ -117,8 +125,10 @@
 //   CHECK-DAG:   %[[TX:.+]] = gpu.thread_id x
 //   CHECK-DAG:   %[[TY:.+]] = gpu.thread_id y
 //       CHECK:   %[[TFLAT:.+]] = affine.linearize_index disjoint [%[[TY]], %[[TX]]] by (2, 64)
+//       CHECK:   gpu.barrier
 //       CHECK:   scf.for %[[I:.+]] = %[[TFLAT]] to %[[C513]] step %c128 {
 //       CHECK:     memref.store {{.*}}[%[[I]]]
+//       CHECK:   gpu.barrier
 
 // -----
 
@@ -137,11 +147,12 @@
 //   CHECK-DAG:   %[[TX:.+]] = gpu.thread_id x
 //   CHECK-DAG:   %[[TY:.+]] = gpu.thread_id y
 //       CHECK:   %[[TFLAT:.+]] = affine.linearize_index disjoint [%[[TY]], %[[TX]]] by (2, 64)
+//       CHECK:   gpu.barrier
 //       CHECK:   scf.for %[[I:.+]] = %c0 to %c512 step %c128 {
 //       CHECK:     %[[LINID:.+]] = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%[[I]])[%[[TFLAT]]]
 //       CHECK:     %[[DELIN:.+]]:3 = affine.delinearize_index %[[LINID]] into (16, 8, 4) : index
 //       CHECK:     memref.store {{.*}}[%[[DELIN]]#0, %[[DELIN]]#1, %[[DELIN]]#2]
-
+//       CHECK:   gpu.barrier
 
 // -----
 
@@ -157,5 +168,7 @@
 }
 
 // CHECK-LABEL: func @distribute_thread_forall_small_workgroup
-//   CHECK:   %[[TX:.+]] = gpu.thread_id x
-//   CHECK:   memref.store {{.*}}[%[[TX]]]
+//       CHECK:   %[[TX:.+]] = gpu.thread_id x
+//       CHECK:   gpu.barrier
+//       CHECK:   memref.store {{.*}}[%[[TX]]]
+//       CHECK:   gpu.barrier

diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_tile_and_fuse.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_tile_and_fuse.mlir
index f71add6..886b39b 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_tile_and_fuse.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_tile_and_fuse.mlir

@@ -908,6 +908,7 @@
 // for loop.
 //       CHECK:       vector.transfer_write %{{.*}}, %[[B2]]{{.*}} memref<10x1xf32, #hal.descriptor_type<storage_buffer>>
 //  CHECK-NEXT:     }
+//  CHECK-NEXT:   gpu.barrier
 //  CHECK-NEXT:   } {mapping = [#iree_codegen.workgroup_mapping<x>]}
 //  CHECK-NEXT:   return
commit	c484058957158b08c920a78734dce6ebb9a9bde0	[log] [tgz]
author	Nirvedh Meshram <96096277+nirvedhmeshram@users.noreply.github.com>	Wed Jan 08 15:39:42 2025 -0600
committer	GitHub <noreply@github.com>	Wed Jan 08 15:39:42 2025 -0600
tree	73e129e60e2e42333ab0deedf8011b63efc0238b
parent	9b4906e7848ebca3e80bf662f31bff24cff77ae7 [diff]