[GPU] Add barriers when resolving GPUMappedForall to fix race condition (#19635)

The barriers added here can be pessimistic and we can look into
optimizing them at a later point if needed. However, we end up with a
race if we dont have them.
In some local testing I did on a MI300 GPU, I did not find any
significant performance impact by these barriers. For example an
unaligned matmul + elementwise took 47us and 48us with and without the
barriers respectively with TileAndFuse with padding support and the
corresponding default path takes 68us. The prefill stage of ToyLLAMA
took 325us and 324us respectively with and without barriers while the
default path takes 461us.

Signed-off-by: Nirvedh Meshram <nirvedh@gmail.com>
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUDistributeForall.cpp b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUDistributeForall.cpp
index 334427c..63bff76 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUDistributeForall.cpp
+++ b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUDistributeForall.cpp
@@ -127,7 +127,12 @@
   Value ub = getValueOrCreateConstantIndexOp(rewriter, loc, totalLoopTripCount);
   Value step =
       rewriter.create<arith::ConstantIndexOp>(loc, flatTotalNumWorkers);
+  // We need to add barriers before and after the distributed loop because the
+  // loop might have reads/writes to shared memory that can have a different
+  // layout compared to rest of the program.
+  rewriter.create<gpu::BarrierOp>(loc);
   auto forLoop = rewriter.create<scf::ForOp>(loc, lb, ub, step, ValueRange{});
+  rewriter.create<gpu::BarrierOp>(loc);
   Block *loopBody = forLoop.getBody();
 
   // Get the replacement IDs for the forall iterator ids.
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_distribute_forall.mlir b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_distribute_forall.mlir
index 32bda8c..cc6b575 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_distribute_forall.mlir
+++ b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_distribute_forall.mlir
@@ -16,9 +16,11 @@
 //   CHECK-DAG:   %[[TX:.+]] = gpu.thread_id x
 //   CHECK-DAG:   %[[TY:.+]] = gpu.thread_id y
 //       CHECK:   %[[TFLAT:.+]] = affine.linearize_index disjoint [%[[TY]], %[[TX]]] by (2, 64)
+//       CHECK:   gpu.barrier
 //       CHECK:   scf.for %[[I:.+]] = %c0 to %c1024 step %c128 {
 //       CHECK:     %[[LINID:.+]] = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%[[I]])[%[[TFLAT]]]
 //       CHECK:     memref.store {{.*}}[%[[LINID]]]
+//       CHECK:   gpu.barrier
 
 // -----
 
@@ -38,9 +40,11 @@
 //   CHECK-DAG:   %[[TY:.+]] = gpu.thread_id y
 //       CHECK:   %[[TFLAT:.+]] = affine.linearize_index disjoint [%[[TY]], %[[TX]]] by (2, 64)
 //       CHECK:   %[[WARPSPLIT:.+]]:2 = affine.delinearize_index %[[TFLAT]] into (4, 32)
+//       CHECK:   gpu.barrier
 //       CHECK:   scf.for %[[I:.+]] = %c0 to %c32 step %c4 {
 //       CHECK:     %[[LINID:.+]] = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%[[I]])[%[[WARPSPLIT]]#0]
 //       CHECK:     memref.store {{.*}}[%[[LINID]]]
+//       CHECK:   gpu.barrier
 
 // -----
 
@@ -76,7 +80,9 @@
 //   CHECK-DAG:   %[[TX:.+]] = gpu.thread_id x
 //   CHECK-DAG:   %[[TY:.+]] = gpu.thread_id y
 //       CHECK:   %[[LINID:.+]] = affine.linearize_index disjoint [%[[TY]], %[[TX]]] by (2, 64)
+//       CHECK:   gpu.barrier
 //       CHECK:   memref.store {{.*}}[%[[LINID]]]
+//       CHECK:   gpu.barrier
 
 // -----
 
@@ -96,8 +102,10 @@
 //   CHECK-DAG:   %[[TX:.+]] = gpu.thread_id x
 //   CHECK-DAG:   %[[TY:.+]] = gpu.thread_id y
 //       CHECK:   %[[TFLAT:.+]] = affine.linearize_index disjoint [%[[TY]], %[[TX]]] by (2, 64)
+//       CHECK:   gpu.barrier
 //       CHECK:   scf.for %[[I:.+]] = %[[TFLAT]] to %c1 step %c128 {
 //       CHECK:     memref.store {{.*}}[%[[I]]]
+//       CHECK:   gpu.barrier
 
 // -----
 
@@ -117,8 +125,10 @@
 //   CHECK-DAG:   %[[TX:.+]] = gpu.thread_id x
 //   CHECK-DAG:   %[[TY:.+]] = gpu.thread_id y
 //       CHECK:   %[[TFLAT:.+]] = affine.linearize_index disjoint [%[[TY]], %[[TX]]] by (2, 64)
+//       CHECK:   gpu.barrier
 //       CHECK:   scf.for %[[I:.+]] = %[[TFLAT]] to %[[C513]] step %c128 {
 //       CHECK:     memref.store {{.*}}[%[[I]]]
+//       CHECK:   gpu.barrier
 
 // -----
 
@@ -137,11 +147,12 @@
 //   CHECK-DAG:   %[[TX:.+]] = gpu.thread_id x
 //   CHECK-DAG:   %[[TY:.+]] = gpu.thread_id y
 //       CHECK:   %[[TFLAT:.+]] = affine.linearize_index disjoint [%[[TY]], %[[TX]]] by (2, 64)
+//       CHECK:   gpu.barrier
 //       CHECK:   scf.for %[[I:.+]] = %c0 to %c512 step %c128 {
 //       CHECK:     %[[LINID:.+]] = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%[[I]])[%[[TFLAT]]]
 //       CHECK:     %[[DELIN:.+]]:3 = affine.delinearize_index %[[LINID]] into (16, 8, 4) : index
 //       CHECK:     memref.store {{.*}}[%[[DELIN]]#0, %[[DELIN]]#1, %[[DELIN]]#2]
-
+//       CHECK:   gpu.barrier
 
 // -----
 
@@ -157,5 +168,7 @@
 }
 
 // CHECK-LABEL: func @distribute_thread_forall_small_workgroup
-//   CHECK:   %[[TX:.+]] = gpu.thread_id x
-//   CHECK:   memref.store {{.*}}[%[[TX]]]
+//       CHECK:   %[[TX:.+]] = gpu.thread_id x
+//       CHECK:   gpu.barrier
+//       CHECK:   memref.store {{.*}}[%[[TX]]]
+//       CHECK:   gpu.barrier
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_tile_and_fuse.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_tile_and_fuse.mlir
index f71add6..886b39b 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_tile_and_fuse.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_tile_and_fuse.mlir
@@ -908,6 +908,7 @@
 // for loop.
 //       CHECK:       vector.transfer_write %{{.*}}, %[[B2]]{{.*}} memref<10x1xf32, #hal.descriptor_type<storage_buffer>>
 //  CHECK-NEXT:     }
+//  CHECK-NEXT:   gpu.barrier
 //  CHECK-NEXT:   } {mapping = [#iree_codegen.workgroup_mapping<x>]}
 //  CHECK-NEXT:   return