Make barrier elimination more aggressive (#14293)

Contrary to the conceptual description of the barrier elimination
technique in the paper by Moses et.al., barrier elimination in IREE is
performed by the greedy rewriter. Thus a redundant barrier is erased
immediately and the updated IR is considered after for the next barrier.
Therefore, there it is sufficient to consider the effects before and
after each barrier until hitting the next barrier, rather than pairwise
extending those until the end of the parallel region boundary.

Specifically, the case that wouldn't be handled correctly with non-eager
rewriting,

```mlir
  store %A
  barrier  // useless because no effects after
  // nothing
  barrier  // useless because no effects before
  load %A
```

is handled correctly in IREE that removes one of the barriers and sees
the second as required when re-analyzing the eagerly rewrtitten IR.

This change lets us eliminate more barriers.
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/TransformExtensions/LLVMGPUExtensions.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/TransformExtensions/LLVMGPUExtensions.cpp
index 2ea8a69..04a50eb 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/TransformExtensions/LLVMGPUExtensions.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/TransformExtensions/LLVMGPUExtensions.cpp
@@ -1424,36 +1424,17 @@
     LLVM_DEBUG(DBGS() << "checking the necessity of: " << barrier << " "
                       << barrier.getLoc() << "\n");
 
-    {
-      LLVM_DEBUG(DBGS() << "with respect to the barrier(s) before\n");
-      SmallVector<MemoryEffects::EffectInstance> beforeEffects;
-      getEffectsBefore(barrier, beforeEffects, /*stopAtBarrier=*/true);
+    SmallVector<MemoryEffects::EffectInstance> beforeEffects;
+    getEffectsBefore(barrier, beforeEffects, /*stopAtBarrier=*/true);
 
-      SmallVector<MemoryEffects::EffectInstance> afterEffects;
-      getEffectsAfter(barrier, afterEffects, /*stopAtBarrier=*/false);
+    SmallVector<MemoryEffects::EffectInstance> afterEffects;
+    getEffectsAfter(barrier, afterEffects, /*stopAtBarrier=*/true);
 
-      if (!haveConflictingEffects(beforeEffects, afterEffects)) {
-        LLVM_DEBUG(DBGS() << "the barrier(s) before is sufficient, removing "
-                          << barrier << "\n");
-        rewriter.eraseOp(barrier);
-        return success();
-      }
-    }
-
-    {
-      LLVM_DEBUG(DBGS() << "with respect to the barrier(s) after\n");
-      SmallVector<MemoryEffects::EffectInstance> beforeEffects;
-      getEffectsBefore(barrier, beforeEffects, /*stopAtBarrier*/ false);
-
-      SmallVector<MemoryEffects::EffectInstance> afterEffects;
-      getEffectsAfter(barrier, afterEffects, /*stopAtBarrier*/ true);
-
-      if (!haveConflictingEffects(beforeEffects, afterEffects)) {
-        LLVM_DEBUG(DBGS() << "the barrier(s) after is sufficient, removing "
-                          << barrier << "\n");
-        rewriter.eraseOp(barrier);
-        return success();
-      }
+    if (!haveConflictingEffects(beforeEffects, afterEffects)) {
+      LLVM_DEBUG(DBGS() << "the surrounding barriers are sufficient, removing "
+                        << barrier << "\n");
+      rewriter.eraseOp(barrier);
+      return success();
     }
 
     LLVM_DEBUG(DBGS() << "barrier is necessary: " << barrier << " "
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_eliminate_gpu_barriers.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_eliminate_gpu_barriers.mlir
index 2fbbfae..51ce0c9 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_eliminate_gpu_barriers.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_eliminate_gpu_barriers.mlir
@@ -178,3 +178,67 @@
   %0 = transform.structured.match ops{["func.func"]} in %arg0 : (!transform.any_op) -> !transform.any_op
   transform.iree.eliminate_gpu_barriers %0 : (!transform.any_op) -> !transform.any_op
 }
+
+// -----
+
+// CHECK-LABEL: @repeated_barrier
+func.func @repeated_barrier(%arg0: memref<?xf32>, %arg1: index, %arg2: f32) -> f32
+attributes {__parallel_region_boundary_for_test} {
+  %0 = memref.load %arg0[%arg1] : memref<?xf32>
+  // CHECK: gpu.barrier
+  gpu.barrier
+  // CHECK-NOT: gpu.barrier
+  gpu.barrier
+  memref.store %arg2, %arg0[%arg1] : memref<?xf32>
+  return %0 : f32
+}
+
+transform.sequence failures(propagate) {
+^bb0(%arg0: !transform.any_op):
+  %0 = transform.structured.match ops{["func.func"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+  transform.iree.eliminate_gpu_barriers %0 : (!transform.any_op) -> !transform.any_op
+}
+
+// -----
+
+// CHECK-LABEL: @symmetric_stop
+func.func @symmetric_stop(%val: f32) -> (f32, f32, f32, f32, f32)
+attributes {__parallel_region_boundary_for_test} {
+  // CHECK: %[[A:.+]] = memref.alloc
+  // CHECK: %[[B:.+]] = memref.alloc
+  // CHECK: %[[C:.+]] = memref.alloc
+  %A = memref.alloc() : memref<f32>
+  %B = memref.alloc() : memref<f32>
+  %C = memref.alloc() : memref<f32>
+  // CHECK: memref.store %{{.*}}, %[[A]]
+  memref.store %val, %A[] : memref<f32>
+  // CHECK: gpu.barrier
+  gpu.barrier
+  // CHECK: memref.load %[[A]]
+  %0 = memref.load %A[] : memref<f32>
+  // CHECK: memref.store %{{.*}}, %[[B]]
+  memref.store %val, %B[] : memref<f32>
+  // This barrier is eliminated because the surrounding barriers are sufficient
+  // to guard write/read on all memrefs.
+  // CHECK-NOT: gpu.barrier
+  gpu.barrier
+  // CHECK: memref.load %[[A]]
+  %1 = memref.load %A[] : memref<f32>
+  // CHECK: memref.store %{{.*}} %[[C]]
+  memref.store %val, %C[] : memref<f32>
+  // CHECK: gpu.barrier
+  gpu.barrier
+  // CHECK: memref.load %[[A]]
+  // CHECK: memref.load %[[B]]
+  // CHECK: memref.load %[[C]]
+  %2 = memref.load %A[] : memref<f32>
+  %3 = memref.load %B[] : memref<f32>
+  %4 = memref.load %C[] : memref<f32>
+  return %0, %1, %2, %3, %4 : f32, f32, f32, f32, f32
+}
+
+transform.sequence failures(propagate) {
+^bb0(%arg0: !transform.any_op):
+  %0 = transform.structured.match ops{["func.func"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+  transform.iree.eliminate_gpu_barriers %0 : (!transform.any_op) -> !transform.any_op
+}