[Codegen][GPU] Change the location of barriers in forall fusion (#18542)

The way that barriers are currently inserted for forall fusion is
fragile and trying to model "WaR" conflicts on tensors (kind of). We
instead want to put the barrier around the body of the whole scf.forall.

See this comment:
https://github.com/iree-org/iree/pull/18490#issuecomment-2346373879
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/TransformExtensions/IREEGPUExtensions.cpp b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TransformExtensions/IREEGPUExtensions.cpp
index c14530b..3ee4db9 100644
--- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/TransformExtensions/IREEGPUExtensions.cpp
+++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TransformExtensions/IREEGPUExtensions.cpp
@@ -200,8 +200,8 @@
                                      "extracted slice from the consumer loop");
   }
 
-  if (failed(GPU::fuseForallIntoSlice(rewriter, producer, consumer,
-                                      consumerChain))) {
+  if (failed(GPU::fuseForallIntoConsumer(rewriter, producer, consumer,
+                                         consumerChain))) {
     return mlir::emitDefiniteFailure(state.getTopLevel(),
                                      "failed to fuse forall ops");
   }
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/TransformExtensions/test/transform_fuse_forall.mlir b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TransformExtensions/test/transform_fuse_forall.mlir
index d1a3201..261f546 100644
--- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/TransformExtensions/test/transform_fuse_forall.mlir
+++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TransformExtensions/test/transform_fuse_forall.mlir
@@ -47,26 +47,26 @@
 //   CHECK-DAG:   %[[EMPTY:.+]] = tensor.empty() : tensor<128x128xf32>
 //   CHECK-DAG:   %[[ALLOC:.+]] = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<128x128xf32>
 //       CHECK:   scf.forall (%[[IDX:.+]], %[[IDY:.+]]) in (8, 8) shared_outs(%[[INIT:.+]] = %[[EMPTY]]) -> (tensor<128x128xf32>) {
+
+//       CHECK:     %[[BARRIER:.+]] = iree_gpu.barrier_region ins(%[[ALLOC]] : tensor<128x128xf32>)
+//       CHECK:     ^bb0(%[[INTERMEDIATE:.+]]: tensor<128x128xf32>):
+//       CHECK:       %[[LOOP:.+]] = scf.for %[[I:.+]] = %c0 to %c64{{.*}} step %c64{{.*}} iter_args(%[[ITER:.+]] = %[[INTERMEDIATE]]) -> (tensor<128x128xf32>)
+//       CHECK:         %[[LINEARID:.+]] = affine.apply #[[$MAP2]](%[[I]], %[[IDX]], %[[IDY]])
+//       CHECK:         %[[IDS:.+]]:2 = affine.delinearize_index %[[LINEARID]] into (%c1, %c64) : index, index
+//       CHECK:         %[[INID0:.+]] = affine.apply #[[$MAP3]](%[[IDS]]#1)
+//       CHECK:         %[[INSLICE0:.+]] = tensor.extract_slice %[[ARG0]][%[[INID0]], %[[IDS]]#0] [2, 128] [1, 1] : tensor<128x128xf32> to tensor<2x128xf32>
+//       CHECK:         %[[INSLICE1:.+]] = tensor.extract_slice %[[ITER]][%[[INID0]], %[[IDS]]#0] [2, 128] [1, 1] : tensor<128x128xf32> to tensor<2x128xf32>
+//       CHECK:         %[[COPY:.+]] = linalg.copy ins(%[[INSLICE0]] : tensor<2x128xf32>) outs(%[[INSLICE1]] : tensor<2x128xf32>) -> tensor<2x128xf32>
+//       CHECK:         %[[INSERT:.+]] = tensor.insert_slice %[[COPY]] into %[[ITER]][%[[INID0]], %[[IDS]]#0] [2, 128] [1, 1]
+//       CHECK:         scf.yield %[[INSERT]]
+//       CHECK:       iree_gpu.yield %[[LOOP]]
+//       CHECK:     } : tensor<128x128xf32>
+
 //   CHECK-DAG:     %[[OUTID0:.+]] = affine.apply #[[$MAP]](%[[IDX]])
 //   CHECK-DAG:     %[[OUTID1:.+]] = affine.apply #[[$MAP]](%[[IDY]])
-
-//       CHECK:     %[[LOOP:.+]] = scf.for %[[I:.+]] = %c0 to %c64{{.*}} step %c64{{.*}} iter_args(%[[ITER:.+]] = %[[ALLOC]]) -> (tensor<128x128xf32>)
-//       CHECK:       %[[LINEARID:.+]] = affine.apply #[[$MAP2]](%[[I]], %[[IDX]], %[[IDY]])
-//       CHECK:       %[[IDS:.+]]:2 = affine.delinearize_index %[[LINEARID]] into (%c1, %c64) : index, index
-//       CHECK:       %[[INID0:.+]] = affine.apply #[[$MAP3]](%[[IDS]]#1)
-//       CHECK:       %[[INSLICE0:.+]] = tensor.extract_slice %[[ARG0]][%[[INID0]], %[[IDS]]#0] [2, 128] [1, 1] : tensor<128x128xf32> to tensor<2x128xf32>
-//       CHECK:       %[[INSLICE1:.+]] = tensor.extract_slice %[[ITER]][%[[INID0]], %[[IDS]]#0] [2, 128] [1, 1] : tensor<128x128xf32> to tensor<2x128xf32>
-//       CHECK:       %[[COPY:.+]] = linalg.copy ins(%[[INSLICE0]] : tensor<2x128xf32>) outs(%[[INSLICE1]] : tensor<2x128xf32>) -> tensor<2x128xf32>
-//       CHECK:       %[[INSERT:.+]] = tensor.insert_slice %[[COPY]] into %[[ITER]][%[[INID0]], %[[IDS]]#0] [2, 128] [1, 1]
-//       CHECK:       scf.yield %[[INSERT]]
-
-//       CHECK:     %[[SHUFFLE:.+]] = iree_gpu.barrier_region ins(%[[LOOP]] : tensor<128x128xf32>)
-//       CHECK:     ^bb0(%[[INTERMEDIATE:.+]]: tensor<128x128xf32>):
-//       CHECK:       %[[SLICE:.+]] = tensor.extract_slice %[[INTERMEDIATE]][%[[OUTID0]], %[[OUTID1]]] [16, 16] [1, 1] : tensor<128x128xf32> to tensor<16x16xf32>
-//       CHECK:       iree_gpu.yield %[[SLICE]]
-//       CHECK:     } : tensor<16x16xf32>
+//       CHECK:     %[[SLICE:.+]] = tensor.extract_slice %[[BARRIER]][%[[OUTID0]], %[[OUTID1]]] [16, 16] [1, 1] : tensor<128x128xf32> to tensor<16x16xf32>
 //       CHECK:     %[[OUTSLICE:.+]] = tensor.extract_slice %[[INIT]][%[[OUTID0]], %[[OUTID1]]] [16, 16] [1, 1] : tensor<128x128xf32> to tensor<16x16xf32>
-//       CHECK:     %[[MM:.+]] = linalg.matmul ins(%[[SHUFFLE]], %[[SHUFFLE]] : tensor<16x16xf32>, tensor<16x16xf32>)
+//       CHECK:     %[[MM:.+]] = linalg.matmul ins(%[[SLICE]], %[[SLICE]] : tensor<16x16xf32>, tensor<16x16xf32>)
 //  CHECK-SAME:       outs(%[[OUTSLICE]] : tensor<16x16xf32>) -> tensor<16x16xf32>
 //       CHECK:     scf.forall.in_parallel {
 //       CHECK:       tensor.parallel_insert_slice %[[MM]] into %[[INIT]][%[[OUTID0]], %[[OUTID1]]] [16, 16] [1, 1] : tensor<16x16xf32> into tensor<128x128xf32>
@@ -122,10 +122,10 @@
 //   CHECK-DAG:   %[[EMPTY:.+]] = tensor.empty() : tensor<128x128xf32>
 //   CHECK-DAG:   %[[ALLOC:.+]] = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<128x128xf32>
 //       CHECK:   scf.forall (%[[IDX:.+]], %[[IDY:.+]]) in (8, 8) shared_outs(%[[INIT:.+]] = %[[EMPTY]]) -> (tensor<128x128xf32>) {
-//       CHECK:     %[[LOOP:.+]] = scf.for {{.*}} iter_args(%[[INIT:.+]] = %[[ALLOC]])
-//       CHECK:       %[[INSERT:.+]] = tensor.insert_slice %{{.*}} into %[[INIT]]
-//       CHECK:     %[[SHUFFLE:.+]] = iree_gpu.barrier_region ins(%[[LOOP]] : tensor<128x128xf32>)
-//       CHECK:       } : tensor<16x16xf32>
+//       CHECK:     %[[SHUFFLE:.+]] = iree_gpu.barrier_region ins(%[[ALLOC]] : tensor<128x128xf32>)
+//       CHECK:       %[[LOOP:.+]] = scf.for {{.*}} iter_args(%[[INIT:.+]] = %{{.*}})
+//       CHECK:         %[[INSERT:.+]] = tensor.insert_slice %{{.*}} into %[[INIT]]
+//       CHECK:     } : tensor<128x128xf32>
 //       CHECK:   } {mapping = [#gpu.warp<y>, #gpu.warp<x>]}
 
 // -----
@@ -178,14 +178,14 @@
 //   CHECK-DAG:   %[[EMPTY:.+]] = tensor.empty() : tensor<128x128xf32>
 //   CHECK-DAG:   %[[ALLOC:.+]] = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<128x128xf32>
 //       CHECK:   scf.forall (%[[IDX:.+]], %[[IDY:.+]]) in (8, 8) shared_outs(%[[INIT:.+]] = %[[EMPTY]]) -> (tensor<128x128xf32>) {
-//       CHECK:     %[[LOOP:.+]] = scf.for {{.*}} iter_args(%[[INIT:.+]] = %[[ALLOC]])
-//       CHECK:       %[[INSERT:.+]] = tensor.insert_slice %{{.*}} into %[[INIT]]
-//       CHECK:     %[[SHUFFLE:.+]] = iree_gpu.barrier_region ins(%[[LOOP]] : tensor<128x128xf32>)
+//       CHECK:     %[[BARRIER:.+]] = iree_gpu.barrier_region ins(%[[ALLOC]] : tensor<128x128xf32>)
 //       CHECK:     ^bb0(%[[INTERMEDIATE:.+]]: tensor<128x128xf32>):
-//       CHECK:       %[[EXPAND:.+]] = tensor.expand_shape %[[INTERMEDIATE]] {{\[}}[0, 1], [2]{{\]}} output_shape [2, 64, 128]
-//       CHECK:       %[[SLICE:.+]] = tensor.extract_slice %[[EXPAND]][0, %{{.*}}, %{{.*}}] [1, 16, 16] [1, 1, 1] : tensor<2x64x128xf32> to tensor<16x16xf32>
-//       CHECK:       iree_gpu.yield %[[SLICE]]
-//       CHECK:       } : tensor<16x16xf32>
+//       CHECK:       %[[LOOP:.+]] = scf.for {{.*}} iter_args(%[[INIT:.+]] = %[[INTERMEDIATE]])
+//       CHECK:         %[[INSERT:.+]] = tensor.insert_slice %{{.*}} into %[[INIT]]
+//       CHECK:       iree_gpu.yield %[[LOOP]]
+//       CHECK:     } : tensor<128x128xf32>
+//       CHECK:     %[[EXPAND:.+]] = tensor.expand_shape %[[BARRIER]] {{\[}}[0, 1], [2]{{\]}} output_shape [2, 64, 128]
+//       CHECK:     %[[SLICE:.+]] = tensor.extract_slice %[[EXPAND]][0, %{{.*}}, %{{.*}}] [1, 16, 16] [1, 1, 1] : tensor<2x64x128xf32> to tensor<16x16xf32>
 //       CHECK:   } {mapping = [#gpu.warp<y>, #gpu.warp<x>]}
 
 // -----
@@ -245,16 +245,16 @@
 //       CHECK:   scf.forall (%[[W_IDX:.+]], %[[W_IDY:.+]]) in (2, 2) shared_outs(%[[INIT:.+]] = %[[EMPTY]]) -> (tensor<128x128xf32>) {
 //       CHECK:     scf.forall (%[[L_IDX:.+]], %[[L_IDY:.+]]) in (4, 4) {{.*}} -> (tensor<64x64xf32>)
 
-//       CHECK:       %[[LOOP:.+]] = scf.for %[[I:.+]] = %c0 to %c64{{.*}} step %c64{{.*}} iter_args(%[[ITER:.+]] = %[[ALLOC]]) -> (tensor<128x128xf32>)
+//       CHECK:       %[[BARRIER:.+]] = iree_gpu.barrier_region ins(%[[ALLOC]] : tensor<128x128xf32>)
+//       CHECK:       %[[LOOP:.+]] = scf.for %[[I:.+]] = %c0 to %c64{{.*}} step %c64{{.*}} iter_args(%[[ITER:.+]] = %{{.*}}) -> (tensor<128x128xf32>)
 //       CHECK:         %[[FLAT_ID:.+]] = affine.apply #[[$MAP4]](%[[I]], %[[L_IDY]], %[[L_IDX]], %[[W_IDX]], %[[W_IDY]])
 //       CHECK:         %[[IDS:.+]]:2 = affine.delinearize_index %[[FLAT_ID]] into (%c1, %c64) : index, index
 //       CHECK:         %[[IDX:.+]] = affine.apply #[[$MAP5]](%[[IDS]]#1)
 //       CHECK:         %[[COPY:.+]] = linalg.copy
 //       CHECK:         %[[INSERT:.+]] = tensor.insert_slice %[[COPY]] into %[[ITER]][%[[IDX]], %[[IDS]]#0] [2, 128]
 //       CHECK:         scf.yield %[[INSERT]]
+//       CHECK:       } : tensor<128x128xf32>
 
-//       CHECK:       %[[SHUFFLE:.+]] = iree_gpu.barrier_region ins(%[[LOOP]] : tensor<128x128xf32>)
-//       CHECK:       } : tensor<16x16xf32>
 //       CHECK:     } {mapping = [#iree_gpu.lane_id<1>, #iree_gpu.lane_id<0>]}
 //       CHECK:   } {mapping = [#gpu.warp<y>, #gpu.warp<x>]}
 
@@ -304,11 +304,11 @@
 
 //       CHECK:   %[[ALLOC:.+]] = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<128x128xf32>
 //       CHECK:   scf.forall (%[[IDX:.+]], %[[IDY:.+]]) in (8, 8) {{.*}} -> (tensor<128x128xf32>) {
-//       CHECK:     %[[LINEARID:.+]] = affine.apply #[[$MAP1]](%[[IDX]], %[[IDY]])
-//       CHECK:     %[[LOOP:.+]] = scf.for %[[I:.+]] = %[[LINEARID]] to %c32{{.*}} step %c64{{.*}} iter_args(%[[ITER:.+]] = %[[ALLOC]])
-//       CHECK:       %[[IDS:.+]] = affine.delinearize_index %[[I]] into (%c32) : index
-//       CHECK:       scf.yield
-//       CHECK:     iree_gpu.barrier_region ins(%[[LOOP]]
+//       CHECK:     iree_gpu.barrier_region ins(%[[ALLOC]]
+//       CHECK:       %[[LINEARID:.+]] = affine.apply #[[$MAP1]](%[[IDX]], %[[IDY]])
+//       CHECK:       scf.for %[[I:.+]] = %[[LINEARID]] to %c32{{.*}} step %c64{{.*}}
+//       CHECK:         %[[IDS:.+]] = affine.delinearize_index %[[I]] into (%c32) : index
+//       CHECK:         scf.yield
 //       CHECK:   } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
 
 // -----
@@ -358,10 +358,10 @@
 
 //       CHECK:   %[[ALLOC:.+]] = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<128x128xf32>
 //       CHECK:   scf.forall (%[[IDX:.+]], %[[IDY:.+]]) in (8, 8) {{.*}} -> (tensor<128x128xf32>) {
-//   CHECK-DAG:     %[[LINEARID:.+]] = affine.apply #[[$MAP1]](%[[IDX]], %[[IDY]])
-//   CHECK-DAG:     %[[PRODCOUNT:.+]] = affine.apply #[[$MAP3]]()[%[[X]], %[[Y]], %[[Z]]]
-//       CHECK:     %[[LOOP:.+]] = scf.for %[[I:.+]] = %[[LINEARID]] to %[[PRODCOUNT]] step %c64{{.*}} iter_args(%[[ITER:.+]] = %[[ALLOC]])
-//       CHECK:       %[[IDS:.+]] = affine.delinearize_index %[[I]] into (%[[Z]], %[[Y]], %[[X]]) : index
-//       CHECK:       scf.yield
-//       CHECK:     iree_gpu.barrier_region ins(%[[LOOP]]
+//       CHECK:     iree_gpu.barrier_region ins(%[[ALLOC]]
+//   CHECK-DAG:       %[[LINEARID:.+]] = affine.apply #[[$MAP1]](%[[IDX]], %[[IDY]])
+//   CHECK-DAG:       %[[PRODCOUNT:.+]] = affine.apply #[[$MAP3]]()[%[[X]], %[[Y]], %[[Z]]]
+//       CHECK:       %[[LOOP:.+]] = scf.for %[[I:.+]] = %[[LINEARID]] to %[[PRODCOUNT]] step %c64{{.*}}
+//       CHECK:         %[[IDS:.+]] = affine.delinearize_index %[[I]] into (%[[Z]], %[[Y]], %[[X]]) : index
+//       CHECK:         scf.yield
 //       CHECK:   } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/TransformExtensions/test/vectorize_iree_gpu_ops.mlir b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TransformExtensions/test/vectorize_iree_gpu_ops.mlir
index 07bd94b..d2a91e6 100644
--- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/TransformExtensions/test/vectorize_iree_gpu_ops.mlir
+++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TransformExtensions/test/vectorize_iree_gpu_ops.mlir
@@ -71,66 +71,3 @@
 //       CHECK:   %[[MMA:.+]] = iree_gpu.multi_mma %[[LHS]], %[[RHS]], %[[ACC]]
 //  CHECK-SAME:     : vector<4xf16>, vector<4xf16> into vector<4xf32>
 //       CHECK:   vector.transfer_write %[[MMA]], %arg2[%c0] {in_bounds = [true]} : vector<4xf32>, tensor<4xf32>
-
-// -----
-
-func.func @barrier_region(%init: tensor<6x6xf32>) -> tensor<3x2xf32> {
-  %0 = iree_gpu.barrier_region ins(%init : tensor<6x6xf32>) {
-  ^bb0(%intermediate: tensor<6x6xf32>):
-    %slice = tensor.extract_slice %intermediate[0, 0] [3, 2] [1, 1] : tensor<6x6xf32> to tensor<3x2xf32>
-    iree_gpu.yield %slice : tensor<3x2xf32>
-  } : tensor<3x2xf32>
-  return %0 : tensor<3x2xf32>
-}
-
-module attributes { transform.with_named_sequence } {
-  transform.named_sequence @__transform_main(%root: !transform.any_op {transform.readonly}) {
-    %func = transform.structured.match ops{["func.func"]} in %root : (!transform.any_op) -> !transform.any_op
-    transform.apply_patterns to %func {
-      transform.apply_patterns.iree.vectorize_iree_gpu
-    } : !transform.any_op
-    transform.yield
-  }
-}
-
-// CHECK-LABEL: func @barrier_region
-//       CHECK:   %[[SHUFFLE:.+]] = iree_gpu.barrier_region
-//       CHECK:     ^bb0(%[[INTERMEDIATE:.+]]: tensor<6x6xf32>):
-//       CHECK:       %[[SLICE:.+]] = tensor.extract_slice %[[INTERMEDIATE]][0, 0] [3, 2] [1, 1]
-//       CHECK:       %[[READ:.+]] = vector.transfer_read {{.*}} : tensor<3x2xf32>, vector<3x2xf32>
-//       CHECK:       iree_gpu.yield %[[READ]] : vector<3x2xf32>
-//       CHECK:   } : vector<3x2xf32>
-//       CHECK:   %[[EMPTY:.+]] = tensor.empty() : tensor<3x2xf32>
-//       CHECK:   vector.transfer_write %[[SHUFFLE]], %[[EMPTY]]
-
-// -----
-
-func.func @multi_result_barrier_region(%init: tensor<6x6xf32>) -> (index, tensor<3x2xf32>) {
-  %0:2 = iree_gpu.barrier_region ins(%init : tensor<6x6xf32>) {
-  ^bb0(%intermediate: tensor<6x6xf32>):
-    %slice = tensor.extract_slice %intermediate[0, 0] [3, 2] [1, 1] : tensor<6x6xf32> to tensor<3x2xf32>
-    %c0 = arith.constant 0 : index
-    iree_gpu.yield %c0, %slice : index, tensor<3x2xf32>
-  } : index, tensor<3x2xf32>
-  return %0#0, %0#1 : index, tensor<3x2xf32>
-}
-
-module attributes { transform.with_named_sequence } {
-  transform.named_sequence @__transform_main(%root: !transform.any_op {transform.readonly}) {
-    %func = transform.structured.match ops{["func.func"]} in %root : (!transform.any_op) -> !transform.any_op
-    transform.apply_patterns to %func {
-      transform.apply_patterns.iree.vectorize_iree_gpu
-    } : !transform.any_op
-    transform.yield
-  }
-}
-
-// CHECK-LABEL: func @multi_result_barrier_region
-//       CHECK:   %[[SHUFFLE:.+]]:2 = iree_gpu.barrier_region
-//       CHECK:     ^bb0(%[[INTERMEDIATE:.+]]: tensor<6x6xf32>):
-//       CHECK:       %[[SLICE:.+]] = tensor.extract_slice %[[INTERMEDIATE]][0, 0] [3, 2] [1, 1]
-//       CHECK:       %[[READ:.+]] = vector.transfer_read {{.*}} : tensor<3x2xf32>, vector<3x2xf32>
-//       CHECK:       iree_gpu.yield %c0, %[[READ]] : index, vector<3x2xf32>
-//       CHECK:   } : index, vector<3x2xf32>
-//       CHECK:   %[[EMPTY:.+]] = tensor.empty() : tensor<3x2xf32>
-//       CHECK:   vector.transfer_write %[[SHUFFLE]]#1, %[[EMPTY]]
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/BufferizationInterfaces.cpp b/compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/BufferizationInterfaces.cpp
index 01d0f42..17202ad 100644
--- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/BufferizationInterfaces.cpp
+++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/BufferizationInterfaces.cpp
@@ -234,7 +234,7 @@
           YieldOpBufferizationInterface, IREE::GPU::YieldOp> {
   bool bufferizesToMemoryRead(Operation *op, OpOperand &opOperand,
                               const AnalysisState &state) const {
-    return true;
+    return false;
   }
 
   bool bufferizesToMemoryWrite(Operation *op, OpOperand &opOperand,
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/FuseAndHoistParallelLoops.cpp b/compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/FuseAndHoistParallelLoops.cpp
index df9736f..4c165d7 100644
--- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/FuseAndHoistParallelLoops.cpp
+++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/FuseAndHoistParallelLoops.cpp
@@ -75,8 +75,8 @@
       return failure();
     }
 
-    return fuseForallIntoSlice(rewriter, producerForall, sliceParent,
-                               consumerChain);
+    return fuseForallIntoConsumer(rewriter, producerForall, sliceParent,
+                                  consumerChain);
   }
 };
 
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/Transforms.cpp b/compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/Transforms.cpp
index db23682..672b43c 100644
--- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/Transforms.cpp
+++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/Transforms.cpp
@@ -114,39 +114,11 @@
   return allocTensor.getResult();
 }
 
-static void replaceConsumerChain(RewriterBase &rewriter, Location loc,
-                                 Value source, Value replacement,
-                                 SmallVector<Operation *> consumerChain) {
-  auto extractSlice = cast<tensor::ExtractSliceOp>(consumerChain.back());
-  OpBuilder::InsertionGuard g(rewriter);
-
-  auto barrierRegionOp = rewriter.create<IREE::GPU::BarrierRegionOp>(
-      loc, extractSlice.getType(), replacement);
-  rewriter.setInsertionPointToStart(barrierRegionOp.getBody());
-  auto terminator =
-      rewriter.create<IREE::GPU::YieldOp>(loc, extractSlice.getResult());
-  for (auto consumer : consumerChain) {
-    rewriter.moveOpBefore(consumer, terminator);
-  }
-  (*consumerChain.begin())
-      ->replaceUsesOfWith(source, barrierRegionOp.getBody()->getArgument(0));
-  rewriter.replaceAllUsesExcept(extractSlice.getResult(),
-                                barrierRegionOp.getResult(0), terminator);
-}
-
-LogicalResult fuseForallIntoSlice(RewriterBase &rewriter,
-                                  scf::ForallOp producer,
-                                  scf::ForallOp consumer,
-                                  SmallVector<Operation *> consumerChain) {
-  if (consumerChain.empty()) {
-    return failure();
-  }
-
-  auto slice = dyn_cast<tensor::ExtractSliceOp>(consumerChain.back());
-  if (!slice) {
-    return failure();
-  }
-
+LogicalResult fuseForallIntoConsumer(RewriterBase &rewriter,
+                                     scf::ForallOp producer,
+                                     scf::ForallOp consumer,
+                                     SmallVector<Operation *> consumerChain) {
+  // TODO: Support multi-result producer loops.
   if (producer->getNumResults() != 1) {
     return failure();
   }
@@ -165,17 +137,33 @@
     return failure();
   }
 
-  rewriter.setInsertionPoint(slice);
-
   // Step 1. Get the destination of the producer loop as a shared memory
   // allocation.
-  FailureOr<Value> sharedDest =
-      createSharedAllocDestination(rewriter, producer);
-  if (failed(sharedDest)) {
+  rewriter.setInsertionPointToStart(consumer.getBody());
+  FailureOr<Value> maybeDest = createSharedAllocDestination(rewriter, producer);
+  if (failed(maybeDest)) {
     return failure();
   }
+  Value sharedDest = maybeDest.value();
 
-  // Step 2. Compute the producer IDs in terms of the consumer IDs.
+  // Step 2. Move the consumer chain to right before the last user in the
+  // chain.
+  if (!consumerChain.empty()) {
+    Operation *base = consumerChain.back();
+    for (Operation *op : consumerChain) {
+      if (op == base) {
+        continue;
+      }
+      rewriter.moveOpBefore(op, base);
+    }
+  }
+
+  // Step 3. Create the `iree_gpu.barrier_region` to wrap the fused producer.
+  auto barrierOp = rewriter.create<IREE::GPU::BarrierRegionOp>(
+      producer.getLoc(), sharedDest.getType(), sharedDest);
+  rewriter.setInsertionPointToStart(barrierOp.getBody());
+
+  // Step 4. Compute the producer IDs in terms of the consumer IDs.
   // The producer IDs are computed as follows:
   //
   // producer = [p0, ..., pn] ∈ [0, ..., 0] to [P0, ..., Pn]
@@ -235,7 +223,7 @@
       staticConsumerCount && staticProducerCount &&
       staticProducerCount.value() % staticConsumerCount.value() == 0;
 
-  // Step 3. Create the `scf.for` loop for the producer.
+  // Step 5. Create the `scf.for` loop for the producer.
   // If the consumer worker count perfectly divides the producer worker count,
   // then we can use a lower bound of 0 and keep the loop bounds static.
   Value lb = perfectlyDivides ? rewriter.create<arith::ConstantIndexOp>(loc, 0)
@@ -244,8 +232,8 @@
       getValueOrCreateConstantIndexOp(rewriter, loc, producerWorkerCount);
   Value step =
       getValueOrCreateConstantIndexOp(rewriter, loc, consumerWorkerCount);
-  auto newProducer =
-      rewriter.create<scf::ForOp>(loc, lb, ub, step, *sharedDest);
+  auto newProducer = rewriter.create<scf::ForOp>(
+      loc, lb, ub, step, barrierOp.getBody()->getArgument(0));
   Block *loopBody = newProducer.getBody();
 
   // Get the replacement IDs for the producer loop.
@@ -267,7 +255,7 @@
   newBlockArgs.append(newProducer.getRegionIterArgs().begin(),
                       newProducer.getRegionIterArgs().end());
 
-  // Step 4. Inline the region of the producer and replace the terminator.
+  // Step 6. Inline the region of the producer and replace the terminator.
   scf::InParallelOp terminator = producer.getTerminator();
   rewriter.mergeBlocks(producer.getBody(), loopBody, newBlockArgs);
 
@@ -287,13 +275,12 @@
   rewriter.eraseOp(parallelInsert);
   rewriter.eraseOp(terminator);
 
-  // Step 5. Replace the extract slice with a `barrier_region` op to indicate
-  // synchronization of the shared tensor.
-  rewriter.setInsertionPointAfter(newProducer);
-  replaceConsumerChain(rewriter, loc, producer.getResult(0),
-                       newProducer.getResult(0), consumerChain);
+  // Step 7. Yield the result of the loop from the barrier op and replace the
+  // producer.
+  rewriter.setInsertionPointToEnd(barrierOp.getBody());
+  rewriter.create<IREE::GPU::YieldOp>(loc, newProducer.getResults());
 
-  rewriter.eraseOp(producer);
+  rewriter.replaceOp(producer, barrierOp);
   return success();
 }
 
@@ -1086,100 +1073,8 @@
 };
 } // namespace
 
-static LogicalResult
-vectorizeStaticBarrierRegionResult(RewriterBase &rewriter,
-                                   IREE::GPU::BarrierRegionOp barrier) {
-  SmallVector<Type> newResultTypes(barrier->getResultTypes());
-  llvm::SmallBitVector vectorizationTargets(newResultTypes.size(), false);
-  for (auto [i, type] : llvm::enumerate(newResultTypes)) {
-    auto tensorResultType = dyn_cast<RankedTensorType>(type);
-    if (!tensorResultType || !tensorResultType.hasStaticShape()) {
-      continue;
-    }
-    vectorizationTargets[i] = true;
-    VectorType newResultType = VectorType::get(
-        tensorResultType.getShape(), tensorResultType.getElementType());
-    type = newResultType;
-  }
-
-  if (vectorizationTargets.none()) {
-    return failure();
-  }
-
-  auto newBarrier = rewriter.create<IREE::GPU::BarrierRegionOp>(
-      barrier.getLoc(), newResultTypes, barrier.getInputs());
-  auto currentTerminator =
-      cast<IREE::GPU::YieldOp>(barrier.getBody()->getTerminator());
-  rewriter.setInsertionPointToEnd(newBarrier.getBody());
-  rewriter.mergeBlocks(barrier.getBody(), newBarrier.getBody(),
-                       newBarrier.getBody()->getArguments());
-
-  // Create the tensor -> vector conversions within the body of the new op.
-  SmallVector<Value> newYields = currentTerminator.getOperands();
-  for (auto [i, val] : llvm::enumerate(newYields)) {
-    if (!vectorizationTargets[i]) {
-      continue;
-    }
-
-    auto resultType = cast<VectorType>(newResultTypes[i]);
-    auto paddingValue = rewriter.create<arith::ConstantOp>(
-        barrier.getLoc(), rewriter.getZeroAttr(resultType.getElementType()));
-
-    auto innerRead =
-        vector::createReadOrMaskedRead(rewriter, currentTerminator.getLoc(),
-                                       val, resultType.getShape(), paddingValue,
-                                       /*useInBoundsInsteadOfMasking=*/true);
-    val = innerRead;
-  }
-
-  rewriter.create<IREE::GPU::YieldOp>(currentTerminator->getLoc(), newYields);
-  rewriter.eraseOp(currentTerminator);
-
-  rewriter.setInsertionPointAfter(newBarrier);
-
-  // Create the writes back to tensor types.
-  SmallVector<Value> replacements = newBarrier.getResults();
-  for (auto [i, val] : llvm::enumerate(replacements)) {
-    if (!vectorizationTargets[i]) {
-      continue;
-    }
-
-    auto tensorResultType =
-        cast<RankedTensorType>(barrier->getResultTypes()[i]);
-    auto empty = rewriter.create<tensor::EmptyOp>(
-        barrier.getLoc(), tensorResultType.getShape(),
-        tensorResultType.getElementType());
-    int64_t rank = tensorResultType.getRank();
-    auto zero = rewriter.create<arith::ConstantIndexOp>(barrier.getLoc(), 0);
-    auto write = rewriter.create<vector::TransferWriteOp>(
-        barrier.getLoc(),
-        /*vector=*/val,
-        /*dest=*/empty,
-        /*indices=*/SmallVector<Value>(rank, zero),
-        /*inBounds=*/SmallVector<bool>(rank, true));
-    val = write->getResult(0);
-  }
-
-  rewriter.replaceOp(barrier, replacements);
-
-  return success();
-}
-
-namespace {
-struct VectorizeStaticBarrierRegionResultPattern
-    : public OpRewritePattern<IREE::GPU::BarrierRegionOp> {
-  using OpRewritePattern<IREE::GPU::BarrierRegionOp>::OpRewritePattern;
-  LogicalResult matchAndRewrite(IREE::GPU::BarrierRegionOp shuffle,
-                                PatternRewriter &rewriter) const override {
-    return vectorizeStaticBarrierRegionResult(rewriter, shuffle);
-  }
-};
-} // namespace
-
 void populateIREEGPUVectorizationPatterns(RewritePatternSet &patterns) {
   patterns.add<VectorizeStaticMultiMmaOpPattern>(patterns.getContext());
-  patterns.add<VectorizeStaticBarrierRegionResultPattern>(
-      patterns.getContext());
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/Transforms.h b/compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/Transforms.h
index 78ef32e..515ed84 100644
--- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/Transforms.h
+++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/Transforms.h
@@ -35,19 +35,26 @@
 namespace mlir::iree_compiler::IREE::GPU {
 
 /// Function to fuse the given producer-consumer pair of forall loops into
-/// the single consumer loop at the given |slice| within the consumer of the
-/// producer. This is managed by inserting an `iree_gpu.barrier_region` at the
-/// boundary to synchronize the workers at the fusion point.
+/// the single consumer loop. This is managed by inserting an
+/// `iree_gpu.barrier_region` at the boundary to synchronize the workers at
+/// the fusion point.
+///
+/// Copy semantics of tensors means that having multiple threads (i.e. in an
+/// scf.forall) inserting into a tensor has unclear semantics without an op
+/// to separate contexts with different levels of parallelism. scf.forall
+/// does this through its terminator and `iree_gpu.barrier_region` does this
+/// by keeping code writing to shared memory in a distinct region. This allows
+/// us to always default to private memory when bufferizing.
 ///
 /// The mapping attributes of both the producer and consumer `scf.forall` ops
 /// must be in a relative descending order, for example:
 ///  [#gpu.thread<z>, #gpu.thread<y>, #gpu.thread<x>]
 /// or
 ///  [#gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]
-LogicalResult fuseForallIntoSlice(RewriterBase &rewriter,
-                                  scf::ForallOp producer,
-                                  scf::ForallOp consumer,
-                                  SmallVector<Operation *> consumerChain);
+LogicalResult fuseForallIntoConsumer(RewriterBase &rewriter,
+                                     scf::ForallOp producer,
+                                     scf::ForallOp consumer,
+                                     SmallVector<Operation *> consumerChain);
 
 // Helper to convert a contraction-like linalg op to an iree_gpu.multi_mma.
 FailureOr<IREE::GPU::MultiMmaOp>
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/VectorizeIREEGPUOps.cpp b/compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/VectorizeIREEGPUOps.cpp
index a816e43..02c9fd4 100644
--- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/VectorizeIREEGPUOps.cpp
+++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/VectorizeIREEGPUOps.cpp
@@ -28,7 +28,6 @@
   MLIRContext *context = &getContext();
   RewritePatternSet patterns(context);
   populateIREEGPUVectorizationPatterns(patterns);
-  populateIREEGPULowerBarrierRegionPatterns(patterns);
   if (failed(
           applyPatternsAndFoldGreedily(getOperation(), std::move(patterns)))) {
     return signalPassFailure();
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp
index ec6e710..d657985 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp
@@ -406,6 +406,7 @@
   funcPassManager.addPass(createCanonicalizerPass());
   funcPassManager.addPass(createCSEPass());
   funcPassManager.addPass(createLoopInvariantCodeMotionPass());
+  funcPassManager.addPass(IREE::GPU::createCombineBarrierRegionsPass());
 
   // Step 6. Lower special ops and vectorize.
   funcPassManager.addPass(IREE::GPU::createVectorizeIREEGPUOpsPass());
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_tile_and_fuse.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_tile_and_fuse.mlir
index faef890..6facd60 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_tile_and_fuse.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_tile_and_fuse.mlir
@@ -46,6 +46,7 @@
 //   CHECK-DAG:   memref.alloc() : memref<64x8xf16, #gpu.address_space<workgroup>>
 //   CHECK-DAG:   memref.alloc() : memref<64x8xf16, #gpu.address_space<workgroup>>
 //       CHECK:   %[[LOOP:.+]] = scf.for %[[IV:.+]] = %c0 to %c1280 step %c4 {{.*}} -> (vector<8x4xf32>)
+//       CHECK:     gpu.barrier
 //   CHECK-DAG:     %[[LHS_RD:.+]] = vector.transfer_read %[[B0]]{{.*}} vector<2xf16>
 //   CHECK-DAG:     vector.transfer_write %[[LHS_RD]], %[[LHS_ALLOC:[A-Za-z0-9]+]]
 //   CHECK-DAG:     %[[RHS_RD:.+]] = vector.transfer_read %[[B1]]{{.*}} vector<2xf16>
@@ -53,7 +54,6 @@
 //       CHECK:     gpu.barrier
 //   CHECK-DAG:     %[[LHS_MM:.+]] = vector.transfer_read %[[LHS_ALLOC]]{{.*}} vector<8x4xf16>
 //   CHECK-DAG:     %[[RHS_MM:.+]] = vector.transfer_read %[[RHS_ALLOC]]{{.*}} vector<4x4xf16>
-//       CHECK:     gpu.barrier
 //       CHECK:     %[[MM:.+]] = vector.contract {{.*}} %[[LHS_MM]], %[[RHS_MM]]
 //       CHECK:     scf.yield %[[MM]]
 //       CHECK:   vector.transfer_write %[[LOOP]], %[[B2]]
@@ -102,16 +102,16 @@
 //   CHECK-DAG:   memref.alloc() : memref<64x36xf16, #gpu.address_space<workgroup>>
 //   CHECK-DAG:   memref.alloc() : memref<64x36xf16, #gpu.address_space<workgroup>>
 //       CHECK:   %[[LOOP:.+]] = scf.for %[[IV:.+]] = %c0 to %c80 step %c2 {{.*}} -> (vector<2x2x4x1xf32>)
+//       CHECK:     gpu.barrier
 //   CHECK-DAG:     %[[LHS_RD:.+]] = vector.transfer_read %[[B0]]{{.*}} vector<8xf16>
 //   CHECK-DAG:     vector.transfer_write %[[LHS_RD]]
 //   CHECK-DAG:     %[[RHS_RD:.+]] = vector.transfer_read %[[B1]]{{.*}} vector<8xf16>
 //   CHECK-DAG:     vector.transfer_write %[[RHS_RD]]
 //       CHECK:     gpu.barrier
-//       CHECK:     %[[LHS_MM:.+]] = vector.transfer_read {{.*}} vector<2x1x2x4xf16>
-//       CHECK:     %[[RHS_MM:.+]] = vector.transfer_read {{.*}} vector<2x1x2x4xf16>
-//       CHECK:     gpu.barrier
-//   CHECK-DAG:     vector.transpose %[[LHS_MM]], [0, 2, 1, 3] : vector<2x1x2x4xf16>
-//   CHECK-DAG:     vector.transpose %[[RHS_MM]], [0, 2, 1, 3] : vector<2x1x2x4xf16>
+//   CHECK-DAG:     vector.transfer_read {{.*}} #gpu.address_space<workgroup>>, vector<2x1x2x4xf16>
+//   CHECK-DAG:     vector.transfer_read {{.*}} #gpu.address_space<workgroup>>, vector<2x1x2x4xf16>
+//   CHECK-DAG:     vector.transpose %{{.*}}, [0, 2, 1, 3] : vector<2x1x2x4xf16>
+//   CHECK-DAG:     vector.transpose %{{.*}}, [0, 2, 1, 3] : vector<2x1x2x4xf16>
 // CHECK-COUNT-4:   amdgpu.mfma {{.*}}blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32
 //       CHECK:     scf.yield
 //       CHECK:   %[[LOOP_T:.+]] = vector.transpose %[[LOOP]], [0, 2, 1, 3] : vector<2x2x4x1xf32> to vector<2x4x2x1xf32>
@@ -184,15 +184,15 @@
 //     CHECK-DAG:   %[[C720:.+]] = arith.constant 720 : index
 //     CHECK-DAG:   %[[C2:.+]] = arith.constant 2 : index
 //         CHECK:   %[[LOOP:.+]] = scf.for %[[IV:.+]] = %[[C0]] to %[[C720]] step %[[C2]] {{.*}} -> (vector<1x2x2x4x1xf32>)
+//         CHECK:     gpu.barrier
 //     CHECK-DAG:     %[[LHS_RD:.+]] = vector.transfer_read %[[B0]]{{.*}} vector<8xf16>
 //     CHECK-DAG:     vector.transfer_write %[[LHS_RD]]
 //     CHECK-DAG:     %[[RHS_RD:.+]] = vector.transfer_read %[[B1]]{{.*}} vector<8xf16>
 //     CHECK-DAG:     vector.transfer_write %[[RHS_RD]]
 //         CHECK:     gpu.barrier
-//         CHECK:     %[[LHS_MM0:.+]] = vector.transfer_read {{.*}} vector<2x1x2x4xf16>
-//         CHECK:     %[[LHS_MM1:.+]] = vector.broadcast {{.*}} vector<2x1x2x4xf16> to vector<1x2x1x2x4xf16>
-//         CHECK:     %[[RHS_MM:.+]] = vector.transfer_read {{.*}} vector<2x4x2x1xf16>
-//         CHECK:     gpu.barrier
+//     CHECK-DAG:     %[[LHS_MM0:.+]] = vector.transfer_read {{.*}} vector<2x1x2x4xf16>
+//     CHECK-DAG:     %[[LHS_MM1:.+]] = vector.broadcast {{.*}} vector<2x1x2x4xf16> to vector<1x2x1x2x4xf16>
+//     CHECK-DAG:     %[[RHS_MM:.+]] = vector.transfer_read {{.*}} vector<2x4x2x1xf16>
 //     CHECK-DAG:     vector.transpose %[[LHS_MM1]], [0, 1, 3, 2, 4] : vector<1x2x1x2x4xf16> to vector<1x2x2x1x4xf16>
 //     CHECK-DAG:     vector.transpose %[[RHS_MM]], [0, 2, 3, 1] : vector<2x4x2x1xf16> to vector<2x2x1x4xf16>
 // CHECK-COUNT-4:     amdgpu.mfma {{.*}}blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32
@@ -248,16 +248,16 @@
 //   CHECK-DAG:   memref.alloc() : memref<64x36xf16, #gpu.address_space<workgroup>>
 //   CHECK-DAG:   memref.alloc() : memref<64x36xf16, #gpu.address_space<workgroup>>
 //       CHECK:   %[[LOOP:.+]] = scf.for %[[IV:.+]] = %c0 to %c80 step %c2 {{.*}} -> (vector<2x2x8x1x1xf32>)
+//       CHECK:     gpu.barrier
 //   CHECK-DAG:     %[[LHS_RD:.+]] = vector.transfer_read %[[B0]]{{.*}} vector<2x8xf16>
 //   CHECK-DAG:     vector.transfer_write %[[LHS_RD]]
 //   CHECK-DAG:     %[[RHS_RD:.+]] = vector.transfer_read %[[B1]]{{.*}} vector<2x8xf16>
 //   CHECK-DAG:     vector.transfer_write %[[RHS_RD]]
 //       CHECK:     gpu.barrier
-//       CHECK:     %[[LHS_MM:.+]] = vector.transfer_read {{.*}} vector<2x1x2x16xf16>
-//       CHECK:     %[[RHS_MM:.+]] = vector.transfer_read {{.*}} vector<2x1x2x16xf16>
-//       CHECK:     gpu.barrier
-//   CHECK-DAG:     vector.transpose %[[LHS_MM]], [0, 2, 1, 3] : vector<2x1x2x16xf16>
-//   CHECK-DAG:     vector.transpose %[[RHS_MM]], [0, 2, 1, 3] : vector<2x1x2x16xf16>
+//   CHECK-DAG:     vector.transfer_read {{.*}} vector<2x1x2x16xf16>
+//   CHECK-DAG:     vector.transfer_read {{.*}} vector<2x1x2x16xf16>
+//   CHECK-DAG:     vector.transpose %{{.*}}, [0, 2, 1, 3] : vector<2x1x2x16xf16>
+//   CHECK-DAG:     vector.transpose %{{.*}}, [0, 2, 1, 3] : vector<2x1x2x16xf16>
 // CHECK-COUNT-8:   amdgpu.wmma {{.*}} : vector<16xf16>, vector<16xf16>, vector<8xf32>
 //       CHECK:     scf.yield
 //       CHECK:   %[[LOOP_T:.+]] = vector.transpose %[[LOOP]], [0, 2, 3, 1, 4] : vector<2x2x8x1x1xf32> to vector<2x8x1x2x1xf32>