[Codegen][GPU] Add support for bufferizing iree_gpu.barrier_region (#18497) This adds direct support for bufferizing iree_gpu.barrier_region. Now we can directly handle this operation during bufferization rather than requiring it to be decomposed before lowering. For now this simply bufferizes to two barriers at the beginning and the end of the region. In the future we could opt to either keep the region, allowing for some additional analysis, or drop the barriers in certain cases. Those options are left as TODO and this is kept simple for now.

diff --git a/compiler/src/iree/compiler/Codegen/Common/test/iree_comprehensive_bufferize.mlir b/compiler/src/iree/compiler/Codegen/Common/test/iree_comprehensive_bufferize.mlir
index 9b8d6b8..42dd373 100644
--- a/compiler/src/iree/compiler/Codegen/Common/test/iree_comprehensive_bufferize.mlir
+++ b/compiler/src/iree/compiler/Codegen/Common/test/iree_comprehensive_bufferize.mlir

@@ -2797,3 +2797,72 @@
 //  CHECK-NEXT:   gpu.barrier
 //       CHECK:   vector.transfer_write %{{.*}}, %[[ALLOC0]]
 //  CHECK-NEXT:   vector.transfer_read %[[ALLOC1]]
+
+// -----
+
+func.func @barrier_region(%x: index, %y: index) -> vector<3x2xf32> {
+  %cst0 = arith.constant 0.0 : f32
+  %c0 = arith.constant 0 : index
+  %init = bufferization.alloc_tensor() : tensor<6x6xf32>
+  %0 = iree_gpu.barrier_region ins(%init : tensor<6x6xf32>) {
+  ^bb0(%intermediate: tensor<6x6xf32>):
+    %slice = tensor.extract_slice %intermediate[%x, %y] [3, 2] [1, 1] : tensor<6x6xf32> to tensor<3x2xf32>
+    %read = vector.transfer_read %slice[%c0, %c0], %cst0 {in_bounds = [true, true]} : tensor<3x2xf32>, vector<3x2xf32>
+    iree_gpu.yield %read : vector<3x2xf32>
+  } : vector<3x2xf32>
+  return %0 : vector<3x2xf32>
+}
+
+// CHECK-LABEL: func @barrier_region
+//       CHECK:   %[[ALLOC:.+]] = memref.alloc()
+//       CHECK:   gpu.barrier
+//       CHECK:   %[[SUBVIEW:.+]] = memref.subview %[[ALLOC]]
+//       CHECK:   %[[READ:.+]] = vector.transfer_read %[[SUBVIEW]]
+//       CHECK:   gpu.barrier
+//       CHECK:   return %[[READ]]
+
+// -----
+
+func.func @barrier_region_tensor_result(%x: index) -> vector<3xf32> {
+  %cst0 = arith.constant 0.0 : f32
+  %c0 = arith.constant 0 : index
+  %init = bufferization.alloc_tensor() : tensor<6xf32>
+  %0 = iree_gpu.barrier_region ins(%init : tensor<6xf32>) {
+  ^bb0(%intermediate: tensor<6xf32>):
+    %slice = tensor.extract_slice %intermediate[%x] [3] [1] : tensor<6xf32> to tensor<3xf32>
+    iree_gpu.yield %slice : tensor<3xf32>
+  } : tensor<3xf32>
+  %read = vector.transfer_read %0[%c0], %cst0 {in_bounds = [true]} : tensor<3xf32>, vector<3xf32>
+  return %read : vector<3xf32>
+}
+
+// CHECK-LABEL: func @barrier_region_tensor_result
+//       CHECK:   %[[ALLOC:.+]] = memref.alloc()
+//       CHECK:   gpu.barrier
+//       CHECK:   %[[SUBVIEW:.+]] = memref.subview %[[ALLOC]]
+//       CHECK:   gpu.barrier
+//       CHECK:   %[[READ:.+]] = vector.transfer_read %[[SUBVIEW]]
+//       CHECK:   return %[[READ]]
+
+// -----
+
+func.func @barrier_region_in_place() -> vector<2x3xf32> {
+  %cst0 = arith.constant 0.0 : f32
+  %c0 = arith.constant 0 : index
+  %init = bufferization.alloc_tensor() : tensor<6xf32>
+  %0 = iree_gpu.barrier_region ins(%init : tensor<6xf32>) {
+  ^bb0(%intermediate: tensor<6xf32>):
+    %slice = tensor.expand_shape %intermediate [[0, 1]] output_shape [2, 3] : tensor<6xf32> into tensor<2x3xf32>
+    iree_gpu.yield %slice : tensor<2x3xf32>
+  } : tensor<2x3xf32>
+  %read = vector.transfer_read %0[%c0, %c0], %cst0 {in_bounds = [true, true]} : tensor<2x3xf32>, vector<2x3xf32>
+  return %read : vector<2x3xf32>
+}
+
+// CHECK-LABEL: func @barrier_region_in_place
+//       CHECK:   %[[ALLOC:.+]] = memref.alloc()
+//       CHECK:   gpu.barrier
+//       CHECK:   %[[EXPAND:.+]] = memref.expand_shape %[[ALLOC]]
+//       CHECK:   gpu.barrier
+//       CHECK:   %[[READ:.+]] = vector.transfer_read %[[EXPAND]]
+//       CHECK:   return %[[READ]]

diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/BufferizationInterfaces.cpp b/compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/BufferizationInterfaces.cpp
index 03208fe..01d0f42 100644
--- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/BufferizationInterfaces.cpp
+++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/BufferizationInterfaces.cpp

@@ -9,8 +9,11 @@
 #include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUDialect.h"
 #include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUOps.h"
 #include "mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h"
+#include "mlir/Dialect/Bufferization/IR/Bufferization.h"
 #include "mlir/Dialect/Bufferization/Transforms/Transforms.h"
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/Value.h"
 
 using mlir::bufferization::AnalysisState;
 using mlir::bufferization::BufferizableOpInterface;
@@ -22,6 +25,142 @@
 
 namespace {
 
+static FailureOr<SmallVector<Value>>
+getBuffers(RewriterBase &rewriter, const MutableOperandRange &operands,
+           const BufferizationOptions &options) {
+  SmallVector<Value> result;
+  for (OpOperand &opOperand : operands) {
+    if (isa<TensorType>(opOperand.get().getType())) {
+      FailureOr<Value> resultBuffer =
+          getBuffer(rewriter, opOperand.get(), options);
+      if (failed(resultBuffer))
+        return failure();
+      result.push_back(*resultBuffer);
+    } else {
+      result.push_back(opOperand.get());
+    }
+  }
+  return result;
+}
+
+/// Bufferization of iree_gpu.barrier_region. Always just bufferizes in place
+/// and gets inlined with barriers.
+struct BarrierRegionOpBufferizationInterface
+    : public BufferizableOpInterface::ExternalModel<
+          BarrierRegionOpBufferizationInterface, IREE::GPU::BarrierRegionOp> {
+  bool bufferizesToMemoryRead(Operation *op, OpOperand &opOperand,
+                              const AnalysisState &state) const {
+    // This op itself never needs to bufferize to a copy. It's possible
+    // that operations within its body will need to bufferize to a copy,
+    // but those copies should happen between the two barriers.
+    return false;
+  }
+
+  bool bufferizesToMemoryWrite(Operation *op, OpOperand &opOperand,
+                               const AnalysisState &state) const {
+    return false;
+  }
+
+  bool isWritable(Operation *op, Value value,
+                  const AnalysisState &state) const {
+    return true;
+  }
+
+  // Operands alias with the region operands.
+  bufferization::AliasingValueList
+  getAliasingValues(Operation *op, OpOperand &opOperand,
+                    const AnalysisState &state) const {
+    SmallVector<bufferization::AliasingValue> alist;
+    auto barrierOp = cast<IREE::GPU::BarrierRegionOp>(op);
+    alist.push_back(
+        {barrierOp.getBody()->getArguments()[opOperand.getOperandNumber()],
+         BufferRelation::Equivalent, /*isDefinite=*/true});
+    return alist;
+  }
+
+  bufferization::AliasingOpOperandList
+  getAliasingOpOperands(Operation *op, Value value,
+                        const AnalysisState &state) const {
+    auto barrierOp = cast<IREE::GPU::BarrierRegionOp>(op);
+    bufferization::AliasingOpOperandList result;
+    if (auto opResult = dyn_cast<OpResult>(value)) {
+      int64_t resultNum = opResult.getResultNumber();
+      auto yieldOp =
+          cast<IREE::GPU::YieldOp>(barrierOp.getBody()->getTerminator());
+      result.addAlias(bufferization::AliasingOpOperand(
+          &yieldOp->getOpOperand(resultNum), BufferRelation::Equivalent,
+          /*isDefinite=*/true));
+    } else if (auto blockArg = dyn_cast<BlockArgument>(value)) {
+      result.addAlias(bufferization::AliasingOpOperand(
+          &barrierOp->getOpOperand(blockArg.getArgNumber()),
+          BufferRelation::Equivalent,
+          /*isDefinite=*/true));
+    }
+    return result;
+  }
+
+  FailureOr<BaseMemRefType>
+  getBufferType(Operation *op, Value value, const BufferizationOptions &options,
+                SmallVector<Value> &invocationStack) const {
+    auto barrierOp = cast<IREE::GPU::BarrierRegionOp>(op);
+
+    FailureOr<BaseMemRefType> memrefType = failure();
+    if (auto opResult = dyn_cast<OpResult>(value)) {
+      int64_t resultNum = opResult.getResultNumber();
+      memrefType = bufferization::getBufferType(
+          barrierOp.getBody()->getTerminator()->getOperand(resultNum), options,
+          invocationStack);
+    } else if (auto blockArg = dyn_cast<BlockArgument>(value)) {
+      int64_t argNum = blockArg.getArgNumber();
+      memrefType = bufferization::getBufferType(barrierOp.getOperand(argNum),
+                                                options, invocationStack);
+    }
+    if (failed(memrefType))
+      return failure();
+    return memrefType;
+  }
+
+  LogicalResult bufferize(Operation *op, RewriterBase &rewriter,
+                          const BufferizationOptions &options) const {
+    auto barrierOp = cast<IREE::GPU::BarrierRegionOp>(op);
+    auto terminator =
+        cast<IREE::GPU::YieldOp>(barrierOp.getBody()->getTerminator());
+
+    FailureOr<SmallVector<Value>> newOperands =
+        getBuffers(rewriter, barrierOp.getInputsMutable(), options);
+    FailureOr<SmallVector<Value>> newResults =
+        getBuffers(rewriter, terminator.getValuesMutable(), options);
+    if (failed(newOperands) || failed(newResults)) {
+      return failure();
+    }
+
+    SmallVector<Value> tensorizedOperands;
+    for (auto [type, replacement] :
+         llvm::zip_equal(barrierOp.getOperandTypes(), *newOperands)) {
+      if (!isa<RankedTensorType>(type)) {
+        tensorizedOperands.push_back(replacement);
+        continue;
+      }
+      tensorizedOperands.push_back(rewriter
+                                       .create<bufferization::ToTensorOp>(
+                                           replacement.getLoc(), replacement)
+                                       .getResult());
+    }
+
+    rewriter.setInsertionPoint(barrierOp);
+    rewriter.create<gpu::BarrierOp>(barrierOp.getLoc());
+    rewriter.setInsertionPointAfter(barrierOp);
+    auto afterBarrier = rewriter.create<gpu::BarrierOp>(barrierOp.getLoc());
+
+    rewriter.inlineBlockBefore(barrierOp.getBody(), afterBarrier,
+                               tensorizedOperands);
+
+    bufferization::replaceOpWithBufferizedValues(rewriter, op, *newResults);
+    rewriter.eraseOp(terminator);
+    return success();
+  }
+};
+
 /// Bufferization of iree_gpu.tensor_barrier. Always just bufferizes in place
 /// and replaces with a barrier.
 struct ValueBarrierOpBufferizationInterface
@@ -88,13 +227,71 @@
   }
 };
 
+/// Bufferization of iree_gpu.yield. Bufferized as part of their enclosing ops,
+/// so this is for analysis only.
+struct YieldOpBufferizationInterface
+    : public BufferizableOpInterface::ExternalModel<
+          YieldOpBufferizationInterface, IREE::GPU::YieldOp> {
+  bool bufferizesToMemoryRead(Operation *op, OpOperand &opOperand,
+                              const AnalysisState &state) const {
+    return true;
+  }
+
+  bool bufferizesToMemoryWrite(Operation *op, OpOperand &opOperand,
+                               const AnalysisState &state) const {
+    return false;
+  }
+
+  bufferization::AliasingValueList
+  getAliasingValues(Operation *op, OpOperand &opOperand,
+                    const AnalysisState &state) const {
+    assert(isa<IREE::GPU::BarrierRegionOp>(op->getParentOp()));
+    return {{op->getParentOp()->getResult(opOperand.getOperandNumber()),
+             BufferRelation::Equivalent}};
+  }
+
+  bool mustBufferizeInPlace(Operation *op, OpOperand &opOperand,
+                            const AnalysisState &state) const {
+    // Yield operands always bufferize inplace. Otherwise, an alloc + copy
+    // may be generated inside the block. We should not return/yield allocations
+    // when possible.
+    return true;
+  }
+
+  LogicalResult bufferize(Operation *op, RewriterBase &rewriter,
+                          const BufferizationOptions &options) const {
+    auto yieldOp = cast<IREE::GPU::YieldOp>(op);
+
+    SmallVector<Value> newResults;
+    for (const auto &it : llvm::enumerate(yieldOp.getValues())) {
+      Value value = it.value();
+      if (isa<TensorType>(value.getType())) {
+        FailureOr<Value> maybeBuffer = getBuffer(rewriter, value, options);
+        if (failed(maybeBuffer))
+          return failure();
+        newResults.push_back(*maybeBuffer);
+      } else {
+        newResults.push_back(value);
+      }
+    }
+
+    bufferization::replaceOpWithNewBufferizedOp<IREE::GPU::YieldOp>(
+        rewriter, op, newResults);
+    return success();
+  }
+};
+
 } // namespace
 
 void registerIREEGPUBufferizationInterfaces(DialectRegistry &registry) {
   registry.addExtension(
       +[](MLIRContext *context, IREE::GPU::IREEGPUDialect *dialect) {
+        IREE::GPU::BarrierRegionOp::attachInterface<
+            BarrierRegionOpBufferizationInterface>(*context);
         IREE::GPU::ValueBarrierOp::attachInterface<
             ValueBarrierOpBufferizationInterface>(*context);
+        IREE::GPU::YieldOp::attachInterface<YieldOpBufferizationInterface>(
+            *context);
       });
 }