[Codegen] Add missing barrier to GPUVectorAlloc (#16551)
We need another barrier before the write to share memory to wait for
reads of the same memory coming from the previous iteration of a loop.
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUVectorAlloc.cpp b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUVectorAlloc.cpp
index 913668d..b24a8e9 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUVectorAlloc.cpp
+++ b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUVectorAlloc.cpp
@@ -115,6 +115,14 @@
});
for (vector::ContractionOp contractOp : opsToPromote) {
OpBuilder builder(contractOp);
+
+ // HACK: Until proper barrier placement is handled later we have to
+ // synchronize explicitly in this pass.
+
+ // Synchronize before the write to shared memory to avoid stepping over
+ // reads in the previous iteration of a loop.
+ builder.create<gpu::BarrierOp>(contractOp->getLoc());
+
// Promote both of the input operands, excluding the accumulator.
OpOperand &lhs = contractOp.getLhsMutable();
FailureOr<Value> lhsRet =
@@ -130,8 +138,7 @@
return signalPassFailure();
}
- // HACK: Until proper barrier placement is handled later we have to
- // synchronize here.
+ // Synchronize after the write to shared memory before we read from it.
builder.create<gpu::BarrierOp>(contractOp->getLoc());
Value lhsVec =
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_vector_alloc.mlir b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_vector_alloc.mlir
index f2c9873..5d005b8 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_vector_alloc.mlir
+++ b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_vector_alloc.mlir
@@ -3,11 +3,11 @@
func.func @matmul_256x256x256(%lhs: tensor<16x256xf16>,
%rhs: tensor<256x16xf16>,
%out: tensor<16x16xf32>) -> tensor<16x16xf32> {
- %cst = arith.constant 0.000000e+00 : f16
+ %cst = arith.constant 0.000000e+00 : f16
%cst_0 = arith.constant dense<0.000000e+00> : vector<16x16xf32>
- %c32 = arith.constant 32 : index
- %c256 = arith.constant 256 : index
- %c0 = arith.constant 0 : index
+ %c32 = arith.constant 32 : index
+ %c256 = arith.constant 256 : index
+ %c0 = arith.constant 0 : index
%8 = scf.for %arg0 = %c0 to %c256 step %c32 iter_args(%arg1 = %cst_0) -> (vector<16x16xf32>) {
%10 = vector.transfer_read %lhs[%c0, %arg0], %cst {in_bounds = [true, true]} : tensor<16x256xf16>, vector<16x32xf16>
%11 = vector.transfer_read %rhs[%arg0, %c0], %cst {in_bounds = [true, true]} : tensor<256x16xf16>, vector<32x16xf16>
@@ -23,6 +23,7 @@
// CHECK: scf.for {{.*}} -> (vector<16x16xf32>) {
// CHECK-DAG: %[[A:.*]] = vector.transfer_read %{{.*}} : tensor<16x256xf16>, vector<16x32xf16>
// CHECK-DAG: %[[B:.*]] = vector.transfer_read %{{.*}} : tensor<256x16xf16>, vector<32x16xf16>
+// CHECK: gpu.barrier
// LHS copy.
// CHECK: %[[PA:.*]] = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<16x32xf16, #gpu.address_space<workgroup>>