[Codegen] Add missing barrier to GPUVectorAlloc (#16551)

We need another barrier before the write to share memory to wait for
reads of the same memory coming from the previous iteration of a loop.
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUVectorAlloc.cpp b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUVectorAlloc.cpp
index 913668d..b24a8e9 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUVectorAlloc.cpp
+++ b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUVectorAlloc.cpp
@@ -115,6 +115,14 @@
     });
     for (vector::ContractionOp contractOp : opsToPromote) {
       OpBuilder builder(contractOp);
+
+      // HACK: Until proper barrier placement is handled later we have to
+      // synchronize explicitly in this pass.
+
+      // Synchronize before the write to shared memory to avoid stepping over
+      // reads in the previous iteration of a loop.
+      builder.create<gpu::BarrierOp>(contractOp->getLoc());
+
       // Promote both of the input operands, excluding the accumulator.
       OpOperand &lhs = contractOp.getLhsMutable();
       FailureOr<Value> lhsRet =
@@ -130,8 +138,7 @@
         return signalPassFailure();
       }
 
-      // HACK: Until proper barrier placement is handled later we have to
-      // synchronize here.
+      // Synchronize after the write to shared memory before we read from it.
       builder.create<gpu::BarrierOp>(contractOp->getLoc());
 
       Value lhsVec =
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_vector_alloc.mlir b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_vector_alloc.mlir
index f2c9873..5d005b8 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_vector_alloc.mlir
+++ b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_vector_alloc.mlir
@@ -3,11 +3,11 @@
 func.func @matmul_256x256x256(%lhs: tensor<16x256xf16>,
                               %rhs: tensor<256x16xf16>,
                               %out: tensor<16x16xf32>) -> tensor<16x16xf32> {
-  %cst = arith.constant 0.000000e+00 : f16 
+  %cst = arith.constant 0.000000e+00 : f16
   %cst_0 = arith.constant dense<0.000000e+00> : vector<16x16xf32>
-  %c32 = arith.constant 32 : index 
-  %c256 = arith.constant 256 : index 
-  %c0 = arith.constant 0 : index 
+  %c32 = arith.constant 32 : index
+  %c256 = arith.constant 256 : index
+  %c0 = arith.constant 0 : index
   %8 = scf.for %arg0 = %c0 to %c256 step %c32 iter_args(%arg1 = %cst_0) -> (vector<16x16xf32>) {
     %10 = vector.transfer_read %lhs[%c0, %arg0], %cst {in_bounds = [true, true]} : tensor<16x256xf16>, vector<16x32xf16>
     %11 = vector.transfer_read %rhs[%arg0, %c0], %cst {in_bounds = [true, true]} : tensor<256x16xf16>, vector<32x16xf16>
@@ -23,6 +23,7 @@
 //         CHECK:    scf.for {{.*}} -> (vector<16x16xf32>) {
 //     CHECK-DAG:      %[[A:.*]] = vector.transfer_read %{{.*}} : tensor<16x256xf16>, vector<16x32xf16>
 //     CHECK-DAG:      %[[B:.*]] = vector.transfer_read %{{.*}} : tensor<256x16xf16>, vector<32x16xf16>
+//         CHECK:      gpu.barrier
 
 // LHS copy.
 //         CHECK:      %[[PA:.*]] = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<16x32xf16, #gpu.address_space<workgroup>>