CanonicalizedSequenceOp: Do not hoist buffer transfers (#11580)

MLIR does not model parallel contexts yet and hoisting buffer transfers
may or may not be safe, depending on which semantics we decide on in the
future. Disable it for now to be safe. Transfers should be hoisted on
tensors when possible.
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/reduction_pipeline_transform.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/reduction_pipeline_transform.mlir
index 01d5f99..8b55d88 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/reduction_pipeline_transform.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/reduction_pipeline_transform.mlir
@@ -36,15 +36,15 @@
 
 // Fusion occurred, no barrier before the loop
 //     CHECK-NOT: gpu.barrier
-//     CHECK:   vector.transfer_read {{.*}} vector<f32>
 // Local per-thread scf.for-based reduction.
 //         CHECK: scf.for
 //         CHECK:   vector.transfer_read {{.*}} vector<2xf32>
+//         CHECK:   vector.transfer_read {{.*}} vector<f32>
 //         CHECK:   vector.reduction <add>{{.*}} : vector<2xf32> into f32
 //         CHECK:   vector.broadcast {{.*}} : f32 to vector<f32>
 // No barrier within the loop
 //     CHECK-NOT:   gpu.barrier
-//         CHECK:   scf.yield {{.*}} : vector<f32>
+//         CHECK:   vector.transfer_write {{.*}} vector<f32>
 
 // Distributed reduction: everyone loads then 5 xor + addf expected.
 //         CHECK: %[[TIDY:.]] = gpu.thread_id  y
@@ -103,15 +103,15 @@
 
 // Fusion occurred, no barrier before the loop
 //     CHECK-NOT: gpu.barrier
-//     CHECK:   vector.transfer_read {{.*}} vector<f32>
 // Local per-thread scf.for-based reduction.
 //         CHECK: scf.for
 //         CHECK:   vector.transfer_read {{.*}} vector<2xf32>
+//         CHECK:   vector.transfer_read {{.*}} vector<f32>
 //         CHECK:   vector.reduction <add>{{.*}} : vector<2xf32> into f32
 //         CHECK:   vector.broadcast {{.*}} : f32 to vector<f32>
 // No barrier within the loop
 //     CHECK-NOT:   gpu.barrier
-//         CHECK:   scf.yield {{.*}} : vector<f32>
+//         CHECK:   vector.transfer_write {{.*}} vector<f32>
 
 // Distributed reduction: everyone loads then 5 xor + addf expected.
 //         CHECK: %[[TIDY:.]] = gpu.thread_id  y
@@ -169,17 +169,17 @@
 
 // Fusion occurred, no barrier before the loop
 //     CHECK-NOT: gpu.barrier
-//     CHECK:   vector.transfer_read {{.*}} vector<f32>
 // Local per-thread scf.for-based reduction.
 //         CHECK: scf.for
 //         CHECK:   vector.transfer_read {{.*}} vector<2xf32>
+//         CHECK:   vector.transfer_read {{.*}} vector<f32>
 //         CHECK:   arith.addf{{.*}} : vector<2xf32>
 //         CHECK:   arith.addf{{.*}} : vector<2xf32>
 //         CHECK:   vector.reduction <add>{{.*}} : vector<2xf32> into f32
 //         CHECK:   vector.broadcast {{.*}} : f32 to vector<f32>
 // No barrier within the loop
 //     CHECK-NOT:   gpu.barrier
-//         CHECK:   scf.yield {{.*}} : vector<f32>
+//         CHECK:   vector.transfer_write {{.*}} vector<f32>
 
 // Distributed reduction: everyone loads then 5 xor + addf expected.
 //         CHECK: %[[TIDY:.]] = gpu.thread_id  y
@@ -240,17 +240,17 @@
 
 // Fusion occurred, no barrier before the loop
 //     CHECK-NOT: gpu.barrier
-//     CHECK:   vector.transfer_read {{.*}} vector<f32>
 // Local per-thread scf.for-based reduction.
 //         CHECK: scf.for
 //         CHECK:   vector.transfer_read {{.*}} vector<2xf32>
+//         CHECK:   vector.transfer_read {{.*}} vector<f32>
 //         CHECK:   arith.addf{{.*}} : vector<2xf32>
 //         CHECK:   arith.addf{{.*}} : vector<2xf32>
 //         CHECK:   vector.reduction <add>{{.*}} : vector<2xf32> into f32
 //         CHECK:   vector.broadcast {{.*}} : f32 to vector<f32>
 // No barrier within the loop
 //     CHECK-NOT:   gpu.barrier
-//         CHECK:   scf.yield {{.*}} : vector<f32>
+//         CHECK:   vector.transfer_write {{.*}} vector<f32>
 
 // Distributed reduction: everyone loads then 5 xor + addf expected.
 //         CHECK: %[[TIDY:.]] = gpu.thread_id  y
@@ -306,15 +306,15 @@
 
 // Fusion occurred, no barrier before the loop
 //     CHECK-NOT: gpu.barrier
-//     CHECK:   vector.transfer_read {{.*}} vector<f32>
 // Local per-thread scf.for-based reduction.
 //         CHECK: scf.for
 //         CHECK:   vector.transfer_read
+//         CHECK:   vector.transfer_read {{.*}} vector<f32>
 //         CHECK:   vector.reduction <add>{{.*}} : vector<4xf32> into f32
 //         CHECK:   vector.broadcast {{.*}} : f32 to vector<f32>
 // No barrier within the loop
 //     CHECK-NOT:   gpu.barrier
-//         CHECK:   scf.yield {{.*}} : vector<f32>
+//         CHECK:   vector.transfer_write {{.*}} vector<f32>
 
 // Distributed reduction: everyone loads then 5 xor + addf expected.
 //         CHECK: %[[TIDY:.]] = gpu.thread_id  y
diff --git a/llvm-external-projects/iree-dialects/include/iree-dialects/Dialect/LinalgExt/Passes/Passes.h b/llvm-external-projects/iree-dialects/include/iree-dialects/Dialect/LinalgExt/Passes/Passes.h
index dfbdad6..bc3a7fe 100644
--- a/llvm-external-projects/iree-dialects/include/iree-dialects/Dialect/LinalgExt/Passes/Passes.h
+++ b/llvm-external-projects/iree-dialects/include/iree-dialects/Dialect/LinalgExt/Passes/Passes.h
@@ -190,7 +190,10 @@
     return *this;
   }
   /// Enable hoisting of redundant vector transfer ops.
-  bool hoistRedundantVectorTransfers = true;
+  /// TODO: MLIR does currently not model parallel contexts. It can be unsafe
+  /// to hoist transfers from buffers in a multi-threaded environment, so this
+  /// should not be enabled by default.
+  bool hoistRedundantVectorTransfers = false;
   LinalgEnablingOptions &enableHoistRedundantVectorTransfers(bool val = true) {
     hoistRedundantVectorTransfers = val;
     return *this;
diff --git a/tests/transform_dialect/cuda/reduction_v3.mlir b/tests/transform_dialect/cuda/reduction_v3.mlir
index e627ef5..b17a8a8 100644
--- a/tests/transform_dialect/cuda/reduction_v3.mlir
+++ b/tests/transform_dialect/cuda/reduction_v3.mlir
@@ -48,8 +48,9 @@
   // Local per-thread scf.for-based reduction.
   //         CHECK: scf.for
   //         CHECK:   vector.transfer_read %{{.*}} : memref<f32, strided<[], offset: ?>>, vector<f32>
+  //         CHECK:   vector.transfer_read %{{.*}} vector<f32>
   //         CHECK:   arith.addf {{.*}} : f32
-  //         CHECK:   scf.yield %{{.*}} : vector<f32>
+  //         CHECK:   vector.transfer_write {{.*}} vector<f32>
 
   //         CHECK: %[[TIDY:.]] = gpu.thread_id  y
   // Distributed reduction: everyone loads then 5 xor + addf expected