[Codegen] Add option to disable copy vectorization (#18673)

Vectorization of linalg.copy introduces two vector.transfer ops that
immediately fold away which can cause unexpected results from LICM
resulting in unlinking copy destinations from surrounding loops. Since
vectorization of a tensor copy does not work anyway, this adds an option
to disable vectorization of copies on tensors and defer it until after
bufferization.
diff --git a/compiler/src/iree/compiler/Codegen/Common/GenericVectorization.cpp b/compiler/src/iree/compiler/Codegen/Common/GenericVectorization.cpp
index 8aee5ba..ec7ac03 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GenericVectorization.cpp
+++ b/compiler/src/iree/compiler/Codegen/Common/GenericVectorization.cpp
@@ -333,6 +333,9 @@
   SmallVector<Operation *> candidates;
   funcOp.walk([&](Operation *op) {
     if (isa<linalg::LinalgOp>(op)) {
+      if (isa<linalg::CopyOp>(op) && !vectorizeCopies) {
+        return;
+      }
       candidates.push_back(op);
     } else if (vectorizePadding && enableVectorMasking &&
                isa<tensor::PadOp>(op)) {
diff --git a/compiler/src/iree/compiler/Codegen/Common/Passes.td b/compiler/src/iree/compiler/Codegen/Common/Passes.td
index a9a208b..f59409f 100644
--- a/compiler/src/iree/compiler/Codegen/Common/Passes.td
+++ b/compiler/src/iree/compiler/Codegen/Common/Passes.td
@@ -286,6 +286,8 @@
       "Enable vector masking during vectorization.">,
     Option<"useConfiguredVectorSizes", "use-configured-vector-sizes", "bool",/*default=*/"true",
       "Control whether the op lowering config represents a set of masked vector sizes">,
+    Option<"vectorizeCopies", "vectorize-copies", "bool", /*default=*/"true",
+      "Enable vectorization of linalg.copy operations.">,
     Option<"vectorizePadding", "vectorize-padding", "bool", /*default=*/"false",
       "Rewrite all tensor.pad ops in the function to vector form.">,
     Option<"vectorizeGatherAccesses", "vectorize-gather-accesses", "bool", /*default=*/"false",
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp
index c676abd..2d8f56e 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp
@@ -216,7 +216,8 @@
   addBufferizePasses(funcPassManager);
 }
 
-static void addGPUVectorizationPasses(OpPassManager &funcPassManager) {
+static void addGPUVectorizationPasses(OpPassManager &funcPassManager,
+                                      bool vectorizeCopies = true) {
   funcPassManager.addPass(createDecomposeConvolutionToLowerDimOpsPass());
   funcPassManager.addPass(IREE::LinalgExt::createDecomposeIm2colPass());
   funcPassManager.addPass(
@@ -224,6 +225,7 @@
   // Vectorize.
   GenericVectorizationPassOptions options;
   options.vectorizePadding = true;
+  options.vectorizeCopies = vectorizeCopies;
   options.vectorizeGatherAccesses = true;
   options.enableCleanup = false;
   options.foldCastIntoContract = true;
@@ -410,7 +412,7 @@
 
   // Step 6. Lower special ops and vectorize.
   funcPassManager.addPass(IREE::GPU::createVectorizeIREEGPUOpsPass());
-  addGPUVectorizationPasses(funcPassManager);
+  addGPUVectorizationPasses(funcPassManager, /*vectorizeCopies=*/false);
   funcPassManager.addPass(createCleanupBufferAllocViewPass());
   funcPassManager.addPass(createGPUCombineValueBarriersPass());
 
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_tile_and_fuse.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_tile_and_fuse.mlir
index 9c29da2..e6b21a6 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_tile_and_fuse.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_tile_and_fuse.mlir
@@ -877,3 +877,67 @@
 // CHECK:  vector.insert_strided_slice %[[C_70_4]], {{.*}}offsets = [7, 0, 0, 0, 0, 0]{{.*}} : vector<4xf32> into vector<8x1x2x1x1x4xf32>
 // CHECK:  vector.insert_strided_slice %[[C_71_4]], {{.*}}offsets = [7, 0, 1, 0, 0, 0]{{.*}} : vector<4xf32> into vector<8x1x2x1x1x4xf32>
 // CHECK:  vector.transfer_write
+
+// -----
+
+#layout = #hal.pipeline.layout<bindings = [
+  #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">,
+  #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">,
+  #hal.pipeline.binding<storage_buffer, Indirect>
+], flags = Indirect>
+
+#lowering_config = #iree_gpu.lowering_config<{
+  promote_operands = [0, 1],
+  reduction = [0, 0, 4],
+  thread = [1, 4, 0],
+  workgroup = [1, 128, 0]
+}>
+#translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [32, 1, 1] subgroup_size = 32>
+
+hal.executable public @main {
+  hal.executable.variant public @cuda_nvptx_fb target(<"cuda", "cuda-nvptx-fb">) {
+    hal.executable.export public @small_m_matmul ordinal(0) layout(#layout) {
+    ^bb0(%arg0: !hal.device):
+      %x, %y, %z = flow.dispatch.workgroup_count_from_slice
+      hal.return %x, %y, %z : index, index, index
+    }
+    builtin.module {
+      func.func @small_m_matmul() attributes {translation_info = #translation_info} {
+        %cst = arith.constant 0.000000e+00 : f32
+        %c0 = arith.constant 0 : index
+        %0 = hal.interface.binding.subspan layout(#layout) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<4x1000xf32>>
+        %1 = hal.interface.binding.subspan layout(#layout) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1000x512xf32>>
+        %2 = hal.interface.binding.subspan layout(#layout) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<4x512xf32>>
+        %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [4, 1000], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<4x1000xf32>> -> tensor<4x1000xf32>
+        %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1000, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1000x512xf32>> -> tensor<1000x512xf32>
+        %5 = tensor.empty() : tensor<4x512xf32>
+        %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<4x512xf32>) -> tensor<4x512xf32>
+        %7 = linalg.matmul {lowering_config = #lowering_config}
+          ins(%3, %4 : tensor<4x1000xf32>, tensor<1000x512xf32>)
+          outs(%6 : tensor<4x512xf32>) -> tensor<4x512xf32>
+        flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [4, 512], strides = [1, 1] : tensor<4x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<4x512xf32>>
+        return
+      }
+    }
+  }
+}
+
+// CHECK-LABEL: func @small_m_matmul
+//   CHECK-DAG:   %[[B0:.+]] = hal.interface.binding.subspan layout({{.+}}) binding(0)
+//   CHECK-DAG:   %[[B1:.+]] = hal.interface.binding.subspan layout({{.+}}) binding(1)
+//   CHECK-DAG:   %[[B2:.+]] = hal.interface.binding.subspan layout({{.+}}) binding(2)
+//   CHECK-DAG:   %[[LHS_ALLOC:.+]] = memref.alloc() : memref<1x6xf32, #gpu.address_space<workgroup>>
+//   CHECK-DAG:   %[[RHS_ALLOC:.+]] = memref.alloc() : memref<4x130xf32, #gpu.address_space<workgroup>>
+//       CHECK:   %[[LOOP:.+]] = scf.for %[[IV:.+]] = %c0 to %c1000 step %c4 {{.*}} -> (vector<1x4xf32>)
+//       CHECK:     gpu.barrier
+
+// TODO: The fact that this read gets hoisted out of the subsequent for loop
+// is a bug in LICM that does no verification that the loop has at least one
+// trip.
+//       CHECK:     %[[LHS_RD:.+]] = vector.transfer_read %[[B0]]{{.*}} vector<4xf32>
+//       CHECK:     scf.for %{{.*}} = %{{.*}} to %c1 step %c32
+//  CHECK-NEXT:       vector.transfer_write %[[LHS_RD]], %[[LHS_ALLOC]]
+//       CHECK:     gpu.barrier
+//   CHECK-DAG:     %[[LHS_MM:.+]] = vector.transfer_read %[[LHS_ALLOC]]{{.*}} vector<4xf32>
+//   CHECK-DAG:     %[[RHS_MM:.+]] = vector.transfer_read %[[RHS_ALLOC]]{{.*}} vector<4x4xf32>
+//       CHECK:     vector.contract {{.*}} %[[LHS_MM]], %[[RHS_MM]]