- Move linalg.mmt4d to the tensor -> vector pipeline (#7307)

- Move linalg.mmt4d tile and vectorzation to tensor -> vector pipeline.
- Add vector transform patterns needed to get better performance.

Running `iree/test/microbenchmarks/linalg_mmt4d.mlir` on Pixel4 core-7
Baseline:
```
--------------------------------------------------------------------------------------------
Benchmark                                                  Time             CPU   Iterations
--------------------------------------------------------------------------------------------
BM_matmul_384x384x512/process_time/real_time            6.64 ms         6.63 ms          106
BM_mmt4d_384x384x512_4x1x4/process_time/real_time       9.74 ms         9.72 ms           72
BM_mmt4d_384x384x512_8x1x8/process_time/real_time       7.10 ms         7.07 ms          100
```
This PR:
```
--------------------------------------------------------------------------------------------
Benchmark                                                  Time             CPU   Iterations
--------------------------------------------------------------------------------------------
BM_matmul_384x384x512/process_time/real_time            6.24 ms         6.22 ms          112
BM_mmt4d_384x384x512_4x1x4/process_time/real_time       4.41 ms         4.40 ms          159
BM_mmt4d_384x384x512_8x1x8/process_time/real_time       3.83 ms         3.82 ms          183
```
diff --git a/iree/compiler/Codegen/Common/OptimizeVectorTransferPass.cpp b/iree/compiler/Codegen/Common/OptimizeVectorTransferPass.cpp
index 78e006e..1de9c92 100644
--- a/iree/compiler/Codegen/Common/OptimizeVectorTransferPass.cpp
+++ b/iree/compiler/Codegen/Common/OptimizeVectorTransferPass.cpp
@@ -95,6 +95,8 @@
     OwningRewritePatternList patterns(&getContext());
     mlir::vector::populateCastAwayVectorLeadingOneDimPatterns(patterns);
     patterns.add<TransposeUnitDimToShapeCast>(&getContext());
+    mlir::vector::populateVectorTransferCollapseInnerMostContiguousDimsPatterns(
+        patterns);
     (void)applyPatternsAndFoldGreedily(funcOp, std::move(patterns));
     // Workaround, run loop invariant code motion before hoist redudant vector
     // transfer to workaround a bug upstream.
diff --git a/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp b/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp
index 8d675a8..6b2ad8e 100644
--- a/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp
+++ b/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp
@@ -374,7 +374,7 @@
 
   return setOpConfigAndEntryPointFnTranslation(
       entryPointFn, mmt4dOp, tileSizes, nativeVectorSize,
-      IREE::Codegen::DispatchLoweringPassPipeline::CPUVectorization);
+      IREE::Codegen::DispatchLoweringPassPipeline::CPUTensorToVectors);
 }
 
 /// Sets the lowering configuration for dispatch region for linalg_ext.fft
diff --git a/iree/compiler/Codegen/LLVMCPU/LLVMCPUTileAndVectorizeLinalgTensorOps.cpp b/iree/compiler/Codegen/LLVMCPU/LLVMCPUTileAndVectorizeLinalgTensorOps.cpp
index 7eb2c56..cdba503 100644
--- a/iree/compiler/Codegen/LLVMCPU/LLVMCPUTileAndVectorizeLinalgTensorOps.cpp
+++ b/iree/compiler/Codegen/LLVMCPU/LLVMCPUTileAndVectorizeLinalgTensorOps.cpp
@@ -173,6 +173,16 @@
     return;
   }
 
+  // Op specific conversion.
+  {
+    RewritePatternSet vectorizeOpsPattenrs(context);
+    populateLinalgToVectorVectorizeMMT4dPatterns(context, vectorizeOpsPattenrs);
+    if (failed(applyPatternsAndFoldGreedily(funcOp,
+                                            std::move(vectorizeOpsPattenrs)))) {
+      return signalPassFailure();
+    }
+  }
+
   // Apply vectorization patterns.
   {
     OwningRewritePatternList vectorizationPatterns(&getContext());
diff --git a/iree/compiler/Codegen/LLVMCPU/test/BUILD b/iree/compiler/Codegen/LLVMCPU/test/BUILD
index 2a67f8e..eca3067 100644
--- a/iree/compiler/Codegen/LLVMCPU/test/BUILD
+++ b/iree/compiler/Codegen/LLVMCPU/test/BUILD
@@ -25,6 +25,7 @@
             "materialize_launch_configuration.mlir",
             "matmul_vectorization.mlir",
             "synchronize_symbol_visibility.mlir",
+            "test_config_mmt4d.mlir",
             "tile_and_vectorize.mlir",
             "unfused_fma.mlir",
             "vector_contract_to_aarch64_asm.mlir",
diff --git a/iree/compiler/Codegen/LLVMCPU/test/CMakeLists.txt b/iree/compiler/Codegen/LLVMCPU/test/CMakeLists.txt
index 1aa33ea..137565d 100644
--- a/iree/compiler/Codegen/LLVMCPU/test/CMakeLists.txt
+++ b/iree/compiler/Codegen/LLVMCPU/test/CMakeLists.txt
@@ -20,6 +20,7 @@
     "materialize_launch_configuration.mlir"
     "matmul_vectorization.mlir"
     "synchronize_symbol_visibility.mlir"
+    "test_config_mmt4d.mlir"
     "tile_and_vectorize.mlir"
     "unfused_fma.mlir"
     "vector_contract_to_aarch64_asm.mlir"
diff --git a/iree/compiler/Codegen/LLVMCPU/test/test_config_mmt4d.mlir b/iree/compiler/Codegen/LLVMCPU/test/test_config_mmt4d.mlir
new file mode 100644
index 0000000..8045b29
--- /dev/null
+++ b/iree/compiler/Codegen/LLVMCPU/test/test_config_mmt4d.mlir
@@ -0,0 +1,58 @@
+// RUN: iree-opt -split-input-file -pass-pipeline='hal.executable(hal.executable.variant(iree-llvmcpu-lower-executable-target{test-lowering-configuration=true}))' %s | IreeFileCheck %s
+
+#executable_target_embedded_elf_arm_64_ = #hal.executable.target<"llvm", "embedded-elf-arm_64", {data_layout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128", native_vector_size = 16 : index, target_triple = "aarch64-unknown-unknown-eabi-elf"}>
+#map0 = affine_map<()[s0, s1] -> (s0 * s1)>
+#map3 = affine_map<(d0)[s0] -> (s0, -d0 + 96)>
+#map4 = affine_map<(d0)[s0] -> (s0, -d0 + 128)>
+hal.executable private @mmt4d_384x384x512_4x1x4_dispatch_0 {
+    hal.interface public @io {
+      hal.interface.binding public @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
+      hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
+      hal.interface.binding public @s0b2_rw_external, set=0, binding=2, type="StorageBuffer", access="Read|Write"
+    }
+    hal.executable.variant public @embedded_elf_arm_64, target = #executable_target_embedded_elf_arm_64_ {
+      hal.executable.entry_point public @mmt4d_384x384x512_4x1x4_dispatch_0 attributes {interface = @io, ordinal = 0 : index}
+      builtin.module  {
+        func @mmt4d_384x384x512_4x1x4_dispatch_0() {
+          %c0 = arith.constant 0 : index
+          %c96 = arith.constant 96 : index
+          %c128 = arith.constant 128 : index
+          %0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : !flow.dispatch.tensor<readonly:96x384x4x1xf32>
+          %1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : !flow.dispatch.tensor<readonly:128x384x4x1xf32>
+          %2 = hal.interface.binding.subspan @io::@s0b2_rw_external[%c0] : !flow.dispatch.tensor<readwrite:96x128x4x4xf32>
+          %workgroup_size_x = hal.interface.workgroup.size[0] : index
+          %workgroup_size_y = hal.interface.workgroup.size[1] : index
+          %workgroup_id_x = hal.interface.workgroup.id[0] : index
+          %workgroup_count_x = hal.interface.workgroup.count[0] : index
+          %workgroup_id_y = hal.interface.workgroup.id[1] : index
+          %workgroup_count_y = hal.interface.workgroup.count[1] : index
+          %3 = affine.apply #map0()[%workgroup_id_y, %workgroup_size_y]
+          %4 = affine.apply #map0()[%workgroup_count_y, %workgroup_size_y]
+          scf.for %arg0 = %3 to %c96 step %4 {
+            %5 = affine.apply #map0()[%workgroup_id_x, %workgroup_size_x]
+            %6 = affine.apply #map0()[%workgroup_count_x, %workgroup_size_x]
+            scf.for %arg1 = %5 to %c128 step %6 {
+              %7 = affine.min #map3(%arg0)[%workgroup_size_y]
+              %8 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0, 0, 0], sizes = [%7, 384, 4, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:96x384x4x1xf32> -> tensor<?x384x4x1xf32>
+              %9 = affine.min #map4(%arg1)[%workgroup_size_x]
+              %10 = flow.dispatch.tensor.load %1, offsets = [%arg1, 0, 0, 0], sizes = [%9, 384, 4, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:128x384x4x1xf32> -> tensor<?x384x4x1xf32>
+              %11 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1, 0, 0], sizes = [%7, %9, 4, 4], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:96x128x4x4xf32> -> tensor<?x?x4x4xf32>
+              %12 = linalg.mmt4d {__internal_linalg_transform__ = "workgroup"} ins(%8, %10 : tensor<?x384x4x1xf32>, tensor<?x384x4x1xf32>) outs(%11 : tensor<?x?x4x4xf32>) -> tensor<?x?x4x4xf32>
+              flow.dispatch.tensor.store %12, %2, offsets = [%arg0, %arg1, 0, 0], sizes = [%7, %9, 4, 4], strides = [1, 1, 1, 1] : tensor<?x?x4x4xf32> -> !flow.dispatch.tensor<readwrite:96x128x4x4xf32>
+            }
+          }
+          return
+        }
+        hal.interface private @io {
+          hal.interface.binding public @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
+          hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
+          hal.interface.binding public @s0b2_rw_external, set=0, binding=2, type="StorageBuffer", access="Read|Write"
+        }
+      }
+    }
+}
+
+//  CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering.config<tile_sizes = {{\[}}[48, 32], [1, 1, 1, 4, 4, 1], [1, 1, 1, 4, 4, 1]{{\]}}, native_vector_size = [1, 1, 1, 4, 4, 1]
+//      CHECK: func @mmt4d_384x384x512_4x1x4_dispatch_0()
+//      CHECK:   linalg.mmt4d
+// CHECK-SAME:     lowering.config = #[[CONFIG]]