Use TileFuseAndVectorize pipeline for x86 by default. (#7666)

Also unnecessary flags for the pass since everything should be lowered
to vectors if possible.
diff --git a/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp b/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp
index 9f794ac..09f6353 100644
--- a/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp
+++ b/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp
@@ -85,11 +85,38 @@
         "linalg.generic and linalg.indexed_generic workgroup tile size"),
     llvm::cl::init(64));
 
-static llvm::cl::opt<bool> clUseTileFuseAndVectorize(
-    "iree-llvmcpu-use-tile-fuse-and-vectorize",
-    llvm::cl::desc(
-        "THIS IS DEVELOPMENT ONLY FLAG. Uses tile, fuse and vectorize"),
-    llvm::cl::init(false));
+using IREE::Codegen::DispatchLoweringPassPipeline;
+
+static Optional<std::string> getTargetTriple(FuncOp entryPointFn) {
+  auto variantOp =
+      entryPointFn->getParentOfType<IREE::HAL::ExecutableVariantOp>();
+  IREE::HAL::ExecutableTargetAttr targetAttr = variantOp.target();
+  if (!targetAttr) return llvm::None;
+  auto config = targetAttr.getConfiguration();
+  if (!config) return llvm::None;
+  auto triple = config.getAs<StringAttr>("target_triple");
+  if (!triple) return llvm::None;
+  return triple.getValue().str();
+}
+
+static DispatchLoweringPassPipeline getDispatchLoweringPassPipeline(
+    FuncOp entryPointFn, Operation *op) {
+  return TypeSwitch<Operation *, DispatchLoweringPassPipeline>(op)
+      .Case<linalg::ContractionOpInterface>([&](auto op) {
+        Optional<std::string> triple = getTargetTriple(entryPointFn);
+        if (triple && triple.getValue().find("x86_64") != std::string::npos) {
+          return DispatchLoweringPassPipeline::CPUTileFuseAndVectorize;
+        } else {
+          return DispatchLoweringPassPipeline::CPUTensorToVectors;
+        }
+      })
+      .Case<linalg::Mmt4DOp>([&](auto op) {
+        return DispatchLoweringPassPipeline::CPUTensorToVectors;
+      })
+      .Default([&](Operation *op) {
+        return DispatchLoweringPassPipeline::CPUDefault;
+      });
+}
 
 /// Looks for the `native_vector_size` attribute in the hal.executable.variant
 /// op.
@@ -230,9 +257,9 @@
   SmallVector<int64_t> workloadPerWorkgroup =
       getDefaultWorkloadPerWorkgroup(tiledLoops, nativeVectorSizeInElements);
 
-  setTranslationInfo(
-      entryPointFn, IREE::Codegen::DispatchLoweringPassPipeline::CPUDefault,
-      workloadPerWorkgroup, /*workgroupSize =*/ArrayRef<int64_t>{});
+  setTranslationInfo(entryPointFn, DispatchLoweringPassPipeline::CPUDefault,
+                     workloadPerWorkgroup,
+                     /*workgroupSize =*/ArrayRef<int64_t>{});
   return success();
 }
 
@@ -309,10 +336,9 @@
   }
   setTranslationInfo(
       entryPointFn,
-      clUseTileFuseAndVectorize
-          ? IREE::Codegen::DispatchLoweringPassPipeline::CPUTileFuseAndVectorize
-          : IREE::Codegen::DispatchLoweringPassPipeline::CPUTensorToVectors,
-      workloadPerWorkgroup, /*workgroupSize =*/ArrayRef<int64_t>{});
+      getDispatchLoweringPassPipeline(entryPointFn, contractionOp),
+      workloadPerWorkgroup,
+      /*workgroupSize =*/ArrayRef<int64_t>{});
 
   SmallVector<int64_t, 4> l1TileSizes, vectorTileSizes;
   if (isBatchMatmul) {
@@ -388,7 +414,7 @@
 
   return setOpConfigAndEntryPointFnTranslation(
       entryPointFn, mmt4dOp, tileSizes, nativeVectorSize,
-      IREE::Codegen::DispatchLoweringPassPipeline::CPUTensorToVectors);
+      getDispatchLoweringPassPipeline(entryPointFn, mmt4dOp));
 }
 
 /// Sets the lowering configuration for dispatch region for linalg_ext.fft
@@ -425,7 +451,7 @@
   return setOpConfigAndEntryPointFnTranslation(
       entryPointFn, fftOp, tileSizes,
       /*nativeVectorSizes=*/ArrayRef<int64_t>{},
-      IREE::Codegen::DispatchLoweringPassPipeline::CPUDefault);
+      getDispatchLoweringPassPipeline(entryPointFn, fftOp));
 }
 
 /// Finds the root operation in the given list of linalg operations and sets
diff --git a/iree/compiler/Codegen/LLVMCPU/LLVMCPULowerExecutableTarget.cpp b/iree/compiler/Codegen/LLVMCPU/LLVMCPULowerExecutableTarget.cpp
index 72078cd..991be85 100644
--- a/iree/compiler/Codegen/LLVMCPU/LLVMCPULowerExecutableTarget.cpp
+++ b/iree/compiler/Codegen/LLVMCPU/LLVMCPULowerExecutableTarget.cpp
@@ -188,8 +188,7 @@
             break;
           case IREE::Codegen::DispatchLoweringPassPipeline::
               CPUTileFuseAndVectorize:
-            addTensorToVectorsPassPipeline(nestedModulePM, lowerToVectors,
-                                           /*useTileAndVectorizeV2=*/true);
+            addTileFuseAndVectorizePassPipeline(nestedModulePM);
             break;
           default:
             llvm_unreachable("Unsupported pipeline on CPU target.");
diff --git a/iree/compiler/Codegen/LLVMCPU/LLVMCPUTileFuseAndVectorizeLinalgTensorOps.cpp b/iree/compiler/Codegen/LLVMCPU/LLVMCPUTileFuseAndVectorizeLinalgTensorOps.cpp
index 282d6ce..f0f330a 100644
--- a/iree/compiler/Codegen/LLVMCPU/LLVMCPUTileFuseAndVectorizeLinalgTensorOps.cpp
+++ b/iree/compiler/Codegen/LLVMCPU/LLVMCPUTileFuseAndVectorizeLinalgTensorOps.cpp
@@ -54,19 +54,11 @@
 namespace {
 struct LLVMCPUTileFuseAndVectorizePass
     : public LLVMCPUTileFuseAndVectorizeBase<LLVMCPUTileFuseAndVectorizePass> {
-  LLVMCPUTileFuseAndVectorizePass(bool vectorize = true)
-      : lowerToVectors(vectorize) {}
-  LLVMCPUTileFuseAndVectorizePass(const LLVMCPUTileFuseAndVectorizePass &pass) {
-    lowerToVectors = pass.lowerToVectors;
-  }
   void getDependentDialects(DialectRegistry &registry) const override {
     registry.insert<linalg::LinalgDialect, memref::MemRefDialect,
                     vector::VectorDialect>();
   }
   void runOnOperation() override;
-
- private:
-  bool lowerToVectors;
 };
 
 LogicalResult applyTileAndFuseCanonicalizationPatterns(FuncOp funcOp) {
@@ -215,10 +207,6 @@
     });
   }
 
-  if (!lowerToVectors) {
-    return;
-  }
-
   {
     // Set vectorization marker globally
     OpBuilder builder(funcOp.getContext());
@@ -300,9 +288,8 @@
   }
 }
 
-std::unique_ptr<OperationPass<FuncOp>> createLLVMCPUTileFuseAndVectorizePass(
-    bool lowerToVectors) {
-  return std::make_unique<LLVMCPUTileFuseAndVectorizePass>(lowerToVectors);
+std::unique_ptr<OperationPass<FuncOp>> createLLVMCPUTileFuseAndVectorizePass() {
+  return std::make_unique<LLVMCPUTileFuseAndVectorizePass>();
 }
 
 }  // namespace iree_compiler
diff --git a/iree/compiler/Codegen/LLVMCPU/Passes.cpp b/iree/compiler/Codegen/LLVMCPU/Passes.cpp
index cb0a934..3ba3f6e 100644
--- a/iree/compiler/Codegen/LLVMCPU/Passes.cpp
+++ b/iree/compiler/Codegen/LLVMCPU/Passes.cpp
@@ -111,18 +111,12 @@
 }
 
 void addTensorToVectorsPassPipeline(OpPassManager &passManager,
-                                    bool lowerToVectors,
-                                    bool useTileAndVectorizeV2) {
+                                    bool lowerToVectors) {
   passManager.addPass(createCanonicalizerPass());
 
   // Tile and vectorize linalg ops on tensors.
-  if (useTileAndVectorizeV2) {
-    passManager.addNestedPass<FuncOp>(
-        createLLVMCPUTileFuseAndVectorizePass(lowerToVectors));
-  } else {
-    passManager.addNestedPass<FuncOp>(
-        createLLVMCPUTileAndVectorizePass(lowerToVectors));
-  }
+  passManager.addNestedPass<FuncOp>(
+      createLLVMCPUTileAndVectorizePass(lowerToVectors));
   passManager.addNestedPass<FuncOp>(createCSEPass());
   passManager.addNestedPass<FuncOp>(createCanonicalizerPass());
 
@@ -136,6 +130,23 @@
   passManager.addNestedPass<FuncOp>(createOptimizeVectorTransferPass());
 }
 
+void addTileFuseAndVectorizePassPipeline(OpPassManager &passManager) {
+  passManager.addPass(createCanonicalizerPass());
+
+  // Tile and vectorize linalg ops on tensors.
+  passManager.addNestedPass<FuncOp>(createLLVMCPUTileFuseAndVectorizePass());
+  passManager.addNestedPass<FuncOp>(createCSEPass());
+  passManager.addNestedPass<FuncOp>(createCanonicalizerPass());
+
+  // Use stack allocation on CPU side.
+  addLinalgBufferizePasses(passManager, cpuAllocationFunction);
+  passManager.addNestedPass<FuncOp>(createCSEPass());
+  passManager.addNestedPass<FuncOp>(createCanonicalizerPass());
+
+  passManager.addNestedPass<FuncOp>(createForOpCanonicalizationPass());
+  passManager.addNestedPass<FuncOp>(createOptimizeVectorTransferPass());
+}
+
 void addCPUDefaultPassPipeline(OpPassManager &passManager) {
   passManager.addPass(createCanonicalizerPass());
   // Use stack allocation on CPU side.
diff --git a/iree/compiler/Codegen/LLVMCPU/test/materialize_launch_configuration.mlir b/iree/compiler/Codegen/LLVMCPU/test/materialize_launch_configuration.mlir
index a521311..f48dc87 100644
--- a/iree/compiler/Codegen/LLVMCPU/test/materialize_launch_configuration.mlir
+++ b/iree/compiler/Codegen/LLVMCPU/test/materialize_launch_configuration.mlir
@@ -6,10 +6,10 @@
     hal.interface.binding @arg1, set=0, binding=1, type="StorageBuffer", access="Read"
     hal.interface.binding @ret0, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
   }
-  hal.executable.variant @llvm, target = #hal.executable.target<"llvm", "embedded-elf-x86_64", {
+  hal.executable.variant @llvm, target = #hal.executable.target<"llvm", "embedded-elf-arm_64", {
        data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128",
        native_vector_size = 16 : index,
-       target_triple = "x86_64-unknown-linux-gnu"
+       target_triple = "aarch64-unknown-unknown-eabi-elf"
     }> {
     hal.executable.entry_point @matmul_tensors attributes {
       interface = @io,
@@ -315,10 +315,10 @@
     hal.interface.binding @arg1, set=0, binding=1, type="StorageBuffer", access="Read"
     hal.interface.binding @ret0, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
   }
-  hal.executable.variant @llvm, target = #hal.executable.target<"llvm", "embedded-elf-x86_64", {
+  hal.executable.variant @llvm, target = #hal.executable.target<"llvm", "embedded-elf-arm_64", {
        data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128",
        native_vector_size = 16 : index,
-       target_triple = "x86_64-unknown-linux-gnu"
+       target_triple = "aarch64-unknown-unknown-eabi-elf"
     }> {
     hal.executable.entry_point @batch_matmul_tensors attributes {
       interface = @io,
@@ -1296,3 +1296,59 @@
 //  CHECK-DAG:     %[[C1:.+]] = arith.constant 1 : index
 //  CHECK-DAG:     %[[D0:.+]] = affine.apply #[[MAP]]()[%[[ARG0]]]
 //      CHECK:     hal.return %[[D0]], %[[C1]], %[[C1]]
+
+// -----
+
+hal.executable private @matmul_x86  {
+  hal.executable.variant public @embedded_elf_x86_64, target = #hal.executable.target<
+      "llvm",
+      "embedded-elf-x86_64", {
+          data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128",
+          native_vector_size = 64 : index,
+          target_triple = "x86_64-unknown-unknown-eabi-elf"
+    }> {
+    hal.executable.entry_point public @matmul_x86 attributes {interface = @io, ordinal = 0 : index}
+    builtin.module  {
+      func @matmul_x86() {
+        %c128 = arith.constant 128 : index
+        %c384 = arith.constant 384 : index
+        %cst = arith.constant 0.000000e+00 : f32
+        %c0 = arith.constant 0 : index
+        %0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : !flow.dispatch.tensor<readonly:384x512xf32>
+        %1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : !flow.dispatch.tensor<readonly:512x128xf32>
+        %2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : !flow.dispatch.tensor<writeonly:384x128xf32>
+        %workgroup_size_x = hal.interface.workgroup.size[0] : index
+        %workgroup_size_y = hal.interface.workgroup.size[1] : index
+        %workgroup_id_x = hal.interface.workgroup.id[0] : index
+        %workgroup_count_x = hal.interface.workgroup.count[0] : index
+        %workgroup_id_y = hal.interface.workgroup.id[1] : index
+        %workgroup_count_y = hal.interface.workgroup.count[1] : index
+        %3 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_y, %workgroup_size_y]
+        %4 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_y, %workgroup_size_y]
+        scf.for %arg0 = %3 to %c384 step %4 {
+          %5 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_x, %workgroup_size_x]
+          %6 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_x, %workgroup_size_x]
+          scf.for %arg1 = %5 to %c128 step %6 {
+            %7 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 384)>(%arg0)[%workgroup_size_y]
+            %8 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [%7, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:384x512xf32> -> tensor<?x512xf32>
+            %9 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 128)>(%arg1)[%workgroup_size_x]
+            %10 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [512, %9], strides = [1, 1] : !flow.dispatch.tensor<readonly:512x128xf32> -> tensor<512x?xf32>
+            %11 = affine.min affine_map<(d0)[s0] -> (-d0 + 384, s0)>(%arg0)[%workgroup_size_y]
+            %12 = affine.min affine_map<(d0)[s0] -> (-d0 + 128, s0)>(%arg1)[%workgroup_size_x]
+            %13 = linalg.init_tensor [%11, %12] : tensor<?x?xf32>
+            %14 = linalg.fill(%cst, %13) : f32, tensor<?x?xf32> -> tensor<?x?xf32> 
+            %15 = linalg.matmul ins(%8, %10 : tensor<?x512xf32>, tensor<512x?xf32>) outs(%14 : tensor<?x?xf32>) -> tensor<?x?xf32>
+            flow.dispatch.tensor.store %15, %2, offsets = [%arg0, %arg1], sizes = [%7, %9], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:384x128xf32>
+          }
+        }
+        return
+      }
+      hal.interface private @io {
+        hal.interface.binding public @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
+        hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
+        hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
+      }
+    }
+  }
+}
+//  CHECK: #[[TRANSLATION:.+]] = #iree_codegen.translation.info<"CPUTileFuseAndVectorize", workload_per_wg = [64, 64]>
diff --git a/iree/compiler/Codegen/Passes.h b/iree/compiler/Codegen/Passes.h
index 9940ebe..569656f 100644
--- a/iree/compiler/Codegen/Passes.h
+++ b/iree/compiler/Codegen/Passes.h
@@ -156,8 +156,9 @@
 /// Multi-level tiling and vectorization of linalg ops on tensors.
 std::unique_ptr<OperationPass<FuncOp>> createLLVMCPUTileAndVectorizePass(
     bool lowerToVectors = true);
-std::unique_ptr<OperationPass<FuncOp>> createLLVMCPUTileFuseAndVectorizePass(
-    bool lowerToVectors = true);
+
+/// Multi-level tiling, fusing and vectorization of linalg ops on tensors.
+std::unique_ptr<OperationPass<FuncOp>> createLLVMCPUTileFuseAndVectorizePass();
 
 /// Vectorizes linalg ops executed in the same hal.interface.workgroup.
 std::unique_ptr<OperationPass<FuncOp>> createLLVMCPUVectorizationPass(
@@ -198,8 +199,11 @@
     IREE::Codegen::TranslationInfoAttr translationInfo,
     ArrayRef<int64_t> workgroupSize = {});
 void addTensorToVectorsPassPipeline(OpPassManager &passManager,
-                                    bool lowerToVectors = true,
-                                    bool useTileAndVectorizeV2 = false);
+                                    bool lowerToVectors = true);
+
+/// Populates the passes needed to multi level tile, fuse and vectorize lowering
+/// of linalg ops on tensors to vectors operations.
+void addTileFuseAndVectorizePassPipeline(OpPassManager &passManager);
 
 //----------------------------------------------------------------------------//
 // LLVMCPU Pass Pipelines for lowering to LLVM dialect.
diff --git a/iree/test/e2e/cpu_specific/BUILD b/iree/test/e2e/cpu_specific/BUILD
index a2de649..fc37d86 100644
--- a/iree/test/e2e/cpu_specific/BUILD
+++ b/iree/test/e2e/cpu_specific/BUILD
@@ -22,7 +22,6 @@
     ],
     compiler_flags = [
         "-iree-input-type=mhlo",
-        "--iree-llvmcpu-use-tile-fuse-and-vectorize",
     ],
     driver = "dylib",
     target_backend = "dylib-llvm-aot",
diff --git a/iree/test/e2e/cpu_specific/CMakeLists.txt b/iree/test/e2e/cpu_specific/CMakeLists.txt
index 25d2185..42ba467 100644
--- a/iree/test/e2e/cpu_specific/CMakeLists.txt
+++ b/iree/test/e2e/cpu_specific/CMakeLists.txt
@@ -21,7 +21,6 @@
     "dylib"
   COMPILER_FLAGS
     "-iree-input-type=mhlo"
-    "--iree-llvmcpu-use-tile-fuse-and-vectorize"
 )
 
 ### BAZEL_TO_CMAKE_PRESERVES_ALL_CONTENT_BELOW_THIS_LINE ###