Undo use of upstream pass for first level tile + distribute. (#8885)

The use of Codegen driver for first level tile + distribute introduces a lot of code motion that interferes with conversion to destination passing style (which in turns interferes with bufferization). Instead, the need is probably to have a more targeted pass to use the upstream tile + fuse + distribute (+ interchange).
diff --git a/iree/compiler/Codegen/LLVMCPU/Passes.cpp b/iree/compiler/Codegen/LLVMCPU/Passes.cpp
index 9f574f4..b4b5ced 100644
--- a/iree/compiler/Codegen/LLVMCPU/Passes.cpp
+++ b/iree/compiler/Codegen/LLVMCPU/Passes.cpp
@@ -250,25 +250,15 @@
 
 void addDoubleTilingExpertPassPipeline(OpPassManager &passManager,
                                        bool lowerToAVX2) {
-  // Run preprocessing and verification before starting Linalg transforms.
-  passManager.addNestedPass<func::FuncOp>(
-      createConvertToDestinationPassingStylePass());
-  passManager.addPass(createCanonicalizerPass());
   passManager.addPass(createVerifyLinalgTransformLegalityPass());
 
   // Do first level of tiling and distribution.
-  passManager.addNestedPass<func::FuncOp>(createInsertDistributionInfoPass());
-  {
-    LinalgFusePassOptions options;
-    options.tilingLevel =
-        static_cast<int64_t>(StrategyTilingLevel::WorkGroupTiles);
-    options.doIREEDistribution = true;
-    passManager.addNestedPass<func::FuncOp>(createLinalgFusePass(options));
-    passManager.addNestedPass<func::FuncOp>(createCanonicalizerPass());
-    passManager.addNestedPass<func::FuncOp>(createCSEPass());
-    passManager.addNestedPass<func::FuncOp>(
-        createRewriteLinalgDestructiveUpdatesPass());
-  }
+  passManager.addNestedPass<FuncOp>(createInsertDistributionInfoPass());
+  passManager.addNestedPass<FuncOp>(createTileAndDistributeToWorkgroupsPass());
+  passManager.addPass(createCanonicalizerPass());
+  passManager.addPass(createCSEPass());
+  passManager.addNestedPass<func::FuncOp>(
+      createConvertToDestinationPassingStylePass());
 
   // Run LinalgFusePass firstly in case that we have fill + matmul + generic
   // ops. At this stage, we do not apply vectorization. The reduction dim won't
@@ -327,25 +317,15 @@
 }
 
 void addConvTileAndDecomposeExpertPassPipeline(OpPassManager &passManager) {
-  // Run preprocessing and verification before starting Linalg transforms.
-  passManager.addNestedPass<func::FuncOp>(
-      createConvertToDestinationPassingStylePass());
-  passManager.addPass(createCanonicalizerPass());
   passManager.addPass(createVerifyLinalgTransformLegalityPass());
 
   // Do first level of tiling and distribution.
-  passManager.addNestedPass<func::FuncOp>(createInsertDistributionInfoPass());
-  {
-    LinalgFusePassOptions options;
-    options.tilingLevel =
-        static_cast<int64_t>(StrategyTilingLevel::WorkGroupTiles);
-    options.doIREEDistribution = true;
-    passManager.addNestedPass<func::FuncOp>(createLinalgFusePass(options));
-    passManager.addNestedPass<func::FuncOp>(createCanonicalizerPass());
-    passManager.addNestedPass<func::FuncOp>(createCSEPass());
-    passManager.addNestedPass<func::FuncOp>(
-        createRewriteLinalgDestructiveUpdatesPass());
-  }
+  passManager.addNestedPass<FuncOp>(createInsertDistributionInfoPass());
+  passManager.addNestedPass<FuncOp>(createTileAndDistributeToWorkgroupsPass());
+  passManager.addPass(createCanonicalizerPass());
+  passManager.addPass(createCSEPass());
+  passManager.addNestedPass<func::FuncOp>(
+      createConvertToDestinationPassingStylePass());
 
   // Run LinalgFusePass firstly in case that we have fill + conv + generic
   // ops. At this stage, we do not apply vectorization. The reduction dim won't
diff --git a/iree/compiler/Codegen/LLVMCPU/test/BUILD b/iree/compiler/Codegen/LLVMCPU/test/BUILD
index fd83660..446338e 100644
--- a/iree/compiler/Codegen/LLVMCPU/test/BUILD
+++ b/iree/compiler/Codegen/LLVMCPU/test/BUILD
@@ -29,6 +29,7 @@
             "materialize_aarch64_launch_configuration.mlir",
             "materialize_riscv_launch_configuration.mlir",
             "materialize_x86_64_launch_configuration.mlir",
+            "pipeline_tests.mlir",
             "synchronize_symbol_visibility.mlir",
             "test_config_mmt4d.mlir",
             "tile_fuse_and_vectorize.mlir",
diff --git a/iree/compiler/Codegen/LLVMCPU/test/CMakeLists.txt b/iree/compiler/Codegen/LLVMCPU/test/CMakeLists.txt
index 6f6cec0..8035201 100644
--- a/iree/compiler/Codegen/LLVMCPU/test/CMakeLists.txt
+++ b/iree/compiler/Codegen/LLVMCPU/test/CMakeLists.txt
@@ -23,6 +23,7 @@
     "materialize_aarch64_launch_configuration.mlir"
     "materialize_riscv_launch_configuration.mlir"
     "materialize_x86_64_launch_configuration.mlir"
+    "pipeline_tests.mlir"
     "synchronize_symbol_visibility.mlir"
     "test_config_mmt4d.mlir"
     "tile_fuse_and_vectorize.mlir"
diff --git a/iree/compiler/Codegen/LLVMCPU/test/pipeline_tests.mlir b/iree/compiler/Codegen/LLVMCPU/test/pipeline_tests.mlir
new file mode 100644
index 0000000..b89b2db
--- /dev/null
+++ b/iree/compiler/Codegen/LLVMCPU/test/pipeline_tests.mlir
@@ -0,0 +1,59 @@
+// RUN: iree-opt -pass-pipeline='hal.executable(hal.executable.variant(iree-llvmcpu-lower-executable-target))' %s | FileCheck %s
+
+// Check that this dispatch compiles to vectors and that there are no allocas.
+// By proxy checks that destination passing style kicked in correctly
+// and no CSE was run between first level tile + fuse + distribute
+// and the conversion to destination passing style. Running CSE
+// before hoists the fill and the init_tensor out of the loop causing
+// issues with the conversion.
+#map3 = affine_map<(d0) -> (d0)>
+#map4 = affine_map<(d0, d1) -> (d0)>
+#map5 = affine_map<(d0, d1) -> (d0, d1)>
+#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm", "embedded-elf-x86_64", {
+  cpu_features = "",
+  data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128",
+  native_vector_size = 16 : index,
+  target_triple = "x86_64-unknown-unknown-eabi-elf"}>
+#executable_layout5 = #hal.executable.layout<push_constants = 2, sets = [
+  #hal.descriptor_set.layout<0, bindings = [
+    #hal.descriptor_set.binding<0, storage_buffer>,
+    #hal.descriptor_set.binding<1, storage_buffer>]
+  >]>
+hal.executable private @check_no_cse {
+  hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
+    hal.executable.entry_point public @check_no_cse ordinal(0) layout(#executable_layout5)
+    builtin.module {
+      func @check_no_cse() {
+        %cst = arith.constant 3.840000e+02 : f32
+        %cst_0 = arith.constant 0.000000e+00 : f32
+        %0 = hal.interface.constant.load[0] : i32
+        %1 = hal.interface.constant.load[1] : i32
+        %2 = arith.index_cast %0 {stream.alignment = 512 : index, stream.values = [0 : index, 10752 : index]} : i32 to index
+        %3 = arith.index_cast %1 {stream.alignment = 512 : index, stream.values = [10752 : index, 21504 : index]} : i32 to index
+        %4 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%2) alignment(64) : !flow.dispatch.tensor<readonly:7x384xf32>
+        %5 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%3) alignment(64) : !flow.dispatch.tensor<writeonly:7xf32>
+        %6 = flow.dispatch.tensor.load %4, offsets = [0, 0], sizes = [7, 384], strides = [1, 1] : !flow.dispatch.tensor<readonly:7x384xf32> -> tensor<7x384xf32>
+        %7 = linalg.init_tensor [7] : tensor<7xf32>
+        %8 = linalg.fill ins(%cst_0 : f32) outs(%7 : tensor<7xf32>) -> tensor<7xf32>
+        %9 = linalg.generic {indexing_maps = [#map5, #map4], iterator_types = ["parallel", "reduction"]} ins(%6 : tensor<7x384xf32>) outs(%8 : tensor<7xf32>) {
+        ^bb0(%arg0: f32, %arg1: f32):
+          %11 = arith.addf %arg1, %arg0 : f32
+          linalg.yield %11 : f32
+        } -> tensor<7xf32>
+        %10 = linalg.generic {indexing_maps = [#map3, #map3], iterator_types = ["parallel"]} ins(%9 : tensor<7xf32>) outs(%7 : tensor<7xf32>) {
+        ^bb0(%arg0: f32, %arg1: f32):
+          %11 = arith.divf %arg0, %cst : f32
+          linalg.yield %11 : f32
+        } -> tensor<7xf32>
+        flow.dispatch.tensor.store %10, %5, offsets = [0], sizes = [7], strides = [1] : tensor<7xf32> -> !flow.dispatch.tensor<writeonly:7xf32>
+        return
+      }
+    }
+  }
+}
+//      CHECK: func @check_no_cse()
+//  CHECK-NOT:    memref.alloc
+//      CHECK:    %[[FOR:.+]] = scf.for
+//      CHECK:    %[[DIVF:.+]] = arith.divf %[[FOR]]
+//      CHECK:    %[[RES:.+]] = vector.extract %[[DIVF]]
+//      CHECK:    memref.store %[[RES]]