[LLVMCPU] Enable tileDispatchUsingForall as default (#18777)

diff --git a/compiler/src/iree/compiler/Codegen/Common/TileDispatchUsingForall.cpp b/compiler/src/iree/compiler/Codegen/Common/TileDispatchUsingForall.cpp
index ebbe585..218b7f5 100644
--- a/compiler/src/iree/compiler/Codegen/Common/TileDispatchUsingForall.cpp
+++ b/compiler/src/iree/compiler/Codegen/Common/TileDispatchUsingForall.cpp
@@ -202,13 +202,16 @@
   llvm::SmallDenseSet<int> droppedLoops;
   for (auto [index, lb, ub, step] :
        llvm::enumerate(mixedLbs, mixedUbs, mixedSteps)) {
-    if (!isa<Attribute>(lb) || !isa<Attribute>(ub) || !isa<Attribute>(step)) {
+
+    std::optional<int64_t> lbVal = getConstantIntValue(lb);
+    std::optional<int64_t> ubVal = getConstantIntValue(ub);
+    std::optional<int64_t> stepVal = getConstantIntValue(step);
+
+    if (!(lbVal && ubVal && stepVal)) {
       continue;
     }
-    int64_t lbVal = getConstantIntValue(lb).value();
-    int64_t ubVal = getConstantIntValue(ub).value();
-    int64_t stepVal = getConstantIntValue(step).value();
-    if (CEILDIV(ubVal - lbVal, stepVal) == 1) {
+
+    if (CEILDIV(ubVal.value() - lbVal.value(), stepVal.value()) == 1) {
       droppedLoops.insert(index);
     }
   }
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/Passes.cpp b/compiler/src/iree/compiler/Codegen/LLVMCPU/Passes.cpp
index 0951fbb..71b3aec 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/Passes.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/Passes.cpp
@@ -95,7 +95,7 @@
 static llvm::cl::opt<bool> clTileDispatchUsingForall(
     "iree-llvmcpu-tile-dispatch-using-forall",
     llvm::cl::desc("Enable tile and distribute to workgroups using scf.forall"),
-    llvm::cl::init(false));
+    llvm::cl::init(true));
 
 // By default, IREE does not enable the Armv9-A streaming SVE mode in the
 // presence of scalable vectors (even when using `+sme`), as currently there's
@@ -111,9 +111,8 @@
     llvm::cl::init(false));
 
 // TODO: Enable `TileDispatchUsingForall` for every pipeline.
-static void addTileAndDistributePasses(OpPassManager &funcPassManager,
-                                       bool enableTileDispatchUsingForall) {
-  if (enableTileDispatchUsingForall || clTileDispatchUsingForall) {
+static void addTileAndDistributePasses(OpPassManager &funcPassManager) {
+  if (clTileDispatchUsingForall) {
     funcPassManager.addPass(
         createTileAndDistributeToWorkgroupsUsingForallOpPass());
   } else {
@@ -346,8 +345,7 @@
 void addCPUBufferOpsTileAndVectorizePipeline(
     OpPassManager &funcPassManager, TilingConfig &tilingConfig,
     LLVMCPUPipelineOptions &pipelineOpt) {
-  addTileAndDistributePasses(funcPassManager,
-                             /*enableTileDispatchUsingForall=*/true);
+  addTileAndDistributePasses(funcPassManager);
 
   // Skip tiling reduction loops because this is expected to apply on copy ops
   // only.
@@ -384,8 +382,7 @@
 void addMultiTilingExpertPassPipeline(OpPassManager &funcPassManager,
                                       TilingConfig &tilingConfig,
                                       LLVMCPUPipelineOptions &pipelineOpt) {
-  addTileAndDistributePasses(funcPassManager,
-                             /*enableTileDispatchUsingForall=*/true);
+  addTileAndDistributePasses(funcPassManager);
 
   SmallVector<int64_t> allFusableLevels(tilingConfig.getFusableLevels());
   // Apply tile and fuse to all the non-distribution fusable levels. Skip
@@ -464,8 +461,7 @@
 void addConvTileAndDecomposeExpertPassPipeline(
     OpPassManager &funcPassManager, TilingConfig &tilingConfig,
     LLVMCPUPipelineOptions &pipelineOpt) {
-  addTileAndDistributePasses(funcPassManager,
-                             /*enableTileDispatchUsingForall=*/true);
+  addTileAndDistributePasses(funcPassManager);
 
   // Run LLVMTileAndFuse firstly in case that we have fill + conv + generic
   // ops. At this stage, we do not apply vectorization. The reduction dim won't
@@ -528,8 +524,7 @@
 void addMmt4dTilingExpertPassPipeline(OpPassManager &funcPassManager,
                                       TilingConfig &tilingConfig,
                                       LLVMCPUPipelineOptions &pipelineOpt) {
-  addTileAndDistributePasses(funcPassManager,
-                             /*enableTileDispatchUsingForall=*/true);
+  addTileAndDistributePasses(funcPassManager);
 
   funcPassManager.addPass(createLLVMCPUTileAndFusePass(
       static_cast<int64_t>(tilingConfig.getVectorCommonParallelLevel())));
@@ -577,8 +572,7 @@
 void addCPUDataTilingPipeline(OpPassManager &funcPassManager,
                               TilingConfig &tilingConfig,
                               LLVMCPUPipelineOptions &pipelineOpt) {
-  addTileAndDistributePasses(funcPassManager,
-                             /*enableTileDispatchUsingForall=*/true);
+  addTileAndDistributePasses(funcPassManager);
 
   // The below two passes are nop if pack/unpack is not specified in ukernels
   // attribute. By default, they are disabled.
@@ -621,8 +615,7 @@
 void addCPULinalgExtTileAndVectorizePipeline(
     OpPassManager &funcPassManager, TilingConfig &tilingConfig,
     LLVMCPUPipelineOptions &pipelineOpt) {
-  addTileAndDistributePasses(funcPassManager,
-                             /*enableTileDispatchUsingForall=*/false);
+  addTileAndDistributePasses(funcPassManager);
   funcPassManager.addPass(
       createLLVMCPUTilePass(tilingConfig.getVectorCommonParallelLevel()));
   // TODO: Remove the pass once we have PartialReductionOpInterface implemented
@@ -661,8 +654,7 @@
 }
 
 void addCPUDefaultPassPipeline(OpPassManager &funcPassManager) {
-  addTileAndDistributePasses(funcPassManager,
-                             /*enableTileDispatchUsingForall=*/false);
+  addTileAndDistributePasses(funcPassManager);
   addCPUBufferizePasses(funcPassManager);
 }
 
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_tile_and_fuse.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_tile_and_fuse.mlir
index 912acf3..2ebc854 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_tile_and_fuse.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_tile_and_fuse.mlir
@@ -290,7 +290,7 @@
 //      CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
 //      CHECK-DAG:   %[[C720:.+]] = arith.constant 720 : index
 //      CHECK-DAG:   %[[C2:.+]] = arith.constant 2 : index
-//         CHECK:   scf.forall ({{.*}}) in (2, 4, 1, 5) {
+//         CHECK:   scf.forall ({{.*}}) in (2, 4, 5) {
 //          CHECK:   %[[LOOP:.+]] = scf.for %[[IV:.+]] = %[[C0]] to %[[C720]] step %[[C2]] {{.*}} -> (vector<1x4x1x4x4x1xf32>)
 //          CHECK:     gpu.barrier
 //      CHECK-DAG:     %[[LHS_RD:.+]] = vector.transfer_read %[[B0]]{{.*}} vector<8xf16>
@@ -307,7 +307,7 @@
 //          CHECK:   %[[LOOP_T:.+]] = vector.transpose %[[LOOP]], [0, 1, 2, 4, 3, 5] : vector<1x4x1x4x4x1xf32> to vector<1x4x1x4x4x1xf32>
 //          CHECK:   %[[EXTRACT:.+]] = vector.extract %[[LOOP_T]][0] : vector<4x1x4x4x1xf32> from vector<1x4x1x4x4x1xf32>
 //          CHECK:   vector.transfer_write %[[EXTRACT]], %[[B2]]
-//         CHECK:   } {mapping = [#iree_codegen.workgroup_mapping<z:1>, #iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
+//         CHECK:   } {mapping = [#iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
 
 // -----
 
diff --git a/compiler/src/iree/compiler/Dialect/Stream/Builtins/fill_i64.mlir b/compiler/src/iree/compiler/Dialect/Stream/Builtins/fill_i64.mlir
index 96e527a..5d7d686 100644
--- a/compiler/src/iree/compiler/Dialect/Stream/Builtins/fill_i64.mlir
+++ b/compiler/src/iree/compiler/Dialect/Stream/Builtins/fill_i64.mlir
@@ -9,16 +9,17 @@
 
 stream.executable private @__builtin_fill_i64 {
   stream.executable.export public @__builtin_fill_i64 workgroups(%arg0: index) -> (index, index, index) {
-    %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0
+    %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0
     stream.return %x, %y, %z : index, index, index
   }
   builtin.module {
     func.func @__builtin_fill_i64(%value: i64, %count: index, %out_binding: !stream.binding) {
       %c0 = arith.constant 0 : index
-      %out = stream.binding.subspan %out_binding[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?xi64>>{%count}
-      %0 = tensor.empty(%count) : tensor<?xi64>
+      %count0 = flow.dispatch.workload.ordinal %count, 0 : index
+      %out = stream.binding.subspan %out_binding[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?xi64>>{%count0}
+      %0 = tensor.empty(%count0) : tensor<?xi64>
       %1 = linalg.fill ins(%value : i64) outs(%0 : tensor<?xi64>) -> tensor<?xi64>
-      flow.dispatch.tensor.store %1, %out, offsets = [0], sizes = [%count], strides = [1] : tensor<?xi64> -> !flow.dispatch.tensor<writeonly:tensor<?xi64>>{%count}
+      flow.dispatch.tensor.store %1, %out, offsets = [0], sizes = [%count0], strides = [1] : tensor<?xi64> -> !flow.dispatch.tensor<writeonly:tensor<?xi64>>{%count0}
       return
     }
   }
diff --git a/compiler/src/iree/compiler/Dialect/Stream/Builtins/splat_i64.mlir b/compiler/src/iree/compiler/Dialect/Stream/Builtins/splat_i64.mlir
index 7d94e51..4d25d35 100644
--- a/compiler/src/iree/compiler/Dialect/Stream/Builtins/splat_i64.mlir
+++ b/compiler/src/iree/compiler/Dialect/Stream/Builtins/splat_i64.mlir
@@ -9,16 +9,17 @@
 
 stream.executable private @__builtin_splat_i64 {
   stream.executable.export public @__builtin_splat_i64 workgroups(%arg0: index) -> (index, index, index) {
-    %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0
+    %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0
     stream.return %x, %y, %z : index, index, index
   }
   builtin.module {
     func.func @__builtin_splat_i64(%value: i64, %count: index, %out_binding: !stream.binding) {
       %c0 = arith.constant 0 : index
-      %out = stream.binding.subspan %out_binding[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?xi64>>{%count}
-      %0 = tensor.empty(%count) : tensor<?xi64>
+      %count0 = flow.dispatch.workload.ordinal %count, 0 : index
+      %out = stream.binding.subspan %out_binding[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?xi64>>{%count0}
+      %0 = tensor.empty(%count0) : tensor<?xi64>
       %1 = linalg.fill ins(%value : i64) outs(%0 : tensor<?xi64>) -> tensor<?xi64>
-      flow.dispatch.tensor.store %1, %out, offsets = [0], sizes = [%count], strides = [1] : tensor<?xi64> -> !flow.dispatch.tensor<writeonly:tensor<?xi64>>{%count}
+      flow.dispatch.tensor.store %1, %out, offsets = [0], sizes = [%count0], strides = [1] : tensor<?xi64> -> !flow.dispatch.tensor<writeonly:tensor<?xi64>>{%count0}
       return
     }
   }
diff --git a/compiler/src/iree/compiler/DispatchCreation/FormDispatchRegions.cpp b/compiler/src/iree/compiler/DispatchCreation/FormDispatchRegions.cpp
index e866022..b38b1a5 100644
--- a/compiler/src/iree/compiler/DispatchCreation/FormDispatchRegions.cpp
+++ b/compiler/src/iree/compiler/DispatchCreation/FormDispatchRegions.cpp
@@ -547,6 +547,14 @@
     return false;
   }
 
+  // TODO: Enable grouped convolution and depth wise pooling fusion.
+  // Rightnow, this is going through the default CPU pipeline and not through
+  // CONVTilingExpert.
+  if (isa<linalg::Conv2DNgchwFgchwOp, linalg::Conv2DNgchwGfchwOp,
+          linalg::PoolingNdhwcSumOp>(producer)) {
+    return false;
+  }
+
   auto producerFusionOp =
       dyn_cast<IREE::LinalgExt::LinalgFusionOpInterface>(producer);
   auto consumerFusionOp =
diff --git a/tests/external/iree-test-suites/onnx_ops/onnx_ops_cpu_llvm_sync.json b/tests/external/iree-test-suites/onnx_ops/onnx_ops_cpu_llvm_sync.json
index a025431..f8ca790 100644
--- a/tests/external/iree-test-suites/onnx_ops/onnx_ops_cpu_llvm_sync.json
+++ b/tests/external/iree-test-suites/onnx_ops/onnx_ops_cpu_llvm_sync.json
@@ -392,13 +392,6 @@
     "onnx/node/generated/test_softsign_example",
     "onnx/node/generated/test_stft",
     "onnx/node/generated/test_stft_with_window",
-    "onnx/node/generated/test_tfidfvectorizer_tf_batch_onlybigrams_skip0",
-    "onnx/node/generated/test_tfidfvectorizer_tf_batch_onlybigrams_skip5",
-    "onnx/node/generated/test_tfidfvectorizer_tf_batch_uniandbigrams_skip5",
-    "onnx/node/generated/test_tfidfvectorizer_tf_only_bigrams_skip0",
-    "onnx/node/generated/test_tfidfvectorizer_tf_onlybigrams_levelempty",
-    "onnx/node/generated/test_tfidfvectorizer_tf_onlybigrams_skip5",
-    "onnx/node/generated/test_tfidfvectorizer_tf_uniandbigrams_skip5",
     "onnx/node/generated/test_training_dropout",
     "onnx/node/generated/test_training_dropout_default",
     "onnx/node/generated/test_training_dropout_default_mask",