[Codegen] Create the temp DispatchConfigOp with FromSliceOp. (#24037)

There are cases that pipeline tests do not have dispatch_config op, and
they currently rely on ReconcileTranslationInfo to insert the dumb
DispatchConfigOp. `{c1, c1, c1}` is NOT a solid fallback, so the
revision makes it returns the result of FromSliceOp instead.

The revision also fixes a pipeline test for CPU, so the resolve hints
pass can map the block arguments to ordinal ops correctly.

Signed-off-by: hanhanW <hanhan0912@gmail.com>
diff --git a/compiler/src/iree/compiler/Codegen/Common/Passes.td b/compiler/src/iree/compiler/Codegen/Common/Passes.td
index 0eb98da..21c175e 100644
--- a/compiler/src/iree/compiler/Codegen/Common/Passes.td
+++ b/compiler/src/iree/compiler/Codegen/Common/Passes.td
@@ -361,6 +361,10 @@
   let summary =
       "Reconcile information (like workgroup_size, subgroup_size) across "
       "`TranslationInfo` set on each function in the dispatch and merge them";
+  let dependentDialects = [
+    "IREE::Codegen::IREECodegenDialect",
+    "IREE::TensorExt::IREETensorExtDialect",
+  ];
   let options = [
     Option<"distributeAlong", "distribute-along",
            "::mlir::iree_compiler::IREE::Codegen::WorkgroupId",
diff --git a/compiler/src/iree/compiler/Codegen/Common/ReconcileTranslationInfo.cpp b/compiler/src/iree/compiler/Codegen/Common/ReconcileTranslationInfo.cpp
index c4b2352..8d758a6 100644
--- a/compiler/src/iree/compiler/Codegen/Common/ReconcileTranslationInfo.cpp
+++ b/compiler/src/iree/compiler/Codegen/Common/ReconcileTranslationInfo.cpp
@@ -22,6 +22,7 @@
 #include "iree/compiler/Codegen/Transforms/Transforms.h"
 #include "iree/compiler/Codegen/Utils/GPUUtils.h"
 #include "iree/compiler/Dialect/LinalgExt/Utils/Utils.h"
+#include "iree/compiler/Dialect/TensorExt/IR/TensorExtOps.h"
 #include "llvm/Support/Casting.h"
 #include "mlir/Analysis/CallGraph.h"
 #include "mlir/Dialect/Affine/Utils.h"
@@ -807,6 +808,13 @@
       }
     }
 
+    // A public function with no translation info of its own and no existing
+    // dispatch_config is a non-entry-point helper.
+    if (!getTranslationInfo(rootFuncOp) &&
+        !configMap.contains(rootFuncOp.getName())) {
+      continue;
+    }
+
     // Reconcile workgroup sizes.
     FailureOr<SmallVector<int64_t>> reconciledWorkgroupSize =
         reconcileWorkgroupSize(translationInfos);
@@ -852,20 +860,29 @@
       configOp = configMap[rootFuncOp.getName()];
     }
 
-    // TODO(hanchung): We should signal a failure if it happens. Currently,
-    // there are some pipeline tests relying on this. It will be fixed once we
-    // have fully migrated the pipeline tests on modules.
+    // If no DispatchConfigOp exists, create one with a body holding a
+    // `workgroup_count_from_slice` placeholder.
     if (!configOp) {
-      // No dispatch_config exists. Create one with a stub body so that
-      // workgroup_size/subgroup_size can be propagated to the export.
-      Location loc = rootFuncOp.getLoc();
+      int64_t numWorkloads = 0;
+      rootFuncOp.walk([&](IREE::TensorExt::DispatchWorkloadOrdinalOp ord) {
+        numWorkloads =
+            std::max(numWorkloads, ord.getOrdinal().getSExtValue() + 1);
+      });
+      OpBuilder::InsertionGuard g(rewriter);
       rewriter.setInsertionPointAfter(rootFuncOp);
+      Location loc = rootFuncOp.getLoc();
       configOp = IREE::Codegen::DispatchConfigOp::create(
           rewriter, loc, FlatSymbolRefAttr::get(rootFuncOp.getNameAttr()));
-      Block *block = rewriter.createBlock(&configOp.getBody());
+      IndexType indexType = rewriter.getIndexType();
+      SmallVector<Type> argTypes(numWorkloads, indexType);
+      SmallVector<Location> argLocs(numWorkloads, loc);
+      Block *block = rewriter.createBlock(&configOp.getBody(), /*insertPt=*/{},
+                                          argTypes, argLocs);
       rewriter.setInsertionPointToStart(block);
-      auto c1 = arith::ConstantIndexOp::create(rewriter, loc, 1);
-      IREE::Codegen::YieldOp::create(rewriter, loc, ValueRange{c1, c1, c1});
+      auto fromSlice =
+          IREE::TensorExt::DispatchWorkgroupCountFromSliceOp::create(
+              rewriter, loc, block->getArguments());
+      IREE::Codegen::YieldOp::create(rewriter, loc, fromSlice.getResults());
     }
     configOp.setWorkgroupSizeAttr(rewriter.getDenseI64ArrayAttr(workgroupSize));
     if (subgroupSizeAttr) {
diff --git a/compiler/src/iree/compiler/Codegen/Common/test/BUILD.bazel b/compiler/src/iree/compiler/Codegen/Common/test/BUILD.bazel
index 220dfec..d4a2879 100644
--- a/compiler/src/iree/compiler/Codegen/Common/test/BUILD.bazel
+++ b/compiler/src/iree/compiler/Codegen/Common/test/BUILD.bazel
@@ -117,6 +117,7 @@
             "propagate_reshapes_by_expansion.mlir",
             "reconcile_translation_info.mlir",
             "reconcile_translation_info_linearize.mlir",
+            "reconcile_translation_info_pure.mlir",
             "reductions.mlir",
             "rematerialize_parallel_ops.mlir",
             "remove_dead_allocs.mlir",
diff --git a/compiler/src/iree/compiler/Codegen/Common/test/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/Common/test/CMakeLists.txt
index f73aa40..68ef693 100644
--- a/compiler/src/iree/compiler/Codegen/Common/test/CMakeLists.txt
+++ b/compiler/src/iree/compiler/Codegen/Common/test/CMakeLists.txt
@@ -112,6 +112,7 @@
     "propagate_reshapes_by_expansion.mlir"
     "reconcile_translation_info.mlir"
     "reconcile_translation_info_linearize.mlir"
+    "reconcile_translation_info_pure.mlir"
     "reductions.mlir"
     "rematerialize_parallel_ops.mlir"
     "remove_dead_allocs.mlir"
diff --git a/compiler/src/iree/compiler/Codegen/Common/test/reconcile_translation_info_pure.mlir b/compiler/src/iree/compiler/Codegen/Common/test/reconcile_translation_info_pure.mlir
new file mode 100644
index 0000000..d7b0ed2
--- /dev/null
+++ b/compiler/src/iree/compiler/Codegen/Common/test/reconcile_translation_info_pure.mlir
@@ -0,0 +1,68 @@
+// RUN: iree-opt --split-input-file --pass-pipeline="builtin.module(iree-codegen-reconcile-translation-info)" %s | FileCheck %s
+
+// Different from other files, this file is purely testing the
+// `ReconcileTranslationInfoPass`. Ideally, each test file should test each pass
+// individually, but the current setup does not allow it.
+
+// Tests for the fallback stub `iree_codegen.dispatch_config` op that
+// `ReconcileTranslationInfoPass` creates when no `dispatch_config` exists for
+// an entry-point function (e.g. iree-opt pipeline tests that bypass
+// `CreateDispatchConfigPass`).
+
+// CHECK-LABEL: func.func @no_workload_ordinals
+//       CHECK: iree_codegen.dispatch_config @no_workload_ordinals workgroup_size = [64, 1, 1]
+//  CHECK-NEXT:   %[[X:.+]], %[[Y:.+]], %[[Z:.+]] = iree_tensor_ext.dispatch.workgroup_count_from_slice()
+//  CHECK-NEXT:   iree_codegen.yield %[[X]], %[[Y]], %[[Z]]
+func.func @no_workload_ordinals() attributes {
+  translation_info = #iree_codegen.translation_info<pipeline = None workgroup_size = [64]>
+} {
+  return
+}
+
+// -----
+
+// When the function uses workload ordinals, the stub block argument list is
+// sized to cover every referenced ordinal (max ordinal + 1), so that the
+// later `ResolveWorkgroupCountHintsPass` can map each ordinal to a stub
+// workload value.
+
+#pipeline_layout = #hal.pipeline.layout<constants = 4, bindings = []>
+
+// CHECK-LABEL: func.func @with_workload_ordinals
+//       CHECK: iree_codegen.dispatch_config @with_workload_ordinals workgroup_size = [128, 1, 1]
+//  CHECK-NEXT:   ^bb0(%[[A0:.+]]: index, %[[A1:.+]]: index, %[[A2:.+]]: index, %[[A3:.+]]: index):
+//  CHECK-NEXT:   %[[X:.+]], %[[Y:.+]], %[[Z:.+]] = iree_tensor_ext.dispatch.workgroup_count_from_slice(%[[A0]], %[[A1]], %[[A2]], %[[A3]])
+//  CHECK-NEXT:   iree_codegen.yield %[[X]], %[[Y]], %[[Z]]
+func.func @with_workload_ordinals() attributes {
+  translation_info = #iree_codegen.translation_info<pipeline = None workgroup_size = [128]>
+} {
+  %p0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32
+  %p1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32
+  %p2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32
+  %p3 = hal.interface.constant.load layout(#pipeline_layout) ordinal(3) : i32
+  %x0 = arith.index_castui %p0 : i32 to index
+  %x1 = arith.index_castui %p1 : i32 to index
+  %x2 = arith.index_castui %p2 : i32 to index
+  %x3 = arith.index_castui %p3 : i32 to index
+  %a = iree_tensor_ext.dispatch.workload.ordinal %x0, 0 : index
+  %b = iree_tensor_ext.dispatch.workload.ordinal %x1, 1 : index
+  %c = iree_tensor_ext.dispatch.workload.ordinal %x2, 2 : index
+  %d = iree_tensor_ext.dispatch.workload.ordinal %x3, 3 : index
+  %ab = arith.addi %a, %b : index
+  %cd = arith.addi %c, %d : index
+  %abcd = arith.addi %ab, %cd : index
+  iree_codegen.workgroup_count_hint(%abcd, 1, 1)
+  return
+}
+
+// -----
+
+// A public function with no `translation_info` and no callees with
+// translation_info is treated as a non-entry-point helper. No stub
+// `dispatch_config` is created for it.
+
+// CHECK-LABEL: func.func @helper_without_translation_info
+//   CHECK-NOT: iree_codegen.dispatch_config @helper_without_translation_info
+func.func @helper_without_translation_info() {
+  return
+}
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/pipeline_tests.mlir b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/pipeline_tests.mlir
index aafa082..db208eb 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/pipeline_tests.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/pipeline_tests.mlir
@@ -620,13 +620,22 @@
 // it direct writes the result into the destination buffer.
 
 #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", native_vector_size = 16 : index, target_triple = "x86_64-none-elf"}>
-#pipeline_layout = #hal.pipeline.layout<bindings = [
+#pipeline_layout = #hal.pipeline.layout<constants = 3, bindings = [
   #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">,
   #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">,
   #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">,
   #hal.pipeline.binding<storage_buffer, Indirect>]>
-func.func @matmul_accumulate_from_readonly(%M: index, %N: index, %K: index) attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} {
+func.func @matmul_accumulate_from_readonly() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} {
   %c0 = arith.constant 0 : index
+  %m_i32 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32
+  %n_i32 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32
+  %k_i32 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32
+  %m_idx = arith.index_castui %m_i32 : i32 to index
+  %n_idx = arith.index_castui %n_i32 : i32 to index
+  %k_idx = arith.index_castui %k_i32 : i32 to index
+  %M = iree_tensor_ext.dispatch.workload.ordinal %m_idx, 0 : index
+  %N = iree_tensor_ext.dispatch.workload.ordinal %n_idx, 1 : index
+  %K = iree_tensor_ext.dispatch.workload.ordinal %k_idx, 2 : index
   %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?xf32>>{%M, %K}
   %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?xf32>>{%K, %N}
   %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?xf32>>{%M, %N}