[Codegen] Create the temp DispatchConfigOp with FromSliceOp. (#24037)
There are cases that pipeline tests do not have dispatch_config op, and
they currently rely on ReconcileTranslationInfo to insert the dumb
DispatchConfigOp. `{c1, c1, c1}` is NOT a solid fallback, so the
revision makes it returns the result of FromSliceOp instead.
The revision also fixes a pipeline test for CPU, so the resolve hints
pass can map the block arguments to ordinal ops correctly.
Signed-off-by: hanhanW <hanhan0912@gmail.com>diff --git a/compiler/src/iree/compiler/Codegen/Common/Passes.td b/compiler/src/iree/compiler/Codegen/Common/Passes.td
index 0eb98da..21c175e 100644
--- a/compiler/src/iree/compiler/Codegen/Common/Passes.td
+++ b/compiler/src/iree/compiler/Codegen/Common/Passes.td
@@ -361,6 +361,10 @@
let summary =
"Reconcile information (like workgroup_size, subgroup_size) across "
"`TranslationInfo` set on each function in the dispatch and merge them";
+ let dependentDialects = [
+ "IREE::Codegen::IREECodegenDialect",
+ "IREE::TensorExt::IREETensorExtDialect",
+ ];
let options = [
Option<"distributeAlong", "distribute-along",
"::mlir::iree_compiler::IREE::Codegen::WorkgroupId",
diff --git a/compiler/src/iree/compiler/Codegen/Common/ReconcileTranslationInfo.cpp b/compiler/src/iree/compiler/Codegen/Common/ReconcileTranslationInfo.cpp
index c4b2352..8d758a6 100644
--- a/compiler/src/iree/compiler/Codegen/Common/ReconcileTranslationInfo.cpp
+++ b/compiler/src/iree/compiler/Codegen/Common/ReconcileTranslationInfo.cpp
@@ -22,6 +22,7 @@
#include "iree/compiler/Codegen/Transforms/Transforms.h"
#include "iree/compiler/Codegen/Utils/GPUUtils.h"
#include "iree/compiler/Dialect/LinalgExt/Utils/Utils.h"
+#include "iree/compiler/Dialect/TensorExt/IR/TensorExtOps.h"
#include "llvm/Support/Casting.h"
#include "mlir/Analysis/CallGraph.h"
#include "mlir/Dialect/Affine/Utils.h"
@@ -807,6 +808,13 @@
}
}
+ // A public function with no translation info of its own and no existing
+ // dispatch_config is a non-entry-point helper.
+ if (!getTranslationInfo(rootFuncOp) &&
+ !configMap.contains(rootFuncOp.getName())) {
+ continue;
+ }
+
// Reconcile workgroup sizes.
FailureOr<SmallVector<int64_t>> reconciledWorkgroupSize =
reconcileWorkgroupSize(translationInfos);
@@ -852,20 +860,29 @@
configOp = configMap[rootFuncOp.getName()];
}
- // TODO(hanchung): We should signal a failure if it happens. Currently,
- // there are some pipeline tests relying on this. It will be fixed once we
- // have fully migrated the pipeline tests on modules.
+ // If no DispatchConfigOp exists, create one with a body holding a
+ // `workgroup_count_from_slice` placeholder.
if (!configOp) {
- // No dispatch_config exists. Create one with a stub body so that
- // workgroup_size/subgroup_size can be propagated to the export.
- Location loc = rootFuncOp.getLoc();
+ int64_t numWorkloads = 0;
+ rootFuncOp.walk([&](IREE::TensorExt::DispatchWorkloadOrdinalOp ord) {
+ numWorkloads =
+ std::max(numWorkloads, ord.getOrdinal().getSExtValue() + 1);
+ });
+ OpBuilder::InsertionGuard g(rewriter);
rewriter.setInsertionPointAfter(rootFuncOp);
+ Location loc = rootFuncOp.getLoc();
configOp = IREE::Codegen::DispatchConfigOp::create(
rewriter, loc, FlatSymbolRefAttr::get(rootFuncOp.getNameAttr()));
- Block *block = rewriter.createBlock(&configOp.getBody());
+ IndexType indexType = rewriter.getIndexType();
+ SmallVector<Type> argTypes(numWorkloads, indexType);
+ SmallVector<Location> argLocs(numWorkloads, loc);
+ Block *block = rewriter.createBlock(&configOp.getBody(), /*insertPt=*/{},
+ argTypes, argLocs);
rewriter.setInsertionPointToStart(block);
- auto c1 = arith::ConstantIndexOp::create(rewriter, loc, 1);
- IREE::Codegen::YieldOp::create(rewriter, loc, ValueRange{c1, c1, c1});
+ auto fromSlice =
+ IREE::TensorExt::DispatchWorkgroupCountFromSliceOp::create(
+ rewriter, loc, block->getArguments());
+ IREE::Codegen::YieldOp::create(rewriter, loc, fromSlice.getResults());
}
configOp.setWorkgroupSizeAttr(rewriter.getDenseI64ArrayAttr(workgroupSize));
if (subgroupSizeAttr) {
diff --git a/compiler/src/iree/compiler/Codegen/Common/test/BUILD.bazel b/compiler/src/iree/compiler/Codegen/Common/test/BUILD.bazel
index 220dfec..d4a2879 100644
--- a/compiler/src/iree/compiler/Codegen/Common/test/BUILD.bazel
+++ b/compiler/src/iree/compiler/Codegen/Common/test/BUILD.bazel
@@ -117,6 +117,7 @@
"propagate_reshapes_by_expansion.mlir",
"reconcile_translation_info.mlir",
"reconcile_translation_info_linearize.mlir",
+ "reconcile_translation_info_pure.mlir",
"reductions.mlir",
"rematerialize_parallel_ops.mlir",
"remove_dead_allocs.mlir",
diff --git a/compiler/src/iree/compiler/Codegen/Common/test/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/Common/test/CMakeLists.txt
index f73aa40..68ef693 100644
--- a/compiler/src/iree/compiler/Codegen/Common/test/CMakeLists.txt
+++ b/compiler/src/iree/compiler/Codegen/Common/test/CMakeLists.txt
@@ -112,6 +112,7 @@
"propagate_reshapes_by_expansion.mlir"
"reconcile_translation_info.mlir"
"reconcile_translation_info_linearize.mlir"
+ "reconcile_translation_info_pure.mlir"
"reductions.mlir"
"rematerialize_parallel_ops.mlir"
"remove_dead_allocs.mlir"
diff --git a/compiler/src/iree/compiler/Codegen/Common/test/reconcile_translation_info_pure.mlir b/compiler/src/iree/compiler/Codegen/Common/test/reconcile_translation_info_pure.mlir
new file mode 100644
index 0000000..d7b0ed2
--- /dev/null
+++ b/compiler/src/iree/compiler/Codegen/Common/test/reconcile_translation_info_pure.mlir
@@ -0,0 +1,68 @@
+// RUN: iree-opt --split-input-file --pass-pipeline="builtin.module(iree-codegen-reconcile-translation-info)" %s | FileCheck %s
+
+// Different from other files, this file is purely testing the
+// `ReconcileTranslationInfoPass`. Ideally, each test file should test each pass
+// individually, but the current setup does not allow it.
+
+// Tests for the fallback stub `iree_codegen.dispatch_config` op that
+// `ReconcileTranslationInfoPass` creates when no `dispatch_config` exists for
+// an entry-point function (e.g. iree-opt pipeline tests that bypass
+// `CreateDispatchConfigPass`).
+
+// CHECK-LABEL: func.func @no_workload_ordinals
+// CHECK: iree_codegen.dispatch_config @no_workload_ordinals workgroup_size = [64, 1, 1]
+// CHECK-NEXT: %[[X:.+]], %[[Y:.+]], %[[Z:.+]] = iree_tensor_ext.dispatch.workgroup_count_from_slice()
+// CHECK-NEXT: iree_codegen.yield %[[X]], %[[Y]], %[[Z]]
+func.func @no_workload_ordinals() attributes {
+ translation_info = #iree_codegen.translation_info<pipeline = None workgroup_size = [64]>
+} {
+ return
+}
+
+// -----
+
+// When the function uses workload ordinals, the stub block argument list is
+// sized to cover every referenced ordinal (max ordinal + 1), so that the
+// later `ResolveWorkgroupCountHintsPass` can map each ordinal to a stub
+// workload value.
+
+#pipeline_layout = #hal.pipeline.layout<constants = 4, bindings = []>
+
+// CHECK-LABEL: func.func @with_workload_ordinals
+// CHECK: iree_codegen.dispatch_config @with_workload_ordinals workgroup_size = [128, 1, 1]
+// CHECK-NEXT: ^bb0(%[[A0:.+]]: index, %[[A1:.+]]: index, %[[A2:.+]]: index, %[[A3:.+]]: index):
+// CHECK-NEXT: %[[X:.+]], %[[Y:.+]], %[[Z:.+]] = iree_tensor_ext.dispatch.workgroup_count_from_slice(%[[A0]], %[[A1]], %[[A2]], %[[A3]])
+// CHECK-NEXT: iree_codegen.yield %[[X]], %[[Y]], %[[Z]]
+func.func @with_workload_ordinals() attributes {
+ translation_info = #iree_codegen.translation_info<pipeline = None workgroup_size = [128]>
+} {
+ %p0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32
+ %p1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32
+ %p2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32
+ %p3 = hal.interface.constant.load layout(#pipeline_layout) ordinal(3) : i32
+ %x0 = arith.index_castui %p0 : i32 to index
+ %x1 = arith.index_castui %p1 : i32 to index
+ %x2 = arith.index_castui %p2 : i32 to index
+ %x3 = arith.index_castui %p3 : i32 to index
+ %a = iree_tensor_ext.dispatch.workload.ordinal %x0, 0 : index
+ %b = iree_tensor_ext.dispatch.workload.ordinal %x1, 1 : index
+ %c = iree_tensor_ext.dispatch.workload.ordinal %x2, 2 : index
+ %d = iree_tensor_ext.dispatch.workload.ordinal %x3, 3 : index
+ %ab = arith.addi %a, %b : index
+ %cd = arith.addi %c, %d : index
+ %abcd = arith.addi %ab, %cd : index
+ iree_codegen.workgroup_count_hint(%abcd, 1, 1)
+ return
+}
+
+// -----
+
+// A public function with no `translation_info` and no callees with
+// translation_info is treated as a non-entry-point helper. No stub
+// `dispatch_config` is created for it.
+
+// CHECK-LABEL: func.func @helper_without_translation_info
+// CHECK-NOT: iree_codegen.dispatch_config @helper_without_translation_info
+func.func @helper_without_translation_info() {
+ return
+}
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/pipeline_tests.mlir b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/pipeline_tests.mlir
index aafa082..db208eb 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/pipeline_tests.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/pipeline_tests.mlir
@@ -620,13 +620,22 @@
// it direct writes the result into the destination buffer.
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", native_vector_size = 16 : index, target_triple = "x86_64-none-elf"}>
-#pipeline_layout = #hal.pipeline.layout<bindings = [
+#pipeline_layout = #hal.pipeline.layout<constants = 3, bindings = [
#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">,
#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">,
#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">,
#hal.pipeline.binding<storage_buffer, Indirect>]>
-func.func @matmul_accumulate_from_readonly(%M: index, %N: index, %K: index) attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} {
+func.func @matmul_accumulate_from_readonly() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} {
%c0 = arith.constant 0 : index
+ %m_i32 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32
+ %n_i32 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32
+ %k_i32 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32
+ %m_idx = arith.index_castui %m_i32 : i32 to index
+ %n_idx = arith.index_castui %n_i32 : i32 to index
+ %k_idx = arith.index_castui %k_i32 : i32 to index
+ %M = iree_tensor_ext.dispatch.workload.ordinal %m_idx, 0 : index
+ %N = iree_tensor_ext.dispatch.workload.ordinal %n_idx, 1 : index
+ %K = iree_tensor_ext.dispatch.workload.ordinal %k_idx, 2 : index
%0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?xf32>>{%M, %K}
%1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?xf32>>{%K, %N}
%2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?xf32>>{%M, %N}