[LLVMCPU] Enable tileDispatchUsingForall as default (#18777)
diff --git a/compiler/src/iree/compiler/Codegen/Common/TileDispatchUsingForall.cpp b/compiler/src/iree/compiler/Codegen/Common/TileDispatchUsingForall.cpp
index ebbe585..218b7f5 100644
--- a/compiler/src/iree/compiler/Codegen/Common/TileDispatchUsingForall.cpp
+++ b/compiler/src/iree/compiler/Codegen/Common/TileDispatchUsingForall.cpp
@@ -202,13 +202,16 @@
llvm::SmallDenseSet<int> droppedLoops;
for (auto [index, lb, ub, step] :
llvm::enumerate(mixedLbs, mixedUbs, mixedSteps)) {
- if (!isa<Attribute>(lb) || !isa<Attribute>(ub) || !isa<Attribute>(step)) {
+
+ std::optional<int64_t> lbVal = getConstantIntValue(lb);
+ std::optional<int64_t> ubVal = getConstantIntValue(ub);
+ std::optional<int64_t> stepVal = getConstantIntValue(step);
+
+ if (!(lbVal && ubVal && stepVal)) {
continue;
}
- int64_t lbVal = getConstantIntValue(lb).value();
- int64_t ubVal = getConstantIntValue(ub).value();
- int64_t stepVal = getConstantIntValue(step).value();
- if (CEILDIV(ubVal - lbVal, stepVal) == 1) {
+
+ if (CEILDIV(ubVal.value() - lbVal.value(), stepVal.value()) == 1) {
droppedLoops.insert(index);
}
}
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/Passes.cpp b/compiler/src/iree/compiler/Codegen/LLVMCPU/Passes.cpp
index 0951fbb..71b3aec 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/Passes.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/Passes.cpp
@@ -95,7 +95,7 @@
static llvm::cl::opt<bool> clTileDispatchUsingForall(
"iree-llvmcpu-tile-dispatch-using-forall",
llvm::cl::desc("Enable tile and distribute to workgroups using scf.forall"),
- llvm::cl::init(false));
+ llvm::cl::init(true));
// By default, IREE does not enable the Armv9-A streaming SVE mode in the
// presence of scalable vectors (even when using `+sme`), as currently there's
@@ -111,9 +111,8 @@
llvm::cl::init(false));
// TODO: Enable `TileDispatchUsingForall` for every pipeline.
-static void addTileAndDistributePasses(OpPassManager &funcPassManager,
- bool enableTileDispatchUsingForall) {
- if (enableTileDispatchUsingForall || clTileDispatchUsingForall) {
+static void addTileAndDistributePasses(OpPassManager &funcPassManager) {
+ if (clTileDispatchUsingForall) {
funcPassManager.addPass(
createTileAndDistributeToWorkgroupsUsingForallOpPass());
} else {
@@ -346,8 +345,7 @@
void addCPUBufferOpsTileAndVectorizePipeline(
OpPassManager &funcPassManager, TilingConfig &tilingConfig,
LLVMCPUPipelineOptions &pipelineOpt) {
- addTileAndDistributePasses(funcPassManager,
- /*enableTileDispatchUsingForall=*/true);
+ addTileAndDistributePasses(funcPassManager);
// Skip tiling reduction loops because this is expected to apply on copy ops
// only.
@@ -384,8 +382,7 @@
void addMultiTilingExpertPassPipeline(OpPassManager &funcPassManager,
TilingConfig &tilingConfig,
LLVMCPUPipelineOptions &pipelineOpt) {
- addTileAndDistributePasses(funcPassManager,
- /*enableTileDispatchUsingForall=*/true);
+ addTileAndDistributePasses(funcPassManager);
SmallVector<int64_t> allFusableLevels(tilingConfig.getFusableLevels());
// Apply tile and fuse to all the non-distribution fusable levels. Skip
@@ -464,8 +461,7 @@
void addConvTileAndDecomposeExpertPassPipeline(
OpPassManager &funcPassManager, TilingConfig &tilingConfig,
LLVMCPUPipelineOptions &pipelineOpt) {
- addTileAndDistributePasses(funcPassManager,
- /*enableTileDispatchUsingForall=*/true);
+ addTileAndDistributePasses(funcPassManager);
// Run LLVMTileAndFuse firstly in case that we have fill + conv + generic
// ops. At this stage, we do not apply vectorization. The reduction dim won't
@@ -528,8 +524,7 @@
void addMmt4dTilingExpertPassPipeline(OpPassManager &funcPassManager,
TilingConfig &tilingConfig,
LLVMCPUPipelineOptions &pipelineOpt) {
- addTileAndDistributePasses(funcPassManager,
- /*enableTileDispatchUsingForall=*/true);
+ addTileAndDistributePasses(funcPassManager);
funcPassManager.addPass(createLLVMCPUTileAndFusePass(
static_cast<int64_t>(tilingConfig.getVectorCommonParallelLevel())));
@@ -577,8 +572,7 @@
void addCPUDataTilingPipeline(OpPassManager &funcPassManager,
TilingConfig &tilingConfig,
LLVMCPUPipelineOptions &pipelineOpt) {
- addTileAndDistributePasses(funcPassManager,
- /*enableTileDispatchUsingForall=*/true);
+ addTileAndDistributePasses(funcPassManager);
// The below two passes are nop if pack/unpack is not specified in ukernels
// attribute. By default, they are disabled.
@@ -621,8 +615,7 @@
void addCPULinalgExtTileAndVectorizePipeline(
OpPassManager &funcPassManager, TilingConfig &tilingConfig,
LLVMCPUPipelineOptions &pipelineOpt) {
- addTileAndDistributePasses(funcPassManager,
- /*enableTileDispatchUsingForall=*/false);
+ addTileAndDistributePasses(funcPassManager);
funcPassManager.addPass(
createLLVMCPUTilePass(tilingConfig.getVectorCommonParallelLevel()));
// TODO: Remove the pass once we have PartialReductionOpInterface implemented
@@ -661,8 +654,7 @@
}
void addCPUDefaultPassPipeline(OpPassManager &funcPassManager) {
- addTileAndDistributePasses(funcPassManager,
- /*enableTileDispatchUsingForall=*/false);
+ addTileAndDistributePasses(funcPassManager);
addCPUBufferizePasses(funcPassManager);
}
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_tile_and_fuse.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_tile_and_fuse.mlir
index 912acf3..2ebc854 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_tile_and_fuse.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_tile_and_fuse.mlir
@@ -290,7 +290,7 @@
// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index
// CHECK-DAG: %[[C720:.+]] = arith.constant 720 : index
// CHECK-DAG: %[[C2:.+]] = arith.constant 2 : index
-// CHECK: scf.forall ({{.*}}) in (2, 4, 1, 5) {
+// CHECK: scf.forall ({{.*}}) in (2, 4, 5) {
// CHECK: %[[LOOP:.+]] = scf.for %[[IV:.+]] = %[[C0]] to %[[C720]] step %[[C2]] {{.*}} -> (vector<1x4x1x4x4x1xf32>)
// CHECK: gpu.barrier
// CHECK-DAG: %[[LHS_RD:.+]] = vector.transfer_read %[[B0]]{{.*}} vector<8xf16>
@@ -307,7 +307,7 @@
// CHECK: %[[LOOP_T:.+]] = vector.transpose %[[LOOP]], [0, 1, 2, 4, 3, 5] : vector<1x4x1x4x4x1xf32> to vector<1x4x1x4x4x1xf32>
// CHECK: %[[EXTRACT:.+]] = vector.extract %[[LOOP_T]][0] : vector<4x1x4x4x1xf32> from vector<1x4x1x4x4x1xf32>
// CHECK: vector.transfer_write %[[EXTRACT]], %[[B2]]
-// CHECK: } {mapping = [#iree_codegen.workgroup_mapping<z:1>, #iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
+// CHECK: } {mapping = [#iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
// -----
diff --git a/compiler/src/iree/compiler/Dialect/Stream/Builtins/fill_i64.mlir b/compiler/src/iree/compiler/Dialect/Stream/Builtins/fill_i64.mlir
index 96e527a..5d7d686 100644
--- a/compiler/src/iree/compiler/Dialect/Stream/Builtins/fill_i64.mlir
+++ b/compiler/src/iree/compiler/Dialect/Stream/Builtins/fill_i64.mlir
@@ -9,16 +9,17 @@
stream.executable private @__builtin_fill_i64 {
stream.executable.export public @__builtin_fill_i64 workgroups(%arg0: index) -> (index, index, index) {
- %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0
+ %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @__builtin_fill_i64(%value: i64, %count: index, %out_binding: !stream.binding) {
%c0 = arith.constant 0 : index
- %out = stream.binding.subspan %out_binding[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?xi64>>{%count}
- %0 = tensor.empty(%count) : tensor<?xi64>
+ %count0 = flow.dispatch.workload.ordinal %count, 0 : index
+ %out = stream.binding.subspan %out_binding[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?xi64>>{%count0}
+ %0 = tensor.empty(%count0) : tensor<?xi64>
%1 = linalg.fill ins(%value : i64) outs(%0 : tensor<?xi64>) -> tensor<?xi64>
- flow.dispatch.tensor.store %1, %out, offsets = [0], sizes = [%count], strides = [1] : tensor<?xi64> -> !flow.dispatch.tensor<writeonly:tensor<?xi64>>{%count}
+ flow.dispatch.tensor.store %1, %out, offsets = [0], sizes = [%count0], strides = [1] : tensor<?xi64> -> !flow.dispatch.tensor<writeonly:tensor<?xi64>>{%count0}
return
}
}
diff --git a/compiler/src/iree/compiler/Dialect/Stream/Builtins/splat_i64.mlir b/compiler/src/iree/compiler/Dialect/Stream/Builtins/splat_i64.mlir
index 7d94e51..4d25d35 100644
--- a/compiler/src/iree/compiler/Dialect/Stream/Builtins/splat_i64.mlir
+++ b/compiler/src/iree/compiler/Dialect/Stream/Builtins/splat_i64.mlir
@@ -9,16 +9,17 @@
stream.executable private @__builtin_splat_i64 {
stream.executable.export public @__builtin_splat_i64 workgroups(%arg0: index) -> (index, index, index) {
- %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0
+ %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @__builtin_splat_i64(%value: i64, %count: index, %out_binding: !stream.binding) {
%c0 = arith.constant 0 : index
- %out = stream.binding.subspan %out_binding[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?xi64>>{%count}
- %0 = tensor.empty(%count) : tensor<?xi64>
+ %count0 = flow.dispatch.workload.ordinal %count, 0 : index
+ %out = stream.binding.subspan %out_binding[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?xi64>>{%count0}
+ %0 = tensor.empty(%count0) : tensor<?xi64>
%1 = linalg.fill ins(%value : i64) outs(%0 : tensor<?xi64>) -> tensor<?xi64>
- flow.dispatch.tensor.store %1, %out, offsets = [0], sizes = [%count], strides = [1] : tensor<?xi64> -> !flow.dispatch.tensor<writeonly:tensor<?xi64>>{%count}
+ flow.dispatch.tensor.store %1, %out, offsets = [0], sizes = [%count0], strides = [1] : tensor<?xi64> -> !flow.dispatch.tensor<writeonly:tensor<?xi64>>{%count0}
return
}
}
diff --git a/compiler/src/iree/compiler/DispatchCreation/FormDispatchRegions.cpp b/compiler/src/iree/compiler/DispatchCreation/FormDispatchRegions.cpp
index e866022..b38b1a5 100644
--- a/compiler/src/iree/compiler/DispatchCreation/FormDispatchRegions.cpp
+++ b/compiler/src/iree/compiler/DispatchCreation/FormDispatchRegions.cpp
@@ -547,6 +547,14 @@
return false;
}
+ // TODO: Enable grouped convolution and depth wise pooling fusion.
+ // Rightnow, this is going through the default CPU pipeline and not through
+ // CONVTilingExpert.
+ if (isa<linalg::Conv2DNgchwFgchwOp, linalg::Conv2DNgchwGfchwOp,
+ linalg::PoolingNdhwcSumOp>(producer)) {
+ return false;
+ }
+
auto producerFusionOp =
dyn_cast<IREE::LinalgExt::LinalgFusionOpInterface>(producer);
auto consumerFusionOp =
diff --git a/tests/external/iree-test-suites/onnx_ops/onnx_ops_cpu_llvm_sync.json b/tests/external/iree-test-suites/onnx_ops/onnx_ops_cpu_llvm_sync.json
index a025431..f8ca790 100644
--- a/tests/external/iree-test-suites/onnx_ops/onnx_ops_cpu_llvm_sync.json
+++ b/tests/external/iree-test-suites/onnx_ops/onnx_ops_cpu_llvm_sync.json
@@ -392,13 +392,6 @@
"onnx/node/generated/test_softsign_example",
"onnx/node/generated/test_stft",
"onnx/node/generated/test_stft_with_window",
- "onnx/node/generated/test_tfidfvectorizer_tf_batch_onlybigrams_skip0",
- "onnx/node/generated/test_tfidfvectorizer_tf_batch_onlybigrams_skip5",
- "onnx/node/generated/test_tfidfvectorizer_tf_batch_uniandbigrams_skip5",
- "onnx/node/generated/test_tfidfvectorizer_tf_only_bigrams_skip0",
- "onnx/node/generated/test_tfidfvectorizer_tf_onlybigrams_levelempty",
- "onnx/node/generated/test_tfidfvectorizer_tf_onlybigrams_skip5",
- "onnx/node/generated/test_tfidfvectorizer_tf_uniandbigrams_skip5",
"onnx/node/generated/test_training_dropout",
"onnx/node/generated/test_training_dropout_default",
"onnx/node/generated/test_training_dropout_default_mask",