Undo use of upstream pass for first level tile + distribute. (#8885)
The use of Codegen driver for first level tile + distribute introduces a lot of code motion that interferes with conversion to destination passing style (which in turns interferes with bufferization). Instead, the need is probably to have a more targeted pass to use the upstream tile + fuse + distribute (+ interchange).
diff --git a/iree/compiler/Codegen/LLVMCPU/Passes.cpp b/iree/compiler/Codegen/LLVMCPU/Passes.cpp
index 9f574f4..b4b5ced 100644
--- a/iree/compiler/Codegen/LLVMCPU/Passes.cpp
+++ b/iree/compiler/Codegen/LLVMCPU/Passes.cpp
@@ -250,25 +250,15 @@
void addDoubleTilingExpertPassPipeline(OpPassManager &passManager,
bool lowerToAVX2) {
- // Run preprocessing and verification before starting Linalg transforms.
- passManager.addNestedPass<func::FuncOp>(
- createConvertToDestinationPassingStylePass());
- passManager.addPass(createCanonicalizerPass());
passManager.addPass(createVerifyLinalgTransformLegalityPass());
// Do first level of tiling and distribution.
- passManager.addNestedPass<func::FuncOp>(createInsertDistributionInfoPass());
- {
- LinalgFusePassOptions options;
- options.tilingLevel =
- static_cast<int64_t>(StrategyTilingLevel::WorkGroupTiles);
- options.doIREEDistribution = true;
- passManager.addNestedPass<func::FuncOp>(createLinalgFusePass(options));
- passManager.addNestedPass<func::FuncOp>(createCanonicalizerPass());
- passManager.addNestedPass<func::FuncOp>(createCSEPass());
- passManager.addNestedPass<func::FuncOp>(
- createRewriteLinalgDestructiveUpdatesPass());
- }
+ passManager.addNestedPass<FuncOp>(createInsertDistributionInfoPass());
+ passManager.addNestedPass<FuncOp>(createTileAndDistributeToWorkgroupsPass());
+ passManager.addPass(createCanonicalizerPass());
+ passManager.addPass(createCSEPass());
+ passManager.addNestedPass<func::FuncOp>(
+ createConvertToDestinationPassingStylePass());
// Run LinalgFusePass firstly in case that we have fill + matmul + generic
// ops. At this stage, we do not apply vectorization. The reduction dim won't
@@ -327,25 +317,15 @@
}
void addConvTileAndDecomposeExpertPassPipeline(OpPassManager &passManager) {
- // Run preprocessing and verification before starting Linalg transforms.
- passManager.addNestedPass<func::FuncOp>(
- createConvertToDestinationPassingStylePass());
- passManager.addPass(createCanonicalizerPass());
passManager.addPass(createVerifyLinalgTransformLegalityPass());
// Do first level of tiling and distribution.
- passManager.addNestedPass<func::FuncOp>(createInsertDistributionInfoPass());
- {
- LinalgFusePassOptions options;
- options.tilingLevel =
- static_cast<int64_t>(StrategyTilingLevel::WorkGroupTiles);
- options.doIREEDistribution = true;
- passManager.addNestedPass<func::FuncOp>(createLinalgFusePass(options));
- passManager.addNestedPass<func::FuncOp>(createCanonicalizerPass());
- passManager.addNestedPass<func::FuncOp>(createCSEPass());
- passManager.addNestedPass<func::FuncOp>(
- createRewriteLinalgDestructiveUpdatesPass());
- }
+ passManager.addNestedPass<FuncOp>(createInsertDistributionInfoPass());
+ passManager.addNestedPass<FuncOp>(createTileAndDistributeToWorkgroupsPass());
+ passManager.addPass(createCanonicalizerPass());
+ passManager.addPass(createCSEPass());
+ passManager.addNestedPass<func::FuncOp>(
+ createConvertToDestinationPassingStylePass());
// Run LinalgFusePass firstly in case that we have fill + conv + generic
// ops. At this stage, we do not apply vectorization. The reduction dim won't
diff --git a/iree/compiler/Codegen/LLVMCPU/test/BUILD b/iree/compiler/Codegen/LLVMCPU/test/BUILD
index fd83660..446338e 100644
--- a/iree/compiler/Codegen/LLVMCPU/test/BUILD
+++ b/iree/compiler/Codegen/LLVMCPU/test/BUILD
@@ -29,6 +29,7 @@
"materialize_aarch64_launch_configuration.mlir",
"materialize_riscv_launch_configuration.mlir",
"materialize_x86_64_launch_configuration.mlir",
+ "pipeline_tests.mlir",
"synchronize_symbol_visibility.mlir",
"test_config_mmt4d.mlir",
"tile_fuse_and_vectorize.mlir",
diff --git a/iree/compiler/Codegen/LLVMCPU/test/CMakeLists.txt b/iree/compiler/Codegen/LLVMCPU/test/CMakeLists.txt
index 6f6cec0..8035201 100644
--- a/iree/compiler/Codegen/LLVMCPU/test/CMakeLists.txt
+++ b/iree/compiler/Codegen/LLVMCPU/test/CMakeLists.txt
@@ -23,6 +23,7 @@
"materialize_aarch64_launch_configuration.mlir"
"materialize_riscv_launch_configuration.mlir"
"materialize_x86_64_launch_configuration.mlir"
+ "pipeline_tests.mlir"
"synchronize_symbol_visibility.mlir"
"test_config_mmt4d.mlir"
"tile_fuse_and_vectorize.mlir"
diff --git a/iree/compiler/Codegen/LLVMCPU/test/pipeline_tests.mlir b/iree/compiler/Codegen/LLVMCPU/test/pipeline_tests.mlir
new file mode 100644
index 0000000..b89b2db
--- /dev/null
+++ b/iree/compiler/Codegen/LLVMCPU/test/pipeline_tests.mlir
@@ -0,0 +1,59 @@
+// RUN: iree-opt -pass-pipeline='hal.executable(hal.executable.variant(iree-llvmcpu-lower-executable-target))' %s | FileCheck %s
+
+// Check that this dispatch compiles to vectors and that there are no allocas.
+// By proxy checks that destination passing style kicked in correctly
+// and no CSE was run between first level tile + fuse + distribute
+// and the conversion to destination passing style. Running CSE
+// before hoists the fill and the init_tensor out of the loop causing
+// issues with the conversion.
+#map3 = affine_map<(d0) -> (d0)>
+#map4 = affine_map<(d0, d1) -> (d0)>
+#map5 = affine_map<(d0, d1) -> (d0, d1)>
+#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm", "embedded-elf-x86_64", {
+ cpu_features = "",
+ data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128",
+ native_vector_size = 16 : index,
+ target_triple = "x86_64-unknown-unknown-eabi-elf"}>
+#executable_layout5 = #hal.executable.layout<push_constants = 2, sets = [
+ #hal.descriptor_set.layout<0, bindings = [
+ #hal.descriptor_set.binding<0, storage_buffer>,
+ #hal.descriptor_set.binding<1, storage_buffer>]
+ >]>
+hal.executable private @check_no_cse {
+ hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
+ hal.executable.entry_point public @check_no_cse ordinal(0) layout(#executable_layout5)
+ builtin.module {
+ func @check_no_cse() {
+ %cst = arith.constant 3.840000e+02 : f32
+ %cst_0 = arith.constant 0.000000e+00 : f32
+ %0 = hal.interface.constant.load[0] : i32
+ %1 = hal.interface.constant.load[1] : i32
+ %2 = arith.index_cast %0 {stream.alignment = 512 : index, stream.values = [0 : index, 10752 : index]} : i32 to index
+ %3 = arith.index_cast %1 {stream.alignment = 512 : index, stream.values = [10752 : index, 21504 : index]} : i32 to index
+ %4 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%2) alignment(64) : !flow.dispatch.tensor<readonly:7x384xf32>
+ %5 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%3) alignment(64) : !flow.dispatch.tensor<writeonly:7xf32>
+ %6 = flow.dispatch.tensor.load %4, offsets = [0, 0], sizes = [7, 384], strides = [1, 1] : !flow.dispatch.tensor<readonly:7x384xf32> -> tensor<7x384xf32>
+ %7 = linalg.init_tensor [7] : tensor<7xf32>
+ %8 = linalg.fill ins(%cst_0 : f32) outs(%7 : tensor<7xf32>) -> tensor<7xf32>
+ %9 = linalg.generic {indexing_maps = [#map5, #map4], iterator_types = ["parallel", "reduction"]} ins(%6 : tensor<7x384xf32>) outs(%8 : tensor<7xf32>) {
+ ^bb0(%arg0: f32, %arg1: f32):
+ %11 = arith.addf %arg1, %arg0 : f32
+ linalg.yield %11 : f32
+ } -> tensor<7xf32>
+ %10 = linalg.generic {indexing_maps = [#map3, #map3], iterator_types = ["parallel"]} ins(%9 : tensor<7xf32>) outs(%7 : tensor<7xf32>) {
+ ^bb0(%arg0: f32, %arg1: f32):
+ %11 = arith.divf %arg0, %cst : f32
+ linalg.yield %11 : f32
+ } -> tensor<7xf32>
+ flow.dispatch.tensor.store %10, %5, offsets = [0], sizes = [7], strides = [1] : tensor<7xf32> -> !flow.dispatch.tensor<writeonly:7xf32>
+ return
+ }
+ }
+ }
+}
+// CHECK: func @check_no_cse()
+// CHECK-NOT: memref.alloc
+// CHECK: %[[FOR:.+]] = scf.for
+// CHECK: %[[DIVF:.+]] = arith.divf %[[FOR]]
+// CHECK: %[[RES:.+]] = vector.extract %[[DIVF]]
+// CHECK: memref.store %[[RES]]