Use TileFuseAndVectorize pipeline for x86 by default. (#7666)
Also unnecessary flags for the pass since everything should be lowered
to vectors if possible.
diff --git a/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp b/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp
index 9f794ac..09f6353 100644
--- a/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp
+++ b/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp
@@ -85,11 +85,38 @@
"linalg.generic and linalg.indexed_generic workgroup tile size"),
llvm::cl::init(64));
-static llvm::cl::opt<bool> clUseTileFuseAndVectorize(
- "iree-llvmcpu-use-tile-fuse-and-vectorize",
- llvm::cl::desc(
- "THIS IS DEVELOPMENT ONLY FLAG. Uses tile, fuse and vectorize"),
- llvm::cl::init(false));
+using IREE::Codegen::DispatchLoweringPassPipeline;
+
+static Optional<std::string> getTargetTriple(FuncOp entryPointFn) {
+ auto variantOp =
+ entryPointFn->getParentOfType<IREE::HAL::ExecutableVariantOp>();
+ IREE::HAL::ExecutableTargetAttr targetAttr = variantOp.target();
+ if (!targetAttr) return llvm::None;
+ auto config = targetAttr.getConfiguration();
+ if (!config) return llvm::None;
+ auto triple = config.getAs<StringAttr>("target_triple");
+ if (!triple) return llvm::None;
+ return triple.getValue().str();
+}
+
+static DispatchLoweringPassPipeline getDispatchLoweringPassPipeline(
+ FuncOp entryPointFn, Operation *op) {
+ return TypeSwitch<Operation *, DispatchLoweringPassPipeline>(op)
+ .Case<linalg::ContractionOpInterface>([&](auto op) {
+ Optional<std::string> triple = getTargetTriple(entryPointFn);
+ if (triple && triple.getValue().find("x86_64") != std::string::npos) {
+ return DispatchLoweringPassPipeline::CPUTileFuseAndVectorize;
+ } else {
+ return DispatchLoweringPassPipeline::CPUTensorToVectors;
+ }
+ })
+ .Case<linalg::Mmt4DOp>([&](auto op) {
+ return DispatchLoweringPassPipeline::CPUTensorToVectors;
+ })
+ .Default([&](Operation *op) {
+ return DispatchLoweringPassPipeline::CPUDefault;
+ });
+}
/// Looks for the `native_vector_size` attribute in the hal.executable.variant
/// op.
@@ -230,9 +257,9 @@
SmallVector<int64_t> workloadPerWorkgroup =
getDefaultWorkloadPerWorkgroup(tiledLoops, nativeVectorSizeInElements);
- setTranslationInfo(
- entryPointFn, IREE::Codegen::DispatchLoweringPassPipeline::CPUDefault,
- workloadPerWorkgroup, /*workgroupSize =*/ArrayRef<int64_t>{});
+ setTranslationInfo(entryPointFn, DispatchLoweringPassPipeline::CPUDefault,
+ workloadPerWorkgroup,
+ /*workgroupSize =*/ArrayRef<int64_t>{});
return success();
}
@@ -309,10 +336,9 @@
}
setTranslationInfo(
entryPointFn,
- clUseTileFuseAndVectorize
- ? IREE::Codegen::DispatchLoweringPassPipeline::CPUTileFuseAndVectorize
- : IREE::Codegen::DispatchLoweringPassPipeline::CPUTensorToVectors,
- workloadPerWorkgroup, /*workgroupSize =*/ArrayRef<int64_t>{});
+ getDispatchLoweringPassPipeline(entryPointFn, contractionOp),
+ workloadPerWorkgroup,
+ /*workgroupSize =*/ArrayRef<int64_t>{});
SmallVector<int64_t, 4> l1TileSizes, vectorTileSizes;
if (isBatchMatmul) {
@@ -388,7 +414,7 @@
return setOpConfigAndEntryPointFnTranslation(
entryPointFn, mmt4dOp, tileSizes, nativeVectorSize,
- IREE::Codegen::DispatchLoweringPassPipeline::CPUTensorToVectors);
+ getDispatchLoweringPassPipeline(entryPointFn, mmt4dOp));
}
/// Sets the lowering configuration for dispatch region for linalg_ext.fft
@@ -425,7 +451,7 @@
return setOpConfigAndEntryPointFnTranslation(
entryPointFn, fftOp, tileSizes,
/*nativeVectorSizes=*/ArrayRef<int64_t>{},
- IREE::Codegen::DispatchLoweringPassPipeline::CPUDefault);
+ getDispatchLoweringPassPipeline(entryPointFn, fftOp));
}
/// Finds the root operation in the given list of linalg operations and sets
diff --git a/iree/compiler/Codegen/LLVMCPU/LLVMCPULowerExecutableTarget.cpp b/iree/compiler/Codegen/LLVMCPU/LLVMCPULowerExecutableTarget.cpp
index 72078cd..991be85 100644
--- a/iree/compiler/Codegen/LLVMCPU/LLVMCPULowerExecutableTarget.cpp
+++ b/iree/compiler/Codegen/LLVMCPU/LLVMCPULowerExecutableTarget.cpp
@@ -188,8 +188,7 @@
break;
case IREE::Codegen::DispatchLoweringPassPipeline::
CPUTileFuseAndVectorize:
- addTensorToVectorsPassPipeline(nestedModulePM, lowerToVectors,
- /*useTileAndVectorizeV2=*/true);
+ addTileFuseAndVectorizePassPipeline(nestedModulePM);
break;
default:
llvm_unreachable("Unsupported pipeline on CPU target.");
diff --git a/iree/compiler/Codegen/LLVMCPU/LLVMCPUTileFuseAndVectorizeLinalgTensorOps.cpp b/iree/compiler/Codegen/LLVMCPU/LLVMCPUTileFuseAndVectorizeLinalgTensorOps.cpp
index 282d6ce..f0f330a 100644
--- a/iree/compiler/Codegen/LLVMCPU/LLVMCPUTileFuseAndVectorizeLinalgTensorOps.cpp
+++ b/iree/compiler/Codegen/LLVMCPU/LLVMCPUTileFuseAndVectorizeLinalgTensorOps.cpp
@@ -54,19 +54,11 @@
namespace {
struct LLVMCPUTileFuseAndVectorizePass
: public LLVMCPUTileFuseAndVectorizeBase<LLVMCPUTileFuseAndVectorizePass> {
- LLVMCPUTileFuseAndVectorizePass(bool vectorize = true)
- : lowerToVectors(vectorize) {}
- LLVMCPUTileFuseAndVectorizePass(const LLVMCPUTileFuseAndVectorizePass &pass) {
- lowerToVectors = pass.lowerToVectors;
- }
void getDependentDialects(DialectRegistry ®istry) const override {
registry.insert<linalg::LinalgDialect, memref::MemRefDialect,
vector::VectorDialect>();
}
void runOnOperation() override;
-
- private:
- bool lowerToVectors;
};
LogicalResult applyTileAndFuseCanonicalizationPatterns(FuncOp funcOp) {
@@ -215,10 +207,6 @@
});
}
- if (!lowerToVectors) {
- return;
- }
-
{
// Set vectorization marker globally
OpBuilder builder(funcOp.getContext());
@@ -300,9 +288,8 @@
}
}
-std::unique_ptr<OperationPass<FuncOp>> createLLVMCPUTileFuseAndVectorizePass(
- bool lowerToVectors) {
- return std::make_unique<LLVMCPUTileFuseAndVectorizePass>(lowerToVectors);
+std::unique_ptr<OperationPass<FuncOp>> createLLVMCPUTileFuseAndVectorizePass() {
+ return std::make_unique<LLVMCPUTileFuseAndVectorizePass>();
}
} // namespace iree_compiler
diff --git a/iree/compiler/Codegen/LLVMCPU/Passes.cpp b/iree/compiler/Codegen/LLVMCPU/Passes.cpp
index cb0a934..3ba3f6e 100644
--- a/iree/compiler/Codegen/LLVMCPU/Passes.cpp
+++ b/iree/compiler/Codegen/LLVMCPU/Passes.cpp
@@ -111,18 +111,12 @@
}
void addTensorToVectorsPassPipeline(OpPassManager &passManager,
- bool lowerToVectors,
- bool useTileAndVectorizeV2) {
+ bool lowerToVectors) {
passManager.addPass(createCanonicalizerPass());
// Tile and vectorize linalg ops on tensors.
- if (useTileAndVectorizeV2) {
- passManager.addNestedPass<FuncOp>(
- createLLVMCPUTileFuseAndVectorizePass(lowerToVectors));
- } else {
- passManager.addNestedPass<FuncOp>(
- createLLVMCPUTileAndVectorizePass(lowerToVectors));
- }
+ passManager.addNestedPass<FuncOp>(
+ createLLVMCPUTileAndVectorizePass(lowerToVectors));
passManager.addNestedPass<FuncOp>(createCSEPass());
passManager.addNestedPass<FuncOp>(createCanonicalizerPass());
@@ -136,6 +130,23 @@
passManager.addNestedPass<FuncOp>(createOptimizeVectorTransferPass());
}
+void addTileFuseAndVectorizePassPipeline(OpPassManager &passManager) {
+ passManager.addPass(createCanonicalizerPass());
+
+ // Tile and vectorize linalg ops on tensors.
+ passManager.addNestedPass<FuncOp>(createLLVMCPUTileFuseAndVectorizePass());
+ passManager.addNestedPass<FuncOp>(createCSEPass());
+ passManager.addNestedPass<FuncOp>(createCanonicalizerPass());
+
+ // Use stack allocation on CPU side.
+ addLinalgBufferizePasses(passManager, cpuAllocationFunction);
+ passManager.addNestedPass<FuncOp>(createCSEPass());
+ passManager.addNestedPass<FuncOp>(createCanonicalizerPass());
+
+ passManager.addNestedPass<FuncOp>(createForOpCanonicalizationPass());
+ passManager.addNestedPass<FuncOp>(createOptimizeVectorTransferPass());
+}
+
void addCPUDefaultPassPipeline(OpPassManager &passManager) {
passManager.addPass(createCanonicalizerPass());
// Use stack allocation on CPU side.
diff --git a/iree/compiler/Codegen/LLVMCPU/test/materialize_launch_configuration.mlir b/iree/compiler/Codegen/LLVMCPU/test/materialize_launch_configuration.mlir
index a521311..f48dc87 100644
--- a/iree/compiler/Codegen/LLVMCPU/test/materialize_launch_configuration.mlir
+++ b/iree/compiler/Codegen/LLVMCPU/test/materialize_launch_configuration.mlir
@@ -6,10 +6,10 @@
hal.interface.binding @arg1, set=0, binding=1, type="StorageBuffer", access="Read"
hal.interface.binding @ret0, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
}
- hal.executable.variant @llvm, target = #hal.executable.target<"llvm", "embedded-elf-x86_64", {
+ hal.executable.variant @llvm, target = #hal.executable.target<"llvm", "embedded-elf-arm_64", {
data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128",
native_vector_size = 16 : index,
- target_triple = "x86_64-unknown-linux-gnu"
+ target_triple = "aarch64-unknown-unknown-eabi-elf"
}> {
hal.executable.entry_point @matmul_tensors attributes {
interface = @io,
@@ -315,10 +315,10 @@
hal.interface.binding @arg1, set=0, binding=1, type="StorageBuffer", access="Read"
hal.interface.binding @ret0, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
}
- hal.executable.variant @llvm, target = #hal.executable.target<"llvm", "embedded-elf-x86_64", {
+ hal.executable.variant @llvm, target = #hal.executable.target<"llvm", "embedded-elf-arm_64", {
data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128",
native_vector_size = 16 : index,
- target_triple = "x86_64-unknown-linux-gnu"
+ target_triple = "aarch64-unknown-unknown-eabi-elf"
}> {
hal.executable.entry_point @batch_matmul_tensors attributes {
interface = @io,
@@ -1296,3 +1296,59 @@
// CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index
// CHECK-DAG: %[[D0:.+]] = affine.apply #[[MAP]]()[%[[ARG0]]]
// CHECK: hal.return %[[D0]], %[[C1]], %[[C1]]
+
+// -----
+
+hal.executable private @matmul_x86 {
+ hal.executable.variant public @embedded_elf_x86_64, target = #hal.executable.target<
+ "llvm",
+ "embedded-elf-x86_64", {
+ data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128",
+ native_vector_size = 64 : index,
+ target_triple = "x86_64-unknown-unknown-eabi-elf"
+ }> {
+ hal.executable.entry_point public @matmul_x86 attributes {interface = @io, ordinal = 0 : index}
+ builtin.module {
+ func @matmul_x86() {
+ %c128 = arith.constant 128 : index
+ %c384 = arith.constant 384 : index
+ %cst = arith.constant 0.000000e+00 : f32
+ %c0 = arith.constant 0 : index
+ %0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : !flow.dispatch.tensor<readonly:384x512xf32>
+ %1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : !flow.dispatch.tensor<readonly:512x128xf32>
+ %2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : !flow.dispatch.tensor<writeonly:384x128xf32>
+ %workgroup_size_x = hal.interface.workgroup.size[0] : index
+ %workgroup_size_y = hal.interface.workgroup.size[1] : index
+ %workgroup_id_x = hal.interface.workgroup.id[0] : index
+ %workgroup_count_x = hal.interface.workgroup.count[0] : index
+ %workgroup_id_y = hal.interface.workgroup.id[1] : index
+ %workgroup_count_y = hal.interface.workgroup.count[1] : index
+ %3 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_y, %workgroup_size_y]
+ %4 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_y, %workgroup_size_y]
+ scf.for %arg0 = %3 to %c384 step %4 {
+ %5 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_x, %workgroup_size_x]
+ %6 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_x, %workgroup_size_x]
+ scf.for %arg1 = %5 to %c128 step %6 {
+ %7 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 384)>(%arg0)[%workgroup_size_y]
+ %8 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [%7, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:384x512xf32> -> tensor<?x512xf32>
+ %9 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 128)>(%arg1)[%workgroup_size_x]
+ %10 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [512, %9], strides = [1, 1] : !flow.dispatch.tensor<readonly:512x128xf32> -> tensor<512x?xf32>
+ %11 = affine.min affine_map<(d0)[s0] -> (-d0 + 384, s0)>(%arg0)[%workgroup_size_y]
+ %12 = affine.min affine_map<(d0)[s0] -> (-d0 + 128, s0)>(%arg1)[%workgroup_size_x]
+ %13 = linalg.init_tensor [%11, %12] : tensor<?x?xf32>
+ %14 = linalg.fill(%cst, %13) : f32, tensor<?x?xf32> -> tensor<?x?xf32>
+ %15 = linalg.matmul ins(%8, %10 : tensor<?x512xf32>, tensor<512x?xf32>) outs(%14 : tensor<?x?xf32>) -> tensor<?x?xf32>
+ flow.dispatch.tensor.store %15, %2, offsets = [%arg0, %arg1], sizes = [%7, %9], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:384x128xf32>
+ }
+ }
+ return
+ }
+ hal.interface private @io {
+ hal.interface.binding public @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
+ hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
+ hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
+ }
+ }
+ }
+}
+// CHECK: #[[TRANSLATION:.+]] = #iree_codegen.translation.info<"CPUTileFuseAndVectorize", workload_per_wg = [64, 64]>
diff --git a/iree/compiler/Codegen/Passes.h b/iree/compiler/Codegen/Passes.h
index 9940ebe..569656f 100644
--- a/iree/compiler/Codegen/Passes.h
+++ b/iree/compiler/Codegen/Passes.h
@@ -156,8 +156,9 @@
/// Multi-level tiling and vectorization of linalg ops on tensors.
std::unique_ptr<OperationPass<FuncOp>> createLLVMCPUTileAndVectorizePass(
bool lowerToVectors = true);
-std::unique_ptr<OperationPass<FuncOp>> createLLVMCPUTileFuseAndVectorizePass(
- bool lowerToVectors = true);
+
+/// Multi-level tiling, fusing and vectorization of linalg ops on tensors.
+std::unique_ptr<OperationPass<FuncOp>> createLLVMCPUTileFuseAndVectorizePass();
/// Vectorizes linalg ops executed in the same hal.interface.workgroup.
std::unique_ptr<OperationPass<FuncOp>> createLLVMCPUVectorizationPass(
@@ -198,8 +199,11 @@
IREE::Codegen::TranslationInfoAttr translationInfo,
ArrayRef<int64_t> workgroupSize = {});
void addTensorToVectorsPassPipeline(OpPassManager &passManager,
- bool lowerToVectors = true,
- bool useTileAndVectorizeV2 = false);
+ bool lowerToVectors = true);
+
+/// Populates the passes needed to multi level tile, fuse and vectorize lowering
+/// of linalg ops on tensors to vectors operations.
+void addTileFuseAndVectorizePassPipeline(OpPassManager &passManager);
//----------------------------------------------------------------------------//
// LLVMCPU Pass Pipelines for lowering to LLVM dialect.
diff --git a/iree/test/e2e/cpu_specific/BUILD b/iree/test/e2e/cpu_specific/BUILD
index a2de649..fc37d86 100644
--- a/iree/test/e2e/cpu_specific/BUILD
+++ b/iree/test/e2e/cpu_specific/BUILD
@@ -22,7 +22,6 @@
],
compiler_flags = [
"-iree-input-type=mhlo",
- "--iree-llvmcpu-use-tile-fuse-and-vectorize",
],
driver = "dylib",
target_backend = "dylib-llvm-aot",
diff --git a/iree/test/e2e/cpu_specific/CMakeLists.txt b/iree/test/e2e/cpu_specific/CMakeLists.txt
index 25d2185..42ba467 100644
--- a/iree/test/e2e/cpu_specific/CMakeLists.txt
+++ b/iree/test/e2e/cpu_specific/CMakeLists.txt
@@ -21,7 +21,6 @@
"dylib"
COMPILER_FLAGS
"-iree-input-type=mhlo"
- "--iree-llvmcpu-use-tile-fuse-and-vectorize"
)
### BAZEL_TO_CMAKE_PRESERVES_ALL_CONTENT_BELOW_THIS_LINE ###