[LLVMCPU] Add pass to enable Armv9 Streaming SVE mode (#13558)
This patch adds a pass 'iree-llvmcpu-enable-aarch64-ssve' that enables the Armv9 Scalable Matrix Extension (SME) Streaming SVE (SSVE) mode [1].
SSVE is enabled in the LLVM backend at the function boundary by specifying one of the following attributes [2]:
aarch64_pstate_sm_enabled - calls to functions with this attribute are
wrapped with 'smstart sm' / 'smstop sm' [3].
Changes the function ABI.
aarch64_pstate_sm_body - 'smstart sm' / 'smstop sm' are emitted in the
function prologue / epilogue for functions
marked with this attribute. This is internal
and doesn't change the function ABI.
This pass adds the 'aarch64_pstate_sm_body' attribute to functions via the passthrough mechanism [4].
This attribute is used because PSTATE.SM changes are kept internal to the function and the purpose of this pass is to enable SSVE for dispatch functions which are called by the IREE runtime. The AAPCS64 [5] states it is the caller's responsibility to ensure that PSTATE.SM has a valid value on entry to a callee, the 'aarch64_pstate_sm_enabled' attribute would change the function ABI forcing the caller (IREE runtime) to be responsible for managing PSTATE.SM before entry/exit. At present, the runtime doesn't know the details of dispatches such that it could emit these instructions.
The pass is enabled for AArch64 when SVE(2) and SME are enabled for the following lowering configurations:
* CPUBufferOpsTileAndVectorize
* CPUDoubleTilingPeelingExpert
* CPUConvTileAndDecomposeExpert
These configurations were chosen simply because they're used in one of our pipelines.
[1] https://developer.arm.com/documentation/ddi0616/aa
[2] https://llvm.org/docs/AArch64SME.html
[3] https://developer.arm.com/documentation/ddi0602/2023-03/Base-Instructions/SMSTART--Enables-access-to-Streaming-SVE-mode-and-SME-architectural-state--an-alias-of-MSR--immediate--
[4] https://mlir.llvm.org/docs/Dialects/LLVM/#attribute-pass-through
[5] https://github.com/ARM-software/abi-aa/blob/main/aapcs64/aapcs64.rst#671pstatesm-interfaces
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/BUILD.bazel b/compiler/src/iree/compiler/Codegen/LLVMCPU/BUILD.bazel
index 2899ee9..6a9157c 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/BUILD.bazel
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/BUILD.bazel
@@ -86,6 +86,7 @@
"@llvm-project//mlir:ArithTransforms",
"@llvm-project//mlir:ArmNeon2dToIntr",
"@llvm-project//mlir:ArmNeonDialect",
+ "@llvm-project//mlir:ArmSMETransforms",
"@llvm-project//mlir:BufferizationDialect",
"@llvm-project//mlir:ComplexToLLVM",
"@llvm-project//mlir:ComplexToStandard",
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/LLVMCPU/CMakeLists.txt
index 4c2bea1..0345814 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/CMakeLists.txt
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/CMakeLists.txt
@@ -63,6 +63,7 @@
MLIRArithTransforms
MLIRArmNeon2dToIntr
MLIRArmNeonDialect
+ MLIRArmSMETransforms
MLIRBufferizationDialect
MLIRComplexToLLVM
MLIRComplexToStandard
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/LLVMCPULowerExecutableTarget.cpp b/compiler/src/iree/compiler/Codegen/LLVMCPU/LLVMCPULowerExecutableTarget.cpp
index 77236ea..ba5f638 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/LLVMCPULowerExecutableTarget.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/LLVMCPULowerExecutableTarget.cpp
@@ -191,6 +191,8 @@
isX86(target) || isRISCV(target) ||
(isAArch64(target) && hasAnySVEFeature(target));
bool enableMicrokernels = hasMicrokernels(target);
+ bool enableAArch64SSVE = isAArch64(target) && hasAnySVEFeature(target) &&
+ hasSMEFeature(target);
if (!testLoweringConfiguration) {
switch (translationInfo.value().getDispatchLoweringPassPipeline()) {
case IREE::Codegen::DispatchLoweringPassPipeline::CPUDefault:
@@ -200,7 +202,8 @@
case IREE::Codegen::DispatchLoweringPassPipeline::
CPUBufferOpsTileAndVectorize:
addCPUBufferOpsTileAndVectorizePipeline(executableLoweringPipeline,
- enableVectorMasking);
+ enableVectorMasking,
+ enableAArch64SSVE);
break;
case IREE::Codegen::DispatchLoweringPassPipeline::
CPUDoubleTilingExpert:
@@ -219,12 +222,14 @@
addMultiTilingExpertPassPipeline(
executableLoweringPipeline,
static_cast<int>(TilingLevel::NumTileLevels),
- /*enablePeeling=*/true, enableVectorMasking, lowerToAVX2);
+ /*enablePeeling=*/true, enableVectorMasking, lowerToAVX2,
+ enableAArch64SSVE);
break;
case IREE::Codegen::DispatchLoweringPassPipeline::
CPUConvTileAndDecomposeExpert:
addConvTileAndDecomposeExpertPassPipeline(
- executableLoweringPipeline, enableVectorMasking);
+ executableLoweringPipeline, enableVectorMasking,
+ enableAArch64SSVE);
break;
case IREE::Codegen::DispatchLoweringPassPipeline::Mmt4dTilingExpert:
addMmt4dTilingExpertPassPipeline(executableLoweringPipeline,
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/LLVMCPUPasses.h b/compiler/src/iree/compiler/Codegen/LLVMCPU/LLVMCPUPasses.h
index 9331ae5..467bcef 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/LLVMCPUPasses.h
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/LLVMCPUPasses.h
@@ -120,7 +120,8 @@
/// pipeline is only used for dispatches that just copy data from input
/// interfaces to output interface.
void addCPUBufferOpsTileAndVectorizePipeline(OpPassManager &passManager,
- bool enableVectorMasking);
+ bool enableVectorMasking,
+ bool enableAArch64SSVE = false);
/// Populates the passes to lower ops through data tiling transformations.
void addCPUDataTilingPipeline(OpPassManager &passManager);
@@ -131,7 +132,8 @@
void addCPUDefaultPassPipeline(OpPassManager &passManager);
void addConvTileAndDecomposeExpertPassPipeline(OpPassManager &passManager,
- bool enableVectorMasking);
+ bool enableVectorMasking,
+ bool enableAArch64SSVE = false);
void addDoubleTilingPadExpertPassPipeline(OpPassManager &passManager,
bool enableVectorMasking);
@@ -144,7 +146,8 @@
void addMultiTilingExpertPassPipeline(OpPassManager &passManager,
int64_t numLevels, bool enablePeeling,
bool enableVectorMasking,
- bool lowerToAVX2);
+ bool lowerToAVX2,
+ bool enableAArch64SSVE = false);
void addTensorToVectorsPassPipeline(OpPassManager &passManager,
bool lowerToVectors = true);
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/Passes.cpp b/compiler/src/iree/compiler/Codegen/LLVMCPU/Passes.cpp
index f9aceab..6c44db8 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/Passes.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/Passes.cpp
@@ -21,6 +21,7 @@
#include "mlir/Conversion/ReconcileUnrealizedCasts/ReconcileUnrealizedCasts.h"
#include "mlir/Conversion/SCFToControlFlow/SCFToControlFlow.h"
#include "mlir/Dialect/Arith/Transforms/Passes.h"
+#include "mlir/Dialect/ArmSME/Transforms/Passes.h"
#include "mlir/Dialect/Linalg/Passes.h"
#include "mlir/Dialect/MemRef/Transforms/Passes.h"
#include "mlir/Pass/PassManager.h"
@@ -319,7 +320,8 @@
//===---------------------------------------------------------------------===//
void addCPUBufferOpsTileAndVectorizePipeline(OpPassManager &passManager,
- bool enableVectorMasking) {
+ bool enableVectorMasking,
+ bool enableAArch64SSVE) {
addTileAndDistributePasses(passManager);
// Skip tiling reduction loops because this is expected to apply on copy ops
@@ -348,6 +350,11 @@
nestedModulePM.addNestedPass<func::FuncOp>(
createLLVMCPUVectorLoweringPass(options));
}
+
+ if (enableAArch64SSVE)
+ nestedModulePM.addNestedPass<func::FuncOp>(
+ mlir::arm_sme::createEnableArmStreamingPass(
+ mlir::arm_sme::ArmStreaming::Locally));
}
void addDoubleTilingPadExpertPassPipeline(OpPassManager &passManager,
@@ -424,7 +431,8 @@
void addMultiTilingExpertPassPipeline(OpPassManager &passManager,
int64_t numLevels, bool enablePeeling,
bool enableVectorMasking,
- bool lowerToAVX2) {
+ bool lowerToAVX2,
+ bool enableAArch64SSVE) {
addTileAndDistributePasses(passManager);
OpPassManager &nestedModulePM = passManager.nest<ModuleOp>();
@@ -488,10 +496,16 @@
nestedModulePM.addNestedPass<func::FuncOp>(
createLLVMCPUVectorLoweringPass(options));
}
+
+ if (enableAArch64SSVE)
+ nestedModulePM.addNestedPass<func::FuncOp>(
+ mlir::arm_sme::createEnableArmStreamingPass(
+ mlir::arm_sme::ArmStreaming::Locally));
}
void addConvTileAndDecomposeExpertPassPipeline(OpPassManager &passManager,
- bool enableVectorMasking) {
+ bool enableVectorMasking,
+ bool enableAArch64SSVE) {
addTileAndDistributePasses(passManager);
OpPassManager &nestedModulePM = passManager.nest<ModuleOp>();
@@ -545,6 +559,11 @@
nestedModulePM.addNestedPass<func::FuncOp>(
createLLVMCPUVectorLoweringPass(options));
}
+
+ if (enableAArch64SSVE)
+ nestedModulePM.addNestedPass<func::FuncOp>(
+ mlir::arm_sme::createEnableArmStreamingPass(
+ mlir::arm_sme::ArmStreaming::Locally));
}
void addMmt4dTilingExpertPassPipeline(OpPassManager &passManager,
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/Utils.cpp b/compiler/src/iree/compiler/Codegen/LLVMCPU/Utils.cpp
index 6d2171e..ffe9358 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/Utils.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/Utils.cpp
@@ -98,5 +98,9 @@
return hasFeature(targetAttr, "+sve") || hasFeature(targetAttr, "+sve2");
}
+bool hasSMEFeature(IREE::HAL::ExecutableTargetAttr targetAttr) {
+ return hasFeature(targetAttr, "+sme");
+}
+
} // namespace iree_compiler
} // namespace mlir
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/Utils.h b/compiler/src/iree/compiler/Codegen/LLVMCPU/Utils.h
index 61aebf0..45cb10a 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/Utils.h
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/Utils.h
@@ -48,6 +48,9 @@
/// features.
bool hasAnySVEFeature(IREE::HAL::ExecutableTargetAttr targetAttr);
+/// Returns true if the 'targetAttr' contains '+sme' in its cpu features.
+bool hasSMEFeature(IREE::HAL::ExecutableTargetAttr targetAttr);
+
} // namespace iree_compiler
} // namespace mlir
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/pipeline_tests.mlir b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/pipeline_tests.mlir
index 7e0400b..588d886 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/pipeline_tests.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/pipeline_tests.mlir
@@ -602,3 +602,103 @@
// CHECK-SAME: ins(%[[SUBVIEW_INPUT0]], %[[SUBVIEW_INPUT1]]
// CHECK-SAME: outs(%[[SUBVIEW_OUTPUT]]
+// -----
+
+// Check Armv9 Streaming SVE mode is enabled for the following pipelines:
+//
+// * CPUBufferOpsTileAndVectorize
+// * CPUDoubleTilingPeelingExpert
+// * CPUConvTileAndDecomposeExpert
+
+#pipeline_layout = #hal.pipeline.layout<push_constants = 2, sets = [
+ #hal.descriptor_set.layout<0, bindings = [
+ #hal.descriptor_set.binding<0, storage_buffer>,
+ #hal.descriptor_set.binding<1, storage_buffer>,
+ #hal.descriptor_set.binding<2, storage_buffer>
+ ]>
+]>
+
+#executable_target_embedded_elf_arm_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {
+ cpu_features = "+sve,+sme",
+ data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128",
+ native_vector_size = 16 : index,
+ target_triple = "aarch64-unknown-unknown-eabi-elf"
+}>
+
+hal.executable private @aarch64_ssve__cpu_buffer_ops_tile_and_vectorize {
+ hal.executable.variant public @embedded_elf_arm_64, target = #executable_target_embedded_elf_arm_64_ {
+ hal.executable.export public @dispatch ordinal(0) layout(#pipeline_layout) attributes {
+ translation_info = #iree_codegen.translation_info<CPUBufferOpsTileAndVectorize>
+ } {
+ ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
+ hal.return %arg1, %arg2, %arg2 : index, index, index
+ }
+ builtin.module {
+ func.func @dispatch() { return }
+ }
+ }
+}
+
+// CHECK-LABEL: @aarch64_ssve__cpu_buffer_ops_tile_and_vectorize
+// CHECK: func.func @dispatch() attributes {arm_locally_streaming}
+
+hal.executable private @aarch64_ssve__cpu_double_tiling_peeling_expert {
+ hal.executable.variant public @embedded_elf_arm_64, target = #executable_target_embedded_elf_arm_64_ {
+ hal.executable.export public @dispatch ordinal(0) layout(#pipeline_layout) attributes {
+ translation_info = #iree_codegen.translation_info<CPUDoubleTilingPeelingExpert>
+ } {
+ ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
+ hal.return %arg1, %arg2, %arg2 : index, index, index
+ }
+ builtin.module {
+ func.func @dispatch() { return }
+ }
+ }
+}
+
+// CHECK-LABEL: @aarch64_ssve__cpu_double_tiling_peeling_expert
+// CHECK: func.func @dispatch() attributes {arm_locally_streaming}
+
+hal.executable private @aarch64_ssve__cpu_conv_tile_and_decompose_expert {
+ hal.executable.variant public @embedded_elf_arm_64, target = #executable_target_embedded_elf_arm_64_ {
+ hal.executable.export public @dispatch ordinal(0) layout(#pipeline_layout) attributes {
+ translation_info = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>
+ } {
+ ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
+ hal.return %arg1, %arg2, %arg2 : index, index, index
+ }
+ builtin.module {
+ func.func @dispatch() { return }
+ }
+ }
+}
+
+// CHECK-LABEL: @aarch64_ssve__cpu_conv_tile_and_decompose_expert
+// CHECK: func.func @dispatch() attributes {arm_locally_streaming}
+
+// Check Armv9 Streaming SVE mode is not enabled if +sve and +sme are not
+// specified.
+
+#executable_target_embedded_elf_arm_64_no_sve = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {
+ cpu_features = "+sme",
+ data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128",
+ native_vector_size = 16 : index,
+ target_triple = "aarch64-unknown-unknown-eabi-elf"
+}>
+
+hal.executable private @aarch64_ssve_sve_disabled {
+ hal.executable.variant public @embedded_elf_arm_64, target = #executable_target_embedded_elf_arm_64_no_sve {
+ hal.executable.export public @dispatch ordinal(0) layout(#pipeline_layout) attributes {
+ translation_info = #iree_codegen.translation_info<CPUBufferOpsTileAndVectorize>
+ } {
+ ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
+ hal.return %arg1, %arg2, %arg2 : index, index, index
+ }
+ builtin.module {
+ func.func @dispatch() { return }
+ }
+ }
+}
+
+// CHECK-LABEL: @aarch64_ssve_sve_disabled
+// CHECK-NOT: func.func @dispatch() attributes {arm_locally_streaming}