[LLVMCPU] Add pass to enable Armv9 Streaming SVE mode (#13558) This patch adds a pass 'iree-llvmcpu-enable-aarch64-ssve' that enables the Armv9 Scalable Matrix Extension (SME) Streaming SVE (SSVE) mode [1]. SSVE is enabled in the LLVM backend at the function boundary by specifying one of the following attributes [2]: aarch64_pstate_sm_enabled - calls to functions with this attribute are wrapped with 'smstart sm' / 'smstop sm' [3]. Changes the function ABI. aarch64_pstate_sm_body - 'smstart sm' / 'smstop sm' are emitted in the function prologue / epilogue for functions marked with this attribute. This is internal and doesn't change the function ABI. This pass adds the 'aarch64_pstate_sm_body' attribute to functions via the passthrough mechanism [4]. This attribute is used because PSTATE.SM changes are kept internal to the function and the purpose of this pass is to enable SSVE for dispatch functions which are called by the IREE runtime. The AAPCS64 [5] states it is the caller's responsibility to ensure that PSTATE.SM has a valid value on entry to a callee, the 'aarch64_pstate_sm_enabled' attribute would change the function ABI forcing the caller (IREE runtime) to be responsible for managing PSTATE.SM before entry/exit. At present, the runtime doesn't know the details of dispatches such that it could emit these instructions. The pass is enabled for AArch64 when SVE(2) and SME are enabled for the following lowering configurations: * CPUBufferOpsTileAndVectorize * CPUDoubleTilingPeelingExpert * CPUConvTileAndDecomposeExpert These configurations were chosen simply because they're used in one of our pipelines. [1] https://developer.arm.com/documentation/ddi0616/aa [2] https://llvm.org/docs/AArch64SME.html [3] https://developer.arm.com/documentation/ddi0602/2023-03/Base-Instructions/SMSTART--Enables-access-to-Streaming-SVE-mode-and-SME-architectural-state--an-alias-of-MSR--immediate-- [4] https://mlir.llvm.org/docs/Dialects/LLVM/#attribute-pass-through [5] https://github.com/ARM-software/abi-aa/blob/main/aapcs64/aapcs64.rst#671pstatesm-interfaces

commit: 4e9b3bdb39dd7eb8d63b4fefa22d86d460261564 [log] [tgz]
author: Cullen Rhodes <cullen.rhodes@arm.com> Wed Jun 07 18:23:57 2023 +0100
committer: GitHub <noreply@github.com> Wed Jun 07 10:23:57 2023 -0700
tree: 86be44db5fc29c50be1b5a110148eebba2facde4
parent: 0b91c98ce58f34ba6098b591c9a5879f3a679f52 [diff]
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/BUILD.bazel b/compiler/src/iree/compiler/Codegen/LLVMCPU/BUILD.bazel
index 2899ee9..6a9157c 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/BUILD.bazel
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/BUILD.bazel

@@ -86,6 +86,7 @@
         "@llvm-project//mlir:ArithTransforms",
         "@llvm-project//mlir:ArmNeon2dToIntr",
         "@llvm-project//mlir:ArmNeonDialect",
+        "@llvm-project//mlir:ArmSMETransforms",
         "@llvm-project//mlir:BufferizationDialect",
         "@llvm-project//mlir:ComplexToLLVM",
         "@llvm-project//mlir:ComplexToStandard",

diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/LLVMCPU/CMakeLists.txt
index 4c2bea1..0345814 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/CMakeLists.txt
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/CMakeLists.txt

@@ -63,6 +63,7 @@
     MLIRArithTransforms
     MLIRArmNeon2dToIntr
     MLIRArmNeonDialect
+    MLIRArmSMETransforms
     MLIRBufferizationDialect
     MLIRComplexToLLVM
     MLIRComplexToStandard

diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/LLVMCPULowerExecutableTarget.cpp b/compiler/src/iree/compiler/Codegen/LLVMCPU/LLVMCPULowerExecutableTarget.cpp
index 77236ea..ba5f638 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/LLVMCPULowerExecutableTarget.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/LLVMCPULowerExecutableTarget.cpp

@@ -191,6 +191,8 @@
           isX86(target) || isRISCV(target) ||
           (isAArch64(target) && hasAnySVEFeature(target));
       bool enableMicrokernels = hasMicrokernels(target);
+      bool enableAArch64SSVE = isAArch64(target) && hasAnySVEFeature(target) &&
+                               hasSMEFeature(target);
       if (!testLoweringConfiguration) {
         switch (translationInfo.value().getDispatchLoweringPassPipeline()) {
           case IREE::Codegen::DispatchLoweringPassPipeline::CPUDefault:
@@ -200,7 +202,8 @@
           case IREE::Codegen::DispatchLoweringPassPipeline::
               CPUBufferOpsTileAndVectorize:
             addCPUBufferOpsTileAndVectorizePipeline(executableLoweringPipeline,
-                                                    enableVectorMasking);
+                                                    enableVectorMasking,
+                                                    enableAArch64SSVE);
             break;
           case IREE::Codegen::DispatchLoweringPassPipeline::
               CPUDoubleTilingExpert:
@@ -219,12 +222,14 @@
             addMultiTilingExpertPassPipeline(
                 executableLoweringPipeline,
                 static_cast<int>(TilingLevel::NumTileLevels),
-                /*enablePeeling=*/true, enableVectorMasking, lowerToAVX2);
+                /*enablePeeling=*/true, enableVectorMasking, lowerToAVX2,
+                enableAArch64SSVE);
             break;
           case IREE::Codegen::DispatchLoweringPassPipeline::
               CPUConvTileAndDecomposeExpert:
             addConvTileAndDecomposeExpertPassPipeline(
-                executableLoweringPipeline, enableVectorMasking);
+                executableLoweringPipeline, enableVectorMasking,
+                enableAArch64SSVE);
             break;
           case IREE::Codegen::DispatchLoweringPassPipeline::Mmt4dTilingExpert:
             addMmt4dTilingExpertPassPipeline(executableLoweringPipeline,

diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/LLVMCPUPasses.h b/compiler/src/iree/compiler/Codegen/LLVMCPU/LLVMCPUPasses.h
index 9331ae5..467bcef 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/LLVMCPUPasses.h
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/LLVMCPUPasses.h

@@ -120,7 +120,8 @@
 /// pipeline is only used for dispatches that just copy data from input
 /// interfaces to output interface.
 void addCPUBufferOpsTileAndVectorizePipeline(OpPassManager &passManager,
-                                             bool enableVectorMasking);
+                                             bool enableVectorMasking,
+                                             bool enableAArch64SSVE = false);
 
 /// Populates the passes to lower ops through data tiling transformations.
 void addCPUDataTilingPipeline(OpPassManager &passManager);
@@ -131,7 +132,8 @@
 void addCPUDefaultPassPipeline(OpPassManager &passManager);
 
 void addConvTileAndDecomposeExpertPassPipeline(OpPassManager &passManager,
-                                               bool enableVectorMasking);
+                                               bool enableVectorMasking,
+                                               bool enableAArch64SSVE = false);
 
 void addDoubleTilingPadExpertPassPipeline(OpPassManager &passManager,
                                           bool enableVectorMasking);
@@ -144,7 +146,8 @@
 void addMultiTilingExpertPassPipeline(OpPassManager &passManager,
                                       int64_t numLevels, bool enablePeeling,
                                       bool enableVectorMasking,
-                                      bool lowerToAVX2);
+                                      bool lowerToAVX2,
+                                      bool enableAArch64SSVE = false);
 
 void addTensorToVectorsPassPipeline(OpPassManager &passManager,
                                     bool lowerToVectors = true);

diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/Passes.cpp b/compiler/src/iree/compiler/Codegen/LLVMCPU/Passes.cpp
index f9aceab..6c44db8 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/Passes.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/Passes.cpp

@@ -21,6 +21,7 @@
 #include "mlir/Conversion/ReconcileUnrealizedCasts/ReconcileUnrealizedCasts.h"
 #include "mlir/Conversion/SCFToControlFlow/SCFToControlFlow.h"
 #include "mlir/Dialect/Arith/Transforms/Passes.h"
+#include "mlir/Dialect/ArmSME/Transforms/Passes.h"
 #include "mlir/Dialect/Linalg/Passes.h"
 #include "mlir/Dialect/MemRef/Transforms/Passes.h"
 #include "mlir/Pass/PassManager.h"
@@ -319,7 +320,8 @@
 //===---------------------------------------------------------------------===//
 
 void addCPUBufferOpsTileAndVectorizePipeline(OpPassManager &passManager,
-                                             bool enableVectorMasking) {
+                                             bool enableVectorMasking,
+                                             bool enableAArch64SSVE) {
   addTileAndDistributePasses(passManager);
 
   // Skip tiling reduction loops because this is expected to apply on copy ops
@@ -348,6 +350,11 @@
     nestedModulePM.addNestedPass<func::FuncOp>(
         createLLVMCPUVectorLoweringPass(options));
   }
+
+  if (enableAArch64SSVE)
+    nestedModulePM.addNestedPass<func::FuncOp>(
+        mlir::arm_sme::createEnableArmStreamingPass(
+            mlir::arm_sme::ArmStreaming::Locally));
 }
 
 void addDoubleTilingPadExpertPassPipeline(OpPassManager &passManager,
@@ -424,7 +431,8 @@
 void addMultiTilingExpertPassPipeline(OpPassManager &passManager,
                                       int64_t numLevels, bool enablePeeling,
                                       bool enableVectorMasking,
-                                      bool lowerToAVX2) {
+                                      bool lowerToAVX2,
+                                      bool enableAArch64SSVE) {
   addTileAndDistributePasses(passManager);
 
   OpPassManager &nestedModulePM = passManager.nest<ModuleOp>();
@@ -488,10 +496,16 @@
     nestedModulePM.addNestedPass<func::FuncOp>(
         createLLVMCPUVectorLoweringPass(options));
   }
+
+  if (enableAArch64SSVE)
+    nestedModulePM.addNestedPass<func::FuncOp>(
+        mlir::arm_sme::createEnableArmStreamingPass(
+            mlir::arm_sme::ArmStreaming::Locally));
 }
 
 void addConvTileAndDecomposeExpertPassPipeline(OpPassManager &passManager,
-                                               bool enableVectorMasking) {
+                                               bool enableVectorMasking,
+                                               bool enableAArch64SSVE) {
   addTileAndDistributePasses(passManager);
 
   OpPassManager &nestedModulePM = passManager.nest<ModuleOp>();
@@ -545,6 +559,11 @@
     nestedModulePM.addNestedPass<func::FuncOp>(
         createLLVMCPUVectorLoweringPass(options));
   }
+
+  if (enableAArch64SSVE)
+    nestedModulePM.addNestedPass<func::FuncOp>(
+        mlir::arm_sme::createEnableArmStreamingPass(
+            mlir::arm_sme::ArmStreaming::Locally));
 }
 
 void addMmt4dTilingExpertPassPipeline(OpPassManager &passManager,

diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/Utils.cpp b/compiler/src/iree/compiler/Codegen/LLVMCPU/Utils.cpp
index 6d2171e..ffe9358 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/Utils.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/Utils.cpp

@@ -98,5 +98,9 @@
   return hasFeature(targetAttr, "+sve") || hasFeature(targetAttr, "+sve2");
 }
 
+bool hasSMEFeature(IREE::HAL::ExecutableTargetAttr targetAttr) {
+  return hasFeature(targetAttr, "+sme");
+}
+
 }  // namespace iree_compiler
 }  // namespace mlir

diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/Utils.h b/compiler/src/iree/compiler/Codegen/LLVMCPU/Utils.h
index 61aebf0..45cb10a 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/Utils.h
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/Utils.h

@@ -48,6 +48,9 @@
 /// features.
 bool hasAnySVEFeature(IREE::HAL::ExecutableTargetAttr targetAttr);
 
+/// Returns true if the 'targetAttr' contains '+sme' in its cpu features.
+bool hasSMEFeature(IREE::HAL::ExecutableTargetAttr targetAttr);
+
 }  // namespace iree_compiler
 }  // namespace mlir
 

diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/pipeline_tests.mlir b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/pipeline_tests.mlir
index 7e0400b..588d886 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/pipeline_tests.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/pipeline_tests.mlir

@@ -602,3 +602,103 @@
 //  CHECK-SAME:         ins(%[[SUBVIEW_INPUT0]], %[[SUBVIEW_INPUT1]]
 //  CHECK-SAME:         outs(%[[SUBVIEW_OUTPUT]]
 
+// -----
+
+// Check Armv9 Streaming SVE mode is enabled for the following pipelines:
+//
+//   * CPUBufferOpsTileAndVectorize
+//   * CPUDoubleTilingPeelingExpert
+//   * CPUConvTileAndDecomposeExpert
+
+#pipeline_layout = #hal.pipeline.layout<push_constants = 2, sets = [
+  #hal.descriptor_set.layout<0, bindings = [
+    #hal.descriptor_set.binding<0, storage_buffer>,
+    #hal.descriptor_set.binding<1, storage_buffer>,
+    #hal.descriptor_set.binding<2, storage_buffer>
+  ]>
+]>
+
+#executable_target_embedded_elf_arm_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {
+  cpu_features = "+sve,+sme",
+  data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128",
+  native_vector_size = 16 : index,
+  target_triple = "aarch64-unknown-unknown-eabi-elf"
+}>
+
+hal.executable private @aarch64_ssve__cpu_buffer_ops_tile_and_vectorize {
+  hal.executable.variant public @embedded_elf_arm_64, target = #executable_target_embedded_elf_arm_64_ {
+    hal.executable.export public @dispatch ordinal(0) layout(#pipeline_layout) attributes {
+      translation_info = #iree_codegen.translation_info<CPUBufferOpsTileAndVectorize>
+    } {
+    ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
+      hal.return %arg1, %arg2, %arg2 : index, index, index
+    }
+    builtin.module {
+      func.func @dispatch() { return }
+    }
+  }
+}
+
+// CHECK-LABEL: @aarch64_ssve__cpu_buffer_ops_tile_and_vectorize
+// CHECK: func.func @dispatch() attributes {arm_locally_streaming}
+
+hal.executable private @aarch64_ssve__cpu_double_tiling_peeling_expert {
+  hal.executable.variant public @embedded_elf_arm_64, target = #executable_target_embedded_elf_arm_64_ {
+    hal.executable.export public @dispatch ordinal(0) layout(#pipeline_layout) attributes {
+      translation_info = #iree_codegen.translation_info<CPUDoubleTilingPeelingExpert>
+    } {
+    ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
+      hal.return %arg1, %arg2, %arg2 : index, index, index
+    }
+    builtin.module {
+      func.func @dispatch() { return }
+    }
+  }
+}
+
+// CHECK-LABEL: @aarch64_ssve__cpu_double_tiling_peeling_expert
+// CHECK: func.func @dispatch() attributes {arm_locally_streaming}
+
+hal.executable private @aarch64_ssve__cpu_conv_tile_and_decompose_expert {
+  hal.executable.variant public @embedded_elf_arm_64, target = #executable_target_embedded_elf_arm_64_ {
+    hal.executable.export public @dispatch ordinal(0) layout(#pipeline_layout) attributes {
+      translation_info = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>
+    } {
+    ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
+      hal.return %arg1, %arg2, %arg2 : index, index, index
+    }
+    builtin.module {
+      func.func @dispatch() { return }
+    }
+  }
+}
+
+// CHECK-LABEL: @aarch64_ssve__cpu_conv_tile_and_decompose_expert
+// CHECK: func.func @dispatch() attributes {arm_locally_streaming}
+
+// Check Armv9 Streaming SVE mode is not enabled if +sve and +sme are not
+// specified.
+
+#executable_target_embedded_elf_arm_64_no_sve = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {
+  cpu_features = "+sme",
+  data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128",
+  native_vector_size = 16 : index,
+  target_triple = "aarch64-unknown-unknown-eabi-elf"
+}>
+
+hal.executable private @aarch64_ssve_sve_disabled {
+  hal.executable.variant public @embedded_elf_arm_64, target = #executable_target_embedded_elf_arm_64_no_sve {
+    hal.executable.export public @dispatch ordinal(0) layout(#pipeline_layout) attributes {
+      translation_info = #iree_codegen.translation_info<CPUBufferOpsTileAndVectorize>
+    } {
+    ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
+      hal.return %arg1, %arg2, %arg2 : index, index, index
+    }
+    builtin.module {
+      func.func @dispatch() { return }
+    }
+  }
+}
+
+// CHECK-LABEL: @aarch64_ssve_sve_disabled
+// CHECK-NOT: func.func @dispatch() attributes {arm_locally_streaming}
commit	4e9b3bdb39dd7eb8d63b4fefa22d86d460261564	[log] [tgz]
author	Cullen Rhodes <cullen.rhodes@arm.com>	Wed Jun 07 18:23:57 2023 +0100
committer	GitHub <noreply@github.com>	Wed Jun 07 10:23:57 2023 -0700
tree	86be44db5fc29c50be1b5a110148eebba2facde4
parent	0b91c98ce58f34ba6098b591c9a5879f3a679f52 [diff]