[GPU] Adding support for opt pass plugins during AMDGPU executable serialization (#18347)
This commit adds the --iree-hal-target-pass-plugins flag That allows to
add plugins during executable code generation and serialization.
This is interesting for adding instrumentation via external passes
(e.g., https://github.com/CRobeck/instrument-amdgpu-kernels)
I am creating this for two reasons. First, to see if there is interest
for this. Second, to get help on an error I have. Currently, I am having
some issues with my tests where there is a segfault during dlopen. If
anyone has some clue what may be happening, that would be awesome.
---------
Signed-off-by: Jose M Monsalve Diaz <jmonsalv@amd.com>
diff --git a/compiler/plugins/target/ROCM/ROCMTarget.cpp b/compiler/plugins/target/ROCM/ROCMTarget.cpp
index 770a84b..0a2fcc3 100644
--- a/compiler/plugins/target/ROCM/ROCMTarget.cpp
+++ b/compiler/plugins/target/ROCM/ROCMTarget.cpp
@@ -33,6 +33,7 @@
#include "llvm/IR/Verifier.h"
#include "llvm/MC/TargetRegistry.h"
#include "llvm/Passes/PassBuilder.h"
+#include "llvm/Passes/PassPlugin.h"
#include "llvm/Passes/StandardInstrumentations.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/FormatVariadic.h"
@@ -61,6 +62,11 @@
std::string enableROCMUkernels = "none";
bool legacySync = true;
+ /// List of LLVM opt pass pluggins to be loaded during GPU code
+ /// generation. The pluggins are paths to dynamic libraries that
+ /// are added to the LLVM pass manager.
+ SmallVector<std::string> passPlugins;
+
void bindOptions(OptionsBinder &binder) {
using namespace llvm;
static cl::OptionCategory category("HIP HAL Target");
@@ -95,6 +101,13 @@
binder.opt<bool>("iree-hip-legacy-sync", legacySync, cl::cat(category),
cl::desc("Enables 'legacy-sync' mode, which is required "
"for inline execution."));
+ binder.list<std::string>(
+ "iree-hip-pass-plugin-path", passPlugins,
+ cl::desc("LLVM pass plugins are out of tree libraries that implement "
+ "LLVM opt passes. The library paths passed in this flag are "
+ "to be passed to the target backend compiler during HIP "
+ "executable serialization"),
+ cl::ZeroOrMore, cl::cat(category));
}
LogicalResult verify(mlir::Builder &builder) const {
@@ -272,7 +285,8 @@
// ones). Inspired by code section in
// https://github.com/iree-org/iree/blob/main/compiler/plugins/target/CUDA/CUDATarget.cpp
static void optimizeModule(llvm::Module &module,
- llvm::TargetMachine &targetMachine) {
+ llvm::TargetMachine &targetMachine,
+ ArrayRef<std::string> passPlugins) {
llvm::LoopAnalysisManager lam;
llvm::FunctionAnalysisManager fam;
llvm::CGSCCAnalysisManager cgam;
@@ -296,6 +310,18 @@
pb.registerLoopAnalyses(lam);
pb.crossRegisterProxies(lam, fam, cgam, mam);
+ for (const std::string &pluginFileName : passPlugins) {
+ llvm::Expected<llvm::PassPlugin> pp =
+ llvm::PassPlugin::Load(pluginFileName);
+ if (pp) {
+ pp->registerPassBuilderCallbacks(pb);
+ } else {
+ std::string error = "unable to load plugin " + pluginFileName + ": " +
+ llvm::toString(pp.takeError());
+ llvm::report_fatal_error(error.c_str());
+ }
+ }
+
llvm::OptimizationLevel ol = llvm::OptimizationLevel::O2;
mpm.addPass(llvm::VerifierPass());
@@ -522,7 +548,7 @@
}
// Run LLVM optimization passes.
- optimizeModule(*llvmModule, *targetMachine);
+ optimizeModule(*llvmModule, *targetMachine, options.passPlugins);
if (!serOptions.dumpIntermediatesPath.empty()) {
dumpModuleToPath(serOptions.dumpIntermediatesPath,
serOptions.dumpBaseName, variantOp.getName(),
diff --git a/compiler/plugins/target/ROCM/test/opt_pass_plugin/CMakeLists.txt b/compiler/plugins/target/ROCM/test/opt_pass_plugin/CMakeLists.txt
new file mode 100644
index 0000000..42b54af
--- /dev/null
+++ b/compiler/plugins/target/ROCM/test/opt_pass_plugin/CMakeLists.txt
@@ -0,0 +1,28 @@
+iree_cc_library(
+ NAME
+ GPUHello
+ SRCS
+ "GPUHello.cpp"
+ DEPS
+ iree::compiler::API::Impl
+ SHARED
+)
+
+# NOTE: this is only required because we want this sample to run on all
+# platforms without needing to change the library name (libfoo.so/foo.dll).
+set_target_properties(iree_compiler_plugins_target_ROCM_test_opt_pass_plugin_GPUHello
+ PROPERTIES
+ WINDOWS_EXPORT_ALL_SYMBOLS ON
+ PREFIX "lib"
+ OUTPUT_NAME "GPUHello"
+)
+
+iree_lit_test_suite(
+ NAME
+ lit
+ SRCS
+ "gpu_hello.mlir"
+ TOOLS
+ FileCheck
+ iree-opt
+)
diff --git a/compiler/plugins/target/ROCM/test/opt_pass_plugin/GPUHello.cpp b/compiler/plugins/target/ROCM/test/opt_pass_plugin/GPUHello.cpp
new file mode 100644
index 0000000..6433530
--- /dev/null
+++ b/compiler/plugins/target/ROCM/test/opt_pass_plugin/GPUHello.cpp
@@ -0,0 +1,82 @@
+// Copyright 2024 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/Pass.h"
+#include "llvm/Passes/PassBuilder.h"
+#include "llvm/Passes/PassPlugin.h"
+#include "llvm/Support/FormatVariadic.h"
+#include "llvm/Support/raw_ostream.h"
+
+namespace {
+
+struct GpuHello final : llvm::PassInfoMixin<GpuHello> {
+ llvm::PreservedAnalyses run(llvm::Module &module,
+ llvm::ModuleAnalysisManager &) {
+ bool modifiedCodeGen = runOnModule(module);
+ if (!modifiedCodeGen)
+ return llvm::PreservedAnalyses::none();
+
+ return llvm::PreservedAnalyses::all();
+ }
+
+ bool runOnModule(llvm::Module &module);
+ // We set `isRequired` to true to keep this pass from being skipped
+ // if it has the optnone LLVM attribute.
+ static bool isRequired() { return true; }
+};
+
+bool GpuHello::runOnModule(llvm::Module &module) {
+ for (llvm::Function &function : module) {
+ if (function.isIntrinsic() || function.isDeclaration())
+ continue;
+
+ if (function.getCallingConv() != llvm::CallingConv::AMDGPU_KERNEL &&
+ function.getCallingConv() != llvm::CallingConv::PTX_Kernel)
+ continue;
+
+ for (llvm::BasicBlock &basicBlock : function) {
+ for (llvm::Instruction &inst : basicBlock) {
+ llvm::DILocation *debugLocation = inst.getDebugLoc();
+ std::string sourceInfo;
+ if (!debugLocation) {
+ sourceInfo = function.getName().str();
+ } else {
+ sourceInfo = llvm::formatv("{0}\t{1}:{2}:{3}", function.getName(),
+ debugLocation->getFilename(),
+ debugLocation->getLine(),
+ debugLocation->getColumn())
+ .str();
+ }
+
+ llvm::errs() << "Hello From First Instruction of GPU Kernel: "
+ << sourceInfo << "\n";
+ return false;
+ }
+ }
+ }
+ return false;
+}
+
+} // end anonymous namespace
+
+llvm::PassPluginLibraryInfo getPassPluginInfo() {
+ const auto callback = [](llvm::PassBuilder &pb) {
+ pb.registerOptimizerLastEPCallback([&](llvm::ModulePassManager &mpm, auto) {
+ mpm.addPass(GpuHello());
+ return true;
+ });
+ };
+ return {LLVM_PLUGIN_API_VERSION, "gpu-hello", LLVM_VERSION_STRING, callback};
+};
+
+extern "C" LLVM_ATTRIBUTE_WEAK
+ LLVM_ATTRIBUTE_VISIBILITY_DEFAULT ::llvm::PassPluginLibraryInfo
+ llvmGetPassPluginInfo() {
+ return getPassPluginInfo();
+}
diff --git a/compiler/plugins/target/ROCM/test/opt_pass_plugin/gpu_hello.mlir b/compiler/plugins/target/ROCM/test/opt_pass_plugin/gpu_hello.mlir
new file mode 100644
index 0000000..9409ddd
--- /dev/null
+++ b/compiler/plugins/target/ROCM/test/opt_pass_plugin/gpu_hello.mlir
@@ -0,0 +1,39 @@
+// RUN: iree-opt --split-input-file --iree-hal-transformation-pipeline --iree-gpu-test-target=gfx90a \
+// RUN: --iree-hip-pass-plugin-path=$IREE_BINARY_DIR/lib/libGPUHello$IREE_DYLIB_EXT %s 2>&1 | FileCheck %s
+
+module attributes {
+ hal.device.targets = [
+ #hal.device.target<"hip", [
+ #hal.executable.target<"rocm", "rocm-hsaco-fb">
+ ]> : !hal.device
+ ]
+} {
+
+stream.executable public @add_dispatch_0 {
+ stream.executable.export @add_dispatch_0 workgroups(%arg0 : index) -> (index, index, index) {
+ %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0
+ stream.return %x, %y, %z : index, index, index
+ }
+ builtin.module {
+ func.func @add_dispatch_0(%arg0_binding: !stream.binding, %arg1_binding: !stream.binding, %arg2_binding: !stream.binding) {
+ %c0 = arith.constant 0 : index
+ %arg0 = stream.binding.subspan %arg0_binding[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<16xf32>>
+ %arg1 = stream.binding.subspan %arg1_binding[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<16xf32>>
+ %arg2 = stream.binding.subspan %arg2_binding[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<16xf32>>
+ %0 = tensor.empty() : tensor<16xf32>
+ %1 = flow.dispatch.tensor.load %arg0, offsets=[0], sizes=[16], strides=[1] : !flow.dispatch.tensor<readonly:tensor<16xf32>> -> tensor<16xf32>
+ %2 = flow.dispatch.tensor.load %arg1, offsets=[0], sizes=[16], strides=[1] : !flow.dispatch.tensor<readonly:tensor<16xf32>> -> tensor<16xf32>
+ %3 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%1, %2 : tensor<16xf32>, tensor<16xf32>) outs(%0 : tensor<16xf32>) {
+ ^bb0(%arg3: f32, %arg4: f32, %arg5: f32): // no predecessors
+ %4 = arith.addf %arg3, %arg4 : f32
+ linalg.yield %4 : f32
+ } -> tensor<16xf32>
+ flow.dispatch.tensor.store %3, %arg2, offsets=[0], sizes=[16], strides=[1] : tensor<16xf32> -> !flow.dispatch.tensor<writeonly:tensor<16xf32>>
+ return
+ }
+ }
+}
+
+}
+
+// CHECK: Hello From First Instruction of GPU Kernel: add_dispatch_0