blob: cfb471783bf10b2845a0c042dca56932f39dff41 [file] [log] [blame]
// Copyright 2021 The IREE Authors
//
// Licensed under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
#include "ROCMTargetUtils.h"
#include <cstdint>
#include "compiler/plugins/target/ROCM/Dialect/ROCM/IR/ROCMAttrs.h"
#include "compiler/plugins/target/ROCM/Dialect/ROCM/Transforms/Passes.h"
#include "compiler/plugins/target/ROCM/builtins/ukernel/iree_uk_amdgpu_bitcode.h"
#include "iree/compiler/Codegen/Common/Passes.h"
#include "iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenAttrs.h"
#include "iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenDialect.h"
#include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.h"
#include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUDialect.h"
#include "iree/compiler/Codegen/Dialect/GPU/TargetUtils/KnownTargets.h"
#include "iree/compiler/Codegen/Dialect/VectorExt/IR/VectorExtDialect.h"
#include "iree/compiler/Codegen/LLVMGPU/Passes.h"
#include "iree/compiler/Codegen/Utils/GPUUtils.h"
#include "iree/compiler/Codegen/Utils/Utils.h"
#include "iree/compiler/Dialect/Encoding/IR/EncodingTypes.h"
#include "iree/compiler/Dialect/HAL/IR/HALOps.h"
#include "iree/compiler/Dialect/HAL/Target/TargetRegistry.h"
#include "iree/compiler/Dialect/HAL/Utils/ExecutableDebugInfoUtils.h"
#include "iree/compiler/Dialect/HAL/Utils/LLVMLinkerUtils.h"
#include "iree/compiler/PluginAPI/Client.h"
#include "iree/compiler/Utils/EmbeddedDataDirectory.h"
#include "iree/compiler/Utils/FlatbufferUtils.h"
#include "iree/compiler/Utils/ToolUtils.h"
#include "iree/schemas/amdgpu_executable_def_builder.h"
#include "iree/schemas/hip_executable_def_builder.h"
#include "llvm/ADT/StringExtras.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/Bitcode/BitcodeWriter.h"
#include "llvm/IR/LegacyPassManager.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/Verifier.h"
#include "llvm/MC/TargetRegistry.h"
#include "llvm/Passes/PassBuilder.h"
#include "llvm/Passes/StandardInstrumentations.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/FormatVariadic.h"
#include "llvm/Support/TargetSelect.h"
#include "llvm/Target/TargetMachine.h"
#include "llvm/Transforms/Utils/Cloning.h"
#include "mlir/Dialect/AMDGPU/Utils/Chipset.h"
#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
#include "mlir/IR/Attributes.h"
#include "mlir/IR/BuiltinAttributes.h"
#include "mlir/Pass/PassManager.h"
#include "mlir/Support/LogicalResult.h"
#include "mlir/Target/LLVMIR/Dialect/Builtin/BuiltinToLLVMIRTranslation.h"
#include "mlir/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.h"
#include "mlir/Target/LLVMIR/Dialect/ROCDL/ROCDLToLLVMIRTranslation.h"
#include "mlir/Target/LLVMIR/Export.h"
namespace mlir::iree_compiler::IREE::HAL {
namespace {
enum class ContainerType {
// Automatically detect the container type from the target ABI attribute.
Auto,
// HIP ExecutableDef flatbuffer.
HIP,
// AMDGPU ExecutableDef flatbuffer.
AMDGPU,
// Raw HSACO image (ELF).
HSACO,
};
// TODO(#18792): rename flags back to iree-rocm- as they are not HIP-specific.
struct ROCMOptions {
std::string target = "";
std::string targetFeatures = "";
ContainerType containerType = ContainerType::Auto;
std::string bitcodeDirectory = getDefaultBitcodeDirectory();
int wavesPerEu = 0;
std::string enableROCMUkernels = "none";
std::string encodingLayoutResolver = GPU::kNoEncodingLayoutResolverName;
bool slpVectorization = true;
bool globalISel = false;
bool specializeDispatches = false;
bool enableTensorUKernels = false;
void bindOptions(OptionsBinder &binder) {
using namespace llvm;
static cl::OptionCategory category("HIP HAL Target");
binder.opt<std::string>(
"iree-hip-target", target, cl::cat(category),
cl::desc(
// clang-format off
"HIP target as expected by LLVM AMDGPU backend; e.g., "
"'gfx90a'/'gfx942' for targeting MI250/MI300 GPUs. "
"Additionally this also supports architecture code names like "
"'cdna3'/'rdna3' or some product names like 'mi300x'/'rtx7900xtx' "
"for a better experience. See "
"https://iree.dev/guides/deployment-configurations/gpu-rocm/ "
"for more details."
// clang-format on
));
binder.opt<std::string>(
"iree-hip-target-features", targetFeatures, cl::cat(category),
cl::desc("HIP target features as expected by LLVM AMDGPU backend; "
"e.g., '+sramecc,+xnack'."));
binder.opt<ContainerType>(
"iree-rocm-container-type", containerType,
llvm::cl::desc("Serialized executable container type."),
llvm::cl::cat(category),
llvm::cl::values(clEnumValN(ContainerType::Auto, "auto",
"Automatically detect the container type "
"from the target ABI attribute."),
clEnumValN(ContainerType::HIP, "hip",
"HIP ExecutableDef flatbuffer."),
clEnumValN(ContainerType::AMDGPU, "amdgpu",
"AMDGPU ExecutableDef flatbuffer."),
clEnumValN(ContainerType::HSACO, "hsaco",
"Raw HSACO image (ELF).")));
binder.opt<std::string>("iree-hip-bc-dir", bitcodeDirectory,
cl::cat(category),
cl::desc("Directory of HIP Bitcode."));
binder.opt<int>("iree-hip-waves-per-eu", wavesPerEu, cl::cat(category),
cl::desc("Optimization hint specifying minimum "
"number of waves per execution unit."));
binder.opt<std::string>(
"iree-hip-enable-ukernels", enableROCMUkernels, cl::cat(category),
cl::desc("Enables microkernels in the HIP compiler backend. May be "
"`default`, `none`, `all`, or a comma-separated list of "
"specific unprefixed microkernels to enable, e.g. `mmt4d`."));
binder.opt<std::string>(
"iree-hip-encoding-layout-resolver", encodingLayoutResolver,
cl::cat(category),
cl::desc("Selects the way that encodings will be "
"resolved. Options are: `none` (resolve to "
"identity layout), `pad` (additional padding "
"on allocations to maximize cache bandwidth), "
"and `data-tiling` (enable data tiled layouts)"));
binder.opt<bool>("iree-hip-llvm-slp-vec", slpVectorization,
cl::cat(category),
cl::desc("Enable slp vectorization in llvm opt."));
binder.opt<bool>("iree-hip-llvm-global-isel", globalISel, cl::cat(category),
cl::desc("Enable global instruction selection in llvm."));
binder.opt<bool>(
"iree-hip-specialize-dispatches", specializeDispatches,
cl::cat(category),
cl::desc(
"Enable runtime specialization of dynamically shaped dispatches."));
binder.opt<bool>("iree-hip-enable-tensor-ukernels", enableTensorUKernels,
cl::cat(category),
cl::desc("Enable MLIR-based ukernels."));
}
LogicalResult verify(mlir::Builder &builder) const {
if (target.empty()) {
return emitError(builder.getUnknownLoc())
<< "HIP target not set; did you forget to pass "
"'--iree-hip-target'?";
}
if (GPU::normalizeHIPTarget(target).empty()) {
return emitError(builder.getUnknownLoc(), "Unknown HIP target '")
<< target << "'";
}
SmallVector<StringRef> features;
llvm::SplitString(targetFeatures, features, ",");
for (StringRef f : features) {
if (!(f.starts_with("+") || f.starts_with("-"))) {
return emitError(builder.getUnknownLoc(),
"HIP target feature must be prefixed with '+' or "
"'-'; but seen '")
<< f << "'";
}
StringRef feature = f.substr(1);
if (feature != "sramecc" && feature != "xnack") {
// We only support these two features to be set explicitly. Features
// like wavefrontsize is controlled and tuned by the compiler.
return emitError(builder.getUnknownLoc(),
"HIP target feature can only be 'sramecc' or "
"'xnack'; but seen '")
<< feature << "'";
}
}
return success();
}
private:
static std::string getDefaultBitcodeDirectory() {
return mlir::iree_compiler::findPlatformLibDirectory("rocm");
}
};
// Returns the ABI or an empty string if unspecified.
static StringRef getABI(IREE::HAL::ExecutableTargetAttr targetAttr) {
if (targetAttr) {
if (auto config = targetAttr.getConfiguration()) {
auto abiAttr = targetAttr.getConfiguration().getAs<StringAttr>("abi");
return abiAttr ? abiAttr.getValue() : "";
}
}
return "";
}
static void dumpModuleToPath(StringRef path, StringRef baseName,
StringRef suffix, StringRef extension,
llvm::Module &module, StringRef header = {}) {
llvm::SmallVector<char, 0> data;
llvm::raw_svector_ostream ostream(data);
ostream << header;
module.print(ostream, nullptr);
dumpDataToPath(path, baseName, suffix, extension,
StringRef(data.data(), data.size()));
}
static std::string translateModuleToObj(llvm::Module &module,
llvm::TargetMachine &targetMachine) {
std::string targetObj;
{
llvm::raw_string_ostream stream(targetObj);
llvm::buffer_ostream pstream(stream);
llvm::legacy::PassManager codegenPasses;
targetMachine.addPassesToEmitFile(codegenPasses, pstream, nullptr,
llvm::CodeGenFileType::ObjectFile);
codegenPasses.run(module);
}
return targetObj;
}
static std::string translateModuleToISA(llvm::Module &module,
llvm::TargetMachine &targetMachine) {
std::string targetISA;
{
llvm::raw_string_ostream stream(targetISA);
llvm::buffer_ostream pstream(stream);
llvm::legacy::PassManager codegenPasses;
targetMachine.addPassesToEmitFile(codegenPasses, pstream, nullptr,
llvm::CodeGenFileType::AssemblyFile);
codegenPasses.run(module);
}
return targetISA;
}
} // namespace
class ROCMTargetBackend final : public TargetBackend {
public:
ROCMTargetBackend(const ROCMOptions &options) : options(options) {}
std::string getLegacyDefaultDeviceID() const override { return "hip"; }
void getDefaultExecutableTargets(
MLIRContext *context, StringRef deviceID, DictionaryAttr deviceConfigAttr,
SmallVectorImpl<IREE::HAL::ExecutableTargetAttr> &executableTargetAttrs)
const override {
if (auto target = getExecutableTarget(deviceID, context)) {
executableTargetAttrs.push_back(target);
}
}
IREE::HAL::ExecutableTargetAttr
getExecutableTarget(StringRef deviceID, MLIRContext *context) const {
Builder b(context);
SmallVector<NamedAttribute, 4> configItems;
auto addConfig = [&](StringRef name, Attribute value) {
configItems.emplace_back(name, value);
};
if (failed(options.verify(b))) {
return nullptr;
}
addConfig("abi", b.getStringAttr(deviceID));
std::string format;
if (deviceID == "amdgpu") {
format = options.target;
} else {
format = "rocm-hsaco-fb"; // legacy HIP
}
if (auto target = GPU::getHIPTargetDetails(
options.target, options.targetFeatures, context)) {
addConfigGPUTarget(context, target, configItems);
if (options.encodingLayoutResolver !=
GPU::kNoEncodingLayoutResolverName) {
if (Attribute encoding = GPU::getHIPTargetEncodingLayoutAttr(
target, options.encodingLayoutResolver)) {
addConfig(IREE::Encoding::kEncodingResolverAttrName, encoding);
}
}
// Look for a default tuning spec.
auto rocmDialect = context->getOrLoadDialect<IREE::ROCM::ROCMDialect>();
// First check for a spec based on the sku.
std::optional<std::string> maybeSpecName = std::nullopt;
if (IREE::GPU::TargetChipAttr chip = target.getChip()) {
if (StringAttr sku = chip.getSku()) {
std::string specName =
llvm::formatv("iree_default_tuning_spec_{}.mlir", sku.strref());
if (!rocmDialect->hasBuiltin(specName)) {
maybeSpecName = specName;
}
}
}
// Then, if none was found, look for one based on the target arch.
if (!maybeSpecName) {
std::string specName =
llvm::formatv("iree_default_tuning_spec_{}.mlir", target.getArch());
if (rocmDialect->hasBuiltin(specName)) {
maybeSpecName = specName;
}
}
if (maybeSpecName) {
addConfig("iree_codegen.default_tuning_spec",
IREE::ROCM::BuiltinTuningModuleAttr::get(
context, maybeSpecName.value()));
}
}
addConfig("ukernels", b.getStringAttr(options.enableROCMUkernels));
if (options.wavesPerEu > 0) {
addConfigWavesPerEu(b.getContext(), options.wavesPerEu, configItems);
}
if (options.enableTensorUKernels) {
addConfig(kUKernelProviderName,
IREE::Codegen::SymbolicUKernelProviderAttr::get(context));
}
return b.getAttr<IREE::HAL::ExecutableTargetAttr>(
b.getStringAttr("rocm"), b.getStringAttr(format),
b.getDictionaryAttr(configItems));
}
void getDependentDialects(DialectRegistry &registry) const override {
mlir::registerBuiltinDialectTranslation(registry);
mlir::registerLLVMDialectTranslation(registry);
mlir::registerROCDLDialectTranslation(registry);
registry.insert<IREE::Codegen::IREECodegenDialect>();
registry.insert<IREE::VectorExt::IREEVectorExtDialect>();
registry.insert<IREE::GPU::IREEGPUDialect>();
registry.insert<IREE::ROCM::ROCMDialect>();
// Configuration may load and manipulate transform dialect libraries.
registerTransformDialectTranslationDependentDialects(registry);
}
void
buildConfigurationPassPipeline(IREE::HAL::ExecutableTargetAttr targetAttr,
OpPassManager &passManager) override {
if (options.specializeDispatches) {
if (auto attr = getGPUTargetAttr(targetAttr.getContext(), targetAttr)) {
ROCM::ApplyBuiltinPDLPatternsPassOptions options;
options.enableSpecialization = true;
if (IREE::GPU::TargetChipAttr chip = attr.getChip()) {
if (StringAttr sku = chip.getSku()) {
options.targets.push_back(sku.str());
}
}
options.targets.push_back(attr.getArch().str());
OpPassManager &modulePassManager = passManager.nest<ModuleOp>();
FunctionLikeNest(modulePassManager).addPass([&]() {
return ROCM::createApplyBuiltinPDLPatternsPass(options);
});
}
}
if (options.enableTensorUKernels) {
if (auto attr = getGPUTargetAttr(targetAttr.getContext(), targetAttr)) {
ROCM::ApplyBuiltinPDLPatternsPassOptions options;
options.enableTensorUKernels = true;
if (IREE::GPU::TargetChipAttr chip = attr.getChip()) {
if (StringAttr sku = chip.getSku()) {
options.targets.push_back(sku.str());
}
}
options.targets.push_back(attr.getArch().str());
OpPassManager &modulePassManager = passManager.nest<ModuleOp>();
FunctionLikeNest(modulePassManager).addPass([&]() {
return ROCM::createApplyBuiltinPDLPatternsPass(options);
});
}
}
passManager.addPass(createSpecializeExportsPass());
buildLLVMGPUCodegenCommonConfigurationPassPipeline(passManager);
OpPassManager &modulePassManager = passManager.nest<ModuleOp>();
if (options.enableTensorUKernels) {
modulePassManager.addPass(
IREE::ROCM::createApplyBuiltinPDLPatternsDriverPass());
}
modulePassManager.addPass(createMaterializeTuningSpecsPass());
modulePassManager.addPass(createMaterializeUserConfigsPass());
modulePassManager.addPass(createLLVMGPUSelectLoweringStrategyPass());
}
void buildTranslationPassPipeline(IREE::HAL::ExecutableTargetAttr targetAttr,
OpPassManager &passManager) override {
buildLLVMGPUCodegenPassPipeline(passManager, true);
}
void buildLinkingPassPipeline(OpPassManager &passManager) override {
buildLLVMGPULinkingPassPipeline(passManager, "rocm");
}
// Performs optimizations on |module| (including LTO-style whole-program
// ones). Inspired by code section in
// https://github.com/iree-org/iree/blob/main/compiler/plugins/target/CUDA/CUDATarget.cpp
static void optimizeModule(llvm::Module &module,
llvm::TargetMachine &targetMachine,
bool slpVectorization,
std::string &outPassesString) {
llvm::LoopAnalysisManager lam;
llvm::FunctionAnalysisManager fam;
llvm::CGSCCAnalysisManager cgam;
llvm::ModuleAnalysisManager mam;
fam.registerPass([&] { return targetMachine.getTargetIRAnalysis(); });
llvm::PipelineTuningOptions pto;
pto.SLPVectorization = slpVectorization;
llvm::PassInstrumentationCallbacks pic;
llvm::StandardInstrumentations si(module.getContext(), false);
si.registerCallbacks(pic, &mam);
llvm::PassBuilder pb(&targetMachine, pto, std::nullopt, &pic);
llvm::ModulePassManager mpm;
pb.registerModuleAnalyses(mam);
pb.registerCGSCCAnalyses(cgam);
pb.registerFunctionAnalyses(fam);
pb.registerLoopAnalyses(lam);
pb.crossRegisterProxies(lam, fam, cgam, mam);
llvm::OptimizationLevel ol = llvm::OptimizationLevel::O2;
mpm.addPass(llvm::VerifierPass());
mpm.addPass(pb.buildPerModuleDefaultPipeline(ol));
mpm.addPass(llvm::VerifierPass());
llvm::raw_string_ostream os(outPassesString);
mpm.printPipeline(os, [&pic](StringRef className) {
auto passName = pic.getPassNameForClassName(className);
return passName.empty() ? className : passName;
});
mpm.run(module, mam);
}
LogicalResult
validateFinalizedModule(IREE::HAL::ExecutableVariantOp variantOp,
llvm::Module &module) {
for (llvm::Function &func : module.functions()) {
if (func.isDeclaration() && !func.isIntrinsic() && !func.use_empty()) {
llvm::User *liveUser = *func.user_begin();
return variantOp.emitError()
<< "found an unresolved external function '" << func.getName()
<< "' in the final bitcode. A remaining live user is\n"
<< llvm::formatv("{}", *liveUser);
}
}
return success();
}
LogicalResult
serializeExecutable(const SerializationOptions &serializationOptions,
IREE::HAL::ExecutableVariantOp variantOp,
OpBuilder &executableBuilder) override {
ModuleOp innerModuleOp = variantOp.getInnerModule();
auto targetAttr = variantOp.getTargetAttr();
StringRef targetArch = options.target;
StringRef targetFeatures = options.targetFeatures;
if (auto attr = getGPUTargetAttr(variantOp.getContext(), targetAttr)) {
targetArch = attr.getArch();
targetFeatures = attr.getFeatures();
}
// We name our files after the executable name so that they are easy to
// track both during compilation (logs/artifacts/etc), as outputs (final
// intermediate code/binary files), and at runtime (loaded
// libraries/symbols/etc).
const std::string libraryName =
variantOp->getParentOfType<IREE::HAL::ExecutableOp>().getName().str();
// Collect all the entry point names.
auto exportOps = llvm::to_vector_of<IREE::HAL::ExecutableExportOp>(
variantOp.getExportOps());
std::optional<uint32_t> subgroupSize;
for (IREE::HAL::ExecutableExportOp exportOp : exportOps) {
// TODO: put this either on the variant or propagate as a function
// attribute instead - today this *must* be consistent across all exports
// and it shouldn't need to be.
if (auto setSubgroupSize = exportOp.getSubgroupSizeAsUInt()) {
if (setSubgroupSize.value() != 32 && setSubgroupSize.value() != 64) {
return variantOp.emitError()
<< "invalid subgroup size " << setSubgroupSize.value();
}
if (subgroupSize.has_value() &&
setSubgroupSize.value() != subgroupSize.value()) {
return variantOp.emitError()
<< "multiple exports with different subgroup sizes; this is a "
"limitation of the IREE compilation process and should be "
"fixed";
}
subgroupSize = setSubgroupSize.value();
}
}
std::string targetHSACO;
if (variantOp.isExternal()) {
if (!variantOp.getObjects().has_value()) {
return variantOp.emitOpError()
<< "no objects defined for external variant";
}
if (variantOp.getObjects()->getValue().size() != 1) {
// For now we assume there will be exactly one object file.
// In the future we will want to perform a linking step here and ideally
// support _also_ linking in the codegen results.
return variantOp.emitOpError() << "only one object reference is "
"supported for external variants";
}
// Read the HSACO from the object file.
auto objectAttr = llvm::cast<IREE::HAL::ExecutableObjectAttr>(
variantOp.getObjects()->getValue().front());
if (auto data = objectAttr.loadData()) {
targetHSACO = data.value();
} else {
return variantOp.emitOpError()
<< "object file could not be loaded: " << objectAttr;
}
} else {
auto maybeChipset = amdgpu::Chipset::parse(targetArch);
if (failed(maybeChipset)) {
return variantOp.emitOpError()
<< "could not parse AMDGPU chipset name '" << targetArch << "'";
}
amdgpu::Chipset chipset = *maybeChipset;
// Perform the translation in a separate context to avoid any
// multi-threading issues.
llvm::LLVMContext context;
std::unique_ptr<llvm::Module> llvmModule =
mlir::translateModuleToLLVMIR(innerModuleOp, context, libraryName);
if (!llvmModule) {
return variantOp.emitError() << "failed to translate the MLIR LLVM "
"dialect to the native llvm::Module";
}
for (auto func : innerModuleOp.getOps<LLVM::LLVMFuncOp>()) {
llvm::Function *llvmFunc = llvmModule->getFunction(func.getName());
if (llvmFunc->isDeclaration())
continue;
// Override flags as given by target func attrs.
if (auto funcAttrs =
func->getAttrOfType<DictionaryAttr>("llvm_func_attrs")) {
for (NamedAttribute funcAttr : funcAttrs) {
auto value = dyn_cast<StringAttr>(funcAttr.getValue());
if (!value) {
return variantOp->emitError()
<< "llvm_func_attrs attribute must be a dictionary of "
"strings. Attribute "
<< funcAttr.getName() << " is not a StringAttr.";
}
llvmFunc->addFnAttr(funcAttr.getName(), value.getValue());
}
}
}
std::unique_ptr<llvm::TargetMachine> targetMachine;
bool isWave64 = true;
{
llvm::Triple triple("amdgcn-amd-amdhsa");
std::string error;
const llvm::Target *target =
llvm::TargetRegistry::lookupTarget("", triple, error);
if (!target) {
return variantOp.emitError() << "cannot initialize target triple";
}
llvm::TargetOptions opt;
opt.AllowFPOpFusion = llvm::FPOpFusion::Fast;
opt.UnsafeFPMath = false;
opt.NoInfsFPMath = false;
opt.NoNaNsFPMath = true;
// Be extra cautious while this is less tested, and prevent unknown
// fallbacks from global isel.
//
// When GlobalISelAbort is set, any failure of GlobalISel,
// whether due to being not yet implemented or incorrect IR will result
// in an immediate abortion of compilation. This disables the fallback
// path of AMDGPUPassConfig::addInstSelector and 2 legacy passes which
// might work around unimplemented cases or errors in GlobalISel
// resulting in a successful compilation but would make one assume
// results are with GlobalISel when they are not.
opt.EnableGlobalISel = options.globalISel;
opt.GlobalISelAbort = options.globalISel
? llvm::GlobalISelAbortMode::Enable
: llvm::GlobalISelAbortMode::Disable;
SmallVector<std::string> features;
if (chipset.majorVersion >= 10 && chipset.majorVersion <= 12) {
switch (subgroupSize.value_or(64)) {
case 32:
isWave64 = false;
features.emplace_back("+wavefrontsize32");
break;
default:
case 64:
features.emplace_back("+wavefrontsize64");
break;
}
}
// Mixed precision fma instructions have complicated semantics on
// gf9+ GPUs and can lead to numeric issues as seen in
// https://github.com/iree-org/iree/issues/18746 so we disable this
// feature.
if (targetArch.starts_with("gfx9")) {
features.emplace_back("-fma-mix-insts");
}
if (!targetFeatures.empty()) {
features.emplace_back(targetFeatures.str());
}
std::string featureStr = llvm::join(features, ",");
targetMachine.reset(target->createTargetMachine(
triple, targetArch, featureStr, opt, llvm::Reloc::Model::PIC_,
std::nullopt, llvm::CodeGenOptLevel::Aggressive));
if (!targetMachine) {
return variantOp.emitError() << "cannot initialize target machine";
}
}
llvmModule->setDataLayout(targetMachine->createDataLayout());
// Code object version * 100.
constexpr uint32_t abiVersion = 500;
// Let the backend know what code object version we're compiling for. This
// insulates us from changes to the default code object version that our
// CI or users may not be prepared for.
llvmModule->addModuleFlag(llvm::Module::Error,
"amdhsa_code_object_version", abiVersion);
for (llvm::Function &f : llvmModule->functions())
f.addFnAttr(llvm::Attribute::AlwaysInline);
// Link user-provided modules.
llvm::Linker linker(*llvmModule);
if (failed(linkCmdlineBitcodeFiles(
variantOp.getLoc(), linker, llvm::Linker::OverrideFromSrc,
*targetMachine, llvmModule->getContext()))) {
return failure();
}
// Link bitcode (*.bc) object attrs specified by the input program.
// Note that this happens after the command-line files so that the command
// line ones override the symbols coming from the embedded files.
auto specializationCallback = [&](llvm::Module &userModule) {
// TODO: inject __nvvm_reflect-style functions/globals for
// bitcode specialization based on the targetMachine and configuration.
// These could use any information we have on the IREE side as well as
// the TargetMachine.
};
unsigned linkerFlags =
llvm::Linker::LinkOnlyNeeded | llvm::Linker::OverrideFromSrc;
if (failed(linkBitcodeObjects(variantOp.getLoc(), linker, linkerFlags,
*targetMachine, variantOp.getObjectsAttr(),
llvmModule->getContext(),
specializationCallback))) {
return mlir::emitError(variantOp.getLoc())
<< "failed linking in user objects for target triple '"
<< targetArch.str() << "'";
}
// Link module to HIP device library.
if (options.bitcodeDirectory.empty()) {
return variantOp.emitError()
<< "cannot find ROCM bitcode files. Check your installation "
"consistency and in the worst case, set "
"--iree-hip-bc-dir= to a path on your system.";
}
if (failed(linkHIPBitcodeIfNeeded(variantOp.getLoc(), llvmModule.get(),
targetArch,
options.bitcodeDirectory))) {
return failure();
}
// Sets HIP platform globals based on the target architecture.
if (failed(setHIPGlobals(variantOp.getLoc(), llvmModule.get(), chipset,
isWave64, abiVersion))) {
return failure();
}
if (!serializationOptions.dumpIntermediatesPath.empty()) {
dumpModuleToPath(serializationOptions.dumpIntermediatesPath,
serializationOptions.dumpBaseName, variantOp.getName(),
".linked.ll", *llvmModule);
}
// For example 'gfx942'
StringRef targetCPU = targetMachine->getTargetCPU();
// For example 'amdgcn-amd-amdhsa'
std::string targetTriple = targetMachine->getTargetTriple().str();
// Run LLVM optimization passes.
std::string passesString;
optimizeModule(*llvmModule, *targetMachine, options.slpVectorization,
passesString);
if (!serializationOptions.dumpIntermediatesPath.empty()) {
// Additional context on '-mcpu' flag in PR comments, see for example:
// https://github.com/iree-org/iree/pull/20716#issuecomment-2851650421
std::string header =
llvm::formatv(R"TXT(
; To reproduce the .optimized.ll from the .linked.ll, run:
; opt -S -mtriple={} -mcpu={} --passes='{}'
; The flag '-S' to emit LLVMIR.
; The behavior of some passes depends on '-mtriple' and '-mcpu'
)TXT",
targetTriple, targetCPU, passesString);
dumpModuleToPath(serializationOptions.dumpIntermediatesPath,
serializationOptions.dumpBaseName, variantOp.getName(),
".optimized.ll", *llvmModule, header);
}
if (failed(validateFinalizedModule(variantOp, *llvmModule))) {
return failure();
}
// Dump the assembly output.
if (!serializationOptions.dumpIntermediatesPath.empty()) {
auto moduleCopy = llvm::CloneModule(*llvmModule);
if (!moduleCopy) {
llvm::errs() << "Error: cloning LLVM IR failed\n";
return failure();
}
std::string targetISA =
translateModuleToISA(*moduleCopy.get(), *targetMachine);
dumpDataToPath(serializationOptions.dumpIntermediatesPath,
serializationOptions.dumpBaseName, variantOp.getName(),
".rocmasm", targetISA);
}
// Serialize hsaco kernel into the binary that we will embed in the
// final FlatBuffer.
std::string targetObj = translateModuleToObj(*llvmModule, *targetMachine);
targetHSACO = createHsaco(variantOp.getLoc(), targetObj, libraryName);
if (targetHSACO.empty())
return failure();
}
if (!serializationOptions.dumpBinariesPath.empty()) {
dumpDataToPath(serializationOptions.dumpBinariesPath,
serializationOptions.dumpBaseName, variantOp.getName(),
".hsaco", targetHSACO);
}
// Determine container type from the target ABI attribute.
ContainerType containerType = options.containerType;
if (containerType == ContainerType::Auto) {
if (getABI(targetAttr) == "amdgpu") {
containerType = ContainerType::AMDGPU;
} else {
containerType = ContainerType::HIP;
}
}
// Wrap the HSACO ELF binary in the requested container type (if any).
FailureOr<DenseIntElementsAttr> binaryContainer;
switch (containerType) {
case ContainerType::Auto: {
// Resolved above; unreachable. Fall-through to error case.
assert(false && "auto container type must have resolved earlier");
break;
}
case ContainerType::AMDGPU: {
binaryContainer = serializeAMDGPUBinaryContainer(
serializationOptions, variantOp, exportOps, targetHSACO);
break;
}
case ContainerType::HIP: {
binaryContainer = serializeHIPBinaryContainer(
serializationOptions, variantOp, exportOps, targetHSACO);
break;
}
case ContainerType::HSACO: {
SmallVector<uint8_t> image;
image.resize(targetHSACO.size());
std::memcpy(image.data(), targetHSACO.data(), image.size());
binaryContainer = DenseIntElementsAttr::get(
VectorType::get({static_cast<int64_t>(targetHSACO.size())},
executableBuilder.getI8Type()),
image);
break;
}
}
if (failed(binaryContainer) || !binaryContainer.value()) {
return failure();
}
// Add the binary data to the target executable.
executableBuilder.create<iree_compiler::IREE::HAL::ExecutableBinaryOp>(
variantOp.getLoc(), variantOp.getSymName(),
variantOp.getTarget().getFormat(), binaryContainer.value());
return success();
}
protected:
FailureOr<DenseIntElementsAttr> serializeAMDGPUBinaryContainer(
const SerializationOptions &serializationOptions,
IREE::HAL::ExecutableVariantOp variantOp,
ArrayRef<IREE::HAL::ExecutableExportOp> exportOps,
StringRef hsacoModule) {
iree_compiler::FlatbufferBuilder builder;
iree_hal_amdgpu_ExecutableDef_start_as_root(builder);
// Attach embedded source file contents.
auto sourceFilesRef = createSourceFilesVec(
serializationOptions.debugLevel, variantOp.getSourcesAttr(), builder);
// Only a single module today.
SmallVector<iree_hal_amdgpu_ModuleDef_ref_t> moduleRefs;
{
auto hsacoImageRef = flatbuffers_string_create(
builder, hsacoModule.data(), hsacoModule.size());
moduleRefs.push_back(
iree_hal_amdgpu_ModuleDef_create(builder, hsacoImageRef));
}
auto modulesRef = builder.createOffsetVecDestructive(moduleRefs);
// Generate optional per-export debug information.
// May be empty if no debug information was requested.
auto exportDebugInfos =
createExportDefs(serializationOptions.debugLevel, exportOps, builder);
SmallVector<iree_hal_amdgpu_ExportDef_ref_t> exportRefs;
exportRefs.resize(exportOps.size(), 0);
for (auto exportOp : exportOps) {
auto ordinalAttr = exportOp.getOrdinalAttr();
if (!ordinalAttr) {
return mlir::emitError(exportOp.getLoc())
<< "could not compile rocm binary: export op is missing ordinal";
}
int64_t ordinal = ordinalAttr.getInt();
// Symbol names include a `.kd` suffix as that's what HSA expects.
auto symbolNameKd = (exportOp.getName() + ".kd").str();
auto symbolNameRef = builder.createString(symbolNameKd);
iree_hal_amdgpu_Dims_t workgroupSize = {0};
if (auto workgroupSizeAttr = exportOp.getWorkgroupSize()) {
auto workgroupSizeDims = workgroupSizeAttr->getValue();
workgroupSize.x = cast<IntegerAttr>(workgroupSizeDims[0]).getInt();
workgroupSize.y = cast<IntegerAttr>(workgroupSizeDims[1]).getInt();
workgroupSize.z = cast<IntegerAttr>(workgroupSizeDims[2]).getInt();
}
auto layoutAttr = exportOp.getLayoutAttr();
uint32_t constantCount = static_cast<uint32_t>(layoutAttr.getConstants());
SmallVector<iree_hal_amdgpu_BindingBits_enum_t> bindingFlags;
for (auto bindingAttr : layoutAttr.getBindings()) {
iree_hal_amdgpu_BindingBits_enum_t flags = 0;
if (allEnumBitsSet(bindingAttr.getFlags(),
IREE::HAL::DescriptorFlags::ReadOnly)) {
flags |= iree_hal_amdgpu_BindingBits_READ_ONLY;
}
if (allEnumBitsSet(bindingAttr.getFlags(),
IREE::HAL::DescriptorFlags::Indirect)) {
flags |= iree_hal_amdgpu_BindingBits_INDIRECT;
}
bindingFlags.push_back(flags);
}
auto bindingFlagsRef = iree_hal_amdgpu_BindingBits_vec_create(
builder, bindingFlags.data(), bindingFlags.size());
iree_hal_amdgpu_ExportDef_start(builder);
iree_hal_amdgpu_ExportDef_symbol_name_add(builder, symbolNameRef);
iree_hal_amdgpu_ExportDef_workgroup_size_add(builder, &workgroupSize);
iree_hal_amdgpu_ExportDef_constant_count_add(builder, constantCount);
iree_hal_amdgpu_ExportDef_binding_flags_add(builder, bindingFlagsRef);
iree_hal_amdgpu_ExportDef_debug_info_add(builder,
exportDebugInfos[ordinal]);
exportRefs[ordinal] = iree_hal_amdgpu_ExportDef_end(builder);
}
auto exportsRef = builder.createOffsetVecDestructive(exportRefs);
iree_hal_amdgpu_ExecutableDef_exports_add(builder, exportsRef);
iree_hal_amdgpu_ExecutableDef_modules_add(builder, modulesRef);
iree_hal_amdgpu_ExecutableDef_source_files_add(builder, sourceFilesRef);
iree_hal_amdgpu_ExecutableDef_end_as_root(builder);
return builder.getBufferAttr(variantOp.getContext());
}
FailureOr<DenseIntElementsAttr>
serializeHIPBinaryContainer(const SerializationOptions &serializationOptions,
IREE::HAL::ExecutableVariantOp variantOp,
ArrayRef<IREE::HAL::ExecutableExportOp> exportOps,
StringRef hsacoModule) {
iree_compiler::FlatbufferBuilder builder;
iree_hal_hip_ExecutableDef_start_as_root(builder);
// Attach embedded source file contents.
auto sourceFilesRef = createSourceFilesVec(
serializationOptions.debugLevel, variantOp.getSourcesAttr(), builder);
// Only a single module today.
SmallVector<iree_hal_hip_ModuleDef_ref_t> moduleRefs;
{
auto hsacoImageRef = flatbuffers_string_create(
builder, hsacoModule.data(), hsacoModule.size());
moduleRefs.push_back(
iree_hal_hip_ModuleDef_create(builder, hsacoImageRef));
}
auto modulesRef = builder.createOffsetVecDestructive(moduleRefs);
// Generate optional per-export debug information.
// May be empty if no debug information was requested.
auto exportDebugInfos =
createExportDefs(serializationOptions.debugLevel, exportOps, builder);
SmallVector<iree_hal_hip_ExportDef_ref_t> exportRefs;
exportRefs.resize(exportOps.size(), 0);
for (auto exportOp : exportOps) {
auto ordinalAttr = exportOp.getOrdinalAttr();
if (!ordinalAttr) {
return mlir::emitError(exportOp.getLoc())
<< "could not compile rocm binary: export op is missing ordinal";
}
int64_t ordinal = ordinalAttr.getInt();
auto kernelNameRef = builder.createString(exportOp.getName());
iree_hal_hip_BlockDims_t blockDims = {0};
if (auto workgroupSizeAttr = exportOp.getWorkgroupSize()) {
auto workgroupSize = workgroupSizeAttr->getValue();
blockDims.x = cast<IntegerAttr>(workgroupSize[0]).getInt();
blockDims.y = cast<IntegerAttr>(workgroupSize[1]).getInt();
blockDims.z = cast<IntegerAttr>(workgroupSize[2]).getInt();
}
auto layoutAttr = exportOp.getLayoutAttr();
uint32_t constantCount = static_cast<uint32_t>(layoutAttr.getConstants());
SmallVector<iree_hal_hip_BindingBits_enum_t> bindingFlags;
for (auto bindingAttr : layoutAttr.getBindings()) {
iree_hal_hip_BindingBits_enum_t flags = 0;
if (allEnumBitsSet(bindingAttr.getFlags(),
IREE::HAL::DescriptorFlags::ReadOnly)) {
flags |= iree_hal_hip_BindingBits_READ_ONLY;
}
if (allEnumBitsSet(bindingAttr.getFlags(),
IREE::HAL::DescriptorFlags::Indirect)) {
flags |= iree_hal_hip_BindingBits_INDIRECT;
}
bindingFlags.push_back(flags);
}
auto bindingFlagsRef = iree_hal_hip_BindingBits_vec_create(
builder, bindingFlags.data(), bindingFlags.size());
iree_hal_hip_ExportDef_start(builder);
iree_hal_hip_ExportDef_module_ordinal_add(builder, 0); // always 0 today
iree_hal_hip_ExportDef_kernel_name_add(builder, kernelNameRef);
iree_hal_hip_ExportDef_block_dims_add(builder, &blockDims);
iree_hal_hip_ExportDef_constant_count_add(builder, constantCount);
iree_hal_hip_ExportDef_binding_flags_add(builder, bindingFlagsRef);
iree_hal_hip_ExportDef_debug_info_add(builder, exportDebugInfos[ordinal]);
exportRefs[ordinal] = iree_hal_hip_ExportDef_end(builder);
}
auto exportsRef = builder.createOffsetVecDestructive(exportRefs);
iree_hal_hip_ExecutableDef_exports_add(builder, exportsRef);
iree_hal_hip_ExecutableDef_modules_add(builder, modulesRef);
iree_hal_hip_ExecutableDef_source_files_add(builder, sourceFilesRef);
iree_hal_hip_ExecutableDef_end_as_root(builder);
return builder.getBufferAttr(variantOp.getContext());
}
private:
const ROCMOptions &options;
};
class AMDGPUTargetDevice final : public TargetDevice {
public:
AMDGPUTargetDevice(const ROCMOptions &options) : options(options) {}
IREE::HAL::DeviceTargetAttr
getDefaultDeviceTarget(MLIRContext *context,
const TargetRegistry &targetRegistry) const override {
Builder b(context);
auto deviceConfigAttr = b.getDictionaryAttr({});
auto executableConfigAttr = b.getDictionaryAttr({});
// If we had multiple target environments we would generate one target attr
// per environment, with each setting its own environment attribute.
SmallVector<IREE::HAL::ExecutableTargetAttr> executableTargetAttrs;
targetRegistry.getTargetBackend("rocm")->getDefaultExecutableTargets(
context, "amdgpu", executableConfigAttr, executableTargetAttrs);
return IREE::HAL::DeviceTargetAttr::get(context, b.getStringAttr("amdgpu"),
deviceConfigAttr,
executableTargetAttrs);
}
private:
const ROCMOptions &options;
};
class HIPTargetDevice final : public TargetDevice {
public:
HIPTargetDevice(const ROCMOptions &options) : options(options) {}
IREE::HAL::DeviceTargetAttr
getDefaultDeviceTarget(MLIRContext *context,
const TargetRegistry &targetRegistry) const override {
Builder b(context);
auto deviceConfigAttr = b.getDictionaryAttr({});
auto executableConfigAttr = b.getDictionaryAttr({});
// If we had multiple target environments we would generate one target attr
// per environment, with each setting its own environment attribute.
SmallVector<IREE::HAL::ExecutableTargetAttr> executableTargetAttrs;
targetRegistry.getTargetBackend("rocm")->getDefaultExecutableTargets(
context, "hip", executableConfigAttr, executableTargetAttrs);
return IREE::HAL::DeviceTargetAttr::get(context, b.getStringAttr("hip"),
deviceConfigAttr,
executableTargetAttrs);
}
private:
const ROCMOptions &options;
};
namespace {
struct ROCMSession final
: PluginSession<ROCMSession, ROCMOptions,
PluginActivationPolicy::DefaultActivated> {
static void registerPasses() { IREE::ROCM::registerROCMTargetPasses(); }
void onRegisterDialects(DialectRegistry &registry) {
registry.insert<IREE::ROCM::ROCMDialect>();
}
void populateHALTargetDevices(IREE::HAL::TargetDeviceList &targets) {
// #hal.device.target<"amdgpu", ...
targets.add("amdgpu", [&]() {
return std::make_shared<AMDGPUTargetDevice>(options);
});
// #hal.device.target<"hip", ...
targets.add("hip",
[&]() { return std::make_shared<HIPTargetDevice>(options); });
}
void populateHALTargetBackends(IREE::HAL::TargetBackendList &targets) {
// #hal.executable.target<"rocm", ...
targets.add("rocm", [&]() {
LLVMInitializeAMDGPUTarget();
LLVMInitializeAMDGPUTargetMC();
LLVMInitializeAMDGPUTargetInfo();
LLVMInitializeAMDGPUAsmParser();
LLVMInitializeAMDGPUAsmPrinter();
return std::make_shared<ROCMTargetBackend>(options);
});
}
};
// Iterate over ukernel bitcode embedded-data files, and insert them into the
// EmbeddedDataDirectory singleton.
static void addAMDGPUUkernelBitcodeToGlobalEmbeddedDataDirectory() {
EmbeddedDataDirectory::withGlobal([](EmbeddedDataDirectory &dir) {
const iree_file_toc_t *toc = iree_uk_amdgpu_bitcode_create();
for (size_t i = 0; i < iree_uk_amdgpu_bitcode_size(); ++i) {
dir.addFile(toc[i].name, llvm::StringRef{toc[i].data, toc[i].size});
}
});
}
} // namespace
} // namespace mlir::iree_compiler::IREE::HAL
extern "C" bool iree_register_compiler_plugin_hal_target_rocm(
mlir::iree_compiler::PluginRegistrar *registrar) {
registrar->registerPlugin<mlir::iree_compiler::IREE::HAL::ROCMSession>(
"hal_target_rocm");
mlir::iree_compiler::IREE::HAL::
addAMDGPUUkernelBitcodeToGlobalEmbeddedDataDirectory();
return true;
}
IREE_DEFINE_COMPILER_OPTION_FLAGS(mlir::iree_compiler::IREE::HAL::ROCMOptions);