compiler/plugins/target/ROCM/ROCMTarget.cpp - 3p/openxla/iree - Git at Google

 // Copyright 2021 The IREE Authors
 //
 // Licensed under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

 #include "ROCMTargetUtils.h"

 #include <cstdint>

 #include "compiler/plugins/target/ROCM/Dialect/ROCM/IR/ROCMAttrs.h"
 #include "compiler/plugins/target/ROCM/Dialect/ROCM/Transforms/Passes.h"
 #include "compiler/plugins/target/ROCM/builtins/ukernel/iree_uk_amdgpu_bitcode.h"
 #include "iree/compiler/Codegen/Common/Passes.h"
 #include "iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenAttrs.h"
 #include "iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenDialect.h"
 #include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.h"
 #include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUDialect.h"
 #include "iree/compiler/Codegen/Dialect/GPU/TargetUtils/KnownTargets.h"
 #include "iree/compiler/Codegen/Dialect/VectorExt/IR/VectorExtDialect.h"
 #include "iree/compiler/Codegen/LLVMGPU/Passes.h"
 #include "iree/compiler/Codegen/Utils/GPUUtils.h"
 #include "iree/compiler/Codegen/Utils/Utils.h"
 #include "iree/compiler/Dialect/Encoding/IR/EncodingTypes.h"
 #include "iree/compiler/Dialect/HAL/IR/HALOps.h"
 #include "iree/compiler/Dialect/HAL/Target/TargetRegistry.h"
 #include "iree/compiler/Dialect/HAL/Utils/ExecutableDebugInfoUtils.h"
 #include "iree/compiler/Dialect/HAL/Utils/LLVMLinkerUtils.h"
 #include "iree/compiler/PluginAPI/Client.h"
 #include "iree/compiler/Utils/EmbeddedDataDirectory.h"
 #include "iree/compiler/Utils/FlatbufferUtils.h"
 #include "iree/compiler/Utils/ToolUtils.h"
 #include "iree/schemas/amdgpu_executable_def_builder.h"
 #include "iree/schemas/hip_executable_def_builder.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Bitcode/BitcodeWriter.h"
 #include "llvm/IR/LegacyPassManager.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Verifier.h"
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/Passes/PassBuilder.h"
 #include "llvm/Passes/StandardInstrumentations.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/FormatVariadic.h"
 #include "llvm/Support/TargetSelect.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Transforms/Utils/Cloning.h"
 #include "mlir/Dialect/AMDGPU/Utils/Chipset.h"
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/IR/Attributes.h"
 #include "mlir/IR/BuiltinAttributes.h"
 #include "mlir/Pass/PassManager.h"
 #include "mlir/Support/LogicalResult.h"
 #include "mlir/Target/LLVMIR/Dialect/Builtin/BuiltinToLLVMIRTranslation.h"
 #include "mlir/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.h"
 #include "mlir/Target/LLVMIR/Dialect/ROCDL/ROCDLToLLVMIRTranslation.h"
 #include "mlir/Target/LLVMIR/Export.h"

 namespace mlir::iree_compiler::IREE::HAL {

 namespace {

 enum class ContainerType {
   // Automatically detect the container type from the target ABI attribute.
   Auto,
   // HIP ExecutableDef flatbuffer.
   HIP,
   // AMDGPU ExecutableDef flatbuffer.
   AMDGPU,
   // Raw HSACO image (ELF).
   HSACO,
 };

 // TODO(#18792): rename flags back to iree-rocm- as they are not HIP-specific.
 struct ROCMOptions {
   std::string target = "";
   std::string targetFeatures = "";
   ContainerType containerType = ContainerType::Auto;
   std::string bitcodeDirectory = getDefaultBitcodeDirectory();
   int wavesPerEu = 0;
   std::string enableROCMUkernels = "none";
   std::string encodingLayoutResolver = GPU::kNoEncodingLayoutResolverName;
   bool slpVectorization = true;
   bool globalISel = false;

   bool specializeDispatches = false;
   bool enableTensorUKernels = false;

   void bindOptions(OptionsBinder &binder) {
     using namespace llvm;
     static cl::OptionCategory category("HIP HAL Target");

     binder.opt<std::string>(
         "iree-hip-target", target, cl::cat(category),
         cl::desc(
             // clang-format off
             "HIP target as expected by LLVM AMDGPU backend; e.g., "
             "'gfx90a'/'gfx942' for targeting MI250/MI300 GPUs. "
             "Additionally this also supports architecture code names like "
             "'cdna3'/'rdna3' or some product names like 'mi300x'/'rtx7900xtx' "
             "for a better experience. See "
             "https://iree.dev/guides/deployment-configurations/gpu-rocm/ "
             "for more details."
             // clang-format on
             ));

     binder.opt<std::string>(
         "iree-hip-target-features", targetFeatures, cl::cat(category),
         cl::desc("HIP target features as expected by LLVM AMDGPU backend; "
                  "e.g., '+sramecc,+xnack'."));

     binder.opt<ContainerType>(
         "iree-rocm-container-type", containerType,
         llvm::cl::desc("Serialized executable container type."),
         llvm::cl::cat(category),
         llvm::cl::values(clEnumValN(ContainerType::Auto, "auto",
                                     "Automatically detect the container type "
                                     "from the target ABI attribute."),
                          clEnumValN(ContainerType::HIP, "hip",
                                     "HIP ExecutableDef flatbuffer."),
                          clEnumValN(ContainerType::AMDGPU, "amdgpu",
                                     "AMDGPU ExecutableDef flatbuffer."),
                          clEnumValN(ContainerType::HSACO, "hsaco",
                                     "Raw HSACO image (ELF).")));

     binder.opt<std::string>("iree-hip-bc-dir", bitcodeDirectory,
                             cl::cat(category),
                             cl::desc("Directory of HIP Bitcode."));

     binder.opt<int>("iree-hip-waves-per-eu", wavesPerEu, cl::cat(category),
                     cl::desc("Optimization hint specifying minimum "
                              "number of waves per execution unit."));

     binder.opt<std::string>(
         "iree-hip-enable-ukernels", enableROCMUkernels, cl::cat(category),
         cl::desc("Enables microkernels in the HIP compiler backend. May be "
                  "`default`, `none`, `all`, or a comma-separated list of "
                  "specific unprefixed microkernels to enable, e.g. `mmt4d`."));
     binder.opt<std::string>(
         "iree-hip-encoding-layout-resolver", encodingLayoutResolver,
         cl::cat(category),
         cl::desc("Selects the way that encodings will be "
                  "resolved. Options are: `none` (resolve to "
                  "identity layout), `pad` (additional padding "
                  "on allocations to maximize cache bandwidth), "
                  "and `data-tiling` (enable data tiled layouts)"));

     binder.opt<bool>("iree-hip-llvm-slp-vec", slpVectorization,
                      cl::cat(category),
                      cl::desc("Enable slp vectorization in llvm opt."));
     binder.opt<bool>("iree-hip-llvm-global-isel", globalISel, cl::cat(category),
                      cl::desc("Enable global instruction selection in llvm."));

     binder.opt<bool>(
         "iree-hip-specialize-dispatches", specializeDispatches,
         cl::cat(category),
         cl::desc(
             "Enable runtime specialization of dynamically shaped dispatches."));
     binder.opt<bool>("iree-hip-enable-tensor-ukernels", enableTensorUKernels,
                      cl::cat(category),
                      cl::desc("Enable MLIR-based ukernels."));
   }

   LogicalResult verify(mlir::Builder &builder) const {
     if (target.empty()) {
       return emitError(builder.getUnknownLoc())
              << "HIP target not set; did you forget to pass "
                 "'--iree-hip-target'?";
     }
     if (GPU::normalizeHIPTarget(target).empty()) {
       return emitError(builder.getUnknownLoc(), "Unknown HIP target '")
              << target << "'";
     }
     SmallVector<StringRef> features;
     llvm::SplitString(targetFeatures, features, ",");
     for (StringRef f : features) {
       if (!(f.starts_with("+") || f.starts_with("-"))) {
         return emitError(builder.getUnknownLoc(),
                          "HIP target feature must be prefixed with '+' or "
                          "'-'; but seen '")
                << f << "'";
       }
       StringRef feature = f.substr(1);
       if (feature != "sramecc" && feature != "xnack") {
         // We only support these two features to be set explicitly. Features
         // like wavefrontsize is controlled and tuned by the compiler.
         return emitError(builder.getUnknownLoc(),
                          "HIP target feature can only be 'sramecc' or "
                          "'xnack'; but seen '")
                << feature << "'";
       }
     }
     return success();
   }

 private:
   static std::string getDefaultBitcodeDirectory() {
     return mlir::iree_compiler::findPlatformLibDirectory("rocm");
   }
 };

 // Returns the ABI or an empty string if unspecified.
 static StringRef getABI(IREE::HAL::ExecutableTargetAttr targetAttr) {
   if (targetAttr) {
     if (auto config = targetAttr.getConfiguration()) {
       auto abiAttr = targetAttr.getConfiguration().getAs<StringAttr>("abi");
       return abiAttr ? abiAttr.getValue() : "";
     }
   }
   return "";
 }

 static void dumpModuleToPath(StringRef path, StringRef baseName,
                              StringRef suffix, StringRef extension,
                              llvm::Module &module, StringRef header = {}) {
   llvm::SmallVector<char, 0> data;
   llvm::raw_svector_ostream ostream(data);
   ostream << header;
   module.print(ostream, nullptr);
   dumpDataToPath(path, baseName, suffix, extension,
                  StringRef(data.data(), data.size()));
 }

 static std::string translateModuleToObj(llvm::Module &module,
                                         llvm::TargetMachine &targetMachine) {
   std::string targetObj;
   {
     llvm::raw_string_ostream stream(targetObj);
     llvm::buffer_ostream pstream(stream);
     llvm::legacy::PassManager codegenPasses;
     targetMachine.addPassesToEmitFile(codegenPasses, pstream, nullptr,
                                       llvm::CodeGenFileType::ObjectFile);
     codegenPasses.run(module);
   }
   return targetObj;
 }

 static std::string translateModuleToISA(llvm::Module &module,
                                         llvm::TargetMachine &targetMachine) {
   std::string targetISA;
   {
     llvm::raw_string_ostream stream(targetISA);
     llvm::buffer_ostream pstream(stream);
     llvm::legacy::PassManager codegenPasses;
     targetMachine.addPassesToEmitFile(codegenPasses, pstream, nullptr,
                                       llvm::CodeGenFileType::AssemblyFile);
     codegenPasses.run(module);
   }
   return targetISA;
 }

 } // namespace

 class ROCMTargetBackend final : public TargetBackend {
 public:
   ROCMTargetBackend(const ROCMOptions &options) : options(options) {}

   std::string getLegacyDefaultDeviceID() const override { return "hip"; }

   void getDefaultExecutableTargets(
       MLIRContext *context, StringRef deviceID, DictionaryAttr deviceConfigAttr,
       SmallVectorImpl<IREE::HAL::ExecutableTargetAttr> &executableTargetAttrs)
       const override {
     if (auto target = getExecutableTarget(deviceID, context)) {
       executableTargetAttrs.push_back(target);
     }
   }

   IREE::HAL::ExecutableTargetAttr
   getExecutableTarget(StringRef deviceID, MLIRContext *context) const {
     Builder b(context);
     SmallVector<NamedAttribute, 4> configItems;
     auto addConfig = [&](StringRef name, Attribute value) {
       configItems.emplace_back(name, value);
     };

     if (failed(options.verify(b))) {
       return nullptr;
     }

     addConfig("abi", b.getStringAttr(deviceID));
     std::string format;
     if (deviceID == "amdgpu") {
       format = options.target;
     } else {
       format = "rocm-hsaco-fb"; // legacy HIP
     }

     if (auto target = GPU::getHIPTargetDetails(
             options.target, options.targetFeatures, context)) {
       addConfigGPUTarget(context, target, configItems);
       if (options.encodingLayoutResolver !=
           GPU::kNoEncodingLayoutResolverName) {
         if (Attribute encoding = GPU::getHIPTargetEncodingLayoutAttr(
                 target, options.encodingLayoutResolver)) {
           addConfig(IREE::Encoding::kEncodingResolverAttrName, encoding);
         }
       }

       // Look for a default tuning spec.
       auto rocmDialect = context->getOrLoadDialect<IREE::ROCM::ROCMDialect>();
       // First check for a spec based on the sku.
       std::optional<std::string> maybeSpecName = std::nullopt;
       if (IREE::GPU::TargetChipAttr chip = target.getChip()) {
         if (StringAttr sku = chip.getSku()) {
           std::string specName =
               llvm::formatv("iree_default_tuning_spec_{}.mlir", sku.strref());
           if (!rocmDialect->hasBuiltin(specName)) {
             maybeSpecName = specName;
           }
         }
       }

       // Then, if none was found, look for one based on the target arch.
       if (!maybeSpecName) {
         std::string specName =
             llvm::formatv("iree_default_tuning_spec_{}.mlir", target.getArch());
         if (rocmDialect->hasBuiltin(specName)) {
           maybeSpecName = specName;
         }
       }

       if (maybeSpecName) {
         addConfig("iree_codegen.default_tuning_spec",
                   IREE::ROCM::BuiltinTuningModuleAttr::get(
                       context, maybeSpecName.value()));
       }
     }

     addConfig("ukernels", b.getStringAttr(options.enableROCMUkernels));
     if (options.enableROCMUkernels != "none") {
       addConfig("iree_codegen.ukernel_provider",
                 IREE::ROCM::UKernelProviderAttr::get(context));
     }
     if (options.wavesPerEu > 0) {
       addConfigWavesPerEu(b.getContext(), options.wavesPerEu, configItems);
     }

     if (options.enableTensorUKernels) {
       addConfig(kUKernelProviderName,
                 IREE::Codegen::SymbolicUKernelProviderAttr::get(context));
     }

     return b.getAttr<IREE::HAL::ExecutableTargetAttr>(
         b.getStringAttr("rocm"), b.getStringAttr(format),
         b.getDictionaryAttr(configItems));
   }

   void getDependentDialects(DialectRegistry &registry) const override {
     mlir::registerBuiltinDialectTranslation(registry);
     mlir::registerLLVMDialectTranslation(registry);
     mlir::registerROCDLDialectTranslation(registry);
     registry.insert<IREE::Codegen::IREECodegenDialect>();
     registry.insert<IREE::VectorExt::IREEVectorExtDialect>();
     registry.insert<IREE::GPU::IREEGPUDialect>();
     registry.insert<IREE::ROCM::ROCMDialect>();
     // Configuration may load and manipulate transform dialect libraries.
     registerTransformDialectTranslationDependentDialects(registry);
   }

   void
   buildConfigurationPassPipeline(IREE::HAL::ExecutableTargetAttr targetAttr,
                                  OpPassManager &passManager) override {
     if (options.specializeDispatches) {
       if (auto attr = getGPUTargetAttr(targetAttr.getContext(), targetAttr)) {
         ROCM::ApplyBuiltinPDLPatternsPassOptions options;
         options.enableSpecialization = true;
         if (IREE::GPU::TargetChipAttr chip = attr.getChip()) {
           if (StringAttr sku = chip.getSku()) {
             options.targets.push_back(sku.str());
           }
         }
         options.targets.push_back(attr.getArch().str());
         OpPassManager &modulePassManager = passManager.nest<ModuleOp>();
         FunctionLikeNest(modulePassManager).addPass([&]() {
           return ROCM::createApplyBuiltinPDLPatternsPass(options);
         });
       }
     }
     if (options.enableTensorUKernels) {
       if (auto attr = getGPUTargetAttr(targetAttr.getContext(), targetAttr)) {
         ROCM::ApplyBuiltinPDLPatternsPassOptions options;
         options.enableTensorUKernels = true;
         if (IREE::GPU::TargetChipAttr chip = attr.getChip()) {
           if (StringAttr sku = chip.getSku()) {
             options.targets.push_back(sku.str());
           }
         }
         options.targets.push_back(attr.getArch().str());
         OpPassManager &modulePassManager = passManager.nest<ModuleOp>();
         FunctionLikeNest(modulePassManager).addPass([&]() {
           return ROCM::createApplyBuiltinPDLPatternsPass(options);
         });
       }
     }
     passManager.addPass(createSpecializeExportsPass());
     buildLLVMGPUCodegenCommonConfigurationPassPipeline(passManager);
     OpPassManager &modulePassManager = passManager.nest<ModuleOp>();
     if (options.enableTensorUKernels) {
       modulePassManager.addPass(
           IREE::ROCM::createApplyBuiltinPDLPatternsDriverPass());
     }
     modulePassManager.addPass(createMaterializeTuningSpecsPass());
     modulePassManager.addPass(createMaterializeUserConfigsPass());
     modulePassManager.addPass(createLLVMGPUSelectLoweringStrategyPass());
   }

   void buildTranslationPassPipeline(IREE::HAL::ExecutableTargetAttr targetAttr,
                                     OpPassManager &passManager) override {
     buildLLVMGPUCodegenPassPipeline(passManager, true);
   }

   void buildLinkingPassPipeline(OpPassManager &passManager) override {
     buildLLVMGPULinkingPassPipeline(passManager, "rocm");
   }

   // Performs optimizations on |module| (including LTO-style whole-program
   // ones). Inspired by code section in
   // https://github.com/iree-org/iree/blob/main/compiler/plugins/target/CUDA/CUDATarget.cpp
   static void optimizeModule(llvm::Module &module,
                              llvm::TargetMachine &targetMachine,
                              bool slpVectorization,
                              std::string &outPassesString) {
     llvm::LoopAnalysisManager lam;
     llvm::FunctionAnalysisManager fam;
     llvm::CGSCCAnalysisManager cgam;
     llvm::ModuleAnalysisManager mam;

     fam.registerPass([&] { return targetMachine.getTargetIRAnalysis(); });

     llvm::PipelineTuningOptions pto;
     pto.SLPVectorization = slpVectorization;

     llvm::PassInstrumentationCallbacks pic;

     llvm::StandardInstrumentations si(module.getContext(), false);
     si.registerCallbacks(pic, &mam);

     llvm::PassBuilder pb(&targetMachine, pto, std::nullopt, &pic);
     llvm::ModulePassManager mpm;
     pb.registerModuleAnalyses(mam);
     pb.registerCGSCCAnalyses(cgam);
     pb.registerFunctionAnalyses(fam);
     pb.registerLoopAnalyses(lam);
     pb.crossRegisterProxies(lam, fam, cgam, mam);

     llvm::OptimizationLevel ol = llvm::OptimizationLevel::O2;

     mpm.addPass(llvm::VerifierPass());
     mpm.addPass(pb.buildPerModuleDefaultPipeline(ol));
     mpm.addPass(llvm::VerifierPass());
     llvm::raw_string_ostream os(outPassesString);
     mpm.printPipeline(os, [&pic](StringRef className) {
       auto passName = pic.getPassNameForClassName(className);
       return passName.empty() ? className : passName;
     });
     mpm.run(module, mam);
   }

   LogicalResult
   validateFinalizedModule(IREE::HAL::ExecutableVariantOp variantOp,
                           llvm::Module &module) {
     for (llvm::Function &func : module.functions()) {
       if (func.isDeclaration() && !func.isIntrinsic() && !func.use_empty()) {
         llvm::User *liveUser = *func.user_begin();
         return variantOp.emitError()
                << "found an unresolved external function '" << func.getName()
                << "' in the final bitcode. A remaining live user is\n"
                << llvm::formatv("{}", *liveUser);
       }
     }
     return success();
   }

   LogicalResult
   serializeExecutable(const SerializationOptions &serializationOptions,
                       IREE::HAL::ExecutableVariantOp variantOp,
                       OpBuilder &executableBuilder) override {
     ModuleOp innerModuleOp = variantOp.getInnerModule();
     auto targetAttr = variantOp.getTargetAttr();
     StringRef targetArch = options.target;
     StringRef targetFeatures = options.targetFeatures;
     if (auto attr = getGPUTargetAttr(variantOp.getContext(), targetAttr)) {
       targetArch = attr.getArch();
       targetFeatures = attr.getFeatures();
     }

     // We name our files after the executable name so that they are easy to
     // track both during compilation (logs/artifacts/etc), as outputs (final
     // intermediate code/binary files), and at runtime (loaded
     // libraries/symbols/etc).
     const std::string libraryName =
         variantOp->getParentOfType<IREE::HAL::ExecutableOp>().getName().str();

     // Collect all the entry point names.
     auto exportOps = llvm::to_vector_of<IREE::HAL::ExecutableExportOp>(
         variantOp.getExportOps());
     std::optional<uint32_t> subgroupSize;
     for (IREE::HAL::ExecutableExportOp exportOp : exportOps) {
       // TODO: put this either on the variant or propagate as a function
       // attribute instead - today this *must* be consistent across all exports
       // and it shouldn't need to be.
       if (auto setSubgroupSize = exportOp.getSubgroupSizeAsUInt()) {
         if (setSubgroupSize.value() != 32 && setSubgroupSize.value() != 64) {
           return variantOp.emitError()
                  << "invalid subgroup size " << setSubgroupSize.value();
         }
         if (subgroupSize.has_value() &&
             setSubgroupSize.value() != subgroupSize.value()) {
           return variantOp.emitError()
                  << "multiple exports with different subgroup sizes; this is a "
                     "limitation of the IREE compilation process and should be "
                     "fixed";
         }
         subgroupSize = setSubgroupSize.value();
       }
     }

     std::string targetHSACO;
     if (variantOp.isExternal()) {
       if (!variantOp.getObjects().has_value()) {
         return variantOp.emitOpError()
                << "no objects defined for external variant";
       }

       if (variantOp.getObjects()->getValue().size() != 1) {
         // For now we assume there will be exactly one object file.
         // In the future we will want to perform a linking step here and ideally
         // support _also_ linking in the codegen results.
         return variantOp.emitOpError() << "only one object reference is "
                                           "supported for external variants";
       }

       // Read the HSACO from the object file.
       auto objectAttr = llvm::cast<IREE::HAL::ExecutableObjectAttr>(
           variantOp.getObjects()->getValue().front());
       if (auto data = objectAttr.loadData()) {
         targetHSACO = data.value();
       } else {
         return variantOp.emitOpError()
                << "object file could not be loaded: " << objectAttr;
       }
     } else {
       auto maybeChipset = amdgpu::Chipset::parse(targetArch);
       if (failed(maybeChipset)) {
         return variantOp.emitOpError()
                << "could not parse AMDGPU chipset name '" << targetArch << "'";
       }
       amdgpu::Chipset chipset = *maybeChipset;
       // Perform the translation in a separate context to avoid any
       // multi-threading issues.
       llvm::LLVMContext context;
       std::unique_ptr<llvm::Module> llvmModule =
           mlir::translateModuleToLLVMIR(innerModuleOp, context, libraryName);
       if (!llvmModule) {
         return variantOp.emitError() << "failed to translate the MLIR LLVM "
                                         "dialect to the native llvm::Module";
       }

       for (auto func : innerModuleOp.getOps<LLVM::LLVMFuncOp>()) {
         llvm::Function *llvmFunc = llvmModule->getFunction(func.getName());
         if (llvmFunc->isDeclaration())
           continue;

         // Override flags as given by target func attrs.
         if (auto funcAttrs =
                 func->getAttrOfType<DictionaryAttr>("llvm_func_attrs")) {
           for (NamedAttribute funcAttr : funcAttrs) {
             auto value = dyn_cast<StringAttr>(funcAttr.getValue());
             if (!value) {
               return variantOp->emitError()
                      << "llvm_func_attrs attribute must be a dictionary of "
                         "strings. Attribute "
                      << funcAttr.getName() << " is not a StringAttr.";
             }
             llvmFunc->addFnAttr(funcAttr.getName(), value.getValue());
           }
         }
       }

       std::unique_ptr<llvm::TargetMachine> targetMachine;
       bool isWave64 = true;
       {
         llvm::Triple triple("amdgcn-amd-amdhsa");
         std::string error;
         const llvm::Target *target =
             llvm::TargetRegistry::lookupTarget("", triple, error);
         if (!target) {
           return variantOp.emitError() << "cannot initialize target triple";
         }
         llvm::TargetOptions opt;
         opt.AllowFPOpFusion = llvm::FPOpFusion::Fast;
         opt.UnsafeFPMath = false;
         opt.NoInfsFPMath = false;
         opt.NoNaNsFPMath = true;
         // Be extra cautious while this is less tested, and prevent unknown
         // fallbacks from global isel.
         //
         // When GlobalISelAbort is set, any failure of GlobalISel,
         // whether due to being not yet implemented or incorrect IR will result
         // in an immediate abortion of compilation. This disables the fallback
         // path of AMDGPUPassConfig::addInstSelector and 2 legacy passes which
         // might work around unimplemented cases or errors in GlobalISel
         // resulting in a successful compilation but would make one assume
         // results are with GlobalISel when they are not.
         opt.EnableGlobalISel = options.globalISel;
         opt.GlobalISelAbort = options.globalISel
                                   ? llvm::GlobalISelAbortMode::Enable
                                   : llvm::GlobalISelAbortMode::Disable;
         SmallVector<std::string> features;
         if (chipset.majorVersion >= 10 && chipset.majorVersion <= 12) {
           switch (subgroupSize.value_or(64)) {
           case 32:
             isWave64 = false;
             features.emplace_back("+wavefrontsize32");
             break;
           default:
           case 64:
             features.emplace_back("+wavefrontsize64");
             break;
           }
         }

         // Mixed precision fma instructions have complicated semantics on
         // gf9+ GPUs and can lead to numeric issues as seen in
         // https://github.com/iree-org/iree/issues/18746 so we disable this
         // feature.
         if (targetArch.starts_with("gfx9")) {
           features.emplace_back("-fma-mix-insts");
         }

         if (!targetFeatures.empty()) {
           features.emplace_back(targetFeatures.str());
         }

         std::string featureStr = llvm::join(features, ",");

         targetMachine.reset(target->createTargetMachine(
             triple, targetArch, featureStr, opt, llvm::Reloc::Model::PIC_,
             std::nullopt, llvm::CodeGenOptLevel::Aggressive));

         if (!targetMachine) {
           return variantOp.emitError() << "cannot initialize target machine";
         }
       }

       llvmModule->setDataLayout(targetMachine->createDataLayout());

       // Code object version * 100.
       constexpr uint32_t abiVersion = 500;
       // Let the backend know what code object version we're compiling for. This
       // insulates us from changes to the default code object version that our
       // CI or users may not be prepared for.
       llvmModule->addModuleFlag(llvm::Module::Error,
                                 "amdhsa_code_object_version", abiVersion);
       for (llvm::Function &f : llvmModule->functions())
         f.addFnAttr(llvm::Attribute::AlwaysInline);

       // Link user-provided modules.
       llvm::Linker linker(*llvmModule);
       if (failed(linkCmdlineBitcodeFiles(
               variantOp.getLoc(), linker, llvm::Linker::OverrideFromSrc,
               *targetMachine, llvmModule->getContext()))) {
         return failure();
       }

       // Link bitcode (*.bc) object attrs specified by the input program.
       // Note that this happens after the command-line files so that the command
       // line ones override the symbols coming from the embedded files.
       auto specializationCallback = [&](llvm::Module &userModule) {
         // TODO: inject __nvvm_reflect-style functions/globals for
         // bitcode specialization based on the targetMachine and configuration.
         // These could use any information we have on the IREE side as well as
         // the TargetMachine.
       };
       unsigned linkerFlags =
           llvm::Linker::LinkOnlyNeeded | llvm::Linker::OverrideFromSrc;
       if (failed(linkBitcodeObjects(variantOp.getLoc(), linker, linkerFlags,
                                     *targetMachine, variantOp.getObjectsAttr(),
                                     llvmModule->getContext(),
                                     specializationCallback))) {
         return mlir::emitError(variantOp.getLoc())
                << "failed linking in user objects for target triple '"
                << targetArch.str() << "'";
       }

       // Link module to HIP device library.
       if (options.bitcodeDirectory.empty()) {
         return variantOp.emitError()
                << "cannot find ROCM bitcode files. Check your installation "
                   "consistency and in the worst case, set "
                   "--iree-hip-bc-dir= to a path on your system.";
       }
       if (failed(linkHIPBitcodeIfNeeded(variantOp.getLoc(), llvmModule.get(),
                                         targetArch,
                                         options.bitcodeDirectory))) {
         return failure();
       }

       // Sets HIP platform globals based on the target architecture.
       if (failed(setHIPGlobals(variantOp.getLoc(), llvmModule.get(), chipset,
                                isWave64, abiVersion))) {
         return failure();
       }

       if (!serializationOptions.dumpIntermediatesPath.empty()) {
         dumpModuleToPath(serializationOptions.dumpIntermediatesPath,
                          serializationOptions.dumpBaseName, variantOp.getName(),
                          ".linked.ll", *llvmModule);
       }

       // For example 'gfx942'
       StringRef targetCPU = targetMachine->getTargetCPU();

       // For example 'amdgcn-amd-amdhsa'
       std::string targetTriple = targetMachine->getTargetTriple().str();

       // Run LLVM optimization passes.
       std::string passesString;
       optimizeModule(*llvmModule, *targetMachine, options.slpVectorization,
                      passesString);
       if (!serializationOptions.dumpIntermediatesPath.empty()) {

         // Additional context on '-mcpu' flag in PR comments, see for example:
         // https://github.com/iree-org/iree/pull/20716#issuecomment-2851650421
         std::string header =
             llvm::formatv(R"TXT(
 ; To reproduce the .optimized.ll from the .linked.ll, run:
 ; opt -S -mtriple={} -mcpu={} --passes='{}'
 ; The flag '-S' to emit LLVMIR.
 ; The behavior of some passes depends on '-mtriple' and '-mcpu'

 )TXT",
                           targetTriple, targetCPU, passesString);

         dumpModuleToPath(serializationOptions.dumpIntermediatesPath,
                          serializationOptions.dumpBaseName, variantOp.getName(),
                          ".optimized.ll", *llvmModule, header);
       }

       if (failed(validateFinalizedModule(variantOp, *llvmModule))) {
         return failure();
       }

       // Dump the assembly output.
       if (!serializationOptions.dumpIntermediatesPath.empty()) {
         auto moduleCopy = llvm::CloneModule(*llvmModule);
         if (!moduleCopy) {
           llvm::errs() << "Error: cloning LLVM IR failed\n";
           return failure();
         }
         std::string targetISA =
             translateModuleToISA(*moduleCopy.get(), *targetMachine);
         dumpDataToPath(serializationOptions.dumpIntermediatesPath,
                        serializationOptions.dumpBaseName, variantOp.getName(),
                        ".rocmasm", targetISA);
       }

       // Serialize hsaco kernel into the binary that we will embed in the
       // final FlatBuffer.
       std::string targetObj = translateModuleToObj(*llvmModule, *targetMachine);
       targetHSACO = createHsaco(variantOp.getLoc(), targetObj, libraryName);
       if (targetHSACO.empty())
         return failure();
     }

     if (!serializationOptions.dumpBinariesPath.empty()) {
       dumpDataToPath(serializationOptions.dumpBinariesPath,
                      serializationOptions.dumpBaseName, variantOp.getName(),
                      ".hsaco", targetHSACO);
     }

     // Determine container type from the target ABI attribute.
     ContainerType containerType = options.containerType;
     if (containerType == ContainerType::Auto) {
       if (getABI(targetAttr) == "amdgpu") {
         containerType = ContainerType::AMDGPU;
       } else {
         containerType = ContainerType::HIP;
       }
     }

     // Wrap the HSACO ELF binary in the requested container type (if any).
     FailureOr<DenseIntElementsAttr> binaryContainer;
     switch (containerType) {
     case ContainerType::Auto: {
       // Resolved above; unreachable. Fall-through to error case.
       assert(false && "auto container type must have resolved earlier");
       break;
     }
     case ContainerType::AMDGPU: {
       binaryContainer = serializeAMDGPUBinaryContainer(
           serializationOptions, variantOp, exportOps, targetHSACO);
       break;
     }
     case ContainerType::HIP: {
       binaryContainer = serializeHIPBinaryContainer(
           serializationOptions, variantOp, exportOps, targetHSACO);
       break;
     }
     case ContainerType::HSACO: {
       SmallVector<uint8_t> image;
       image.resize(targetHSACO.size());
       std::memcpy(image.data(), targetHSACO.data(), image.size());
       binaryContainer = DenseIntElementsAttr::get(
           VectorType::get({static_cast<int64_t>(targetHSACO.size())},
                           executableBuilder.getI8Type()),
           image);
       break;
     }
     }
     if (failed(binaryContainer) || !binaryContainer.value()) {
       return failure();
     }

     // Add the binary data to the target executable.
     iree_compiler::IREE::HAL::ExecutableBinaryOp::create(
         executableBuilder, variantOp.getLoc(), variantOp.getSymName(),
         variantOp.getTarget().getFormat(), binaryContainer.value());

     return success();
   }

 protected:
   FailureOr<DenseIntElementsAttr> serializeAMDGPUBinaryContainer(
       const SerializationOptions &serializationOptions,
       IREE::HAL::ExecutableVariantOp variantOp,
       ArrayRef<IREE::HAL::ExecutableExportOp> exportOps,
       StringRef hsacoModule) {
     iree_compiler::FlatbufferBuilder builder;
     iree_hal_amdgpu_ExecutableDef_start_as_root(builder);

     // Attach embedded source file contents.
     auto sourceFilesRef = createSourceFilesVec(
         serializationOptions.debugLevel, variantOp.getSourcesAttr(), builder);

     // Only a single module today.
     SmallVector<iree_hal_amdgpu_ModuleDef_ref_t> moduleRefs;
     {
       auto hsacoImageRef = flatbuffers_string_create(
           builder, hsacoModule.data(), hsacoModule.size());
       moduleRefs.push_back(
           iree_hal_amdgpu_ModuleDef_create(builder, hsacoImageRef));
     }
     auto modulesRef = builder.createOffsetVecDestructive(moduleRefs);

     // Generate optional per-export debug information.
     // May be empty if no debug information was requested.
     auto exportDebugInfos =
         createExportDefs(serializationOptions.debugLevel, exportOps, builder);

     SmallVector<iree_hal_amdgpu_ExportDef_ref_t> exportRefs;
     exportRefs.resize(exportOps.size(), 0);
     for (auto exportOp : exportOps) {
       auto ordinalAttr = exportOp.getOrdinalAttr();
       if (!ordinalAttr) {
         return mlir::emitError(exportOp.getLoc())
                << "could not compile rocm binary: export op is missing ordinal";
       }
       int64_t ordinal = ordinalAttr.getInt();

       // Symbol names include a `.kd` suffix as that's what HSA expects.
       auto symbolNameKd = (exportOp.getName() + ".kd").str();
       auto symbolNameRef = builder.createString(symbolNameKd);

       iree_hal_amdgpu_Dims_t workgroupSize = {0};
       if (auto workgroupSizeAttr = exportOp.getWorkgroupSize()) {
         auto workgroupSizeDims = workgroupSizeAttr->getValue();
         workgroupSize.x = cast<IntegerAttr>(workgroupSizeDims[0]).getInt();
         workgroupSize.y = cast<IntegerAttr>(workgroupSizeDims[1]).getInt();
         workgroupSize.z = cast<IntegerAttr>(workgroupSizeDims[2]).getInt();
       }

       auto layoutAttr = exportOp.getLayoutAttr();
       uint32_t constantCount = static_cast<uint32_t>(layoutAttr.getConstants());
       SmallVector<iree_hal_amdgpu_BindingBits_enum_t> bindingFlags;
       for (auto bindingAttr : layoutAttr.getBindings()) {
         iree_hal_amdgpu_BindingBits_enum_t flags = 0;
         if (allEnumBitsSet(bindingAttr.getFlags(),
                            IREE::HAL::DescriptorFlags::ReadOnly)) {
           flags |= iree_hal_amdgpu_BindingBits_READ_ONLY;
         }
         if (allEnumBitsSet(bindingAttr.getFlags(),
                            IREE::HAL::DescriptorFlags::Indirect)) {
           flags |= iree_hal_amdgpu_BindingBits_INDIRECT;
         }
         bindingFlags.push_back(flags);
       }
       auto bindingFlagsRef = iree_hal_amdgpu_BindingBits_vec_create(
           builder, bindingFlags.data(), bindingFlags.size());

       iree_hal_amdgpu_ExportDef_start(builder);
       iree_hal_amdgpu_ExportDef_symbol_name_add(builder, symbolNameRef);
       iree_hal_amdgpu_ExportDef_workgroup_size_add(builder, &workgroupSize);
       iree_hal_amdgpu_ExportDef_constant_count_add(builder, constantCount);
       iree_hal_amdgpu_ExportDef_binding_flags_add(builder, bindingFlagsRef);
       iree_hal_amdgpu_ExportDef_debug_info_add(builder,
                                                exportDebugInfos[ordinal]);
       exportRefs[ordinal] = iree_hal_amdgpu_ExportDef_end(builder);
     }
     auto exportsRef = builder.createOffsetVecDestructive(exportRefs);

     iree_hal_amdgpu_ExecutableDef_exports_add(builder, exportsRef);
     iree_hal_amdgpu_ExecutableDef_modules_add(builder, modulesRef);
     iree_hal_amdgpu_ExecutableDef_source_files_add(builder, sourceFilesRef);
     iree_hal_amdgpu_ExecutableDef_end_as_root(builder);

     return builder.getBufferAttr(variantOp.getContext());
   }

   FailureOr<DenseIntElementsAttr>
   serializeHIPBinaryContainer(const SerializationOptions &serializationOptions,
                               IREE::HAL::ExecutableVariantOp variantOp,
                               ArrayRef<IREE::HAL::ExecutableExportOp> exportOps,
                               StringRef hsacoModule) {
     iree_compiler::FlatbufferBuilder builder;
     iree_hal_hip_ExecutableDef_start_as_root(builder);

     // Attach embedded source file contents.
     auto sourceFilesRef = createSourceFilesVec(
         serializationOptions.debugLevel, variantOp.getSourcesAttr(), builder);

     // Only a single module today.
     SmallVector<iree_hal_hip_ModuleDef_ref_t> moduleRefs;
     {
       auto hsacoImageRef = flatbuffers_string_create(
           builder, hsacoModule.data(), hsacoModule.size());
       moduleRefs.push_back(
           iree_hal_hip_ModuleDef_create(builder, hsacoImageRef));
     }
     auto modulesRef = builder.createOffsetVecDestructive(moduleRefs);

     // Generate optional per-export debug information.
     // May be empty if no debug information was requested.
     auto exportDebugInfos =
         createExportDefs(serializationOptions.debugLevel, exportOps, builder);

     SmallVector<iree_hal_hip_ExportDef_ref_t> exportRefs;
     exportRefs.resize(exportOps.size(), 0);
     for (auto exportOp : exportOps) {
       auto ordinalAttr = exportOp.getOrdinalAttr();
       if (!ordinalAttr) {
         return mlir::emitError(exportOp.getLoc())
                << "could not compile rocm binary: export op is missing ordinal";
       }
       int64_t ordinal = ordinalAttr.getInt();

       auto kernelNameRef = builder.createString(exportOp.getName());

       iree_hal_hip_BlockDims_t blockDims = {0};
       if (auto workgroupSizeAttr = exportOp.getWorkgroupSize()) {
         auto workgroupSize = workgroupSizeAttr->getValue();
         blockDims.x = cast<IntegerAttr>(workgroupSize[0]).getInt();
         blockDims.y = cast<IntegerAttr>(workgroupSize[1]).getInt();
         blockDims.z = cast<IntegerAttr>(workgroupSize[2]).getInt();
       }

       auto layoutAttr = exportOp.getLayoutAttr();
       uint32_t constantCount = static_cast<uint32_t>(layoutAttr.getConstants());
       SmallVector<iree_hal_hip_BindingBits_enum_t> bindingFlags;
       for (auto bindingAttr : layoutAttr.getBindings()) {
         iree_hal_hip_BindingBits_enum_t flags = 0;
         if (allEnumBitsSet(bindingAttr.getFlags(),
                            IREE::HAL::DescriptorFlags::ReadOnly)) {
           flags |= iree_hal_hip_BindingBits_READ_ONLY;
         }
         if (allEnumBitsSet(bindingAttr.getFlags(),
                            IREE::HAL::DescriptorFlags::Indirect)) {
           flags |= iree_hal_hip_BindingBits_INDIRECT;
         }
         bindingFlags.push_back(flags);
       }
       auto bindingFlagsRef = iree_hal_hip_BindingBits_vec_create(
           builder, bindingFlags.data(), bindingFlags.size());

       iree_hal_hip_ExportDef_start(builder);
       iree_hal_hip_ExportDef_module_ordinal_add(builder, 0); // always 0 today
       iree_hal_hip_ExportDef_kernel_name_add(builder, kernelNameRef);
       iree_hal_hip_ExportDef_block_dims_add(builder, &blockDims);
       iree_hal_hip_ExportDef_constant_count_add(builder, constantCount);
       iree_hal_hip_ExportDef_binding_flags_add(builder, bindingFlagsRef);
       iree_hal_hip_ExportDef_debug_info_add(builder, exportDebugInfos[ordinal]);
       exportRefs[ordinal] = iree_hal_hip_ExportDef_end(builder);
     }
     auto exportsRef = builder.createOffsetVecDestructive(exportRefs);

     iree_hal_hip_ExecutableDef_exports_add(builder, exportsRef);
     iree_hal_hip_ExecutableDef_modules_add(builder, modulesRef);
     iree_hal_hip_ExecutableDef_source_files_add(builder, sourceFilesRef);
     iree_hal_hip_ExecutableDef_end_as_root(builder);

     return builder.getBufferAttr(variantOp.getContext());
   }

 private:
   const ROCMOptions &options;
 };

 class AMDGPUTargetDevice final : public TargetDevice {
 public:
   AMDGPUTargetDevice(const ROCMOptions &options) : options(options) {}

   IREE::HAL::DeviceTargetAttr
   getDefaultDeviceTarget(MLIRContext *context,
                          const TargetRegistry &targetRegistry) const override {
     Builder b(context);
     auto deviceConfigAttr = b.getDictionaryAttr({});
     auto executableConfigAttr = b.getDictionaryAttr({});

     // If we had multiple target environments we would generate one target attr
     // per environment, with each setting its own environment attribute.
     SmallVector<IREE::HAL::ExecutableTargetAttr> executableTargetAttrs;
     targetRegistry.getTargetBackend("rocm")->getDefaultExecutableTargets(
         context, "amdgpu", executableConfigAttr, executableTargetAttrs);

     return IREE::HAL::DeviceTargetAttr::get(context, b.getStringAttr("amdgpu"),
                                             deviceConfigAttr,
                                             executableTargetAttrs);
   }

 private:
   const ROCMOptions &options;
 };

 class HIPTargetDevice final : public TargetDevice {
 public:
   HIPTargetDevice(const ROCMOptions &options) : options(options) {}

   IREE::HAL::DeviceTargetAttr
   getDefaultDeviceTarget(MLIRContext *context,
                          const TargetRegistry &targetRegistry) const override {
     Builder b(context);
     auto deviceConfigAttr = b.getDictionaryAttr({});
     auto executableConfigAttr = b.getDictionaryAttr({});

     // If we had multiple target environments we would generate one target attr
     // per environment, with each setting its own environment attribute.
     SmallVector<IREE::HAL::ExecutableTargetAttr> executableTargetAttrs;
     targetRegistry.getTargetBackend("rocm")->getDefaultExecutableTargets(
         context, "hip", executableConfigAttr, executableTargetAttrs);

     return IREE::HAL::DeviceTargetAttr::get(context, b.getStringAttr("hip"),
                                             deviceConfigAttr,
                                             executableTargetAttrs);
   }

 private:
   const ROCMOptions &options;
 };

 namespace {

 struct ROCMSession final
     : PluginSession<ROCMSession, ROCMOptions,
                     PluginActivationPolicy::DefaultActivated> {
   static void registerPasses() { IREE::ROCM::registerROCMTargetPasses(); }
   void onRegisterDialects(DialectRegistry &registry) {
     registry.insert<IREE::ROCM::ROCMDialect>();
   }
   void populateHALTargetDevices(IREE::HAL::TargetDeviceList &targets) {
     // #hal.device.target<"amdgpu", ...
     targets.add("amdgpu", [&]() {
       return std::make_shared<AMDGPUTargetDevice>(options);
     });
     // #hal.device.target<"hip", ...
     targets.add("hip",
                 [&]() { return std::make_shared<HIPTargetDevice>(options); });
   }
   void populateHALTargetBackends(IREE::HAL::TargetBackendList &targets) {
     // #hal.executable.target<"rocm", ...
     targets.add("rocm", [&]() {
       LLVMInitializeAMDGPUTarget();
       LLVMInitializeAMDGPUTargetMC();
       LLVMInitializeAMDGPUTargetInfo();
       LLVMInitializeAMDGPUAsmParser();
       LLVMInitializeAMDGPUAsmPrinter();
       return std::make_shared<ROCMTargetBackend>(options);
     });
   }
 };

 // Iterate over ukernel bitcode embedded-data files, and insert them into the
 // EmbeddedDataDirectory singleton.
 static void addAMDGPUUkernelBitcodeToGlobalEmbeddedDataDirectory() {
   EmbeddedDataDirectory::withGlobal([](EmbeddedDataDirectory &dir) {
     const iree_file_toc_t *toc = iree_uk_amdgpu_bitcode_create();
     for (size_t i = 0; i < iree_uk_amdgpu_bitcode_size(); ++i) {
       dir.addFile(toc[i].name, llvm::StringRef{toc[i].data, toc[i].size});
     }
   });
 }

 } // namespace

 } // namespace mlir::iree_compiler::IREE::HAL

 extern "C" bool iree_register_compiler_plugin_hal_target_rocm(
     mlir::iree_compiler::PluginRegistrar *registrar) {
   registrar->registerPlugin<mlir::iree_compiler::IREE::HAL::ROCMSession>(
       "hal_target_rocm");
   mlir::iree_compiler::IREE::HAL::
       addAMDGPUUkernelBitcodeToGlobalEmbeddedDataDirectory();
   return true;
 }

 IREE_DEFINE_COMPILER_OPTION_FLAGS(mlir::iree_compiler::IREE::HAL::ROCMOptions);