[ROCM][Codegen] Add experimental amdgcn SPIR-V path (#24499)
Adds an experimental ROCm amdgcn SPIR-V path that emits HIP-loadable
SPIR-V instead of native HSACO, enabling the HIP runtime to JIT compile
device code for the target GPU. The main changes are:
- `--iree-rocm-use-spirv` flag to select `rocm-spirv-fb` for HIP
targets.
- ROCDL prepare for SPIR-V: Adjust address spaces, calling conventions,
remove AMDGPU-specific function attributes.
- ROCMTarget: Serialize LLVM SPIR-V output into a HIP-loadable offload
bundle.
- Focused lit coverage for lowering, serialization, command-line
handling.
---------
Signed-off-by: Austin Lu <aulu@amd.com>
Co-authored-by: Jakub Kuderski <jakub@nod-labs.com>
diff --git a/CMakeLists.txt b/CMakeLists.txt
index a692255..65a5176 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -464,6 +464,16 @@
"Target chip for ROCm tests that need to compile device code. \
Defaults to empty string to disable tests.")
+# Backward compatibility: accept the old IREE_HIP_TEST_TARGET_CHIP name.
+set(IREE_HIP_TEST_TARGET_CHIP "" CACHE STRING
+ "Deprecated; use IREE_ROCM_TEST_TARGET_CHIP instead.")
+if(IREE_HIP_TEST_TARGET_CHIP AND NOT IREE_ROCM_TEST_TARGET_CHIP)
+ message(WARNING "IREE_HIP_TEST_TARGET_CHIP is deprecated; use IREE_ROCM_TEST_TARGET_CHIP instead.")
+ set(IREE_ROCM_TEST_TARGET_CHIP "${IREE_HIP_TEST_TARGET_CHIP}" CACHE STRING "" FORCE)
+endif()
+set(IREE_ROCM_TEST_AMDGCNSPIRV OFF CACHE BOOL
+ "Use amdgcnspirv (SPIR-V output) for ROCm e2e tests.")
+
#-------------------------------------------------------------------------------
# Compiler Target Options
# We try to keep the default build as simple as possible and disable heavy targets.
diff --git a/MODULE.bazel b/MODULE.bazel
index 7bcb75d..15f7553 100644
--- a/MODULE.bazel
+++ b/MODULE.bazel
@@ -108,6 +108,7 @@
"ARM",
"NVPTX",
"RISCV",
+ "SPIRV",
"WebAssembly",
"X86",
],
diff --git a/build_tools/cmake/iree_check_test.cmake b/build_tools/cmake/iree_check_test.cmake
index 77c3010..72582ad 100644
--- a/build_tools/cmake/iree_check_test.cmake
+++ b/build_tools/cmake/iree_check_test.cmake
@@ -162,6 +162,9 @@
endif()
if(_NORMALIZED_TARGET_BACKEND STREQUAL "ROCM")
list(APPEND _BASE_COMPILER_FLAGS "--iree-rocm-target=${IREE_ROCM_TEST_TARGET_CHIP}")
+ if(IREE_ROCM_TEST_AMDGCNSPIRV)
+ list(APPEND _BASE_COMPILER_FLAGS "--iree-rocm-use-spirv")
+ endif()
endif()
if(_BYTECODE_MODULE_BUILD_ENABLED)
diff --git a/build_tools/cmake/iree_e2e_generated_runner_test.cmake b/build_tools/cmake/iree_e2e_generated_runner_test.cmake
index 7332f78..dd850ca 100644
--- a/build_tools/cmake/iree_e2e_generated_runner_test.cmake
+++ b/build_tools/cmake/iree_e2e_generated_runner_test.cmake
@@ -60,6 +60,13 @@
set(_BASE_COMPILER_FLAGS
"--iree-hal-target-backends=${_RULE_TARGET_BACKEND}"
)
+ string(TOUPPER ${_RULE_TARGET_BACKEND} _UPPERCASE_TARGET_BACKEND)
+ string(REPLACE "-" "_" _NORMALIZED_TARGET_BACKEND ${_UPPERCASE_TARGET_BACKEND})
+ if(_NORMALIZED_TARGET_BACKEND STREQUAL "ROCM")
+ if(IREE_ROCM_TEST_AMDGCNSPIRV)
+ list(APPEND _BASE_COMPILER_FLAGS "--iree-rocm-use-spirv")
+ endif()
+ endif()
if(NOT TARGET "${_NAME}_${_RULE_TEST_TYPE}_module")
iree_bytecode_module(
diff --git a/build_tools/cmake/iree_llvm.cmake b/build_tools/cmake/iree_llvm.cmake
index 0c52e32..7a86a5d 100644
--- a/build_tools/cmake/iree_llvm.cmake
+++ b/build_tools/cmake/iree_llvm.cmake
@@ -241,7 +241,7 @@
message(STATUS " - metal-spirv")
endif()
if(IREE_TARGET_BACKEND_ROCM OR IREE_HAL_DRIVER_AMDGPU)
- list(APPEND LLVM_TARGETS_TO_BUILD AMDGPU)
+ list(APPEND LLVM_TARGETS_TO_BUILD AMDGPU SPIRV)
set(IREE_CLANG_TARGET clang)
endif()
if(IREE_TARGET_BACKEND_VULKAN_SPIRV)
diff --git a/compiler/plugins/target/ROCM/BUILD.bazel b/compiler/plugins/target/ROCM/BUILD.bazel
index 8c8b68e..c1190b2 100644
--- a/compiler/plugins/target/ROCM/BUILD.bazel
+++ b/compiler/plugins/target/ROCM/BUILD.bazel
@@ -59,6 +59,7 @@
"@llvm-project//llvm:Linker",
"@llvm-project//llvm:MC",
"@llvm-project//llvm:Passes",
+ "@llvm-project//llvm:SPIRVCodeGen",
"@llvm-project//llvm:Support",
"@llvm-project//llvm:Target",
"@llvm-project//llvm:TransformUtils",
diff --git a/compiler/plugins/target/ROCM/CMakeLists.txt b/compiler/plugins/target/ROCM/CMakeLists.txt
index 4e1a87c..442103a 100644
--- a/compiler/plugins/target/ROCM/CMakeLists.txt
+++ b/compiler/plugins/target/ROCM/CMakeLists.txt
@@ -35,6 +35,7 @@
LLVMLinker
LLVMMC
LLVMPasses
+ LLVMSPIRVCodeGen
LLVMSupport
LLVMTarget
LLVMTransformUtils
diff --git a/compiler/plugins/target/ROCM/ROCMTarget.cpp b/compiler/plugins/target/ROCM/ROCMTarget.cpp
index 7fe09af..06d9aad 100644
--- a/compiler/plugins/target/ROCM/ROCMTarget.cpp
+++ b/compiler/plugins/target/ROCM/ROCMTarget.cpp
@@ -39,6 +39,8 @@
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/Bitcode/BitcodeWriter.h"
#include "llvm/Frontend/Offloading/Utility.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Instructions.h"
#include "llvm/IR/LegacyPassManager.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/Verifier.h"
@@ -194,6 +196,7 @@
IREE::Codegen::DenormalFpMath::None;
bool enableRegSpillWarning = false;
bool debugSymbols = false;
+ bool useAmdgcnSpirv = false;
void bindOptions(OptionsBinder &binder) {
using namespace llvm;
@@ -290,6 +293,11 @@
binder.opt<bool>("iree-rocm-emit-debug-info", debugSymbols,
cl::cat(category),
cl::desc("Generate and embed debug information (DWARF)."));
+
+ binder.opt<bool>(
+ "iree-rocm-use-spirv", useAmdgcnSpirv, cl::cat(category),
+ cl::desc("Produce SPIR-V binary (amdgcnspirv) instead of native ISA. "
+ "The HIP runtime JIT-compiles the SPIR-V to native ISA."));
}
LogicalResult verify(mlir::Builder &builder) const {
@@ -414,7 +422,9 @@
addConfig("abi", b.getStringAttr(deviceID));
std::string format;
- if (deviceID == "amdgpu") {
+ if (targetOptions.useAmdgcnSpirv) {
+ format = "rocm-spirv-fb";
+ } else if (deviceID == "amdgpu") {
FailureOr<std::string> targetID =
buildAMDGPUTargetID(b.getUnknownLoc(), targetOptions.target,
targetOptions.targetFeatures);
@@ -565,8 +575,12 @@
void buildTranslationPassPipeline(IREE::HAL::ExecutableTargetAttr targetAttr,
OpPassManager &passManager) final {
- buildLLVMGPUCodegenPassPipeline(passManager.nest<ModuleOp>(), true,
- targetOptions.debugSymbols);
+ // Derive SPIR-V mode from the target format rather than the CLI flag,
+ // so that programmatically-constructed target attrs work correctly.
+ bool useSPIRV = targetAttr.getFormat() == "rocm-spirv-fb";
+ buildLLVMGPUCodegenPassPipeline(
+ passManager.nest<ModuleOp>(), /*useROCM=*/true,
+ targetOptions.debugSymbols, /*includeLLVMLowering=*/true, useSPIRV);
buildCodegenTranslationPostProcessingPassPipeline(passManager);
}
@@ -638,9 +652,14 @@
LogicalResult
validateFinalizedModule(IREE::HAL::ExecutableVariantOp variantOp,
- llvm::Module &module) {
+ llvm::Module &module, bool allowExternalDecls) {
for (llvm::Function &func : module.functions()) {
if (func.isDeclaration() && !func.isIntrinsic() && !func.use_empty()) {
+ // In SPIR-V mode, external declarations (e.g. __ocml_*, __ockl_*,
+ // amdgcn intrinsics) are expected — they are resolved at JIT time.
+ if (allowExternalDecls) {
+ continue;
+ }
llvm::User *liveUser = *func.user_begin();
return variantOp.emitError()
<< "found an unresolved external function '" << func.getName()
@@ -657,6 +676,13 @@
OpBuilder &executableBuilder) final {
ModuleOp innerModuleOp = variantOp.getInnerModule();
auto targetAttr = variantOp.getTargetAttr();
+ bool useSPIRV = targetAttr.getFormat() == "rocm-spirv-fb";
+ if (targetOptions.useAmdgcnSpirv && !useSPIRV) {
+ return variantOp.emitError()
+ << "--iree-rocm-use-spirv requires ROCm executable target format "
+ "'rocm-spirv-fb', but got '"
+ << targetAttr.getFormat() << "'";
+ }
StringRef targetArch = targetOptions.target;
StringRef targetFeatures = targetOptions.targetFeatures;
if (auto attr = getGPUTargetAttr(variantOp.getContext(), targetAttr)) {
@@ -821,15 +847,20 @@
}
}
- llvmModule->setDataLayout(targetMachine->createDataLayout());
-
// Code object version * 100.
constexpr uint32_t abiVersion = 500;
- // Let the backend know what code object version we're compiling for. This
- // insulates us from changes to the default code object version that our
- // CI or users may not be prepared for.
- llvmModule->addModuleFlag(llvm::Module::Error,
- "amdhsa_code_object_version", abiVersion);
+
+ // For SPIR-V mode, the data layout is already set by PrepareForSPIRVPass
+ // on the MLIR module and propagated through MLIR->LLVM IR translation.
+ if (!useSPIRV) {
+ llvmModule->setDataLayout(targetMachine->createDataLayout());
+
+ // Let the backend know what code object version we're compiling for.
+ // This insulates us from changes to the default code object version
+ // that our CI or users may not be prepared for.
+ llvmModule->addModuleFlag(llvm::Module::Error,
+ "amdhsa_code_object_version", abiVersion);
+ }
for (llvm::Function &f : llvmModule->functions()) {
f.addFnAttr(llvm::Attribute::AlwaysInline);
@@ -863,23 +894,25 @@
<< targetArch.str() << "'";
}
- // Link module to HIP device library.
- if (targetOptions.bitcodeDirectory.empty()) {
- return variantOp.emitError()
- << "cannot find ROCM bitcode files. Check your installation "
- "consistency and in the worst case, set "
- "--iree-rocm-bc-dir= to a path on your system.";
- }
- if (failed(linkHIPBitcodeIfNeeded(variantOp.getLoc(), llvmModule.get(),
- targetArch,
- targetOptions.bitcodeDirectory))) {
- return failure();
- }
+ // Link module to HIP device library (skip for SPIR-V — resolved at JIT).
+ if (!useSPIRV) {
+ if (targetOptions.bitcodeDirectory.empty()) {
+ return variantOp.emitError()
+ << "cannot find ROCM bitcode files. Check your installation "
+ "consistency and in the worst case, set "
+ "--iree-rocm-bc-dir= to a path on your system.";
+ }
+ if (failed(linkHIPBitcodeIfNeeded(variantOp.getLoc(), llvmModule.get(),
+ targetArch,
+ targetOptions.bitcodeDirectory))) {
+ return failure();
+ }
- // Sets HIP platform globals based on the target architecture.
- if (failed(setHIPGlobals(variantOp.getLoc(), llvmModule.get(), chipset,
- isWave64, abiVersion))) {
- return failure();
+ // Sets HIP platform globals based on the target architecture.
+ if (failed(setHIPGlobals(variantOp.getLoc(), llvmModule.get(), chipset,
+ isWave64, abiVersion))) {
+ return failure();
+ }
}
if (!serializationOptions.dumpIntermediatesPath.empty()) {
@@ -895,9 +928,14 @@
std::string targetTriple = targetMachine->getTargetTriple().str();
// Run LLVM optimization passes.
+ // For SPIR-V mode, skip AMDGPU-specific optimization which introduces
+ // buffer resource intrinsics (AS 7) and other constructs incompatible
+ // with the SPIR-V backend.
std::string passesString;
- optimizeModule(*llvmModule, *targetMachine,
- targetOptions.slpVectorization, passesString);
+ if (!useSPIRV) {
+ optimizeModule(*llvmModule, *targetMachine,
+ targetOptions.slpVectorization, passesString);
+ }
if (!serializationOptions.dumpIntermediatesPath.empty()) {
// Additional context on '-mcpu' flag in PR comments, see for example:
// https://github.com/iree-org/iree/pull/20716#issuecomment-2851650421
@@ -915,48 +953,133 @@
".optimized.ll", *llvmModule, header);
}
- if (failed(validateFinalizedModule(variantOp, *llvmModule))) {
+ if (failed(validateFinalizedModule(variantOp, *llvmModule, useSPIRV))) {
return failure();
}
- // Dump the assembly output.
- if (!serializationOptions.dumpIntermediatesPath.empty()) {
- auto moduleCopy = llvm::CloneModule(*llvmModule);
- if (!moduleCopy) {
- llvm::errs() << "Error: cloning LLVM IR failed\n";
- return failure();
+ if (useSPIRV) {
+ // SPIR-V codegen path: create a SPIR-V TargetMachine and emit binary.
+ llvm::Triple spirvTriple("spirv64-amd-amdhsa");
+ std::string spirvError;
+ const llvm::Target *spirvTarget =
+ llvm::TargetRegistry::lookupTarget("", spirvTriple, spirvError);
+ if (!spirvTarget) {
+ return variantOp.emitError()
+ << "cannot find SPIR-V target: " << spirvError;
}
- std::string asmHeader = llvm::formatv(
- R"TXT(; To reproduce the .rocmasm from .optimized.ll, run:
+ auto spirvTM = std::unique_ptr<llvm::TargetMachine>(
+ spirvTarget->createTargetMachine(
+ spirvTriple, /*CPU=*/"", /*Features=*/"", llvm::TargetOptions{},
+ llvm::Reloc::PIC_, std::nullopt,
+ llvm::CodeGenOptLevel::Default));
+
+ if (!spirvTM) {
+ return variantOp.emitError() << "cannot create SPIR-V target machine";
+ }
+
+ // If requested, dump the final .ll just before conversion to
+ // SPIRV-binary
+ if (!serializationOptions.dumpIntermediatesPath.empty()) {
+ dumpModuleToPath(serializationOptions.dumpIntermediatesPath,
+ serializationOptions.dumpBaseName,
+ variantOp.getName(), ".final.ll", *llvmModule);
+ }
+
+ // Emit SPIR-V binary (no lld/hsaco needed).
+ // The MLIR PrepareForSPIRV pass set triple, data layout, calling
+ // conventions, stripped assumes, and removed AMDGPU attrs. We
+ // reassert the data layout here from the TM to guarantee it matches
+ // exactly (MachineFunction::init() asserts on mismatch).
+ llvmModule->setDataLayout(spirvTM->createDataLayout());
+ llvmModule->setTargetTriple(spirvTriple);
+ std::string spirvBinary = translateModuleToObj(*llvmModule, *spirvTM);
+ if (spirvBinary.empty()) {
+ return variantOp.emitError()
+ << "SPIR-V binary translation produced empty output";
+ }
+
+ if (!serializationOptions.dumpIntermediatesPath.empty()) {
+ dumpDataToPath(serializationOptions.dumpIntermediatesPath,
+ serializationOptions.dumpBaseName, variantOp.getName(),
+ ".spv", spirvBinary);
+ }
+
+ // Wrap the SPIR-V binary in a clang offload bundle. The HIP runtime
+ // expects this format — raw SPIR-V is not accepted by
+ // hipModuleLoadData.
+ // Note: Multi-byte fields use little-endian byte order, matching the
+ // Clang offload bundler binary format specification.
+ const std::string bundleTarget = "hip-spirv64-amd-amdhsa--amdgcnspirv";
+ const char bundleMagic[] = "__CLANG_OFFLOAD_BUNDLE__";
+ const uint64_t numEntries = 1;
+ const uint64_t headerSize = sizeof(bundleMagic) - 1 +
+ sizeof(numEntries) + 3 * sizeof(uint64_t) +
+ bundleTarget.size();
+ const uint64_t dataOffset = headerSize;
+ const uint64_t dataSize = spirvBinary.size();
+
+ std::string bundle;
+ bundle.reserve(headerSize + dataSize);
+ // Magic string (no null terminator).
+ bundle.append(bundleMagic, sizeof(bundleMagic) - 1);
+ // Number of entries.
+ bundle.append(reinterpret_cast<const char *>(&numEntries),
+ sizeof(numEntries));
+ // Entry: offset, size, target string length, target string.
+ bundle.append(reinterpret_cast<const char *>(&dataOffset),
+ sizeof(dataOffset));
+ bundle.append(reinterpret_cast<const char *>(&dataSize),
+ sizeof(dataSize));
+ uint64_t targetLen = bundleTarget.size();
+ bundle.append(reinterpret_cast<const char *>(&targetLen),
+ sizeof(targetLen));
+ bundle.append(bundleTarget);
+ // SPIR-V binary data.
+ bundle.append(spirvBinary);
+ targetHSACO = std::move(bundle);
+ } else {
+ // Native ISA codegen path.
+ // Dump the assembly output.
+ if (!serializationOptions.dumpIntermediatesPath.empty()) {
+ auto moduleCopy = llvm::CloneModule(*llvmModule);
+ if (!moduleCopy) {
+ llvm::errs() << "Error: cloning LLVM IR failed\n";
+ return failure();
+ }
+ std::string asmHeader = llvm::formatv(
+ R"TXT(; To reproduce the .rocmasm from .optimized.ll, run:
; llc -mtriple={} -mcpu={} -mattr='{}' -O3 <.optimized.ll> -o <out.rocmasm>
)TXT",
- targetTriple, targetCPU, targetMachine->getTargetFeatureString());
+ targetTriple, targetCPU, targetMachine->getTargetFeatureString());
- std::string targetISA =
- translateModuleToISA(*moduleCopy.get(), *targetMachine);
- dumpDataToPath(serializationOptions.dumpIntermediatesPath,
- serializationOptions.dumpBaseName, variantOp.getName(),
- ".rocmasm", asmHeader + targetISA);
- }
+ std::string targetISA =
+ translateModuleToISA(*moduleCopy.get(), *targetMachine);
+ dumpDataToPath(serializationOptions.dumpIntermediatesPath,
+ serializationOptions.dumpBaseName, variantOp.getName(),
+ ".rocmasm", asmHeader + targetISA);
+ }
- // Serialize hsaco kernel into the binary that we will embed in the
- // final FlatBuffer.
- std::string targetObj = translateModuleToObj(*llvmModule, *targetMachine);
- targetHSACO = createHsaco(variantOp.getLoc(), targetObj, libraryName);
- if (targetHSACO.empty()) {
- return failure();
- }
+ // Serialize hsaco kernel into the binary that we will embed in the
+ // final FlatBuffer.
+ std::string targetObj =
+ translateModuleToObj(*llvmModule, *targetMachine);
+ targetHSACO = createHsaco(variantOp.getLoc(), targetObj, libraryName);
+ if (targetHSACO.empty()) {
+ return failure();
+ }
- if (targetOptions.enableRegSpillWarning) {
- checkRegisterSpilling(variantOp, targetObj);
+ if (targetOptions.enableRegSpillWarning) {
+ checkRegisterSpilling(variantOp, targetObj);
+ }
}
}
if (!serializationOptions.dumpBinariesPath.empty()) {
+ StringRef ext = useSPIRV ? ".spv" : ".hsaco";
dumpDataToPath(serializationOptions.dumpBinariesPath,
serializationOptions.dumpBaseName, variantOp.getName(),
- ".hsaco", targetHSACO);
+ ext, targetHSACO);
}
// Determine container type from the target ABI attribute.
@@ -1273,6 +1396,11 @@
LLVMInitializeAMDGPUTargetInfo();
LLVMInitializeAMDGPUAsmParser();
LLVMInitializeAMDGPUAsmPrinter();
+ // Also initialize SPIR-V target for amdgcnspirv mode.
+ LLVMInitializeSPIRVTarget();
+ LLVMInitializeSPIRVTargetInfo();
+ LLVMInitializeSPIRVTargetMC();
+ LLVMInitializeSPIRVAsmPrinter();
return std::make_shared<ROCMTargetBackend>(options, codegenOptions);
});
}
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/BUILD.bazel b/compiler/src/iree/compiler/Codegen/LLVMGPU/BUILD.bazel
index c99e24f..0d46708 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/BUILD.bazel
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/BUILD.bazel
@@ -145,6 +145,7 @@
"ROCDLConfigureBufferInstructions.cpp",
"ROCDLLoadToTransposeLoad.cpp",
"ROCDLPrefetching.cpp",
+ "ROCDLPrepareForSPIRV.cpp",
"Verifiers.cpp",
],
hdrs = [
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/LLVMGPU/CMakeLists.txt
index 87281aa..2e32fec 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/CMakeLists.txt
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/CMakeLists.txt
@@ -119,6 +119,7 @@
"ROCDLConfigureBufferInstructions.cpp"
"ROCDLLoadToTransposeLoad.cpp"
"ROCDLPrefetching.cpp"
+ "ROCDLPrepareForSPIRV.cpp"
"Verifiers.cpp"
DEPS
::LLVMGPUConstraintGenerator
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp
index 4aac152..1a6a3a5 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp
@@ -968,7 +968,8 @@
}
static void addLowerToLLVMGPUPasses(OpPassManager &modulePassManager,
- bool forROCDL, bool preserveDebugInfo) {
+ bool forROCDL, bool preserveDebugInfo,
+ bool useSPIRV = false) {
modulePassManager.addPass(
createConvertHALDescriptorTypeToGPUAddressSpacePass());
modulePassManager.addPass(createCanonicalizerPass());
@@ -1088,6 +1089,9 @@
modulePassManager.addPass(createConvertToROCDLPass());
modulePassManager.addNestedPass<LLVM::LLVMFuncOp>(
createROCDLAnnotateKernelForTranslationPass());
+ if (useSPIRV) {
+ modulePassManager.addPass(createROCDLPrepareForSPIRVPass());
+ }
} else {
// Convert to NVVM.
modulePassManager.addPass(createConvertToNVVMPass());
@@ -1160,7 +1164,7 @@
void buildLLVMGPUCodegenPassPipeline(OpPassManager &modulePassManager,
bool useROCM, bool preserveDebugInfo,
- bool includeLLVMLowering) {
+ bool includeLLVMLowering, bool useSPIRV) {
modulePassManager.addPass(createLowerExecutableUsingTransformDialectPass());
LLVMGPULowerExecutableTargetPassOptions options;
options.forROCDL = useROCM;
@@ -1184,7 +1188,8 @@
// - The module contains the final llvm.module ready to be serialized.
//===--------------------------------------------------------------------===//
if (includeLLVMLowering) {
- addLowerToLLVMGPUPasses(modulePassManager, useROCM, preserveDebugInfo);
+ addLowerToLLVMGPUPasses(modulePassManager, useROCM, preserveDebugInfo,
+ useSPIRV);
}
LLVM_DEBUG({
@@ -1281,6 +1286,9 @@
*this, "include-llvm-lowering",
llvm::cl::desc("Include the lowering to LLVM dialect."),
llvm::cl::init(true)};
+ Option<bool> useSPIRV{
+ *this, "use-spirv",
+ llvm::cl::desc("Prepare LLVM dialect IR for the SPIR-V backend")};
};
static PassPipelineRegistration<> LLVMGPUConfigPipeline(
@@ -1308,9 +1316,9 @@
"Runs the LLVMGPU ROCDL lowering pipeline",
[](OpPassManager &modulePassManager,
const LLVMGPULoweringPipelineOptions &options) {
- buildLLVMGPUCodegenPassPipeline(modulePassManager, true,
- options.preserveDebugInfo,
- options.includeLLVMLowering);
+ buildLLVMGPUCodegenPassPipeline(
+ modulePassManager, true, options.preserveDebugInfo,
+ options.includeLLVMLowering, options.useSPIRV);
});
static PassPipelineRegistration<> LLVMGPULinkingPipeline(
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.h b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.h
index 9515e4e..6467e40 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.h
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.h
@@ -79,7 +79,8 @@
/// module within the IREE::HAL::ExecutableOp.
void buildLLVMGPUCodegenPassPipeline(OpPassManager &modulePassManager,
bool useROCM, bool preserveDebugInfo,
- bool includeLLVMLowering = true);
+ bool includeLLVMLowering = true,
+ bool useSPIRV = false);
/// Wraps GPUPipelineOptions and forROCDL for passing through
/// PipelineAttrInterface::buildPipeline.
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/ROCDLConfigureBufferInstructions.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/ROCDLConfigureBufferInstructions.cpp
index 584d433..9382232 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/ROCDLConfigureBufferInstructions.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/ROCDLConfigureBufferInstructions.cpp
@@ -127,6 +127,12 @@
if (!target || !target.isAMD()) {
return;
}
+ // Buffer instructions (address space 7) are not supported by the SPIR-V
+ // backend, so skip when targeting SPIR-V output.
+ auto execTarget = IREE::HAL::ExecutableTargetAttr::lookup(funcOp);
+ if (execTarget && execTarget.getFormat() == "rocm-spirv-fb") {
+ return;
+ }
// Initialize the DataFlowSolver with IntegerRangeAnalysis.
DataFlowSolver solver;
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/ROCDLPasses.td b/compiler/src/iree/compiler/Codegen/LLVMGPU/ROCDLPasses.td
index 321ee1a..a3b7176 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/ROCDLPasses.td
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/ROCDLPasses.td
@@ -73,4 +73,20 @@
];
}
+def ROCDLPrepareForSPIRVPass :
+ Pass<"iree-rocdl-prepare-for-spirv", "ModuleOp"> {
+ let summary = "Prepare LLVM dialect IR for SPIR-V backend codegen.";
+ let description = [{
+ Transforms AMDGPU-flavored LLVM dialect IR into SPIR-V-compatible form:
+ - Remaps address spaces (AMDGPU -> SPIR-V conventions)
+ - Changes calling conventions (amdgpu_kernel -> spir_kernel, etc.)
+ - Removes ROCDL-specific function attributes
+ - Removes inreg parameter attributes (AMDGPU argument preloading)
+ - Sets module triple and data layout for spirv64-amd-amdhsa
+ }];
+ let dependentDialects = [
+ "LLVM::LLVMDialect",
+ ];
+}
+
#endif // IREE_CODEGEN_LLVMGPU_ROCDLPASSES
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/ROCDLPrepareForSPIRV.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/ROCDLPrepareForSPIRV.cpp
new file mode 100644
index 0000000..063316e
--- /dev/null
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/ROCDLPrepareForSPIRV.cpp
@@ -0,0 +1,318 @@
+// Copyright 2026 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "llvm/ADT/STLExtras.h"
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/Pass/Pass.h"
+
+#define DEBUG_TYPE "iree-rocdl-prepare-for-spirv"
+
+namespace mlir::iree_compiler {
+
+#define GEN_PASS_DEF_ROCDLPREPAREFORSPIRVPASS
+#include "iree/compiler/Codegen/LLVMGPU/ROCDLPasses.h.inc"
+
+namespace {
+
+// Remaps AMDGPU address spaces to SPIR-V address spaces.
+// AMDGPU 5 (private/stack) -> SPIR-V 0 (Function)
+// AMDGPU 0 (flat/generic) -> SPIR-V 4 (Generic)
+// AMDGPU 4 (constant) -> SPIR-V 2 (UniformConstant)
+// AMDGPU 1 (global) -> 1 (no change)
+// AMDGPU 3 (shared/LDS) -> 3 (no change)
+static unsigned remapAddressSpace(unsigned as) {
+ switch (as) {
+ case 5:
+ return 0; // private -> Function
+ case 0:
+ return 4; // flat/generic -> Generic
+ case 4:
+ return 2; // constant -> UniformConstant
+ default:
+ return as; // 1 (global) and 3 (shared) stay the same
+ }
+}
+
+// Remaps address spaces in a type. Recursively handles pointer and array types.
+static Type remapType(Type type) {
+ if (auto ptrType = dyn_cast<LLVM::LLVMPointerType>(type)) {
+ unsigned newAS = remapAddressSpace(ptrType.getAddressSpace());
+ if (newAS != ptrType.getAddressSpace()) {
+ return LLVM::LLVMPointerType::get(type.getContext(), newAS);
+ }
+ return type;
+ }
+ if (auto arrayType = dyn_cast<LLVM::LLVMArrayType>(type)) {
+ Type newElem = remapType(arrayType.getElementType());
+ if (newElem != arrayType.getElementType()) {
+ return LLVM::LLVMArrayType::get(newElem, arrayType.getNumElements());
+ }
+ return type;
+ }
+ if (auto funcType = dyn_cast<LLVM::LLVMFunctionType>(type)) {
+ Type newRet = remapType(funcType.getReturnType());
+ SmallVector<Type> newParams;
+ bool changed = newRet != funcType.getReturnType();
+ for (Type param : funcType.getParams()) {
+ Type newParam = remapType(param);
+ newParams.push_back(newParam);
+ changed |= newParam != param;
+ }
+ if (changed) {
+ return LLVM::LLVMFunctionType::get(newRet, newParams,
+ funcType.isVarArg());
+ }
+ return type;
+ }
+ if (auto structType = dyn_cast<LLVM::LLVMStructType>(type)) {
+ if (structType.isOpaque()) {
+ return type;
+ }
+ SmallVector<Type> newBody;
+ bool changed = false;
+ for (Type elem : structType.getBody()) {
+ Type newElem = remapType(elem);
+ newBody.push_back(newElem);
+ changed |= newElem != elem;
+ }
+ if (changed) {
+ // TODO: Identified structs lose their name here. This is acceptable
+ // for the current IR patterns but needs proper handling for recursive
+ // struct types if they ever appear.
+ return LLVM::LLVMStructType::getLiteral(type.getContext(), newBody,
+ structType.isPacked());
+ }
+ return type;
+ }
+ return type;
+}
+
+// List of ROCDL-specific attributes to remove for SPIR-V.
+// Note: AMDGPU attributes are handled by the backend.
+static constexpr llvm::StringLiteral kROCDLGPUAttrsToRemove[] = {
+ // ROCDL-specific attributes set by ROCDLAnnotateKernelForTranslationPass.
+ "rocdl.kernel",
+ "rocdl.flat_work_group_size",
+ "rocdl.reqd_work_group_size",
+ "rocdl.max_flat_work_group_size",
+};
+
+static bool isOptimizationLevelFlag(StringRef arg) {
+ return arg == "-O0" || arg == "-O1" || arg == "-O2" || arg == "-O3" ||
+ arg == "-Os" || arg == "-Oz";
+}
+
+static std::string ensureO3InCmdline(StringRef cmdline) {
+ SmallVector<StringRef> args;
+ cmdline.split(args, '\0', /*MaxSplit=*/-1, /*KeepEmpty=*/false);
+
+ std::string result;
+ bool hasOptLevel = false;
+ for (StringRef arg : args) {
+ if (isOptimizationLevelFlag(arg)) {
+ if (hasOptLevel) {
+ continue;
+ }
+ arg = "-O3";
+ hasOptLevel = true;
+ }
+ result.append(arg.data(), arg.size());
+ result.push_back('\0');
+ }
+
+ if (!hasOptLevel) {
+ result.append("-O3", 3);
+ result.push_back('\0');
+ }
+
+ return result;
+}
+
+struct ROCDLPrepareForSPIRVPass final
+ : impl::ROCDLPrepareForSPIRVPassBase<ROCDLPrepareForSPIRVPass> {
+ void getDependentDialects(DialectRegistry ®istry) const override {
+ registry.insert<LLVM::LLVMDialect>();
+ }
+
+ void runOnOperation() override {
+ ModuleOp moduleOp = getOperation();
+
+ // Set module triple and data layout for SPIR-V.
+ moduleOp->setAttr(
+ LLVM::LLVMDialect::getTargetTripleAttrName(),
+ StringAttr::get(moduleOp.getContext(), "spirv64-amd-amdhsa"));
+ // Data layout from llvm::Triple("spirv64-amd-amdhsa").computeDataLayout().
+ // Must match exactly or MachineFunction::init() asserts.
+ moduleOp->setAttr(
+ LLVM::LLVMDialect::getDataLayoutAttrName(),
+ StringAttr::get(
+ moduleOp.getContext(),
+ "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128"
+ "-v192:256-v256:256-v512:512-v1024:1024-n32:64-S32-G1-P4-A0"));
+
+ // Process all functions: remap calling conventions and remove AMDGPU attrs.
+ moduleOp->walk([&](LLVM::LLVMFuncOp funcOp) {
+ // Remap calling conventions.
+ // After ROCDLAnnotateKernelForTranslationPass, kernel functions have
+ // rocdl.kernel attribute instead of amdgpu_kernel CC.
+ auto cc = funcOp.getCConv();
+ if (cc == LLVM::CConv::AMDGPU_KERNEL || funcOp->hasAttr("rocdl.kernel")) {
+ funcOp.setCConv(LLVM::CConv::SPIR_KERNEL);
+ } else if (!funcOp.getName().starts_with("llvm.")) {
+ // All non-intrinsic functions (definitions and declarations) get
+ // spir_func. LLVM intrinsics (llvm.*) keep their default CC.
+ funcOp.setCConv(LLVM::CConv::SPIR_FUNC);
+ }
+
+ // Remove ROCDLGPU attributes from the function's attribute dictionary.
+ for (auto attrName : kROCDLGPUAttrsToRemove) {
+ funcOp->removeAttr(attrName);
+ }
+
+ // Also clean llvm_func_attrs if present.
+ if (auto funcAttrs =
+ funcOp->getAttrOfType<DictionaryAttr>("llvm_func_attrs")) {
+ SmallVector<NamedAttribute> newAttrs;
+ for (NamedAttribute attr : funcAttrs) {
+ if (!llvm::is_contained(kROCDLGPUAttrsToRemove, attr.getName())) {
+ newAttrs.push_back(attr);
+ }
+ }
+ if (newAttrs.size() != funcAttrs.size()) {
+ if (newAttrs.empty()) {
+ funcOp->removeAttr("llvm_func_attrs");
+ } else {
+ funcOp->setAttr(
+ "llvm_func_attrs",
+ DictionaryAttr::get(moduleOp.getContext(), newAttrs));
+ }
+ }
+ }
+
+ // Remove inreg attributes from all parameters. AMDGPU uses inreg for
+ // argument preloading which is incompatible with SPIR-V.
+ // TODO: Clean this up once the SPIR-V backend can handle this.
+ for (unsigned i = 0; i < funcOp.getNumArguments(); ++i) {
+ funcOp.removeArgAttr(i, "llvm.inreg");
+ }
+
+ // Remap function signature types (pointer address spaces in
+ // params/return).
+ auto funcType = funcOp.getFunctionType();
+ Type newRetType = remapType(funcType.getReturnType());
+ SmallVector<Type> newParamTypes;
+ bool sigChanged = newRetType != funcType.getReturnType();
+ for (Type param : funcType.getParams()) {
+ Type newParam = remapType(param);
+ newParamTypes.push_back(newParam);
+ sigChanged |= newParam != param;
+ }
+ if (sigChanged) {
+ funcOp.setFunctionType(LLVM::LLVMFunctionType::get(
+ newRetType, newParamTypes, funcType.isVarArg()));
+ // Also update block argument types.
+ if (!funcOp.isDeclaration()) {
+ Block &entryBlock = funcOp.getBody().front();
+ for (auto [i, arg] : llvm::enumerate(entryBlock.getArguments())) {
+ if (i < newParamTypes.size()) {
+ arg.setType(newParamTypes[i]);
+ }
+ }
+ }
+ }
+ });
+
+ // Remap address spaces on all operations.
+ moduleOp->walk([&](Operation *op) {
+ // Skip function ops — they were already handled above.
+ if (isa<LLVM::LLVMFuncOp>(op)) {
+ return;
+ }
+
+ // Handle ops with explicit addr_space attributes first.
+ if (auto allocaOp = dyn_cast<LLVM::AllocaOp>(op)) {
+ // The address space is encoded in the result pointer type.
+ unsigned oldAS = allocaOp.getRes().getType().getAddressSpace();
+ unsigned newAS = remapAddressSpace(oldAS);
+ if (newAS != oldAS) {
+ allocaOp.getRes().setType(
+ LLVM::LLVMPointerType::get(op->getContext(), newAS));
+ }
+ // Also remap the addr_space attribute if present (discardable attr).
+ if (auto addrSpaceAttr =
+ allocaOp->getAttrOfType<IntegerAttr>("addr_space")) {
+ unsigned attrAS = addrSpaceAttr.getInt();
+ unsigned newAttrAS = remapAddressSpace(attrAS);
+ if (newAttrAS != attrAS) {
+ allocaOp->setAttr(
+ "addr_space",
+ IntegerAttr::get(addrSpaceAttr.getType(), newAttrAS));
+ }
+ }
+ return;
+ }
+ if (auto globalOp = dyn_cast<LLVM::GlobalOp>(op)) {
+ unsigned oldAS = globalOp.getAddrSpace();
+ unsigned newAS = remapAddressSpace(oldAS);
+ if (newAS != oldAS) {
+ globalOp.setAddrSpace(newAS);
+ }
+ Type newGlobalType = remapType(globalOp.getGlobalType());
+ if (newGlobalType != globalOp.getGlobalType()) {
+ globalOp.setGlobalType(newGlobalType);
+ }
+ return;
+ }
+
+ // For all other ops, remap result types.
+ for (auto [i, resultType] : llvm::enumerate(op->getResultTypes())) {
+ Type newType = remapType(resultType);
+ if (newType != resultType) {
+ op->getResult(i).setType(newType);
+ }
+ }
+ });
+
+ // Embed @llvm.cmdline with "-O3" so comgr JIT compiles at -O3.
+ // Without this, comgr defaults to -O0, causing massive register spilling.
+ // See amd/comgr/src/comgr-compiler.cpp: extractSpirvFlags().
+ // Must be at addrspace(1) (CrossWorkgroup) so the SPIR-V backend emits it
+ // as a module-level global that survives the round-trip.
+ auto i8Type = IntegerType::get(moduleOp.getContext(), 8);
+ auto cmdlineGlobal = moduleOp.lookupSymbol<LLVM::GlobalOp>("llvm.cmdline");
+ if (cmdlineGlobal) {
+ auto valueAttr =
+ dyn_cast_if_present<StringAttr>(cmdlineGlobal.getValueAttr());
+ if (!valueAttr) {
+ cmdlineGlobal.emitOpError()
+ << "expected @llvm.cmdline to have a string initializer";
+ return signalPassFailure();
+ }
+ std::string flags = ensureO3InCmdline(valueAttr.getValue());
+ cmdlineGlobal.setValueAttr(StringAttr::get(moduleOp.getContext(), flags));
+ cmdlineGlobal.setGlobalType(
+ LLVM::LLVMArrayType::get(i8Type, flags.size()));
+ cmdlineGlobal.setSection(".llvmcmd");
+ cmdlineGlobal.setAlignment(1);
+ cmdlineGlobal.setAddrSpace(1);
+ } else {
+ StringRef flags("-O3\0", 4);
+ OpBuilder builder(moduleOp.getBody(), moduleOp.getBody()->end());
+ auto globalOp =
+ LLVM::GlobalOp::create(builder, moduleOp.getLoc(),
+ LLVM::LLVMArrayType::get(i8Type, flags.size()),
+ /*isConstant=*/true, LLVM::Linkage::Private,
+ "llvm.cmdline", builder.getStringAttr(flags));
+ globalOp.setSection(".llvmcmd");
+ globalOp.setAlignment(1);
+ globalOp.setAddrSpace(1);
+ }
+ }
+};
+
+} // namespace
+} // namespace mlir::iree_compiler
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/BUILD.bazel b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/BUILD.bazel
index 9368ce2..35f4d49 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/BUILD.bazel
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/BUILD.bazel
@@ -64,7 +64,11 @@
"reuse_shared_memory_allocs.mlir",
"rocdl_global_transpose_load.mlir",
"rocdl_load_to_transpose_load.mlir",
+ "rocdl_pipeline_spirv_test.mlir",
"rocdl_pipeline_test.mlir",
+ "rocdl_prepare_for_spirv.mlir",
+ "rocm_spirv_serialize.mlir",
+ "rocm_spirv_serialize_target_attr.mlir",
"sort_pipeline_test.mlir",
"tensorcore_vectorization.mlir",
"transform_dialect_bufferize.mlir",
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/CMakeLists.txt
index 03383cd..0825a08 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/CMakeLists.txt
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/CMakeLists.txt
@@ -59,7 +59,11 @@
"reuse_shared_memory_allocs.mlir"
"rocdl_global_transpose_load.mlir"
"rocdl_load_to_transpose_load.mlir"
+ "rocdl_pipeline_spirv_test.mlir"
"rocdl_pipeline_test.mlir"
+ "rocdl_prepare_for_spirv.mlir"
+ "rocm_spirv_serialize.mlir"
+ "rocm_spirv_serialize_target_attr.mlir"
"sort_pipeline_test.mlir"
"tensorcore_vectorization.mlir"
"transform_dialect_bufferize.mlir"
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/rocdl_pipeline_spirv_test.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/rocdl_pipeline_spirv_test.mlir
new file mode 100644
index 0000000..218829b
--- /dev/null
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/rocdl_pipeline_spirv_test.mlir
@@ -0,0 +1,86 @@
+// RUN: iree-opt --split-input-file --iree-gpu-test-target=gfx942 \
+// RUN: --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(builtin.module(iree-codegen-llvmgpu-configuration-pipeline), builtin.module(iree-codegen-llvmgpu-rocdl-lowering-pipeline{use-spirv=true}))))" \
+// RUN: %s | FileCheck %s
+
+// Verify that a simple element-wise op lowers through the ROCDL pipeline with
+// SPIR-V preparation (spir_kernel CC, address space remapping, correct triple).
+
+#pipeline_layout = #hal.pipeline.layout<bindings = [
+ #hal.pipeline.binding<storage_buffer>,
+ #hal.pipeline.binding<storage_buffer>,
+ #hal.pipeline.binding<storage_buffer>
+]>
+hal.executable @simpleMath_ex_dispatch_0 {
+ hal.executable.variant @rocm target(<"rocm", "rocm-spirv-fb">) {
+ hal.executable.export public @add_dispatch_0 layout(#pipeline_layout) count(%arg0: !hal.device) -> (index, index, index) {
+ %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice()
+ hal.return %x, %y, %z : index, index, index
+ }
+ builtin.module {
+ func.func @add_dispatch_0() {
+ %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) : !iree_tensor_ext.dispatch.tensor<readonly:tensor<16xf32>>
+ %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) : !iree_tensor_ext.dispatch.tensor<readonly:tensor<16xf32>>
+ %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) : !iree_tensor_ext.dispatch.tensor<writeonly:tensor<16xf32>>
+ %3 = tensor.empty() : tensor<16xf32>
+ %4 = iree_tensor_ext.dispatch.tensor.load %0, offsets=[0], sizes=[16], strides=[1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<16xf32>> -> tensor<16xf32>
+ %5 = iree_tensor_ext.dispatch.tensor.load %1, offsets=[0], sizes=[16], strides=[1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<16xf32>> -> tensor<16xf32>
+ %6 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%4, %5 : tensor<16xf32>, tensor<16xf32>) outs(%3 : tensor<16xf32>) {
+ ^bb0(%arg0: f32, %arg1: f32, %arg2: f32):
+ %7 = arith.addf %arg0, %arg1 : f32
+ linalg.yield %7 : f32
+ } -> tensor<16xf32>
+ iree_tensor_ext.dispatch.tensor.store %6, %2, offsets=[0], sizes=[16], strides=[1] : tensor<16xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<16xf32>>
+ return
+ }
+ }
+ }
+}
+
+// CHECK-LABEL: hal.executable public @simpleMath_ex_dispatch_0
+// CHECK: hal.executable.variant public @rocm
+// CHECK: module attributes
+// CHECK-SAME: llvm.target_triple = "spirv64-amd-amdhsa"
+// CHECK: llvm.func spir_kernelcc @add_dispatch_0
+// CHECK: llvm.fadd
+// CHECK-NOT: amdgpu_kernel
+// CHECK-NOT: addr_space = 7
+
+// -----
+
+// Verify matmul lowers through the SPIR-V pipeline without buffer instructions.
+
+#pipeline_layout = #hal.pipeline.layout<bindings = [
+ #hal.pipeline.binding<storage_buffer>,
+ #hal.pipeline.binding<storage_buffer>,
+ #hal.pipeline.binding<storage_buffer>
+]>
+hal.executable @matmul_dispatch_0 {
+ hal.executable.variant @rocm target(<"rocm", "rocm-spirv-fb">) {
+ hal.executable.export public @matmul_dispatch_0 layout(#pipeline_layout) count(%arg0: !hal.device) -> (index, index, index) {
+ %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice()
+ hal.return %x, %y, %z : index, index, index
+ }
+ builtin.module {
+ func.func @matmul_dispatch_0() {
+ %c0 = arith.constant 0 : index
+ %cst = arith.constant 0.000000e+00 : f32
+ %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2048x1024xf32>>
+ %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) : !iree_tensor_ext.dispatch.tensor<readonly:tensor<1024x512xf32>>
+ %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%c0) : !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2048x512xf32>>
+ %3 = iree_tensor_ext.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32>
+ %4 = iree_tensor_ext.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32>
+ %5 = tensor.empty() : tensor<2048x512xf32>
+ %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
+ %7 = linalg.matmul ins(%3, %4 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%6 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
+ iree_tensor_ext.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2048x512xf32>>
+ return
+ }
+ }
+ }
+}
+
+// CHECK-LABEL: hal.executable public @matmul_dispatch_0
+// CHECK: module attributes
+// CHECK-SAME: llvm.target_triple = "spirv64-amd-amdhsa"
+// CHECK: llvm.func spir_kernelcc @matmul_dispatch_0
+// CHECK-NOT: addr_space = 7
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/rocdl_prepare_for_spirv.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/rocdl_prepare_for_spirv.mlir
new file mode 100644
index 0000000..912da1a
--- /dev/null
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/rocdl_prepare_for_spirv.mlir
@@ -0,0 +1,150 @@
+// RUN: iree-opt --split-input-file --iree-rocdl-prepare-for-spirv %s | FileCheck %s
+// RUN: iree-opt --split-input-file --iree-rocdl-prepare-for-spirv --iree-rocdl-prepare-for-spirv %s | FileCheck %s --check-prefix=IDEMPOTENT
+
+// IDEMPOTENT: llvm.mlir.global private constant @llvm.cmdline
+
+// Test triple, data layout, calling conventions, attributes, and address spaces.
+
+// CHECK: module attributes
+// CHECK-SAME: llvm.data_layout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-n32:64-S32-G1-P4-A0"
+// CHECK-SAME: llvm.target_triple = "spirv64-amd-amdhsa"
+
+// Shared memory (AS 3) stays AS 3.
+// CHECK: llvm.mlir.global external @__dynamic_shared_memory__()
+// CHECK-SAME: addr_space = 3
+
+module attributes {llvm.target_triple = "amdgcn-amd-amdhsa"} {
+
+ llvm.mlir.global external @__dynamic_shared_memory__() {addr_space = 3 : i32, alignment = 16 : i64} : !llvm.array<0 x i8>
+
+ // CHECK-LABEL: llvm.func spir_kernelcc @simple_kernel
+ llvm.func amdgpu_kernelcc @simple_kernel(%arg0: !llvm.ptr<1> {llvm.align = 16 : i64}) attributes {
+ target_cpu = "gfx1201",
+ target_features = "+wavefrontsize64"
+ } {
+ // Alloca in AS 5 (private) should become AS 0 (Function).
+ // CHECK: llvm.alloca %{{.*}} x f32 {addr_space = 0 : i32} : (i64) -> !llvm.ptr
+ %c1 = llvm.mlir.constant(1 : i64) : i64
+ %0 = llvm.alloca %c1 x f32 {addr_space = 5 : i32} : (i64) -> !llvm.ptr<5>
+ // Flat/generic AS 0 -> SPIR-V Generic AS 4.
+ // CHECK: llvm.addrspacecast %{{.*}} : !llvm.ptr to !llvm.ptr<4>
+ %1 = llvm.addrspacecast %0 : !llvm.ptr<5> to !llvm.ptr
+ llvm.return
+ }
+
+ // CHECK-LABEL: llvm.func spir_funccc @helper_func
+ llvm.func @helper_func(%arg0: f32) -> f32 {
+ llvm.return %arg0 : f32
+ }
+
+ // CHECK-LABEL: llvm.func spir_kernelcc @kernel_with_shared_mem
+ llvm.func amdgpu_kernelcc @kernel_with_shared_mem() {
+ // CHECK: llvm.mlir.addressof @__dynamic_shared_memory__ : !llvm.ptr<3>
+ %0 = llvm.mlir.addressof @__dynamic_shared_memory__ : !llvm.ptr<3>
+ llvm.return
+ }
+
+ // Intrinsic declarations should keep their calling convention unchanged.
+ // CHECK: llvm.func @llvm.amdgcn.workitem.id.x() -> i32
+ llvm.func @llvm.amdgcn.workitem.id.x() -> i32
+
+ // External device library declarations get spir_func.
+ // CHECK: llvm.func spir_funccc @__ocml_sin_f32(f32) -> f32
+ llvm.func @__ocml_sin_f32(f32) -> f32
+}
+
+// -----
+
+// Test that inreg attributes are removed from kernel arguments.
+
+module {
+ // CHECK-LABEL: llvm.func spir_kernelcc @kernel_with_inreg
+ // CHECK-SAME: (%{{.+}}: i32, %{{.+}}: !llvm.ptr<1>, %{{.+}}: i64)
+ // CHECK-NOT: llvm.inreg
+ llvm.func amdgpu_kernelcc @kernel_with_inreg(
+ %arg0: i32 {llvm.inreg},
+ %arg1: !llvm.ptr<1> {llvm.inreg},
+ %arg2: i64) {
+ llvm.return
+ }
+}
+
+// -----
+
+// Test that rocdl.kernel attribute triggers spir_kernel CC.
+// After ROCDLAnnotateKernelForTranslation, kernels have rocdl.kernel
+// instead of amdgpu_kernel CC.
+
+module {
+ // CHECK-LABEL: llvm.func spir_kernelcc @annotated_kernel
+ // CHECK-NOT: rocdl.kernel
+ // CHECK-NOT: rocdl.flat_work_group_size
+ // CHECK-NOT: rocdl.reqd_work_group_size
+ // CHECK-NOT: rocdl.max_flat_work_group_size
+ llvm.func @annotated_kernel(%arg0: !llvm.ptr<1>)
+ attributes {
+ rocdl.kernel,
+ rocdl.flat_work_group_size = "1,256",
+ rocdl.reqd_work_group_size = dense<[64, 1, 1]> : vector<3xi32>,
+ rocdl.max_flat_work_group_size = 256 : i32
+ } {
+ llvm.return
+ }
+}
+
+// -----
+
+// Test constant address space remapping (AS 4 -> AS 2).
+
+module {
+ // CHECK: llvm.mlir.global external @const_data() {addr_space = 2 : i32}
+ llvm.mlir.global external @const_data() {addr_space = 4 : i32} : !llvm.array<16 x f32>
+}
+
+// -----
+
+// Test that @llvm.cmdline is created for comgr JIT flags.
+
+module {
+ llvm.func @kernel() {
+ llvm.return
+ }
+ // CHECK: llvm.mlir.global private constant @llvm.cmdline("-O3\00")
+ // CHECK-SAME: addr_space = 1
+ // CHECK-SAME: section = ".llvmcmd"
+}
+
+// -----
+
+// Test that an existing @llvm.cmdline has its optimization flag replaced.
+
+module {
+ llvm.mlir.global private constant @llvm.cmdline("-cc1\00-O2\00foo\00")
+ {addr_space = 1 : i32, alignment = 1 : i64, section = ".llvmcmd"} : !llvm.array<13 x i8>
+
+ llvm.func @kernel_with_existing_cmdline() {
+ llvm.return
+ }
+
+ // CHECK: llvm.mlir.global private constant @llvm.cmdline("-cc1\00-O3\00foo\00")
+ // CHECK-SAME: addr_space = 1
+ // CHECK-SAME: section = ".llvmcmd"
+}
+
+// -----
+
+// Test that an existing @llvm.cmdline without an optimization flag has -O3
+// appended.
+
+module {
+ llvm.mlir.global private constant @llvm.cmdline("-cc1\00")
+ {addr_space = 1 : i32, alignment = 1 : i64, section = ".llvmcmd"} : !llvm.array<5 x i8>
+
+ llvm.func @kernel_with_existing_cmdline_without_opt() {
+ llvm.return
+ }
+
+ // CHECK: llvm.mlir.global private constant @llvm.cmdline("-cc1\00-O3\00")
+ // CHECK-SAME: addr_space = 1
+ // CHECK-SAME: section = ".llvmcmd"
+}
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/rocm_spirv_serialize.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/rocm_spirv_serialize.mlir
new file mode 100644
index 0000000..5b33a6b
--- /dev/null
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/rocm_spirv_serialize.mlir
@@ -0,0 +1,27 @@
+// RUN: rm -rf %t && mkdir -p %t
+// RUN: iree-opt --iree-hal-transformation-pipeline --iree-hal-target-device=hip \
+// RUN: --iree-rocm-target=gfx942 --iree-rocm-use-spirv \
+// RUN: --iree-hal-dump-executable-binaries-to=%t %s -o /dev/null
+// RUN: ls %t | FileCheck %s --check-prefix=FILES --implicit-check-not=.hsaco
+
+// Verify that ROCm serialization uses the --iree-rocm-use-spirv flag to produce
+// a SPIR-V binary through the normal HAL executable source path.
+
+// FILES: module_serialize_spirv_flag_test_rocm_spirv_fb.spv
+
+#pipeline_layout = #hal.pipeline.layout<bindings = []>
+
+hal.executable.source public @serialize_spirv_flag_test {
+ hal.executable.export public @empty ordinal(0) layout(#pipeline_layout) count(%arg0: !hal.device) -> (index, index, index) {
+ %c1 = arith.constant 1 : index
+ hal.return %c1, %c1, %c1 : index, index, index
+ } attributes {subgroup_size = 64 : index, workgroup_size = [1 : index, 1 : index, 1 : index]}
+ builtin.module attributes {
+ llvm.data_layout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-n32:64-S32-G1-P4-A0",
+ llvm.target_triple = "spirv64-amd-amdhsa"
+ } {
+ llvm.func spir_kernelcc @empty() attributes {gpu.known_block_size = array<i32: 1, 1, 1>} {
+ llvm.return
+ }
+ }
+}
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/rocm_spirv_serialize_target_attr.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/rocm_spirv_serialize_target_attr.mlir
new file mode 100644
index 0000000..3e8022e
--- /dev/null
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/rocm_spirv_serialize_target_attr.mlir
@@ -0,0 +1,39 @@
+// RUN: rm -rf %t && mkdir -p %t
+// RUN: iree-opt --iree-hal-transformation-pipeline --iree-rocm-target=gfx942 \
+// RUN: --iree-hal-dump-executable-binaries-to=%t %s -o /dev/null
+// RUN: ls %t | FileCheck %s --check-prefix=FILES --implicit-check-not=.hsaco
+
+// Verify that ROCm serialization still honors a preconfigured SPIR-V HAL
+// executable variant without requiring --iree-rocm-use-spirv.
+
+// FILES: module_serialize_spirv_attr_test_rocm_spirv_fb.spv
+
+#pipeline_layout = #hal.pipeline.layout<bindings = []>
+#target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <
+ compute = fp32, storage = b32, subgroup = none,
+ subgroup_size_choices = [64],
+ max_workgroup_sizes = [1024, 1024, 1024],
+ max_thread_count_per_workgroup = 1024,
+ max_workgroup_memory_bytes = 65536,
+ max_workgroup_counts = [2147483647, 2147483647, 2147483647]
+>>
+
+hal.executable public @serialize_spirv_attr_test {
+ hal.executable.variant public @rocm_spirv_fb target(<"rocm", "rocm-spirv-fb", {
+ iree_codegen.target_info = #target,
+ ukernels = "none"
+ }>) {
+ hal.executable.export public @empty ordinal(0) layout(#pipeline_layout) count(%arg0: !hal.device) -> (index, index, index) {
+ %c1 = arith.constant 1 : index
+ hal.return %c1, %c1, %c1 : index, index, index
+ } attributes {subgroup_size = 64 : index, workgroup_size = [1 : index, 1 : index, 1 : index]}
+ builtin.module attributes {
+ llvm.data_layout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-n32:64-S32-G1-P4-A0",
+ llvm.target_triple = "spirv64-amd-amdhsa"
+ } {
+ llvm.func spir_kernelcc @empty() attributes {gpu.known_block_size = array<i32: 1, 1, 1>} {
+ llvm.return
+ }
+ }
+ }
+}
diff --git a/runtime/src/iree/hal/drivers/hip/hip_device.c b/runtime/src/iree/hal/drivers/hip/hip_device.c
index 4c38db7..f6fb6aa 100644
--- a/runtime/src/iree/hal/drivers/hip/hip_device.c
+++ b/runtime/src/iree/hal/drivers/hip/hip_device.c
@@ -766,7 +766,10 @@
}
if (iree_string_view_equal(category, IREE_SV("hal.executable.format"))) {
- *out_value = iree_string_view_equal(key, IREE_SV("rocm-hsaco-fb")) ? 1 : 0;
+ *out_value = (iree_string_view_equal(key, IREE_SV("rocm-hsaco-fb")) ||
+ iree_string_view_equal(key, IREE_SV("rocm-spirv-fb")))
+ ? 1
+ : 0;
return iree_ok_status();
}