[ROCM][Codegen] Add experimental amdgcn SPIR-V path (#24499)

Adds an experimental ROCm amdgcn SPIR-V path that emits HIP-loadable
SPIR-V instead of native HSACO, enabling the HIP runtime to JIT compile
device code for the target GPU. The main changes are:

- `--iree-rocm-use-spirv` flag to select `rocm-spirv-fb` for HIP
targets.
- ROCDL prepare for SPIR-V: Adjust address spaces, calling conventions,
remove AMDGPU-specific function attributes.
- ROCMTarget: Serialize LLVM SPIR-V output into a HIP-loadable offload
bundle.
- Focused lit coverage for lowering, serialization, command-line
handling.

---------

Signed-off-by: Austin Lu <aulu@amd.com>
Co-authored-by: Jakub Kuderski <jakub@nod-labs.com>
diff --git a/CMakeLists.txt b/CMakeLists.txt
index a692255..65a5176 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -464,6 +464,16 @@
   "Target chip for ROCm tests that need to compile device code. \
    Defaults to empty string to disable tests.")
 
+# Backward compatibility: accept the old IREE_HIP_TEST_TARGET_CHIP name.
+set(IREE_HIP_TEST_TARGET_CHIP "" CACHE STRING
+  "Deprecated; use IREE_ROCM_TEST_TARGET_CHIP instead.")
+if(IREE_HIP_TEST_TARGET_CHIP AND NOT IREE_ROCM_TEST_TARGET_CHIP)
+  message(WARNING "IREE_HIP_TEST_TARGET_CHIP is deprecated; use IREE_ROCM_TEST_TARGET_CHIP instead.")
+  set(IREE_ROCM_TEST_TARGET_CHIP "${IREE_HIP_TEST_TARGET_CHIP}" CACHE STRING "" FORCE)
+endif()
+set(IREE_ROCM_TEST_AMDGCNSPIRV OFF CACHE BOOL
+  "Use amdgcnspirv (SPIR-V output) for ROCm e2e tests.")
+
 #-------------------------------------------------------------------------------
 # Compiler Target Options
 # We try to keep the default build as simple as possible and disable heavy targets.
diff --git a/MODULE.bazel b/MODULE.bazel
index 7bcb75d..15f7553 100644
--- a/MODULE.bazel
+++ b/MODULE.bazel
@@ -108,6 +108,7 @@
         "ARM",
         "NVPTX",
         "RISCV",
+        "SPIRV",
         "WebAssembly",
         "X86",
     ],
diff --git a/build_tools/cmake/iree_check_test.cmake b/build_tools/cmake/iree_check_test.cmake
index 77c3010..72582ad 100644
--- a/build_tools/cmake/iree_check_test.cmake
+++ b/build_tools/cmake/iree_check_test.cmake
@@ -162,6 +162,9 @@
   endif()
   if(_NORMALIZED_TARGET_BACKEND STREQUAL "ROCM")
     list(APPEND _BASE_COMPILER_FLAGS "--iree-rocm-target=${IREE_ROCM_TEST_TARGET_CHIP}")
+    if(IREE_ROCM_TEST_AMDGCNSPIRV)
+      list(APPEND _BASE_COMPILER_FLAGS "--iree-rocm-use-spirv")
+    endif()
   endif()
 
   if(_BYTECODE_MODULE_BUILD_ENABLED)
diff --git a/build_tools/cmake/iree_e2e_generated_runner_test.cmake b/build_tools/cmake/iree_e2e_generated_runner_test.cmake
index 7332f78..dd850ca 100644
--- a/build_tools/cmake/iree_e2e_generated_runner_test.cmake
+++ b/build_tools/cmake/iree_e2e_generated_runner_test.cmake
@@ -60,6 +60,13 @@
   set(_BASE_COMPILER_FLAGS
     "--iree-hal-target-backends=${_RULE_TARGET_BACKEND}"
   )
+  string(TOUPPER ${_RULE_TARGET_BACKEND} _UPPERCASE_TARGET_BACKEND)
+  string(REPLACE "-" "_" _NORMALIZED_TARGET_BACKEND ${_UPPERCASE_TARGET_BACKEND})
+  if(_NORMALIZED_TARGET_BACKEND STREQUAL "ROCM")
+    if(IREE_ROCM_TEST_AMDGCNSPIRV)
+      list(APPEND _BASE_COMPILER_FLAGS "--iree-rocm-use-spirv")
+    endif()
+  endif()
 
   if(NOT TARGET "${_NAME}_${_RULE_TEST_TYPE}_module")
     iree_bytecode_module(
diff --git a/build_tools/cmake/iree_llvm.cmake b/build_tools/cmake/iree_llvm.cmake
index 0c52e32..7a86a5d 100644
--- a/build_tools/cmake/iree_llvm.cmake
+++ b/build_tools/cmake/iree_llvm.cmake
@@ -241,7 +241,7 @@
     message(STATUS "  - metal-spirv")
   endif()
   if(IREE_TARGET_BACKEND_ROCM OR IREE_HAL_DRIVER_AMDGPU)
-    list(APPEND LLVM_TARGETS_TO_BUILD AMDGPU)
+    list(APPEND LLVM_TARGETS_TO_BUILD AMDGPU SPIRV)
     set(IREE_CLANG_TARGET clang)
   endif()
   if(IREE_TARGET_BACKEND_VULKAN_SPIRV)
diff --git a/compiler/plugins/target/ROCM/BUILD.bazel b/compiler/plugins/target/ROCM/BUILD.bazel
index 8c8b68e..c1190b2 100644
--- a/compiler/plugins/target/ROCM/BUILD.bazel
+++ b/compiler/plugins/target/ROCM/BUILD.bazel
@@ -59,6 +59,7 @@
         "@llvm-project//llvm:Linker",
         "@llvm-project//llvm:MC",
         "@llvm-project//llvm:Passes",
+        "@llvm-project//llvm:SPIRVCodeGen",
         "@llvm-project//llvm:Support",
         "@llvm-project//llvm:Target",
         "@llvm-project//llvm:TransformUtils",
diff --git a/compiler/plugins/target/ROCM/CMakeLists.txt b/compiler/plugins/target/ROCM/CMakeLists.txt
index 4e1a87c..442103a 100644
--- a/compiler/plugins/target/ROCM/CMakeLists.txt
+++ b/compiler/plugins/target/ROCM/CMakeLists.txt
@@ -35,6 +35,7 @@
     LLVMLinker
     LLVMMC
     LLVMPasses
+    LLVMSPIRVCodeGen
     LLVMSupport
     LLVMTarget
     LLVMTransformUtils
diff --git a/compiler/plugins/target/ROCM/ROCMTarget.cpp b/compiler/plugins/target/ROCM/ROCMTarget.cpp
index 7fe09af..06d9aad 100644
--- a/compiler/plugins/target/ROCM/ROCMTarget.cpp
+++ b/compiler/plugins/target/ROCM/ROCMTarget.cpp
@@ -39,6 +39,8 @@
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Bitcode/BitcodeWriter.h"
 #include "llvm/Frontend/Offloading/Utility.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Instructions.h"
 #include "llvm/IR/LegacyPassManager.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Verifier.h"
@@ -194,6 +196,7 @@
       IREE::Codegen::DenormalFpMath::None;
   bool enableRegSpillWarning = false;
   bool debugSymbols = false;
+  bool useAmdgcnSpirv = false;
 
   void bindOptions(OptionsBinder &binder) {
     using namespace llvm;
@@ -290,6 +293,11 @@
     binder.opt<bool>("iree-rocm-emit-debug-info", debugSymbols,
                      cl::cat(category),
                      cl::desc("Generate and embed debug information (DWARF)."));
+
+    binder.opt<bool>(
+        "iree-rocm-use-spirv", useAmdgcnSpirv, cl::cat(category),
+        cl::desc("Produce SPIR-V binary (amdgcnspirv) instead of native ISA. "
+                 "The HIP runtime JIT-compiles the SPIR-V to native ISA."));
   }
 
   LogicalResult verify(mlir::Builder &builder) const {
@@ -414,7 +422,9 @@
 
     addConfig("abi", b.getStringAttr(deviceID));
     std::string format;
-    if (deviceID == "amdgpu") {
+    if (targetOptions.useAmdgcnSpirv) {
+      format = "rocm-spirv-fb";
+    } else if (deviceID == "amdgpu") {
       FailureOr<std::string> targetID =
           buildAMDGPUTargetID(b.getUnknownLoc(), targetOptions.target,
                               targetOptions.targetFeatures);
@@ -565,8 +575,12 @@
 
   void buildTranslationPassPipeline(IREE::HAL::ExecutableTargetAttr targetAttr,
                                     OpPassManager &passManager) final {
-    buildLLVMGPUCodegenPassPipeline(passManager.nest<ModuleOp>(), true,
-                                    targetOptions.debugSymbols);
+    // Derive SPIR-V mode from the target format rather than the CLI flag,
+    // so that programmatically-constructed target attrs work correctly.
+    bool useSPIRV = targetAttr.getFormat() == "rocm-spirv-fb";
+    buildLLVMGPUCodegenPassPipeline(
+        passManager.nest<ModuleOp>(), /*useROCM=*/true,
+        targetOptions.debugSymbols, /*includeLLVMLowering=*/true, useSPIRV);
     buildCodegenTranslationPostProcessingPassPipeline(passManager);
   }
 
@@ -638,9 +652,14 @@
 
   LogicalResult
   validateFinalizedModule(IREE::HAL::ExecutableVariantOp variantOp,
-                          llvm::Module &module) {
+                          llvm::Module &module, bool allowExternalDecls) {
     for (llvm::Function &func : module.functions()) {
       if (func.isDeclaration() && !func.isIntrinsic() && !func.use_empty()) {
+        // In SPIR-V mode, external declarations (e.g. __ocml_*, __ockl_*,
+        // amdgcn intrinsics) are expected — they are resolved at JIT time.
+        if (allowExternalDecls) {
+          continue;
+        }
         llvm::User *liveUser = *func.user_begin();
         return variantOp.emitError()
                << "found an unresolved external function '" << func.getName()
@@ -657,6 +676,13 @@
                       OpBuilder &executableBuilder) final {
     ModuleOp innerModuleOp = variantOp.getInnerModule();
     auto targetAttr = variantOp.getTargetAttr();
+    bool useSPIRV = targetAttr.getFormat() == "rocm-spirv-fb";
+    if (targetOptions.useAmdgcnSpirv && !useSPIRV) {
+      return variantOp.emitError()
+             << "--iree-rocm-use-spirv requires ROCm executable target format "
+                "'rocm-spirv-fb', but got '"
+             << targetAttr.getFormat() << "'";
+    }
     StringRef targetArch = targetOptions.target;
     StringRef targetFeatures = targetOptions.targetFeatures;
     if (auto attr = getGPUTargetAttr(variantOp.getContext(), targetAttr)) {
@@ -821,15 +847,20 @@
         }
       }
 
-      llvmModule->setDataLayout(targetMachine->createDataLayout());
-
       // Code object version * 100.
       constexpr uint32_t abiVersion = 500;
-      // Let the backend know what code object version we're compiling for. This
-      // insulates us from changes to the default code object version that our
-      // CI or users may not be prepared for.
-      llvmModule->addModuleFlag(llvm::Module::Error,
-                                "amdhsa_code_object_version", abiVersion);
+
+      // For SPIR-V mode, the data layout is already set by PrepareForSPIRVPass
+      // on the MLIR module and propagated through MLIR->LLVM IR translation.
+      if (!useSPIRV) {
+        llvmModule->setDataLayout(targetMachine->createDataLayout());
+
+        // Let the backend know what code object version we're compiling for.
+        // This insulates us from changes to the default code object version
+        // that our CI or users may not be prepared for.
+        llvmModule->addModuleFlag(llvm::Module::Error,
+                                  "amdhsa_code_object_version", abiVersion);
+      }
 
       for (llvm::Function &f : llvmModule->functions()) {
         f.addFnAttr(llvm::Attribute::AlwaysInline);
@@ -863,23 +894,25 @@
                << targetArch.str() << "'";
       }
 
-      // Link module to HIP device library.
-      if (targetOptions.bitcodeDirectory.empty()) {
-        return variantOp.emitError()
-               << "cannot find ROCM bitcode files. Check your installation "
-                  "consistency and in the worst case, set "
-                  "--iree-rocm-bc-dir= to a path on your system.";
-      }
-      if (failed(linkHIPBitcodeIfNeeded(variantOp.getLoc(), llvmModule.get(),
-                                        targetArch,
-                                        targetOptions.bitcodeDirectory))) {
-        return failure();
-      }
+      // Link module to HIP device library (skip for SPIR-V — resolved at JIT).
+      if (!useSPIRV) {
+        if (targetOptions.bitcodeDirectory.empty()) {
+          return variantOp.emitError()
+                 << "cannot find ROCM bitcode files. Check your installation "
+                    "consistency and in the worst case, set "
+                    "--iree-rocm-bc-dir= to a path on your system.";
+        }
+        if (failed(linkHIPBitcodeIfNeeded(variantOp.getLoc(), llvmModule.get(),
+                                          targetArch,
+                                          targetOptions.bitcodeDirectory))) {
+          return failure();
+        }
 
-      // Sets HIP platform globals based on the target architecture.
-      if (failed(setHIPGlobals(variantOp.getLoc(), llvmModule.get(), chipset,
-                               isWave64, abiVersion))) {
-        return failure();
+        // Sets HIP platform globals based on the target architecture.
+        if (failed(setHIPGlobals(variantOp.getLoc(), llvmModule.get(), chipset,
+                                 isWave64, abiVersion))) {
+          return failure();
+        }
       }
 
       if (!serializationOptions.dumpIntermediatesPath.empty()) {
@@ -895,9 +928,14 @@
       std::string targetTriple = targetMachine->getTargetTriple().str();
 
       // Run LLVM optimization passes.
+      // For SPIR-V mode, skip AMDGPU-specific optimization which introduces
+      // buffer resource intrinsics (AS 7) and other constructs incompatible
+      // with the SPIR-V backend.
       std::string passesString;
-      optimizeModule(*llvmModule, *targetMachine,
-                     targetOptions.slpVectorization, passesString);
+      if (!useSPIRV) {
+        optimizeModule(*llvmModule, *targetMachine,
+                       targetOptions.slpVectorization, passesString);
+      }
       if (!serializationOptions.dumpIntermediatesPath.empty()) {
         // Additional context on '-mcpu' flag in PR comments, see for example:
         // https://github.com/iree-org/iree/pull/20716#issuecomment-2851650421
@@ -915,48 +953,133 @@
                          ".optimized.ll", *llvmModule, header);
       }
 
-      if (failed(validateFinalizedModule(variantOp, *llvmModule))) {
+      if (failed(validateFinalizedModule(variantOp, *llvmModule, useSPIRV))) {
         return failure();
       }
 
-      // Dump the assembly output.
-      if (!serializationOptions.dumpIntermediatesPath.empty()) {
-        auto moduleCopy = llvm::CloneModule(*llvmModule);
-        if (!moduleCopy) {
-          llvm::errs() << "Error: cloning LLVM IR failed\n";
-          return failure();
+      if (useSPIRV) {
+        // SPIR-V codegen path: create a SPIR-V TargetMachine and emit binary.
+        llvm::Triple spirvTriple("spirv64-amd-amdhsa");
+        std::string spirvError;
+        const llvm::Target *spirvTarget =
+            llvm::TargetRegistry::lookupTarget("", spirvTriple, spirvError);
+        if (!spirvTarget) {
+          return variantOp.emitError()
+                 << "cannot find SPIR-V target: " << spirvError;
         }
-        std::string asmHeader = llvm::formatv(
-            R"TXT(; To reproduce the .rocmasm from .optimized.ll, run:
+        auto spirvTM = std::unique_ptr<llvm::TargetMachine>(
+            spirvTarget->createTargetMachine(
+                spirvTriple, /*CPU=*/"", /*Features=*/"", llvm::TargetOptions{},
+                llvm::Reloc::PIC_, std::nullopt,
+                llvm::CodeGenOptLevel::Default));
+
+        if (!spirvTM) {
+          return variantOp.emitError() << "cannot create SPIR-V target machine";
+        }
+
+        // If requested, dump the final .ll just before conversion to
+        // SPIRV-binary
+        if (!serializationOptions.dumpIntermediatesPath.empty()) {
+          dumpModuleToPath(serializationOptions.dumpIntermediatesPath,
+                           serializationOptions.dumpBaseName,
+                           variantOp.getName(), ".final.ll", *llvmModule);
+        }
+
+        // Emit SPIR-V binary (no lld/hsaco needed).
+        // The MLIR PrepareForSPIRV pass set triple, data layout, calling
+        // conventions, stripped assumes, and removed AMDGPU attrs. We
+        // reassert the data layout here from the TM to guarantee it matches
+        // exactly (MachineFunction::init() asserts on mismatch).
+        llvmModule->setDataLayout(spirvTM->createDataLayout());
+        llvmModule->setTargetTriple(spirvTriple);
+        std::string spirvBinary = translateModuleToObj(*llvmModule, *spirvTM);
+        if (spirvBinary.empty()) {
+          return variantOp.emitError()
+                 << "SPIR-V binary translation produced empty output";
+        }
+
+        if (!serializationOptions.dumpIntermediatesPath.empty()) {
+          dumpDataToPath(serializationOptions.dumpIntermediatesPath,
+                         serializationOptions.dumpBaseName, variantOp.getName(),
+                         ".spv", spirvBinary);
+        }
+
+        // Wrap the SPIR-V binary in a clang offload bundle. The HIP runtime
+        // expects this format — raw SPIR-V is not accepted by
+        // hipModuleLoadData.
+        // Note: Multi-byte fields use little-endian byte order, matching the
+        // Clang offload bundler binary format specification.
+        const std::string bundleTarget = "hip-spirv64-amd-amdhsa--amdgcnspirv";
+        const char bundleMagic[] = "__CLANG_OFFLOAD_BUNDLE__";
+        const uint64_t numEntries = 1;
+        const uint64_t headerSize = sizeof(bundleMagic) - 1 +
+                                    sizeof(numEntries) + 3 * sizeof(uint64_t) +
+                                    bundleTarget.size();
+        const uint64_t dataOffset = headerSize;
+        const uint64_t dataSize = spirvBinary.size();
+
+        std::string bundle;
+        bundle.reserve(headerSize + dataSize);
+        // Magic string (no null terminator).
+        bundle.append(bundleMagic, sizeof(bundleMagic) - 1);
+        // Number of entries.
+        bundle.append(reinterpret_cast<const char *>(&numEntries),
+                      sizeof(numEntries));
+        // Entry: offset, size, target string length, target string.
+        bundle.append(reinterpret_cast<const char *>(&dataOffset),
+                      sizeof(dataOffset));
+        bundle.append(reinterpret_cast<const char *>(&dataSize),
+                      sizeof(dataSize));
+        uint64_t targetLen = bundleTarget.size();
+        bundle.append(reinterpret_cast<const char *>(&targetLen),
+                      sizeof(targetLen));
+        bundle.append(bundleTarget);
+        // SPIR-V binary data.
+        bundle.append(spirvBinary);
+        targetHSACO = std::move(bundle);
+      } else {
+        // Native ISA codegen path.
+        // Dump the assembly output.
+        if (!serializationOptions.dumpIntermediatesPath.empty()) {
+          auto moduleCopy = llvm::CloneModule(*llvmModule);
+          if (!moduleCopy) {
+            llvm::errs() << "Error: cloning LLVM IR failed\n";
+            return failure();
+          }
+          std::string asmHeader = llvm::formatv(
+              R"TXT(; To reproduce the .rocmasm from .optimized.ll, run:
 ; llc -mtriple={} -mcpu={} -mattr='{}' -O3 <.optimized.ll> -o <out.rocmasm>
 
 )TXT",
-            targetTriple, targetCPU, targetMachine->getTargetFeatureString());
+              targetTriple, targetCPU, targetMachine->getTargetFeatureString());
 
-        std::string targetISA =
-            translateModuleToISA(*moduleCopy.get(), *targetMachine);
-        dumpDataToPath(serializationOptions.dumpIntermediatesPath,
-                       serializationOptions.dumpBaseName, variantOp.getName(),
-                       ".rocmasm", asmHeader + targetISA);
-      }
+          std::string targetISA =
+              translateModuleToISA(*moduleCopy.get(), *targetMachine);
+          dumpDataToPath(serializationOptions.dumpIntermediatesPath,
+                         serializationOptions.dumpBaseName, variantOp.getName(),
+                         ".rocmasm", asmHeader + targetISA);
+        }
 
-      // Serialize hsaco kernel into the binary that we will embed in the
-      // final FlatBuffer.
-      std::string targetObj = translateModuleToObj(*llvmModule, *targetMachine);
-      targetHSACO = createHsaco(variantOp.getLoc(), targetObj, libraryName);
-      if (targetHSACO.empty()) {
-        return failure();
-      }
+        // Serialize hsaco kernel into the binary that we will embed in the
+        // final FlatBuffer.
+        std::string targetObj =
+            translateModuleToObj(*llvmModule, *targetMachine);
+        targetHSACO = createHsaco(variantOp.getLoc(), targetObj, libraryName);
+        if (targetHSACO.empty()) {
+          return failure();
+        }
 
-      if (targetOptions.enableRegSpillWarning) {
-        checkRegisterSpilling(variantOp, targetObj);
+        if (targetOptions.enableRegSpillWarning) {
+          checkRegisterSpilling(variantOp, targetObj);
+        }
       }
     }
 
     if (!serializationOptions.dumpBinariesPath.empty()) {
+      StringRef ext = useSPIRV ? ".spv" : ".hsaco";
       dumpDataToPath(serializationOptions.dumpBinariesPath,
                      serializationOptions.dumpBaseName, variantOp.getName(),
-                     ".hsaco", targetHSACO);
+                     ext, targetHSACO);
     }
 
     // Determine container type from the target ABI attribute.
@@ -1273,6 +1396,11 @@
       LLVMInitializeAMDGPUTargetInfo();
       LLVMInitializeAMDGPUAsmParser();
       LLVMInitializeAMDGPUAsmPrinter();
+      // Also initialize SPIR-V target for amdgcnspirv mode.
+      LLVMInitializeSPIRVTarget();
+      LLVMInitializeSPIRVTargetInfo();
+      LLVMInitializeSPIRVTargetMC();
+      LLVMInitializeSPIRVAsmPrinter();
       return std::make_shared<ROCMTargetBackend>(options, codegenOptions);
     });
   }
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/BUILD.bazel b/compiler/src/iree/compiler/Codegen/LLVMGPU/BUILD.bazel
index c99e24f..0d46708 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/BUILD.bazel
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/BUILD.bazel
@@ -145,6 +145,7 @@
         "ROCDLConfigureBufferInstructions.cpp",
         "ROCDLLoadToTransposeLoad.cpp",
         "ROCDLPrefetching.cpp",
+        "ROCDLPrepareForSPIRV.cpp",
         "Verifiers.cpp",
     ],
     hdrs = [
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/LLVMGPU/CMakeLists.txt
index 87281aa..2e32fec 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/CMakeLists.txt
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/CMakeLists.txt
@@ -119,6 +119,7 @@
     "ROCDLConfigureBufferInstructions.cpp"
     "ROCDLLoadToTransposeLoad.cpp"
     "ROCDLPrefetching.cpp"
+    "ROCDLPrepareForSPIRV.cpp"
     "Verifiers.cpp"
   DEPS
     ::LLVMGPUConstraintGenerator
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp
index 4aac152..1a6a3a5 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp
@@ -968,7 +968,8 @@
 }
 
 static void addLowerToLLVMGPUPasses(OpPassManager &modulePassManager,
-                                    bool forROCDL, bool preserveDebugInfo) {
+                                    bool forROCDL, bool preserveDebugInfo,
+                                    bool useSPIRV = false) {
   modulePassManager.addPass(
       createConvertHALDescriptorTypeToGPUAddressSpacePass());
   modulePassManager.addPass(createCanonicalizerPass());
@@ -1088,6 +1089,9 @@
     modulePassManager.addPass(createConvertToROCDLPass());
     modulePassManager.addNestedPass<LLVM::LLVMFuncOp>(
         createROCDLAnnotateKernelForTranslationPass());
+    if (useSPIRV) {
+      modulePassManager.addPass(createROCDLPrepareForSPIRVPass());
+    }
   } else {
     // Convert to NVVM.
     modulePassManager.addPass(createConvertToNVVMPass());
@@ -1160,7 +1164,7 @@
 
 void buildLLVMGPUCodegenPassPipeline(OpPassManager &modulePassManager,
                                      bool useROCM, bool preserveDebugInfo,
-                                     bool includeLLVMLowering) {
+                                     bool includeLLVMLowering, bool useSPIRV) {
   modulePassManager.addPass(createLowerExecutableUsingTransformDialectPass());
   LLVMGPULowerExecutableTargetPassOptions options;
   options.forROCDL = useROCM;
@@ -1184,7 +1188,8 @@
   //   - The module contains the final llvm.module ready to be serialized.
   //===--------------------------------------------------------------------===//
   if (includeLLVMLowering) {
-    addLowerToLLVMGPUPasses(modulePassManager, useROCM, preserveDebugInfo);
+    addLowerToLLVMGPUPasses(modulePassManager, useROCM, preserveDebugInfo,
+                            useSPIRV);
   }
 
   LLVM_DEBUG({
@@ -1281,6 +1286,9 @@
         *this, "include-llvm-lowering",
         llvm::cl::desc("Include the lowering to LLVM dialect."),
         llvm::cl::init(true)};
+    Option<bool> useSPIRV{
+        *this, "use-spirv",
+        llvm::cl::desc("Prepare LLVM dialect IR for the SPIR-V backend")};
   };
 
   static PassPipelineRegistration<> LLVMGPUConfigPipeline(
@@ -1308,9 +1316,9 @@
           "Runs the LLVMGPU ROCDL lowering pipeline",
           [](OpPassManager &modulePassManager,
              const LLVMGPULoweringPipelineOptions &options) {
-            buildLLVMGPUCodegenPassPipeline(modulePassManager, true,
-                                            options.preserveDebugInfo,
-                                            options.includeLLVMLowering);
+            buildLLVMGPUCodegenPassPipeline(
+                modulePassManager, true, options.preserveDebugInfo,
+                options.includeLLVMLowering, options.useSPIRV);
           });
 
   static PassPipelineRegistration<> LLVMGPULinkingPipeline(
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.h b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.h
index 9515e4e..6467e40 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.h
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.h
@@ -79,7 +79,8 @@
 /// module within the IREE::HAL::ExecutableOp.
 void buildLLVMGPUCodegenPassPipeline(OpPassManager &modulePassManager,
                                      bool useROCM, bool preserveDebugInfo,
-                                     bool includeLLVMLowering = true);
+                                     bool includeLLVMLowering = true,
+                                     bool useSPIRV = false);
 
 /// Wraps GPUPipelineOptions and forROCDL for passing through
 /// PipelineAttrInterface::buildPipeline.
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/ROCDLConfigureBufferInstructions.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/ROCDLConfigureBufferInstructions.cpp
index 584d433..9382232 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/ROCDLConfigureBufferInstructions.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/ROCDLConfigureBufferInstructions.cpp
@@ -127,6 +127,12 @@
     if (!target || !target.isAMD()) {
       return;
     }
+    // Buffer instructions (address space 7) are not supported by the SPIR-V
+    // backend, so skip when targeting SPIR-V output.
+    auto execTarget = IREE::HAL::ExecutableTargetAttr::lookup(funcOp);
+    if (execTarget && execTarget.getFormat() == "rocm-spirv-fb") {
+      return;
+    }
 
     // Initialize the DataFlowSolver with IntegerRangeAnalysis.
     DataFlowSolver solver;
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/ROCDLPasses.td b/compiler/src/iree/compiler/Codegen/LLVMGPU/ROCDLPasses.td
index 321ee1a..a3b7176 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/ROCDLPasses.td
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/ROCDLPasses.td
@@ -73,4 +73,20 @@
   ];
 }
 
+def ROCDLPrepareForSPIRVPass :
+    Pass<"iree-rocdl-prepare-for-spirv", "ModuleOp"> {
+  let summary = "Prepare LLVM dialect IR for SPIR-V backend codegen.";
+  let description = [{
+    Transforms AMDGPU-flavored LLVM dialect IR into SPIR-V-compatible form:
+    - Remaps address spaces (AMDGPU -> SPIR-V conventions)
+    - Changes calling conventions (amdgpu_kernel -> spir_kernel, etc.)
+    - Removes ROCDL-specific function attributes
+    - Removes inreg parameter attributes (AMDGPU argument preloading)
+    - Sets module triple and data layout for spirv64-amd-amdhsa
+  }];
+  let dependentDialects = [
+    "LLVM::LLVMDialect",
+  ];
+}
+
 #endif // IREE_CODEGEN_LLVMGPU_ROCDLPASSES
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/ROCDLPrepareForSPIRV.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/ROCDLPrepareForSPIRV.cpp
new file mode 100644
index 0000000..063316e
--- /dev/null
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/ROCDLPrepareForSPIRV.cpp
@@ -0,0 +1,318 @@
+// Copyright 2026 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "llvm/ADT/STLExtras.h"
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/Pass/Pass.h"
+
+#define DEBUG_TYPE "iree-rocdl-prepare-for-spirv"
+
+namespace mlir::iree_compiler {
+
+#define GEN_PASS_DEF_ROCDLPREPAREFORSPIRVPASS
+#include "iree/compiler/Codegen/LLVMGPU/ROCDLPasses.h.inc"
+
+namespace {
+
+// Remaps AMDGPU address spaces to SPIR-V address spaces.
+//   AMDGPU 5 (private/stack) -> SPIR-V 0 (Function)
+//   AMDGPU 0 (flat/generic)  -> SPIR-V 4 (Generic)
+//   AMDGPU 4 (constant)      -> SPIR-V 2 (UniformConstant)
+//   AMDGPU 1 (global)        -> 1 (no change)
+//   AMDGPU 3 (shared/LDS)    -> 3 (no change)
+static unsigned remapAddressSpace(unsigned as) {
+  switch (as) {
+  case 5:
+    return 0; // private -> Function
+  case 0:
+    return 4; // flat/generic -> Generic
+  case 4:
+    return 2; // constant -> UniformConstant
+  default:
+    return as; // 1 (global) and 3 (shared) stay the same
+  }
+}
+
+// Remaps address spaces in a type. Recursively handles pointer and array types.
+static Type remapType(Type type) {
+  if (auto ptrType = dyn_cast<LLVM::LLVMPointerType>(type)) {
+    unsigned newAS = remapAddressSpace(ptrType.getAddressSpace());
+    if (newAS != ptrType.getAddressSpace()) {
+      return LLVM::LLVMPointerType::get(type.getContext(), newAS);
+    }
+    return type;
+  }
+  if (auto arrayType = dyn_cast<LLVM::LLVMArrayType>(type)) {
+    Type newElem = remapType(arrayType.getElementType());
+    if (newElem != arrayType.getElementType()) {
+      return LLVM::LLVMArrayType::get(newElem, arrayType.getNumElements());
+    }
+    return type;
+  }
+  if (auto funcType = dyn_cast<LLVM::LLVMFunctionType>(type)) {
+    Type newRet = remapType(funcType.getReturnType());
+    SmallVector<Type> newParams;
+    bool changed = newRet != funcType.getReturnType();
+    for (Type param : funcType.getParams()) {
+      Type newParam = remapType(param);
+      newParams.push_back(newParam);
+      changed |= newParam != param;
+    }
+    if (changed) {
+      return LLVM::LLVMFunctionType::get(newRet, newParams,
+                                         funcType.isVarArg());
+    }
+    return type;
+  }
+  if (auto structType = dyn_cast<LLVM::LLVMStructType>(type)) {
+    if (structType.isOpaque()) {
+      return type;
+    }
+    SmallVector<Type> newBody;
+    bool changed = false;
+    for (Type elem : structType.getBody()) {
+      Type newElem = remapType(elem);
+      newBody.push_back(newElem);
+      changed |= newElem != elem;
+    }
+    if (changed) {
+      // TODO: Identified structs lose their name here. This is acceptable
+      // for the current IR patterns but needs proper handling for recursive
+      // struct types if they ever appear.
+      return LLVM::LLVMStructType::getLiteral(type.getContext(), newBody,
+                                              structType.isPacked());
+    }
+    return type;
+  }
+  return type;
+}
+
+// List of ROCDL-specific attributes to remove for SPIR-V.
+// Note: AMDGPU attributes are handled by the backend.
+static constexpr llvm::StringLiteral kROCDLGPUAttrsToRemove[] = {
+    // ROCDL-specific attributes set by ROCDLAnnotateKernelForTranslationPass.
+    "rocdl.kernel",
+    "rocdl.flat_work_group_size",
+    "rocdl.reqd_work_group_size",
+    "rocdl.max_flat_work_group_size",
+};
+
+static bool isOptimizationLevelFlag(StringRef arg) {
+  return arg == "-O0" || arg == "-O1" || arg == "-O2" || arg == "-O3" ||
+         arg == "-Os" || arg == "-Oz";
+}
+
+static std::string ensureO3InCmdline(StringRef cmdline) {
+  SmallVector<StringRef> args;
+  cmdline.split(args, '\0', /*MaxSplit=*/-1, /*KeepEmpty=*/false);
+
+  std::string result;
+  bool hasOptLevel = false;
+  for (StringRef arg : args) {
+    if (isOptimizationLevelFlag(arg)) {
+      if (hasOptLevel) {
+        continue;
+      }
+      arg = "-O3";
+      hasOptLevel = true;
+    }
+    result.append(arg.data(), arg.size());
+    result.push_back('\0');
+  }
+
+  if (!hasOptLevel) {
+    result.append("-O3", 3);
+    result.push_back('\0');
+  }
+
+  return result;
+}
+
+struct ROCDLPrepareForSPIRVPass final
+    : impl::ROCDLPrepareForSPIRVPassBase<ROCDLPrepareForSPIRVPass> {
+  void getDependentDialects(DialectRegistry &registry) const override {
+    registry.insert<LLVM::LLVMDialect>();
+  }
+
+  void runOnOperation() override {
+    ModuleOp moduleOp = getOperation();
+
+    // Set module triple and data layout for SPIR-V.
+    moduleOp->setAttr(
+        LLVM::LLVMDialect::getTargetTripleAttrName(),
+        StringAttr::get(moduleOp.getContext(), "spirv64-amd-amdhsa"));
+    // Data layout from llvm::Triple("spirv64-amd-amdhsa").computeDataLayout().
+    // Must match exactly or MachineFunction::init() asserts.
+    moduleOp->setAttr(
+        LLVM::LLVMDialect::getDataLayoutAttrName(),
+        StringAttr::get(
+            moduleOp.getContext(),
+            "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128"
+            "-v192:256-v256:256-v512:512-v1024:1024-n32:64-S32-G1-P4-A0"));
+
+    // Process all functions: remap calling conventions and remove AMDGPU attrs.
+    moduleOp->walk([&](LLVM::LLVMFuncOp funcOp) {
+      // Remap calling conventions.
+      // After ROCDLAnnotateKernelForTranslationPass, kernel functions have
+      // rocdl.kernel attribute instead of amdgpu_kernel CC.
+      auto cc = funcOp.getCConv();
+      if (cc == LLVM::CConv::AMDGPU_KERNEL || funcOp->hasAttr("rocdl.kernel")) {
+        funcOp.setCConv(LLVM::CConv::SPIR_KERNEL);
+      } else if (!funcOp.getName().starts_with("llvm.")) {
+        // All non-intrinsic functions (definitions and declarations) get
+        // spir_func. LLVM intrinsics (llvm.*) keep their default CC.
+        funcOp.setCConv(LLVM::CConv::SPIR_FUNC);
+      }
+
+      // Remove ROCDLGPU attributes from the function's attribute dictionary.
+      for (auto attrName : kROCDLGPUAttrsToRemove) {
+        funcOp->removeAttr(attrName);
+      }
+
+      // Also clean llvm_func_attrs if present.
+      if (auto funcAttrs =
+              funcOp->getAttrOfType<DictionaryAttr>("llvm_func_attrs")) {
+        SmallVector<NamedAttribute> newAttrs;
+        for (NamedAttribute attr : funcAttrs) {
+          if (!llvm::is_contained(kROCDLGPUAttrsToRemove, attr.getName())) {
+            newAttrs.push_back(attr);
+          }
+        }
+        if (newAttrs.size() != funcAttrs.size()) {
+          if (newAttrs.empty()) {
+            funcOp->removeAttr("llvm_func_attrs");
+          } else {
+            funcOp->setAttr(
+                "llvm_func_attrs",
+                DictionaryAttr::get(moduleOp.getContext(), newAttrs));
+          }
+        }
+      }
+
+      // Remove inreg attributes from all parameters. AMDGPU uses inreg for
+      // argument preloading which is incompatible with SPIR-V.
+      // TODO: Clean this up once the SPIR-V backend can handle this.
+      for (unsigned i = 0; i < funcOp.getNumArguments(); ++i) {
+        funcOp.removeArgAttr(i, "llvm.inreg");
+      }
+
+      // Remap function signature types (pointer address spaces in
+      // params/return).
+      auto funcType = funcOp.getFunctionType();
+      Type newRetType = remapType(funcType.getReturnType());
+      SmallVector<Type> newParamTypes;
+      bool sigChanged = newRetType != funcType.getReturnType();
+      for (Type param : funcType.getParams()) {
+        Type newParam = remapType(param);
+        newParamTypes.push_back(newParam);
+        sigChanged |= newParam != param;
+      }
+      if (sigChanged) {
+        funcOp.setFunctionType(LLVM::LLVMFunctionType::get(
+            newRetType, newParamTypes, funcType.isVarArg()));
+        // Also update block argument types.
+        if (!funcOp.isDeclaration()) {
+          Block &entryBlock = funcOp.getBody().front();
+          for (auto [i, arg] : llvm::enumerate(entryBlock.getArguments())) {
+            if (i < newParamTypes.size()) {
+              arg.setType(newParamTypes[i]);
+            }
+          }
+        }
+      }
+    });
+
+    // Remap address spaces on all operations.
+    moduleOp->walk([&](Operation *op) {
+      // Skip function ops — they were already handled above.
+      if (isa<LLVM::LLVMFuncOp>(op)) {
+        return;
+      }
+
+      // Handle ops with explicit addr_space attributes first.
+      if (auto allocaOp = dyn_cast<LLVM::AllocaOp>(op)) {
+        // The address space is encoded in the result pointer type.
+        unsigned oldAS = allocaOp.getRes().getType().getAddressSpace();
+        unsigned newAS = remapAddressSpace(oldAS);
+        if (newAS != oldAS) {
+          allocaOp.getRes().setType(
+              LLVM::LLVMPointerType::get(op->getContext(), newAS));
+        }
+        // Also remap the addr_space attribute if present (discardable attr).
+        if (auto addrSpaceAttr =
+                allocaOp->getAttrOfType<IntegerAttr>("addr_space")) {
+          unsigned attrAS = addrSpaceAttr.getInt();
+          unsigned newAttrAS = remapAddressSpace(attrAS);
+          if (newAttrAS != attrAS) {
+            allocaOp->setAttr(
+                "addr_space",
+                IntegerAttr::get(addrSpaceAttr.getType(), newAttrAS));
+          }
+        }
+        return;
+      }
+      if (auto globalOp = dyn_cast<LLVM::GlobalOp>(op)) {
+        unsigned oldAS = globalOp.getAddrSpace();
+        unsigned newAS = remapAddressSpace(oldAS);
+        if (newAS != oldAS) {
+          globalOp.setAddrSpace(newAS);
+        }
+        Type newGlobalType = remapType(globalOp.getGlobalType());
+        if (newGlobalType != globalOp.getGlobalType()) {
+          globalOp.setGlobalType(newGlobalType);
+        }
+        return;
+      }
+
+      // For all other ops, remap result types.
+      for (auto [i, resultType] : llvm::enumerate(op->getResultTypes())) {
+        Type newType = remapType(resultType);
+        if (newType != resultType) {
+          op->getResult(i).setType(newType);
+        }
+      }
+    });
+
+    // Embed @llvm.cmdline with "-O3" so comgr JIT compiles at -O3.
+    // Without this, comgr defaults to -O0, causing massive register spilling.
+    // See amd/comgr/src/comgr-compiler.cpp: extractSpirvFlags().
+    // Must be at addrspace(1) (CrossWorkgroup) so the SPIR-V backend emits it
+    // as a module-level global that survives the round-trip.
+    auto i8Type = IntegerType::get(moduleOp.getContext(), 8);
+    auto cmdlineGlobal = moduleOp.lookupSymbol<LLVM::GlobalOp>("llvm.cmdline");
+    if (cmdlineGlobal) {
+      auto valueAttr =
+          dyn_cast_if_present<StringAttr>(cmdlineGlobal.getValueAttr());
+      if (!valueAttr) {
+        cmdlineGlobal.emitOpError()
+            << "expected @llvm.cmdline to have a string initializer";
+        return signalPassFailure();
+      }
+      std::string flags = ensureO3InCmdline(valueAttr.getValue());
+      cmdlineGlobal.setValueAttr(StringAttr::get(moduleOp.getContext(), flags));
+      cmdlineGlobal.setGlobalType(
+          LLVM::LLVMArrayType::get(i8Type, flags.size()));
+      cmdlineGlobal.setSection(".llvmcmd");
+      cmdlineGlobal.setAlignment(1);
+      cmdlineGlobal.setAddrSpace(1);
+    } else {
+      StringRef flags("-O3\0", 4);
+      OpBuilder builder(moduleOp.getBody(), moduleOp.getBody()->end());
+      auto globalOp =
+          LLVM::GlobalOp::create(builder, moduleOp.getLoc(),
+                                 LLVM::LLVMArrayType::get(i8Type, flags.size()),
+                                 /*isConstant=*/true, LLVM::Linkage::Private,
+                                 "llvm.cmdline", builder.getStringAttr(flags));
+      globalOp.setSection(".llvmcmd");
+      globalOp.setAlignment(1);
+      globalOp.setAddrSpace(1);
+    }
+  }
+};
+
+} // namespace
+} // namespace mlir::iree_compiler
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/BUILD.bazel b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/BUILD.bazel
index 9368ce2..35f4d49 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/BUILD.bazel
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/BUILD.bazel
@@ -64,7 +64,11 @@
             "reuse_shared_memory_allocs.mlir",
             "rocdl_global_transpose_load.mlir",
             "rocdl_load_to_transpose_load.mlir",
+            "rocdl_pipeline_spirv_test.mlir",
             "rocdl_pipeline_test.mlir",
+            "rocdl_prepare_for_spirv.mlir",
+            "rocm_spirv_serialize.mlir",
+            "rocm_spirv_serialize_target_attr.mlir",
             "sort_pipeline_test.mlir",
             "tensorcore_vectorization.mlir",
             "transform_dialect_bufferize.mlir",
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/CMakeLists.txt
index 03383cd..0825a08 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/CMakeLists.txt
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/CMakeLists.txt
@@ -59,7 +59,11 @@
     "reuse_shared_memory_allocs.mlir"
     "rocdl_global_transpose_load.mlir"
     "rocdl_load_to_transpose_load.mlir"
+    "rocdl_pipeline_spirv_test.mlir"
     "rocdl_pipeline_test.mlir"
+    "rocdl_prepare_for_spirv.mlir"
+    "rocm_spirv_serialize.mlir"
+    "rocm_spirv_serialize_target_attr.mlir"
     "sort_pipeline_test.mlir"
     "tensorcore_vectorization.mlir"
     "transform_dialect_bufferize.mlir"
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/rocdl_pipeline_spirv_test.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/rocdl_pipeline_spirv_test.mlir
new file mode 100644
index 0000000..218829b
--- /dev/null
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/rocdl_pipeline_spirv_test.mlir
@@ -0,0 +1,86 @@
+// RUN: iree-opt --split-input-file --iree-gpu-test-target=gfx942 \
+// RUN:   --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(builtin.module(iree-codegen-llvmgpu-configuration-pipeline), builtin.module(iree-codegen-llvmgpu-rocdl-lowering-pipeline{use-spirv=true}))))" \
+// RUN:   %s | FileCheck %s
+
+// Verify that a simple element-wise op lowers through the ROCDL pipeline with
+// SPIR-V preparation (spir_kernel CC, address space remapping, correct triple).
+
+#pipeline_layout = #hal.pipeline.layout<bindings = [
+  #hal.pipeline.binding<storage_buffer>,
+  #hal.pipeline.binding<storage_buffer>,
+  #hal.pipeline.binding<storage_buffer>
+]>
+hal.executable @simpleMath_ex_dispatch_0 {
+  hal.executable.variant @rocm target(<"rocm", "rocm-spirv-fb">) {
+    hal.executable.export public @add_dispatch_0 layout(#pipeline_layout) count(%arg0: !hal.device) -> (index, index, index) {
+      %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice()
+      hal.return %x, %y, %z : index, index, index
+    }
+    builtin.module {
+      func.func @add_dispatch_0() {
+        %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) : !iree_tensor_ext.dispatch.tensor<readonly:tensor<16xf32>>
+        %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) : !iree_tensor_ext.dispatch.tensor<readonly:tensor<16xf32>>
+        %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) : !iree_tensor_ext.dispatch.tensor<writeonly:tensor<16xf32>>
+        %3 = tensor.empty() : tensor<16xf32>
+        %4 = iree_tensor_ext.dispatch.tensor.load %0, offsets=[0], sizes=[16], strides=[1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<16xf32>> -> tensor<16xf32>
+        %5 = iree_tensor_ext.dispatch.tensor.load %1, offsets=[0], sizes=[16], strides=[1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<16xf32>> -> tensor<16xf32>
+        %6 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%4, %5 : tensor<16xf32>, tensor<16xf32>) outs(%3 : tensor<16xf32>) {
+        ^bb0(%arg0: f32, %arg1: f32, %arg2: f32):
+          %7 = arith.addf %arg0, %arg1 : f32
+          linalg.yield %7 : f32
+        } -> tensor<16xf32>
+        iree_tensor_ext.dispatch.tensor.store %6, %2, offsets=[0], sizes=[16], strides=[1] : tensor<16xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<16xf32>>
+        return
+      }
+    }
+  }
+}
+
+// CHECK-LABEL: hal.executable public @simpleMath_ex_dispatch_0
+//       CHECK:   hal.executable.variant public @rocm
+//       CHECK:   module attributes
+//  CHECK-SAME:     llvm.target_triple = "spirv64-amd-amdhsa"
+//       CHECK:   llvm.func spir_kernelcc @add_dispatch_0
+//       CHECK:     llvm.fadd
+//   CHECK-NOT:     amdgpu_kernel
+//   CHECK-NOT:     addr_space = 7
+
+// -----
+
+// Verify matmul lowers through the SPIR-V pipeline without buffer instructions.
+
+#pipeline_layout = #hal.pipeline.layout<bindings = [
+  #hal.pipeline.binding<storage_buffer>,
+  #hal.pipeline.binding<storage_buffer>,
+  #hal.pipeline.binding<storage_buffer>
+]>
+hal.executable @matmul_dispatch_0 {
+  hal.executable.variant @rocm target(<"rocm", "rocm-spirv-fb">) {
+    hal.executable.export public @matmul_dispatch_0 layout(#pipeline_layout) count(%arg0: !hal.device) -> (index, index, index) {
+      %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice()
+      hal.return %x, %y, %z : index, index, index
+    }
+    builtin.module {
+      func.func @matmul_dispatch_0() {
+        %c0 = arith.constant 0 : index
+        %cst = arith.constant 0.000000e+00 : f32
+        %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2048x1024xf32>>
+        %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) : !iree_tensor_ext.dispatch.tensor<readonly:tensor<1024x512xf32>>
+        %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%c0) : !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2048x512xf32>>
+        %3 = iree_tensor_ext.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32>
+        %4 = iree_tensor_ext.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32>
+        %5 = tensor.empty() : tensor<2048x512xf32>
+        %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
+        %7 = linalg.matmul ins(%3, %4 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%6 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
+        iree_tensor_ext.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2048x512xf32>>
+        return
+      }
+    }
+  }
+}
+
+// CHECK-LABEL: hal.executable public @matmul_dispatch_0
+//       CHECK:   module attributes
+//  CHECK-SAME:     llvm.target_triple = "spirv64-amd-amdhsa"
+//       CHECK:   llvm.func spir_kernelcc @matmul_dispatch_0
+//   CHECK-NOT:     addr_space = 7
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/rocdl_prepare_for_spirv.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/rocdl_prepare_for_spirv.mlir
new file mode 100644
index 0000000..912da1a
--- /dev/null
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/rocdl_prepare_for_spirv.mlir
@@ -0,0 +1,150 @@
+// RUN: iree-opt --split-input-file --iree-rocdl-prepare-for-spirv %s | FileCheck %s
+// RUN: iree-opt --split-input-file --iree-rocdl-prepare-for-spirv --iree-rocdl-prepare-for-spirv %s | FileCheck %s --check-prefix=IDEMPOTENT
+
+// IDEMPOTENT: llvm.mlir.global private constant @llvm.cmdline
+
+// Test triple, data layout, calling conventions, attributes, and address spaces.
+
+// CHECK: module attributes
+// CHECK-SAME: llvm.data_layout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-n32:64-S32-G1-P4-A0"
+// CHECK-SAME: llvm.target_triple = "spirv64-amd-amdhsa"
+
+// Shared memory (AS 3) stays AS 3.
+// CHECK: llvm.mlir.global external @__dynamic_shared_memory__()
+// CHECK-SAME: addr_space = 3
+
+module attributes {llvm.target_triple = "amdgcn-amd-amdhsa"} {
+
+  llvm.mlir.global external @__dynamic_shared_memory__() {addr_space = 3 : i32, alignment = 16 : i64} : !llvm.array<0 x i8>
+
+  // CHECK-LABEL: llvm.func spir_kernelcc @simple_kernel
+  llvm.func amdgpu_kernelcc @simple_kernel(%arg0: !llvm.ptr<1> {llvm.align = 16 : i64}) attributes {
+    target_cpu = "gfx1201",
+    target_features = "+wavefrontsize64"
+  } {
+    // Alloca in AS 5 (private) should become AS 0 (Function).
+    // CHECK: llvm.alloca %{{.*}} x f32 {addr_space = 0 : i32} : (i64) -> !llvm.ptr
+    %c1 = llvm.mlir.constant(1 : i64) : i64
+    %0 = llvm.alloca %c1 x f32 {addr_space = 5 : i32} : (i64) -> !llvm.ptr<5>
+    // Flat/generic AS 0 -> SPIR-V Generic AS 4.
+    // CHECK: llvm.addrspacecast %{{.*}} : !llvm.ptr to !llvm.ptr<4>
+    %1 = llvm.addrspacecast %0 : !llvm.ptr<5> to !llvm.ptr
+    llvm.return
+  }
+
+  // CHECK-LABEL: llvm.func spir_funccc @helper_func
+  llvm.func @helper_func(%arg0: f32) -> f32 {
+    llvm.return %arg0 : f32
+  }
+
+  // CHECK-LABEL: llvm.func spir_kernelcc @kernel_with_shared_mem
+  llvm.func amdgpu_kernelcc @kernel_with_shared_mem() {
+    // CHECK: llvm.mlir.addressof @__dynamic_shared_memory__ : !llvm.ptr<3>
+    %0 = llvm.mlir.addressof @__dynamic_shared_memory__ : !llvm.ptr<3>
+    llvm.return
+  }
+
+  // Intrinsic declarations should keep their calling convention unchanged.
+  // CHECK: llvm.func @llvm.amdgcn.workitem.id.x() -> i32
+  llvm.func @llvm.amdgcn.workitem.id.x() -> i32
+
+  // External device library declarations get spir_func.
+  // CHECK: llvm.func spir_funccc @__ocml_sin_f32(f32) -> f32
+  llvm.func @__ocml_sin_f32(f32) -> f32
+}
+
+// -----
+
+// Test that inreg attributes are removed from kernel arguments.
+
+module {
+  // CHECK-LABEL: llvm.func spir_kernelcc @kernel_with_inreg
+  // CHECK-SAME:    (%{{.+}}: i32, %{{.+}}: !llvm.ptr<1>, %{{.+}}: i64)
+  // CHECK-NOT:     llvm.inreg
+  llvm.func amdgpu_kernelcc @kernel_with_inreg(
+      %arg0: i32 {llvm.inreg},
+      %arg1: !llvm.ptr<1> {llvm.inreg},
+      %arg2: i64) {
+    llvm.return
+  }
+}
+
+// -----
+
+// Test that rocdl.kernel attribute triggers spir_kernel CC.
+// After ROCDLAnnotateKernelForTranslation, kernels have rocdl.kernel
+// instead of amdgpu_kernel CC.
+
+module {
+  // CHECK-LABEL: llvm.func spir_kernelcc @annotated_kernel
+  // CHECK-NOT:     rocdl.kernel
+  // CHECK-NOT:     rocdl.flat_work_group_size
+  // CHECK-NOT:     rocdl.reqd_work_group_size
+  // CHECK-NOT:     rocdl.max_flat_work_group_size
+  llvm.func @annotated_kernel(%arg0: !llvm.ptr<1>)
+      attributes {
+        rocdl.kernel,
+        rocdl.flat_work_group_size = "1,256",
+        rocdl.reqd_work_group_size = dense<[64, 1, 1]> : vector<3xi32>,
+        rocdl.max_flat_work_group_size = 256 : i32
+      } {
+    llvm.return
+  }
+}
+
+// -----
+
+// Test constant address space remapping (AS 4 -> AS 2).
+
+module {
+  // CHECK: llvm.mlir.global external @const_data() {addr_space = 2 : i32}
+  llvm.mlir.global external @const_data() {addr_space = 4 : i32} : !llvm.array<16 x f32>
+}
+
+// -----
+
+// Test that @llvm.cmdline is created for comgr JIT flags.
+
+module {
+  llvm.func @kernel() {
+    llvm.return
+  }
+  // CHECK: llvm.mlir.global private constant @llvm.cmdline("-O3\00")
+  // CHECK-SAME: addr_space = 1
+  // CHECK-SAME: section = ".llvmcmd"
+}
+
+// -----
+
+// Test that an existing @llvm.cmdline has its optimization flag replaced.
+
+module {
+  llvm.mlir.global private constant @llvm.cmdline("-cc1\00-O2\00foo\00")
+      {addr_space = 1 : i32, alignment = 1 : i64, section = ".llvmcmd"} : !llvm.array<13 x i8>
+
+  llvm.func @kernel_with_existing_cmdline() {
+    llvm.return
+  }
+
+  // CHECK: llvm.mlir.global private constant @llvm.cmdline("-cc1\00-O3\00foo\00")
+  // CHECK-SAME: addr_space = 1
+  // CHECK-SAME: section = ".llvmcmd"
+}
+
+// -----
+
+// Test that an existing @llvm.cmdline without an optimization flag has -O3
+// appended.
+
+module {
+  llvm.mlir.global private constant @llvm.cmdline("-cc1\00")
+      {addr_space = 1 : i32, alignment = 1 : i64, section = ".llvmcmd"} : !llvm.array<5 x i8>
+
+  llvm.func @kernel_with_existing_cmdline_without_opt() {
+    llvm.return
+  }
+
+  // CHECK: llvm.mlir.global private constant @llvm.cmdline("-cc1\00-O3\00")
+  // CHECK-SAME: addr_space = 1
+  // CHECK-SAME: section = ".llvmcmd"
+}
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/rocm_spirv_serialize.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/rocm_spirv_serialize.mlir
new file mode 100644
index 0000000..5b33a6b
--- /dev/null
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/rocm_spirv_serialize.mlir
@@ -0,0 +1,27 @@
+// RUN: rm -rf %t && mkdir -p %t
+// RUN: iree-opt --iree-hal-transformation-pipeline --iree-hal-target-device=hip \
+// RUN:   --iree-rocm-target=gfx942 --iree-rocm-use-spirv \
+// RUN:   --iree-hal-dump-executable-binaries-to=%t %s -o /dev/null
+// RUN: ls %t | FileCheck %s --check-prefix=FILES --implicit-check-not=.hsaco
+
+// Verify that ROCm serialization uses the --iree-rocm-use-spirv flag to produce
+// a SPIR-V binary through the normal HAL executable source path.
+
+// FILES: module_serialize_spirv_flag_test_rocm_spirv_fb.spv
+
+#pipeline_layout = #hal.pipeline.layout<bindings = []>
+
+hal.executable.source public @serialize_spirv_flag_test {
+  hal.executable.export public @empty ordinal(0) layout(#pipeline_layout) count(%arg0: !hal.device) -> (index, index, index) {
+    %c1 = arith.constant 1 : index
+    hal.return %c1, %c1, %c1 : index, index, index
+  } attributes {subgroup_size = 64 : index, workgroup_size = [1 : index, 1 : index, 1 : index]}
+  builtin.module attributes {
+    llvm.data_layout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-n32:64-S32-G1-P4-A0",
+    llvm.target_triple = "spirv64-amd-amdhsa"
+  } {
+    llvm.func spir_kernelcc @empty() attributes {gpu.known_block_size = array<i32: 1, 1, 1>} {
+      llvm.return
+    }
+  }
+}
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/rocm_spirv_serialize_target_attr.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/rocm_spirv_serialize_target_attr.mlir
new file mode 100644
index 0000000..3e8022e
--- /dev/null
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/rocm_spirv_serialize_target_attr.mlir
@@ -0,0 +1,39 @@
+// RUN: rm -rf %t && mkdir -p %t
+// RUN: iree-opt --iree-hal-transformation-pipeline --iree-rocm-target=gfx942 \
+// RUN:   --iree-hal-dump-executable-binaries-to=%t %s -o /dev/null
+// RUN: ls %t | FileCheck %s --check-prefix=FILES --implicit-check-not=.hsaco
+
+// Verify that ROCm serialization still honors a preconfigured SPIR-V HAL
+// executable variant without requiring --iree-rocm-use-spirv.
+
+// FILES: module_serialize_spirv_attr_test_rocm_spirv_fb.spv
+
+#pipeline_layout = #hal.pipeline.layout<bindings = []>
+#target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <
+  compute = fp32, storage = b32, subgroup = none,
+  subgroup_size_choices = [64],
+  max_workgroup_sizes = [1024, 1024, 1024],
+  max_thread_count_per_workgroup = 1024,
+  max_workgroup_memory_bytes = 65536,
+  max_workgroup_counts = [2147483647, 2147483647, 2147483647]
+>>
+
+hal.executable public @serialize_spirv_attr_test {
+  hal.executable.variant public @rocm_spirv_fb target(<"rocm", "rocm-spirv-fb", {
+    iree_codegen.target_info = #target,
+    ukernels = "none"
+  }>) {
+    hal.executable.export public @empty ordinal(0) layout(#pipeline_layout) count(%arg0: !hal.device) -> (index, index, index) {
+      %c1 = arith.constant 1 : index
+      hal.return %c1, %c1, %c1 : index, index, index
+    } attributes {subgroup_size = 64 : index, workgroup_size = [1 : index, 1 : index, 1 : index]}
+    builtin.module attributes {
+      llvm.data_layout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-n32:64-S32-G1-P4-A0",
+      llvm.target_triple = "spirv64-amd-amdhsa"
+    } {
+      llvm.func spir_kernelcc @empty() attributes {gpu.known_block_size = array<i32: 1, 1, 1>} {
+        llvm.return
+      }
+    }
+  }
+}
diff --git a/runtime/src/iree/hal/drivers/hip/hip_device.c b/runtime/src/iree/hal/drivers/hip/hip_device.c
index 4c38db7..f6fb6aa 100644
--- a/runtime/src/iree/hal/drivers/hip/hip_device.c
+++ b/runtime/src/iree/hal/drivers/hip/hip_device.c
@@ -766,7 +766,10 @@
   }
 
   if (iree_string_view_equal(category, IREE_SV("hal.executable.format"))) {
-    *out_value = iree_string_view_equal(key, IREE_SV("rocm-hsaco-fb")) ? 1 : 0;
+    *out_value = (iree_string_view_equal(key, IREE_SV("rocm-hsaco-fb")) ||
+                  iree_string_view_equal(key, IREE_SV("rocm-spirv-fb")))
+                     ? 1
+                     : 0;
     return iree_ok_status();
   }