Improving linking support for ROCM and ukernels. (#19211) To support externally-defined ukernels on ROCM the ROCMTarget has been brought in-line with LLVMCPU/CUDA by calling `linkBitcodeObjects`. To make authoring passes that include object references `#hal.executable.object` now allows any data type to be associated so long as it is serializable allowing for external resource attrs and other custom attributes that may serialize based on other information. To allow patterns to attach object references all ops within an executable variant can now declare a `hal.executable.objects` array that will be hoisted and merged into the top-level variant objects after our executable linking pass (before serialization where they are used).

commit: 82a89e389ddf7a13c613f5b26df9051ded43a764 [log] [tgz]
author: Ben Vanik <ben.vanik@gmail.com> Tue Nov 19 13:09:38 2024 -0800
committer: GitHub <noreply@github.com> Tue Nov 19 13:09:38 2024 -0800
tree: 7066a839fa0f53ed48609e39375f5467bf99d230
parent: 4396bf1aa0d53d703d0204d9e361630257b7d792 [diff]
parent: f5106640f4bf75eb15a5673de5fafd3aa0a96fd4 [diff]
diff --git a/.github/workflows/oneshot_candidate_release.yml b/.github/workflows/oneshot_candidate_release.yml
index 05f9288..66bd42d 100644
--- a/.github/workflows/oneshot_candidate_release.yml
+++ b/.github/workflows/oneshot_candidate_release.yml

@@ -48,16 +48,15 @@
 
       - name: Create Release
         id: create_release
-        uses: actions/create-release@0cb9c9b65d5d1901c1f53e5e66eaf4afd303e70e # v1
-        env:
-          GITHUB_TOKEN: ${{ secrets.WRITE_ACCESS_TOKEN }}
+        uses: ncipollo/release-action@2c591bcc8ecdcd2db72b97d6147f871fcd833ba5 # v1.14.0
         with:
-          tag_name: ${{ env.tag_name }}
-          release_name: iree candidate ${{ env.tag_name }}
+          tag: ${{ env.tag_name }}
+          name: iree candidate ${{ env.tag_name }}
           body: |
             Automatic candidate release of iree.
           draft: true
           prerelease: true
+          token: ${{ secrets.WRITE_ACCESS_TOKEN }}
 
       - name: "Invoke workflow :: Build Release Packages"
         uses: benc-uk/workflow-dispatch@e2e5e9a103e331dad343f381a29e654aea3cf8fc # v1.2.4

diff --git a/.github/workflows/schedule_candidate_release.yml b/.github/workflows/schedule_candidate_release.yml
index a89fe23..c467a62 100644
--- a/.github/workflows/schedule_candidate_release.yml
+++ b/.github/workflows/schedule_candidate_release.yml

@@ -65,17 +65,16 @@
 
       - name: Create Release
         id: create_release
-        uses: actions/create-release@0cb9c9b65d5d1901c1f53e5e66eaf4afd303e70e # v1
-        env:
-          GITHUB_TOKEN: ${{ secrets.WRITE_ACCESS_TOKEN }}
+        uses: ncipollo/release-action@2c591bcc8ecdcd2db72b97d6147f871fcd833ba5 # v1.14.0
         with:
-          tag_name: ${{ env.tag_name }}
-          release_name: iree candidate ${{ env.tag_name }}
-          commitish: ${{ steps.last_green_commit.outputs.release-commit }}
+          tag: ${{ env.tag_name }}
+          name: iree candidate ${{ env.tag_name }}
+          commit: ${{ steps.last_green_commit.outputs.release-commit }}
           body: |
             Automatic candidate release of iree.
           draft: true
           prerelease: true
+          token: ${{ secrets.WRITE_ACCESS_TOKEN }}
 
       - name: "Invoke workflow :: Build Release Packages"
         uses: benc-uk/workflow-dispatch@e2e5e9a103e331dad343f381a29e654aea3cf8fc # v1.2.4

diff --git a/compiler/plugins/target/ROCM/builtins/ukernel/CMakeLists.txt b/compiler/plugins/target/ROCM/builtins/ukernel/CMakeLists.txt
index 693fe38..417b08b 100644
--- a/compiler/plugins/target/ROCM/builtins/ukernel/CMakeLists.txt
+++ b/compiler/plugins/target/ROCM/builtins/ukernel/CMakeLists.txt

@@ -125,7 +125,7 @@
 
 # TODO: Decide what to build by default. No real constaints here
 #       except compile-time cost, so just picked out the popular ones.
-set(_ukernel_supported_chips "gfx90a" "gfx940" "gfx1030" "gfx1100")
+set(_ukernel_supported_chips "gfx90a" "gfx942" "gfx1030" "gfx1100")
 foreach(_amd_chip ${_ukernel_supported_chips})
   iree_rocm_bitcode_library(
     NAME

diff --git a/compiler/plugins/target/ROCM/test/target_device_features.mlir b/compiler/plugins/target/ROCM/test/target_device_features.mlir
index 0809252..01805d2 100644
--- a/compiler/plugins/target/ROCM/test/target_device_features.mlir
+++ b/compiler/plugins/target/ROCM/test/target_device_features.mlir

@@ -7,7 +7,7 @@
 // RUN:   --iree-hip-target=gfx941 --iree-hip-target-features=+sramecc,-xnack %s | FileCheck %s --check-prefix=GFX941
 //
 // RUN: iree-opt --pass-pipeline='builtin.module(iree-hal-assign-target-devices{targetDevices=hip},iree-hal-transformation-pipeline{serialize-executables=false})' \
-// RUN:   --iree-hip-target=gfx940 %s | FileCheck %s --check-prefix=GFX940
+// RUN:   --iree-hip-target=gfx942 %s | FileCheck %s --check-prefix=GFX940
 //
 // RUN: iree-opt --pass-pipeline='builtin.module(iree-hal-assign-target-devices{targetDevices=hip},iree-hal-transformation-pipeline{serialize-executables=false})' \
 // RUN:   --iree-hip-target=rx7900xtx %s | FileCheck %s --check-prefix=GFX1100
@@ -25,7 +25,7 @@
 // GFX941: target = #iree_gpu.target<arch = "gfx941",
 // GFX941-SAME:         features = "+sramecc,-xnack"
 
-// GFX940: target = #iree_gpu.target<arch = "gfx940",
+// GFX940: target = #iree_gpu.target<arch = "gfx942",
 // GFX940-SAME:         mma =  [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>],
 
 // GFX1100: target = #iree_gpu.target<arch = "gfx1100",

diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/test/vector_reduction_to_gpu.mlir b/compiler/src/iree/compiler/Codegen/Common/GPU/test/vector_reduction_to_gpu.mlir
index e0aac48..a0f5803 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GPU/test/vector_reduction_to_gpu.mlir
+++ b/compiler/src/iree/compiler/Codegen/Common/GPU/test/vector_reduction_to_gpu.mlir

@@ -1,5 +1,5 @@
 // RUN: iree-opt --split-input-file --iree-gpu-test-target=sm_60 --pass-pipeline='builtin.module(func.func(iree-codegen-vector-reduction-to-gpu, cse))' %s | FileCheck %s
-// RUN: iree-opt --split-input-file --iree-gpu-test-target=gfx940 --pass-pipeline='builtin.module(func.func(iree-codegen-vector-reduction-to-gpu, cse))' %s | FileCheck %s --check-prefix=CDNA3
+// RUN: iree-opt --split-input-file --iree-gpu-test-target=gfx942 --pass-pipeline='builtin.module(func.func(iree-codegen-vector-reduction-to-gpu, cse))' %s | FileCheck %s --check-prefix=CDNA3
 
 #pipeline_layout = #hal.pipeline.layout<bindings = [
   #hal.pipeline.binding<storage_buffer>,

diff --git a/compiler/src/iree/compiler/Codegen/Common/MaterializeUserConfigs.cpp b/compiler/src/iree/compiler/Codegen/Common/MaterializeUserConfigs.cpp
index 78e854e..46f0517 100644
--- a/compiler/src/iree/compiler/Codegen/Common/MaterializeUserConfigs.cpp
+++ b/compiler/src/iree/compiler/Codegen/Common/MaterializeUserConfigs.cpp

@@ -4,20 +4,11 @@
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-#include <iterator>
-
 #include "iree/compiler/Codegen/Common/Passes.h"
 #include "iree/compiler/Codegen/Common/UserConfig.h"
 #include "iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenAttrs.h"
 #include "iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenDialect.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/ScopeExit.h"
-#include "llvm/ADT/iterator_range.h"
-#include "mlir/Dialect/Linalg/Transforms/Transforms.h"
-#include "mlir/Dialect/Transform/IR/TransformDialect.h"
-#include "mlir/Dialect/Transform/IR/TransformOps.h"
 #include "mlir/Dialect/Transform/Transforms/TransformInterpreterUtils.h"
-#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 
 #define DEBUG_TYPE "iree-codegen-materialize-user-configs"
 #define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE "]: ")
@@ -40,7 +31,8 @@
 
 namespace {
 
-static const char kTranslationInfoAttrName[] = "translation_info";
+constexpr StringLiteral kTranslationInfoAttrName = "translation_info";
+constexpr StringLiteral kDefaultTransformSequenceName = "__kernel_config";
 
 enum StrategyRunResult {
   Success = 0,
@@ -87,10 +79,11 @@
       //      or any other IREE codegen pipeline.
       //
       //   2. Use the configuration strategy to do codegen directly. At the end
-      //   of
-      //      the strategy, the variant needs to be annotated with
-      //      "translation_info" = #iree_codegen.translation_info<pipeline =
-      //      None>
+      //      of the strategy, the variant needs to be annotated with:
+      //      ```mlir
+      //      "translation_info" =
+      //        #iree_codegen.translation_info<pipeline = None>
+      //      ```
       SmallVector<StringRef, 2> parts;
       llvm::SplitString(
           llvm::StringRef(clCodegenTransformDialectLibraryFileName), parts,
@@ -112,7 +105,7 @@
         libraryFileName = parts[0];
       }
 
-      std::string entrySequenceName;
+      StringRef entrySequenceName = kDefaultTransformSequenceName;
       // Check if the user specified a custom entry point name.
       if (parts.size() == 2) {
         if (parts[1].empty()) {
@@ -120,8 +113,6 @@
           return signalPassFailure();
         }
         entrySequenceName = parts[1];
-      } else {
-        entrySequenceName = "__kernel_config";
       }
 
       LDBG("MaterializeUserConfigsPass on function: " << funcOp);
@@ -145,7 +136,8 @@
           funcOp.emitError() << "transform kernel config strategy `"
                              << entrySequenceName << " not found";
           return signalPassFailure();
-        } else if (runResult == StrategyRunResult::Failed) {
+        }
+        if (runResult == StrategyRunResult::Failed) {
           funcOp.emitError() << "transform kernel config strategy `"
                              << entrySequenceName << "` failed to apply";
           return signalPassFailure();

diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp b/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp
index bd3c1f4..77b99a9 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp

@@ -2922,10 +2922,14 @@
   // loads and stores will have a performance impact.
   auto resultTypes = rootOperation->getResultTypes();
   if (commonVecTileSizes.size() != 0 && !resultTypes.empty()) {
-    auto elementTypeSize =
-        cast<ShapedType>(rootOperation->getResultTypes().front())
-            .getElementType()
-            .getIntOrFloatBitWidth();
+    Type elementType = cast<ShapedType>(resultTypes[0]).getElementType();
+    unsigned int elementTypeSize;
+    if (auto complexType = llvm::dyn_cast<ComplexType>(elementType)) {
+      elementTypeSize =
+          2 * complexType.getElementType().getIntOrFloatBitWidth();
+    } else {
+      elementTypeSize = elementType.getIntOrFloatBitWidth();
+    }
     // for now just enable for i1
     if (elementTypeSize == 1) {
       auto innermostTileSize = commonVecTileSizes.back();

diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/select_x86_64_lowering_strategy.mlir b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/select_x86_64_lowering_strategy.mlir
index 9161c81..22a2880 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/select_x86_64_lowering_strategy.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/select_x86_64_lowering_strategy.mlir

@@ -1983,3 +1983,52 @@
 // CHECK: func @i1_type()
 // CHECK: linalg.generic {
 // CHECK-SAME: {lowering_config = #[[CONFIG]]}
+
+// -----
+#pipeline_layout = #hal.pipeline.layout<bindings = [
+  #hal.pipeline.binding<storage_buffer>,
+  #hal.pipeline.binding<storage_buffer>,
+  #hal.pipeline.binding<storage_buffer>,
+  #hal.pipeline.binding<storage_buffer>
+]>
+
+#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "+avx512f", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-linux-gnu"}>
+#map = affine_map<(d0, d1, d2) -> (d1)>
+#map1 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
+func.func @complex_view_as_real() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} {
+  %c1 = arith.constant 1 : index
+  %c0 = arith.constant 0 : index
+  %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1xi32>>
+  %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x50xcomplex<f32>>>
+  %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x1x32x50x2xf32>>
+  %3 = hal.interface.binding.subspan layout(#pipeline_layout) binding(3) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<32x50x2xf32>>
+  %4 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [1], strides = [1] : !flow.dispatch.tensor<readonly:tensor<1xi32>> -> tensor<1xi32>
+  %5 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0, 0], sizes = [1, 1, 32, 50, 2], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x1x32x50x2xf32>> -> tensor<1x1x32x50x2xf32>
+  %6 = tensor.empty() : tensor<32x50x2xf32>
+  %extracted = tensor.extract %4[%c0] : tensor<1xi32>
+  %7 = arith.extsi %extracted : i32 to i64
+  %8 = arith.index_cast %7 : i64 to index
+  %9 = flow.dispatch.tensor.load %1, offsets = [%8, 0], sizes = [1, 50], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x50xcomplex<f32>>> -> tensor<50xcomplex<f32>>
+  %10 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "parallel"]} ins(%9 : tensor<50xcomplex<f32>>) outs(%6 : tensor<32x50x2xf32>) {
+  ^bb0(%in: complex<f32>, %out: f32):
+    %11 = linalg.index 0 : index
+    %12 = linalg.index 1 : index
+    %extracted_0 = tensor.extract %5[%c0, %c0, %11, %12, %c0] : tensor<1x1x32x50x2xf32>
+    %extracted_1 = tensor.extract %5[%c0, %c0, %11, %12, %c1] : tensor<1x1x32x50x2xf32>
+    %13 = complex.create %extracted_0, %extracted_1 : complex<f32>
+    %14 = complex.mul %13, %in : complex<f32>
+    %15 = complex.re %14 : complex<f32>
+    %16 = complex.im %14 : complex<f32>
+    %17 = linalg.index 2 : index
+    %18 = arith.cmpi eq, %17, %c0 : index
+    %19 = arith.select %18, %15, %16 : f32
+    linalg.yield %19 : f32
+  } -> tensor<32x50x2xf32>
+  flow.dispatch.tensor.store %10, %3, offsets = [0, 0, 0], sizes = [32, 50, 2], strides = [1, 1, 1] : tensor<32x50x2xf32> -> !flow.dispatch.tensor<writeonly:tensor<32x50x2xf32>>
+  return
+}
+
+//  CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[4, 25, 2], [1, 1, 2], [0, 0, 0], [0, 0, 0]{{\]}}>
+//      CHECK: func.func @complex_view_as_real()
+//      CHECK:   linalg.generic
+// CHECK-SAME:       lowering_config = #[[CONFIG]]

diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/ROCDLAnnotateKernelForTranslation.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/ROCDLAnnotateKernelForTranslation.cpp
index 0a6eca5..f736ed7 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/ROCDLAnnotateKernelForTranslation.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/ROCDLAnnotateKernelForTranslation.cpp

@@ -74,7 +74,7 @@
     rocdlDialect->getWavesPerEuAttrHelper().setAttr(funcOp, *attr);
   }
 
-  // Kernel argument preloading is only supported on gfx940 and newer targets
+  // Kernel argument preloading is only supported on gfx942 and newer targets
   // from the CDNA family. This is enabled using the `inreg` function argument
   // attribute.
   FailureOr<amdgpu::Chipset> chipset = getChipsetVersion(targetAttr);

diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/TestLLVMGPUQueryMMAPass.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/TestLLVMGPUQueryMMAPass.cpp
index 355187f..2e65f81 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/TestLLVMGPUQueryMMAPass.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/TestLLVMGPUQueryMMAPass.cpp

@@ -23,15 +23,16 @@
     : impl::TestLLVMGPUQueryMMAPassBase<TestLLVMGPUQueryMMAPass> {
   void runOnOperation() override {
     ModuleOp moduleOp = getOperation();
-    llvm::SmallDenseMap<IREE::HAL::ExecutableVariantOp,
-                        SmallVector<IREE::GPU::MMAIntrinsic>>
-        mmaMap = queryMMAIntrinsics(moduleOp);
-    for (const auto &[op, mmaAttrs] : mmaMap) {
+    SmallVector<IREE::HAL::ExecutableVariantOp> executableVariantOps =
+        getExecutableVariantOps(moduleOp);
+    for (IREE::HAL::ExecutableVariantOp op : executableVariantOps) {
       llvm::outs() << "Executable Variant Name: "
                    << cast<IREE::HAL::ExecutableVariantOp>(*op).getName()
                    << "\n";
+      SmallVector<IREE::GPU::MMAIntrinsic> mmaIntrinsics =
+          queryMMAIntrinsics(op);
       llvm::outs() << "MMA Intrinsics: ";
-      llvm::interleave(mmaAttrs, llvm::outs(), " ");
+      llvm::interleave(mmaIntrinsics, llvm::outs(), " ");
       llvm::outs() << "\n";
     }
   }

diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/BUILD.bazel b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/BUILD.bazel
index ccff3c1..93f4480 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/BUILD.bazel
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/BUILD.bazel

@@ -22,12 +22,12 @@
             "config_igemm_tile_and_fuse.mlir",
             "config_tile_and_fuse.mlir",
             "config_vector_distribute_gfx1100.mlir",
-            "config_vector_distribute_gfx940.mlir",
+            "config_vector_distribute_gfx942.mlir",
             "config_user_vector_distribute.mlir",
             "lowering_scalar_dispatch.mlir",
             "pipeline_igemm_tile_and_fuse.mlir",
             "pipeline_tile_and_fuse.mlir",
-            "pipeline_vector_distribute_gfx940.mlir",
+            "pipeline_vector_distribute_gfx942.mlir",
             "pipeline_vector_distribute_gfx1100.mlir",
             "pipeline_warp_reduction.mlir",
         ],

diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/CMakeLists.txt
index 570277c..d4b79e2 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/CMakeLists.txt
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/CMakeLists.txt

@@ -19,12 +19,12 @@
     "config_tile_and_fuse.mlir"
     "config_user_vector_distribute.mlir"
     "config_vector_distribute_gfx1100.mlir"
-    "config_vector_distribute_gfx940.mlir"
+    "config_vector_distribute_gfx942.mlir"
     "lowering_scalar_dispatch.mlir"
     "pipeline_igemm_tile_and_fuse.mlir"
     "pipeline_tile_and_fuse.mlir"
     "pipeline_vector_distribute_gfx1100.mlir"
-    "pipeline_vector_distribute_gfx940.mlir"
+    "pipeline_vector_distribute_gfx942.mlir"
     "pipeline_warp_reduction.mlir"
   TOOLS
     FileCheck

diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/annotate_kernel_for_translation.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/annotate_kernel_for_translation.mlir
index 825b28c..b5d548f 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/annotate_kernel_for_translation.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/annotate_kernel_for_translation.mlir

@@ -45,10 +45,10 @@
 
 // -----
 
-// Check that we annotate kernel arguments on gfx940-series.
+// Check that we annotate kernel arguments on gfx942-series.
 
 #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb",
-  {iree.gpu.target = #iree_gpu.target<arch = "gfx940", features = "",
+  {iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "",
                                       wgp = <compute = int32, storage =  b32,
                                       subgroup =  none, dot =  none, mma = [],
                                       subgroup_size_choices = [64],

diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_igemm_tile_and_fuse.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_igemm_tile_and_fuse.mlir
index ca22f60..4c9f79e 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_igemm_tile_and_fuse.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_igemm_tile_and_fuse.mlir

@@ -1,4 +1,4 @@
-// RUN: iree-opt --mlir-print-local-scope --split-input-file --iree-gpu-test-target=gfx940 \
+// RUN: iree-opt --mlir-print-local-scope --split-input-file --iree-gpu-test-target=gfx942 \
 // RUN: --iree-codegen-llvmgpu-use-igemm=true --pass-pipeline="builtin.module(iree-llvmgpu-select-lowering-strategy)" %s | FileCheck %s
 
 func.func @nhwc_conv_mfma() {

diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_tile_and_fuse.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_tile_and_fuse.mlir
index 2c6926a..c976a33 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_tile_and_fuse.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_tile_and_fuse.mlir

@@ -1,4 +1,4 @@
-// RUN: iree-opt --mlir-print-local-scope --split-input-file --iree-gpu-test-target=gfx940 \
+// RUN: iree-opt --mlir-print-local-scope --split-input-file --iree-gpu-test-target=gfx942 \
 // RUN: --iree-codegen-llvmgpu-test-tile-and-fuse-matmul=true --iree-codegen-llvmgpu-test-tile-and-fuse-vectorize=true \
 // RUN: --iree-codegen-llvmgpu-use-igemm=false \
 // RUN: --pass-pipeline="builtin.module(iree-llvmgpu-select-lowering-strategy)" %s | FileCheck %s

diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_vector_distribute_gfx940.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_vector_distribute_gfx942.mlir
similarity index 99%
rename from compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_vector_distribute_gfx940.mlir
rename to compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_vector_distribute_gfx942.mlir
index 54d2627..d71e7ed 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_vector_distribute_gfx940.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_vector_distribute_gfx942.mlir

@@ -1,4 +1,4 @@
-// RUN: iree-opt --split-input-file --iree-gpu-test-target=gfx940 --iree-codegen-llvmgpu-use-vector-distribution \
+// RUN: iree-opt --split-input-file --iree-gpu-test-target=gfx942 --iree-codegen-llvmgpu-use-vector-distribution \
 // RUN:   --iree-codegen-llvmgpu-use-unaligned-gemm-vector-distribution --iree-codegen-llvmgpu-use-igemm=false \
 // RUN:   --pass-pipeline="builtin.module(iree-llvmgpu-select-lowering-strategy)" %s | FileCheck %s
 
@@ -83,7 +83,7 @@
   #hal.pipeline.binding<storage_buffer>,
   #hal.pipeline.binding<storage_buffer>
 ]>
-#target = #iree_gpu.target<arch = "gfx940", features = "", wgp = <
+#target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <
   compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8,
   subgroup = shuffle|arithmetic, dot = dp4xi8toi32,
   mma = [],

diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_igemm_tile_and_fuse.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_igemm_tile_and_fuse.mlir
index 0763eb4..15d4dc4 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_igemm_tile_and_fuse.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_igemm_tile_and_fuse.mlir

@@ -1,4 +1,4 @@
-// RUN: iree-opt --split-input-file --iree-gpu-test-target=gfx940 \
+// RUN: iree-opt --split-input-file --iree-gpu-test-target=gfx942 \
 // RUN:   --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(builtin.module(func.func(iree-llvmgpu-lower-executable-target)))))" %s | FileCheck %s
 
 #pipeline_layout = #hal.pipeline.layout<bindings = [

diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_tile_and_fuse.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_tile_and_fuse.mlir
index ab9606b..772e146 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_tile_and_fuse.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_tile_and_fuse.mlir

@@ -1,4 +1,4 @@
-// RUN: iree-opt --split-input-file --iree-gpu-test-target=gfx940 \
+// RUN: iree-opt --split-input-file --iree-gpu-test-target=gfx942 \
 // RUN:   --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(builtin.module(func.func(iree-llvmgpu-lower-executable-target)))))" %s | FileCheck %s
 
 #pipeline_layout = #hal.pipeline.layout<bindings = [

diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_vector_distribute_gfx940.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_vector_distribute_gfx942.mlir
similarity index 99%
rename from compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_vector_distribute_gfx940.mlir
rename to compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_vector_distribute_gfx942.mlir
index 5e5096d..389339c 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_vector_distribute_gfx940.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_vector_distribute_gfx942.mlir

@@ -1,9 +1,9 @@
-// RUN: iree-opt --split-input-file --iree-gpu-test-target=gfx940 \
+// RUN: iree-opt --split-input-file --iree-gpu-test-target=gfx942 \
 // RUN:   --iree-codegen-llvmgpu-use-vector-distribution --iree-llvmgpu-enable-prefetch=true \
 // RUN:   --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(builtin.module(func.func(iree-llvmgpu-lower-executable-target)))))" \
 // RUN:   %s | FileCheck %s
 
-// RUN: iree-opt --split-input-file --iree-gpu-test-target=gfx940 \
+// RUN: iree-opt --split-input-file --iree-gpu-test-target=gfx942 \
 // RUN:   --iree-codegen-llvmgpu-use-vector-distribution --iree-llvmgpu-enable-prefetch=true \
 // RUN:   --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(builtin.module(func.func(iree-llvmgpu-lower-executable-target)))))" \
 // RUN:   %s | FileCheck %s --check-prefix=MEMORY

diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_warp_reduction.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_warp_reduction.mlir
index 4128e4f..aef1055 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_warp_reduction.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_warp_reduction.mlir

@@ -1,4 +1,4 @@
-// RUN: iree-opt --split-input-file --iree-gpu-test-target=gfx940 --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(builtin.module(iree-codegen-rocdl-configuration-pipeline), iree-codegen-linalg-to-rocdl-pipeline2)))" %s | FileCheck %s
+// RUN: iree-opt --split-input-file --iree-gpu-test-target=gfx942 --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(builtin.module(iree-codegen-rocdl-configuration-pipeline), iree-codegen-linalg-to-rocdl-pipeline2)))" %s | FileCheck %s
 
 #pipeline_layout = #hal.pipeline.layout<bindings = [
   #hal.pipeline.binding<storage_buffer>,

diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/config_custom_op.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/config_custom_op.mlir
index 77cbbde..62ccec7 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/config_custom_op.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/config_custom_op.mlir

@@ -1,4 +1,4 @@
-// RUN: iree-opt --split-input-file --iree-gpu-test-target=gfx940 --pass-pipeline='builtin.module(iree-llvmgpu-select-lowering-strategy)' %s | FileCheck %s
+// RUN: iree-opt --split-input-file --iree-gpu-test-target=gfx942 --pass-pipeline='builtin.module(iree-llvmgpu-select-lowering-strategy)' %s | FileCheck %s
 
 func.func @custom_op(%arg0 : tensor<384x512xf32>, %arg1 : tensor<512x128xf32>,
     %arg2 : tensor<128xf32>) -> tensor<384x128xf32> {

diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/config_matvec.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/config_matvec.mlir
index b685819..5262c34 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/config_matvec.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/config_matvec.mlir

@@ -1,4 +1,4 @@
-// RUN: iree-opt --split-input-file --iree-gpu-test-target=gfx940 --pass-pipeline='builtin.module(iree-llvmgpu-select-lowering-strategy)' %s | FileCheck %s
+// RUN: iree-opt --split-input-file --iree-gpu-test-target=gfx942 --pass-pipeline='builtin.module(iree-llvmgpu-select-lowering-strategy)' %s | FileCheck %s
 // RUN: iree-opt --split-input-file --iree-gpu-test-target=gfx1100 --pass-pipeline='builtin.module(iree-llvmgpu-select-lowering-strategy)' %s | FileCheck %s --check-prefix=CDNA3
 
 #pipeline_layout = #hal.pipeline.layout<constants = 5, bindings = [

diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/gpu_pipeline_generalize_named_ops.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/gpu_pipeline_generalize_named_ops.mlir
index 2c171b2..29371ba 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/gpu_pipeline_generalize_named_ops.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/gpu_pipeline_generalize_named_ops.mlir

@@ -1,7 +1,7 @@
-// RUN: iree-opt --pass-pipeline="builtin.module(iree-codegen-llvmgpu-configuration-pipeline)" --iree-gpu-test-target=gfx940 \
+// RUN: iree-opt --pass-pipeline="builtin.module(iree-codegen-llvmgpu-configuration-pipeline)" --iree-gpu-test-target=gfx942 \
 // RUN:   --split-input-file %s | FileCheck %s
 
-// RUN: iree-opt --pass-pipeline="builtin.module(iree-codegen-rocdl-configuration-pipeline)" --iree-gpu-test-target=gfx940 \
+// RUN: iree-opt --pass-pipeline="builtin.module(iree-codegen-rocdl-configuration-pipeline)" --iree-gpu-test-target=gfx942 \
 // RUN:   --split-input-file %s | FileCheck %s
 
 // Make sure that the GPU configuration pipelines generalize named ops, e.g., linalg.matmul_transpose_b to linalg.generic.

diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/reduction_pipeline_rocm.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/reduction_pipeline_rocm.mlir
index 82257f1..5ae0be5 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/reduction_pipeline_rocm.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/reduction_pipeline_rocm.mlir

@@ -1,4 +1,4 @@
-// RUN: iree-opt --split-input-file --iree-gpu-test-target=gfx940 \
+// RUN: iree-opt --split-input-file --iree-gpu-test-target=gfx942 \
 // RUN:  --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(builtin.module(iree-llvmgpu-select-lowering-strategy, func.func(iree-llvmgpu-lower-executable-target)))))" \
 // RUN:  %s | FileCheck %s
 // RUN: iree-opt --split-input-file --iree-gpu-test-target=gfx1100 \

diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/reduction_pipeline_softmax_rocm.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/reduction_pipeline_softmax_rocm.mlir
index fafd96e..85ee90a 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/reduction_pipeline_softmax_rocm.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/reduction_pipeline_softmax_rocm.mlir

@@ -1,5 +1,5 @@
 // RUN: iree-opt --split-input-file --iree-gpu-test-target=gfx1100 --pass-pipeline="builtin.module(func.func(iree-codegen-decompose-softmax), iree-llvmgpu-select-lowering-strategy,  func.func(iree-llvmgpu-lower-executable-target))" %s | FileCheck %s
-// RUN: iree-opt --split-input-file --iree-gpu-test-target=gfx940 --pass-pipeline="builtin.module(func.func(iree-codegen-decompose-softmax), iree-llvmgpu-select-lowering-strategy,  func.func(iree-llvmgpu-lower-executable-target))" %s | FileCheck %s --check-prefix=CDNA3
+// RUN: iree-opt --split-input-file --iree-gpu-test-target=gfx942 --pass-pipeline="builtin.module(func.func(iree-codegen-decompose-softmax), iree-llvmgpu-select-lowering-strategy,  func.func(iree-llvmgpu-lower-executable-target))" %s | FileCheck %s --check-prefix=CDNA3
 
 #pipeline_layout = #hal.pipeline.layout<bindings = [
   #hal.pipeline.binding<storage_buffer>,

diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/rocdl_pipeline_test.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/rocdl_pipeline_test.mlir
index edddd19..33a5c39 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/rocdl_pipeline_test.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/rocdl_pipeline_test.mlir

@@ -1,5 +1,5 @@
 // RUN: iree-opt --split-input-file --iree-gpu-test-target=gfx908 --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(builtin.module(iree-codegen-llvmgpu-configuration-pipeline), iree-codegen-linalg-to-rocdl-pipeline)))" %s | FileCheck %s --check-prefix=CDNA1
-// RUN: iree-opt --split-input-file --iree-gpu-test-target=gfx940 --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(builtin.module(iree-codegen-llvmgpu-configuration-pipeline), iree-codegen-linalg-to-rocdl-pipeline)))" %s | FileCheck %s --check-prefix=CDNA3
+// RUN: iree-opt --split-input-file --iree-gpu-test-target=gfx942 --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(builtin.module(iree-codegen-llvmgpu-configuration-pipeline), iree-codegen-linalg-to-rocdl-pipeline)))" %s | FileCheck %s --check-prefix=CDNA3
 // RUN: iree-opt --split-input-file --iree-gpu-test-target=gfx1100 --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(builtin.module(iree-codegen-llvmgpu-configuration-pipeline), iree-codegen-linalg-to-rocdl-pipeline)))" %s | FileCheck %s --check-prefix=RDNA3
 
 // Verify that a simple element wise op gets lowered successfully all the way to

diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/test_query_mma.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/test_query_mma.mlir
index 070355f..4863d38 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/test_query_mma.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/test_query_mma.mlir

@@ -71,10 +71,10 @@
   }
 }
 
-// CHECK-DAG: main_0
-// CHECK-DAG: MMA Intrinsics: MFMA_F32_16x16x4_F32 MFMA_F32_16x16x16_F16
-// CHECK-DAG: main_1
-// CHECK-DAG: MMA Intrinsics: MFMA_F32_32x32x8_F16 MFMA_F32_16x16x16_BF16
+// CHECK:      main_0
+// CHECK-NEXT: MMA Intrinsics: MFMA_F32_16x16x4_F32 MFMA_F32_16x16x16_F16
+// CHECK-NEXT: main_1
+// CHECK-NEXT: MMA Intrinsics: MFMA_F32_32x32x8_F16 MFMA_F32_16x16x16_BF16
 
 // -----
 

diff --git a/compiler/src/iree/compiler/Codegen/Utils/GPUUtils.cpp b/compiler/src/iree/compiler/Codegen/Utils/GPUUtils.cpp
index 1ad0bd3..f1eb776 100644
--- a/compiler/src/iree/compiler/Codegen/Utils/GPUUtils.cpp
+++ b/compiler/src/iree/compiler/Codegen/Utils/GPUUtils.cpp

@@ -944,7 +944,7 @@
 //       No real technical reason to only allow these aside from compile
 //       time and diskspace.
 bool hasUkernelSupportedRocmArch(StringRef targetChip) {
-  const char *kSupportedTargetChip[] = {"gfx90a", "gfx940", "gfx1030",
+  const char *kSupportedTargetChip[] = {"gfx90a", "gfx942", "gfx1030",
                                         "gfx1100"};
   size_t arraySize =
       sizeof(kSupportedTargetChip) / sizeof(kSupportedTargetChip[0]);
@@ -1028,22 +1028,24 @@
   return std::nullopt;
 }
 
-llvm::SmallDenseMap<IREE::HAL::ExecutableVariantOp,
-                    SmallVector<IREE::GPU::MMAIntrinsic>>
-queryMMAIntrinsics(mlir::ModuleOp moduleOp) {
-  llvm::SmallDenseMap<IREE::HAL::ExecutableVariantOp,
-                      SmallVector<IREE::GPU::MMAIntrinsic>>
-      mmaAttributesMap;
+SmallVector<IREE::HAL::ExecutableVariantOp>
+getExecutableVariantOps(mlir::ModuleOp moduleOp) {
+  llvm::SmallVector<IREE::HAL::ExecutableVariantOp> executableVariantOps;
   moduleOp.walk([&](IREE::HAL::ExecutableVariantOp executableOp) {
-    if (IREE::GPU::TargetAttr target = getGPUTargetAttr(executableOp)) {
-      auto mmaIntrinsics = llvm::map_to_vector(
-          target.getWgp().getMma(), [](IREE::GPU::MMAAttr attr) {
-            return attr.getIntrinsic().getValue();
-          });
-      mmaAttributesMap[executableOp] = std::move(mmaIntrinsics);
-    }
+    executableVariantOps.push_back(executableOp);
   });
-  return mmaAttributesMap;
+  return executableVariantOps;
+}
+
+SmallVector<IREE::GPU::MMAIntrinsic>
+queryMMAIntrinsics(IREE::HAL::ExecutableVariantOp executableOp) {
+  llvm::SmallVector<IREE::GPU::MMAIntrinsic> mmaIntrinsics;
+  if (IREE::GPU::TargetAttr target = getGPUTargetAttr(executableOp)) {
+    mmaIntrinsics = llvm::map_to_vector(
+        target.getWgp().getMma(),
+        [](IREE::GPU::MMAAttr attr) { return attr.getIntrinsic().getValue(); });
+  }
+  return mmaIntrinsics;
 }
 
 } // namespace mlir::iree_compiler

diff --git a/compiler/src/iree/compiler/Codegen/Utils/GPUUtils.h b/compiler/src/iree/compiler/Codegen/Utils/GPUUtils.h
index ead0fc4..133d724 100644
--- a/compiler/src/iree/compiler/Codegen/Utils/GPUUtils.h
+++ b/compiler/src/iree/compiler/Codegen/Utils/GPUUtils.h

@@ -207,13 +207,16 @@
 /// Returns std::nullopt if none found.
 std::optional<int> getGPUSubgroupSize(mlir::FunctionOpInterface func);
 
-/// Returns a map of supported MMA intrinsic instructions based on the
-/// GPU target descriptions in `moduleOp`. Each entry in the map associates
-/// an `IREE::HAL::ExecutableVariantOp` with a vector of
-/// `IREE::GPU::MMAIntrinsic` attributes.
-llvm::SmallDenseMap<IREE::HAL::ExecutableVariantOp,
-                    SmallVector<IREE::GPU::MMAIntrinsic>>
-queryMMAIntrinsics(mlir::ModuleOp moduleOp);
+/// Returns all `IREE::HAL::ExecutableVariantOp` operations from the
+/// given `mlir::ModuleOp`, ensuring they are returned in their original IR
+/// order.
+SmallVector<IREE::HAL::ExecutableVariantOp>
+getExecutableVariantOps(mlir::ModuleOp moduleOp);
+
+// Returns the MMA intrinsics associated with the given
+// `IREE::HAL::ExecutableVariantOp`.
+SmallVector<IREE::GPU::MMAIntrinsic>
+queryMMAIntrinsics(IREE::HAL::ExecutableVariantOp executableOp);
 
 } // namespace mlir::iree_compiler
 

diff --git a/experimental/regression_suite/pyproject.toml b/experimental/regression_suite/pyproject.toml
index 7f590ee..ca0f106 100644
--- a/experimental/regression_suite/pyproject.toml
+++ b/experimental/regression_suite/pyproject.toml

@@ -8,7 +8,7 @@
     "plat_rdna3_vulkan: mark tests as running on AMD RDNA3 Vulkan device",
     "plat_nvidia_a100: mark tests as running on NVIDIA A100 device",
     "plat_gfx90a_rocm: mark tests as running on AMD GFX90A ROCm device",
-    "plat_gfx940_rocm: mark tests as running on AMD GFX940 ROCm device",
+    "plat_gfx942_rocm: mark tests as running on AMD GFX942 ROCm device",
     "plat_rdna3_rocm: mark tests as running on AMD RDNA3 ROCm device",
     "presubmit: mark test as running on presubmit",
     "postsubmit: mark test as running on postsubmit",

diff --git a/experimental/regression_suite/tests/pregenerated/test_ukernel.py b/experimental/regression_suite/tests/pregenerated/test_ukernel.py
index 61cfd88..fdd1a1c 100644
--- a/experimental/regression_suite/tests/pregenerated/test_ukernel.py
+++ b/experimental/regression_suite/tests/pregenerated/test_ukernel.py

@@ -6,6 +6,7 @@
 
 import pytest
 from ireers_tools import *
+from pathlib import Path
 
 ###############################################################################
 # Fixtures
@@ -25,7 +26,7 @@
 def argmax_ukernel_host_cpu_vmfb(argmax_ukernel_source):
     return iree_compile(
         argmax_ukernel_source,
-        "host_cpu",
+        vmfb_path=Path("host_cpu"),
         flags=COMMON_FLAGS
         + [
             "--iree-hal-target-backends=llvm-cpu",
@@ -38,7 +39,7 @@
 def argmax_ukernel_gfx90a_rocm_vmfb(argmax_ukernel_source):
     return iree_compile(
         argmax_ukernel_source,
-        "gfx90a_rocm",
+        vmfb_path=Path("gfx90a_rocm"),
         flags=COMMON_FLAGS
         + [
             "--iree-hal-target-backends=rocm",
@@ -49,14 +50,14 @@
 
 
 @pytest.fixture
-def argmax_ukernel_gfx940_rocm_vmfb(argmax_ukernel_source):
+def argmax_ukernel_gfx942_rocm_vmfb(argmax_ukernel_source):
     return iree_compile(
         argmax_ukernel_source,
-        "gfx940_rocm",
+        vmfb_path=Path("gfx942_rocm"),
         flags=COMMON_FLAGS
         + [
             "--iree-hal-target-backends=rocm",
-            "--iree-hip-target=gfx940",
+            "--iree-hip-target=gfx942",
             "--iree-hip-enable-ukernels=argmax",
         ],
     )
@@ -195,16 +196,16 @@
 
 @pytest.mark.presubmit
 @pytest.mark.unstable_linalg
-@pytest.mark.plat_gfx940_rocm
-def test_correctness_gfx940_rocm(
-    argmax_ukernel_gfx940_rocm_vmfb,
+@pytest.mark.plat_gfx942_rocm
+def test_correctness_gfx942_rocm(
+    argmax_ukernel_gfx942_rocm_vmfb,
     argmax_input_f16,
     argmax_output_f16,
     argmax_input_f32,
     argmax_output_f32,
 ):
     iree_run_module(
-        argmax_ukernel_gfx940_rocm_vmfb,
+        argmax_ukernel_gfx942_rocm_vmfb,
         device="hip",
         function="argmax_3d_dyn_f16i32",
         args=[
@@ -213,7 +214,7 @@
         ],
     )
     iree_run_module(
-        argmax_ukernel_gfx940_rocm_vmfb,
+        argmax_ukernel_gfx942_rocm_vmfb,
         device="hip",
         function="argmax_3d_dyn_f16i64",
         args=[
@@ -223,7 +224,7 @@
     )
 
     iree_run_module(
-        argmax_ukernel_gfx940_rocm_vmfb,
+        argmax_ukernel_gfx942_rocm_vmfb,
         device="hip",
         function="argmax_3d_dyn_f32i32",
         args=[
@@ -232,7 +233,7 @@
         ],
     )
     iree_run_module(
-        argmax_ukernel_gfx940_rocm_vmfb,
+        argmax_ukernel_gfx942_rocm_vmfb,
         device="hip",
         function="argmax_3d_dyn_f32i64",
         args=[

diff --git a/integrations/pjrt/src/iree_pjrt/common/api_impl.cc b/integrations/pjrt/src/iree_pjrt/common/api_impl.cc
index a7c5cc7..0526c37 100644
--- a/integrations/pjrt/src/iree_pjrt/common/api_impl.cc
+++ b/integrations/pjrt/src/iree_pjrt/common/api_impl.cc

@@ -590,8 +590,8 @@
       device_.device(), IREE_HAL_QUEUE_AFFINITY_ANY,
       /*wait_semaphore_list=*/iree_hal_fence_semaphore_list(ready_fence_.get()),
       /*signal_semaphore_list=*/
-      iree_hal_fence_semaphore_list(dst_buffer_ready_fence.get()), transfer_cb,
-      iree_hal_buffer_binding_table_empty()));
+      iree_hal_fence_semaphore_list(dst_buffer_ready_fence.get()),
+      transfer_cb.get()));
 
   *out_done_event = copy_done_event;
   return iree_ok_status();
@@ -836,9 +836,8 @@
       /*binding_capacity=*/0, &transfer_cb));
   IREE_CHECK_OK(iree_hal_command_buffer_begin(transfer_cb.get()));
   IREE_RETURN_IF_ERROR(iree_hal_command_buffer_fill_buffer(
-      transfer_cb.get(), buffer.get(), /*target_offset=*/0,
-      /*target_size=*/byte_length, data, element_type_byte_size,
-      IREE_HAL_FILL_FLAG_NONE));
+      transfer_cb.get(), iree_hal_make_buffer_ref(buffer.get(), 0, byte_length),
+      data, element_type_byte_size, IREE_HAL_FILL_FLAG_NONE));
   IREE_CHECK_OK(iree_hal_command_buffer_end(transfer_cb.get()));
 
   // Execute the enqueued splat:
@@ -847,8 +846,7 @@
       /*wait_semaphore_list=*/
       {1, &transfer_timeline_, &signal_alloca_complete},
       /*signal_semaphore_list=*/
-      {1, &transfer_timeline_, &signal_copy_complete}, transfer_cb,
-      iree_hal_buffer_binding_table_empty()));
+      {1, &transfer_timeline_, &signal_copy_complete}, transfer_cb.get()));
 
   // Wrap in a buffer view and return:
   iree::vm::ref<iree_hal_buffer_view_t> result_buffer_view;
@@ -1191,8 +1189,7 @@
       /*wait_semaphore_list=*/
       {1, &transfer_timeline_, &signal_alloca_complete},
       /*signal_semaphore_list=*/
-      {1, &transfer_timeline_, &signal_copy_complete}, transfer_cb,
-      iree_hal_buffer_binding_table_empty()));
+      {1, &transfer_timeline_, &signal_copy_complete}, transfer_cb.get()));
 
   // Wrap in a buffer view and return.
   iree::vm::ref<iree_hal_buffer_view_t> result_buffer_view;

diff --git a/integrations/pjrt/src/iree_pjrt/cuda/client.cc b/integrations/pjrt/src/iree_pjrt/cuda/client.cc
index f1546a0..510707b 100644
--- a/integrations/pjrt/src/iree_pjrt/cuda/client.cc
+++ b/integrations/pjrt/src/iree_pjrt/cuda/client.cc

@@ -31,7 +31,6 @@
   iree_hal_cuda_device_params_t default_params;
   iree_hal_cuda_device_params_initialize(&default_params);
   default_params.command_buffer_mode = IREE_HAL_CUDA_COMMAND_BUFFER_MODE_STREAM;
-  default_params.allow_inline_execution = false;
 
   // Driver params.
   iree_hal_cuda_driver_options_t driver_options;
@@ -39,7 +38,7 @@
   driver_options.default_device_index = 0;
 
   IREE_RETURN_IF_ERROR(
-      iree_hal_cuda_driver_create(driver_name, &default_params, &driver_options,
+      iree_hal_cuda_driver_create(driver_name, &driver_options, &default_params,
                                   host_allocator_, out_driver));
   logger().debug("CUDA driver created");
   return iree_ok_status();
commit	82a89e389ddf7a13c613f5b26df9051ded43a764	[log] [tgz]
author	Ben Vanik <ben.vanik@gmail.com>	Tue Nov 19 13:09:38 2024 -0800
committer	GitHub <noreply@github.com>	Tue Nov 19 13:09:38 2024 -0800
tree	7066a839fa0f53ed48609e39375f5467bf99d230
parent	4396bf1aa0d53d703d0204d9e361630257b7d792 [diff]
parent	f5106640f4bf75eb15a5673de5fafd3aa0a96fd4 [diff]