Add gfx950 ukernel patterns (#21856)

This PR enables our existing MLIR ukernel for GFX942 to GFX950 for
matmul operations with F16, F8, and BF16 data types.
diff --git a/compiler/plugins/target/ROCM/Dialect/ROCM/IR/ROCMDialect.cpp b/compiler/plugins/target/ROCM/Dialect/ROCM/IR/ROCMDialect.cpp
index 0c4e3f2..2c0a8f6 100644
--- a/compiler/plugins/target/ROCM/Dialect/ROCM/IR/ROCMDialect.cpp
+++ b/compiler/plugins/target/ROCM/Dialect/ROCM/IR/ROCMDialect.cpp
@@ -58,4 +58,8 @@
   return builtins.getFile(name);
 }
 
+SmallVector<StringRef> ROCMDialect::getBuiltinNames() {
+  return llvm::to_vector(builtins.getMap().keys());
+}
+
 } // namespace mlir::iree_compiler::IREE::ROCM
diff --git a/compiler/plugins/target/ROCM/Dialect/ROCM/IR/ROCMDialect.td b/compiler/plugins/target/ROCM/Dialect/ROCM/IR/ROCMDialect.td
index 720821f..01243e1 100644
--- a/compiler/plugins/target/ROCM/Dialect/ROCM/IR/ROCMDialect.td
+++ b/compiler/plugins/target/ROCM/Dialect/ROCM/IR/ROCMDialect.td
@@ -31,6 +31,8 @@
     bool hasBuiltin(StringRef name);
     /// Helper for getting a builtin.
     std::optional<StringRef> getBuiltin(StringRef name);
+    /// Helper for getting the builtin names.
+    SmallVector<StringRef> getBuiltinNames();
 
     /// Returns the loaded builtin module for the `path`. If the module has
     /// already been loaded in the past, returns the memoized module without
diff --git a/compiler/plugins/target/ROCM/Dialect/ROCM/Transforms/ApplyBuiltinPDLPatterns.cpp b/compiler/plugins/target/ROCM/Dialect/ROCM/Transforms/ApplyBuiltinPDLPatterns.cpp
index 2ff03ee..fbe6581 100644
--- a/compiler/plugins/target/ROCM/Dialect/ROCM/Transforms/ApplyBuiltinPDLPatterns.cpp
+++ b/compiler/plugins/target/ROCM/Dialect/ROCM/Transforms/ApplyBuiltinPDLPatterns.cpp
@@ -19,6 +19,7 @@
 #include "iree/compiler/Utils/ShapeUtils.h"
 #include "llvm/ADT/SmallVectorExtras.h"
 #include "llvm/Support/FormatVariadic.h"
+#include "llvm/Support/Regex.h"
 #include "mlir/Dialect/LLVMIR/ROCDLDialect.h"
 #include "mlir/Dialect/Linalg/IR/Linalg.h"
 #include "mlir/Dialect/PDL/IR/PDL.h"
@@ -203,6 +204,14 @@
 
 namespace {
 
+SmallVector<StringRef> filterUkernelPatternsByTarget(ArrayRef<StringRef> names,
+                                                     StringRef target) {
+  std::string pattern =
+      "ukernel_patterns(_.*)*" + target.str() + "(_.*)*\\.mlir";
+  llvm::Regex regex(pattern);
+  return llvm::filter_to_vector(
+      names, [&regex](StringRef name) { return regex.match(name); });
+}
 class ApplyBuiltinPDLPatternsPass
     : public iree_compiler::IREE::ROCM::impl::ApplyBuiltinPDLPatternsPassBase<
           ApplyBuiltinPDLPatternsPass> {
@@ -232,17 +241,21 @@
     }
     if (enableTensorUKernels) {
       for (std::string target : targets) {
-        std::string builtinName =
-            llvm::formatv("ukernel_patterns_{}.mlir", target);
-        std::optional<StringRef> maybeBuiltin =
-            rocmDialect->getBuiltin(builtinName);
-        if (!maybeBuiltin) {
-          // Skip when no patterns are present.
-          continue;
-        }
-        if (failed(populatePDLModuleFromBuiltin(context, tmpPatterns,
-                                                maybeBuiltin.value()))) {
-          return failure();
+        SmallVector<StringRef> allBuiltinNames = rocmDialect->getBuiltinNames();
+        SmallVector<StringRef> builtinNames =
+            filterUkernelPatternsByTarget(allBuiltinNames, target);
+        std::string builtinSrc;
+        for (StringRef builtinName : builtinNames) {
+          std::optional<StringRef> maybeBuiltin =
+              rocmDialect->getBuiltin(builtinName);
+          if (!maybeBuiltin) {
+            // Skip when no patterns are present.
+            continue;
+          }
+          if (failed(populatePDLModuleFromBuiltin(context, tmpPatterns,
+                                                  maybeBuiltin.value()))) {
+            return failure();
+          }
         }
       }
     }
diff --git a/compiler/plugins/target/ROCM/Dialect/ROCM/Transforms/test/BUILD.bazel b/compiler/plugins/target/ROCM/Dialect/ROCM/Transforms/test/BUILD.bazel
index 74a2b84..8bfe040 100644
--- a/compiler/plugins/target/ROCM/Dialect/ROCM/Transforms/test/BUILD.bazel
+++ b/compiler/plugins/target/ROCM/Dialect/ROCM/Transforms/test/BUILD.bazel
@@ -12,16 +12,14 @@
     licenses = ["notice"],  # Apache 2.0
 )
 
+ukernel_pdl_patterns_files = [
+    "apply_builtin_ukernel_pdl_patterns.mlir",
+    "apply_builtin_ukernel_pdl_patterns_driver.mlir",
+]
+
 iree_lit_test_suite(
     name = "lit",
-    srcs = enforce_glob(
-        [
-            "apply_builtin_pdl_patterns.mlir",
-            "apply_builtin_ukernel_pdl_patterns.mlir",
-            "apply_builtin_ukernel_pdl_patterns_driver.mlir",
-        ],
-        include = ["*.mlir"],
-    ),
+    srcs = ukernel_pdl_patterns_files,
     cfg = "//compiler:lit.cfg.py",
     tools = [
         "//tools:iree-opt",
diff --git a/compiler/plugins/target/ROCM/Dialect/ROCM/Transforms/test/CMakeLists.txt b/compiler/plugins/target/ROCM/Dialect/ROCM/Transforms/test/CMakeLists.txt
index 45224bc..7e87ce7 100644
--- a/compiler/plugins/target/ROCM/Dialect/ROCM/Transforms/test/CMakeLists.txt
+++ b/compiler/plugins/target/ROCM/Dialect/ROCM/Transforms/test/CMakeLists.txt
@@ -14,7 +14,6 @@
   NAME
     lit
   SRCS
-    "apply_builtin_pdl_patterns.mlir"
     "apply_builtin_ukernel_pdl_patterns.mlir"
     "apply_builtin_ukernel_pdl_patterns_driver.mlir"
   TOOLS
diff --git a/compiler/plugins/target/ROCM/Dialect/ROCM/Transforms/test/apply_builtin_ukernel_pdl_patterns.mlir b/compiler/plugins/target/ROCM/Dialect/ROCM/Transforms/test/apply_builtin_ukernel_pdl_patterns.mlir
index 6ddcba2..47794f8 100644
--- a/compiler/plugins/target/ROCM/Dialect/ROCM/Transforms/test/apply_builtin_ukernel_pdl_patterns.mlir
+++ b/compiler/plugins/target/ROCM/Dialect/ROCM/Transforms/test/apply_builtin_ukernel_pdl_patterns.mlir
@@ -1,10 +1,12 @@
 // RUN: iree-opt --pass-pipeline='builtin.module(func.func(iree-rocm-apply-builtin-pdl-patterns{targets=gfx942 enable-tensor-ukernels=true}))' \
-// RUN:   --mlir-print-local-scope --split-input-file %s | FileCheck %s
+// RUN:   --mlir-print-local-scope --split-input-file %s | FileCheck %s --check-prefix=GFX942
+// RUN: iree-opt --pass-pipeline='builtin.module(func.func(iree-rocm-apply-builtin-pdl-patterns{targets=gfx950 enable-tensor-ukernels=true}))' \
+// RUN:   --mlir-print-local-scope --split-input-file %s | FileCheck %s --check-prefix=GFX950
 
 #map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>
 #map2 = affine_map<(d0, d1, d2, d3) -> (d2, d3)>
 #map3 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>
-func.func @matmul_f8(%arg0: tensor<1x128x4096xf8E4M3FNUZ>, %arg1: tensor<1024x4096xf8E4M3FNUZ>) -> tensor<1x128x1024xf32> {
+func.func @matmul_f8_gfx942(%arg0: tensor<1x128x4096xf8E4M3FNUZ>, %arg1: tensor<1024x4096xf8E4M3FNUZ>) -> tensor<1x128x1024xf32> {
   %cst = arith.constant 0.000000e+00 : f32
   %0 = tensor.empty() : tensor<1x128x1024xf32>
   %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<1x128x1024xf32>) -> tensor<1x128x1024xf32>
@@ -18,19 +20,22 @@
     } -> tensor<1x128x1024xf32>
   return %2 : tensor<1x128x1024xf32>
 }
-// CHECK-LABEL: @matmul_f8
-// CHECK:         linalg.generic
-// CHECK-SAME:      compilation_info = #iree_codegen.compilation_info
-// CHECK-SAME:      lowering_config =
-// CHECK-SAME:      translation_info =
-// CHECK-SAME:      iree_codegen.ukernel = #iree_codegen.ukernel_descriptor<"pingpong_medium_f8_expanded", tensor>
-
+// GFX942-LABEL: @matmul_f8_gfx942
+// GFX942:         linalg.generic
+// GFX942-SAME:      compilation_info = #iree_codegen.compilation_info
+// GFX942-SAME:      lowering_config =
+// GFX942-SAME:      translation_info =
+// GFX942-SAME:      iree_codegen.ukernel = #iree_codegen.ukernel_descriptor<"pingpong_medium_f8E4M3FNUZ_expanded", tensor>
+// GFX950-LABEL: @matmul_f8_gfx942
+// GFX950:         linalg.generic
+// GFX950-NOT:      compilation_info = #iree_codegen.compilation_info
+// GFX950-NOT:      iree_codegen.ukernel = #iree_codegen.ukernel_descriptor<"pingpong_medium_f8E4M3FNUZ_expanded", tensor>
 // -----
 
 #map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>
 #map2 = affine_map<(d0, d1, d2, d3) -> (d2, d3)>
 #map3 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>
-func.func @negative_matmul_f8(%arg0: tensor<1x128x256xf8E4M3FNUZ>, %arg1: tensor<1024x256xf8E4M3FNUZ>) -> tensor<1x128x1024xf32> {
+func.func @negative_matmul_f8_gfx942(%arg0: tensor<1x128x256xf8E4M3FNUZ>, %arg1: tensor<1024x256xf8E4M3FNUZ>) -> tensor<1x128x1024xf32> {
   %cst = arith.constant 0.000000e+00 : f32
   %0 = tensor.empty() : tensor<1x128x1024xf32>
   %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<1x128x1024xf32>) -> tensor<1x128x1024xf32>
@@ -44,16 +49,19 @@
     } -> tensor<1x128x1024xf32>
   return %2 : tensor<1x128x1024xf32>
 }
-// CHECK-LABEL: @negative_matmul_f8
-// CHECK-NOT:     compilation_info = #iree_codegen.compilation_info
-// CHECK-NOT:     iree_codegen.ukernel = #iree_codegen.ukernel_descriptor<"pingpong_medium_f8_expanded", tensor>
+// GFX942-LABEL: @negative_matmul_f8_gfx942
+// GFX942-NOT:     compilation_info = #iree_codegen.compilation_info
+// GFX942-NOT:     iree_codegen.ukernel = #iree_codegen.ukernel_descriptor<"pingpong_medium_f8E4M3FNUZ_expanded", tensor>
+// GFX950-LABEL: @negative_matmul_f8_gfx942
+// GFX950-NOT:     compilation_info = #iree_codegen.compilation_info
+// GFX950-NOT:     iree_codegen.ukernel = #iree_codegen.ukernel_descriptor<"pingpong_medium_f8E4M3FNUZ_expanded", tensor>
 
 // -----
 
 #map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>
 #map2 = affine_map<(d0, d1, d2, d3) -> (d2, d3)>
 #map3 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>
-func.func @negative_matmul_f8_medium_no_zero_fill(%arg0: tensor<1x128x4096xf8E4M3FNUZ>, %arg1: tensor<1024x4096xf8E4M3FNUZ>) -> tensor<1x128x1024xf32> {
+func.func @negative_matmul_f8_medium_no_zero_fill_gfx942(%arg0: tensor<1x128x4096xf8E4M3FNUZ>, %arg1: tensor<1024x4096xf8E4M3FNUZ>) -> tensor<1x128x1024xf32> {
   %cst = arith.constant 1.000000e+00 : f32
   %0 = tensor.empty() : tensor<1x128x1024xf32>
   %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<1x128x1024xf32>) -> tensor<1x128x1024xf32>
@@ -67,9 +75,12 @@
     } -> tensor<1x128x1024xf32>
   return %2 : tensor<1x128x1024xf32>
 }
-// CHECK-LABEL: @negative_matmul_f8_medium_no_zero_fill
-// CHECK-NOT:     compilation_info = #iree_codegen.compilation_info
-// CHECK-NOT:     iree_codegen.ukernel = #iree_codegen.ukernel_descriptor
+// GFX942-LABEL: @negative_matmul_f8_medium_no_zero_fill_gfx942
+// GFX942-NOT:     compilation_info = #iree_codegen.compilation_info
+// GFX942-NOT:     iree_codegen.ukernel = #iree_codegen.ukernel_descriptor
+// GFX950-LABEL: @negative_matmul_f8_medium_no_zero_fill_gfx942
+// GFX950-NOT:     compilation_info = #iree_codegen.compilation_info
+// GFX950-NOT:     iree_codegen.ukernel = #iree_codegen.ukernel_descriptor
 
 // -----
 
@@ -78,7 +89,7 @@
 #map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>
 #map2 = affine_map<(d0, d1, d2, d3) -> (d2, d3)>
 #map3 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>
-func.func @matmul_f8_dynamic(%arg0: index) -> tensor<1x128x1024xf32> {
+func.func @matmul_f8_dynamic_gfx942(%arg0: index) -> tensor<1x128x1024xf32> {
   %cst = arith.constant 0.000000e+00 : f32
   %0 = util.assume.int %arg0<umin = 512, udiv = 128> : index
   %1 = tensor.empty(%0) : tensor<1x128x?xf8E4M3FNUZ>
@@ -95,12 +106,16 @@
   } -> tensor<1x128x1024xf32>
   return %5 : tensor<1x128x1024xf32>
 }
-// CHECK-LABEL: @matmul_f8_dynamic
-// CHECK:         linalg.generic
-// CHECK-SAME:      compilation_info = #iree_codegen.compilation_info
-// CHECK-SAME:      lowering_config =
-// CHECK-SAME:      translation_info =
-// CHECK-SAME:      iree_codegen.ukernel = #iree_codegen.ukernel_descriptor<"pingpong_medium_f8_expanded", tensor>
+// GFX942-LABEL: @matmul_f8_dynamic_gfx942
+// GFX942:         linalg.generic
+// GFX942-SAME:      compilation_info = #iree_codegen.compilation_info
+// GFX942-SAME:      lowering_config =
+// GFX942-SAME:      translation_info =
+// GFX942-SAME:      iree_codegen.ukernel = #iree_codegen.ukernel_descriptor<"pingpong_medium_f8E4M3FNUZ_expanded", tensor>
+// GFX950-LABEL: @matmul_f8_dynamic_gfx942
+// GFX950:         linalg.generic
+// GFX950-NOT:      compilation_info = #iree_codegen.compilation_info
+// GFX950-NOT:      iree_codegen.ukernel = #iree_codegen.ukernel_descriptor<"pingpong_medium_f8E4M3FNUZ_expanded", tensor>
 
 // -----
 
@@ -109,7 +124,7 @@
 #map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>
 #map2 = affine_map<(d0, d1, d2, d3) -> (d2, d3)>
 #map3 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>
-func.func @negative_matmul_f8_dynamic_multiple_of(%arg0: tensor<1024x512xf8E4M3FNUZ>, %arg1: index) -> tensor<1x?x1024xf32> {
+func.func @negative_matmul_f8_dynamic_multiple_of_gfx942(%arg0: tensor<1024x512xf8E4M3FNUZ>, %arg1: index) -> tensor<1x?x1024xf32> {
   %cst = arith.constant 0.000000e+00 : f32
   %0 = util.assume.int %arg1<udiv = 64> : index
   %1 = tensor.empty(%0) : tensor<1x?x512xf8E4M3FNUZ>
@@ -125,9 +140,12 @@
   } -> tensor<1x?x1024xf32>
   return %4 : tensor<1x?x1024xf32>
 }
-// CHECK-LABEL: @negative_matmul_f8_dynamic_multiple_of
-// CHECK-NOT:     compilation_info = #iree_codegen.compilation_info
-// CHECK-NOT:     iree_codegen.ukernel = #iree_codegen.ukernel_descriptor<"pingpong_medium_f8_expanded", tensor>
+// GFX942-LABEL: @negative_matmul_f8_dynamic_multiple_of_gfx942
+// GFX942-NOT:     compilation_info = #iree_codegen.compilation_info
+// GFX942-NOT:     iree_codegen.ukernel = #iree_codegen.ukernel_descriptor<"pingpong_medium_f8E4M3FNUZ_expanded", tensor>
+// GFX950-LABEL: @negative_matmul_f8_dynamic_multiple_of_gfx942
+// GFX950-NOT:     compilation_info = #iree_codegen.compilation_info
+// GFX950-NOT:     iree_codegen.ukernel = #iree_codegen.ukernel_descriptor<"pingpong_medium_f8E4M3FNUZ_expanded", tensor>
 
 // -----
 
@@ -136,7 +154,7 @@
 #map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>
 #map2 = affine_map<(d0, d1, d2, d3) -> (d2, d3)>
 #map3 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>
-func.func @negative_matmul_f8_dynamic_lower_bound(%arg0: index) -> tensor<1x128x1024xf32> {
+func.func @negative_matmul_f8_dynamic_lower_bound_gfx942(%arg0: index) -> tensor<1x128x1024xf32> {
   %cst = arith.constant 0.000000e+00 : f32
   %0 = util.assume.int %arg0<umin = 256, udiv = 128> : index
   %1 = tensor.empty(%0) : tensor<1x128x?xf8E4M3FNUZ>
@@ -153,16 +171,19 @@
   } -> tensor<1x128x1024xf32>
   return %5 : tensor<1x128x1024xf32>
 }
-// CHECK-LABEL: @negative_matmul_f8_dynamic_lower_bound
-// CHECK-NOT:     compilation_info = #iree_codegen.compilation_info
-// CHECK-NOT:     iree_codegen.ukernel = #iree_codegen.ukernel_descriptor<"pingpong_medium_f8_expanded", tensor>
+// GFX942-LABEL: @negative_matmul_f8_dynamic_lower_bound_gfx942
+// GFX942-NOT:     compilation_info = #iree_codegen.compilation_info
+// GFX942-NOT:     iree_codegen.ukernel = #iree_codegen.ukernel_descriptor<"pingpong_medium_f8E4M3FNUZ_expanded", tensor>
+// GFX950-LABEL: @negative_matmul_f8_dynamic_lower_bound_gfx942
+// GFX950-NOT:     compilation_info = #iree_codegen.compilation_info
+// GFX950-NOT:     iree_codegen.ukernel = #iree_codegen.ukernel_descriptor<"pingpong_medium_f8E4M3FNUZ_expanded", tensor>
 
 // -----
 
 #map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>
 #map2 = affine_map<(d0, d1, d2, d3) -> (d2, d3)>
 #map3 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>
-func.func @negative_matmul_f8_large_no_zero_fill(%arg0: tensor<1x256x4096xf8E4M3FNUZ>, %arg1: tensor<1024x4096xf8E4M3FNUZ>) -> tensor<1x256x1024xf32> {
+func.func @negative_matmul_f8_large_no_zero_fill_gfx942(%arg0: tensor<1x256x4096xf8E4M3FNUZ>, %arg1: tensor<1024x4096xf8E4M3FNUZ>) -> tensor<1x256x1024xf32> {
   %cst = arith.constant 1.000000e+00 : f32
   %0 = tensor.empty() : tensor<1x256x1024xf32>
   %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<1x256x1024xf32>) -> tensor<1x256x1024xf32>
@@ -176,11 +197,212 @@
     } -> tensor<1x256x1024xf32>
   return %2 : tensor<1x256x1024xf32>
 }
-// CHECK-LABEL: @negative_matmul_f8_large_no_zero_fill
-// CHECK-NOT:     compilation_info = #iree_codegen.compilation_info
-// CHECK-NOT:     iree_codegen.ukernel = #iree_codegen.ukernel_descriptor
+// GFX942-LABEL: @negative_matmul_f8_large_no_zero_fill_gfx942
+// GFX942-NOT:     compilation_info = #iree_codegen.compilation_info
+// GFX942-NOT:     iree_codegen.ukernel = #iree_codegen.ukernel_descriptor
+// GFX950-LABEL: @negative_matmul_f8_large_no_zero_fill_gfx942
+// GFX950-NOT:     compilation_info = #iree_codegen.compilation_info
+// GFX950-NOT:     iree_codegen.ukernel = #iree_codegen.ukernel_descriptor
 
 // -----
+#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>
+#map2 = affine_map<(d0, d1, d2, d3) -> (d2, d3)>
+#map3 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>
+func.func @matmul_f8_gfx950(%arg0: tensor<1x128x4096xf8E4M3FN>, %arg1: tensor<1024x4096xf8E4M3FN>) -> tensor<1x128x1024xf32> {
+  %cst = arith.constant 0.000000e+00 : f32
+  %0 = tensor.empty() : tensor<1x128x1024xf32>
+  %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<1x128x1024xf32>) -> tensor<1x128x1024xf32>
+  %2 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%arg0, %arg1 : tensor<1x128x4096xf8E4M3FN>, tensor<1024x4096xf8E4M3FN>) outs(%1 : tensor<1x128x1024xf32>) {
+    ^bb0(%in: f8E4M3FN, %in_4: f8E4M3FN, %out: f32):
+      %12 = arith.extf %in : f8E4M3FN to f32
+      %13 = arith.extf %in_4 : f8E4M3FN to f32
+      %14 = arith.mulf %12, %13 : f32
+      %15 = arith.addf %out, %14 : f32
+      linalg.yield %15 : f32
+    } -> tensor<1x128x1024xf32>
+  return %2 : tensor<1x128x1024xf32>
+}
+// GFX950-LABEL: @matmul_f8_gfx950
+// GFX950:         linalg.generic
+// GFX950-SAME:      compilation_info = #iree_codegen.compilation_info
+// GFX950-SAME:      lowering_config =
+// GFX950-SAME:      translation_info =
+// GFX950-SAME:      iree_codegen.ukernel = #iree_codegen.ukernel_descriptor<"pingpong_medium_f8E4M3FN_expanded", tensor>
+// GFX942-LABEL: @matmul_f8_gfx950
+// GFX942:         linalg.generic
+// GFX942-NOT:      compilation_info = #iree_codegen.compilation_info
+// GFX942-NOT:      iree_codegen.ukernel = #iree_codegen.ukernel_descriptor<"pingpong_medium_f8E4M3FN_expanded", tensor>
+// -----
+
+#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>
+#map2 = affine_map<(d0, d1, d2, d3) -> (d2, d3)>
+#map3 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>
+func.func @negative_matmul_f8_gfx950(%arg0: tensor<1x128x256xf8E4M3FN>, %arg1: tensor<1024x256xf8E4M3FN>) -> tensor<1x128x1024xf32> {
+  %cst = arith.constant 0.000000e+00 : f32
+  %0 = tensor.empty() : tensor<1x128x1024xf32>
+  %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<1x128x1024xf32>) -> tensor<1x128x1024xf32>
+  %2 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%arg0, %arg1 : tensor<1x128x256xf8E4M3FN>, tensor<1024x256xf8E4M3FN>) outs(%1 : tensor<1x128x1024xf32>) {
+    ^bb0(%in: f8E4M3FN, %in_4: f8E4M3FN, %out: f32):
+      %12 = arith.extf %in : f8E4M3FN to f32
+      %13 = arith.extf %in_4 : f8E4M3FN to f32
+      %14 = arith.mulf %12, %13 : f32
+      %15 = arith.addf %out, %14 : f32
+      linalg.yield %15 : f32
+    } -> tensor<1x128x1024xf32>
+  return %2 : tensor<1x128x1024xf32>
+}
+// GFX950-LABEL: @negative_matmul_f8_gfx950
+// GFX950-NOT:     compilation_info = #iree_codegen.compilation_info
+// GFX950-NOT:     iree_codegen.ukernel = #iree_codegen.ukernel_descriptor<"pingpong_medium_f8E4M3FN_expanded", tensor>
+// GFX942-LABEL: @negative_matmul_f8_gfx950
+// GFX942-NOT:     compilation_info = #iree_codegen.compilation_info
+// GFX942-NOT:     iree_codegen.ukernel = #iree_codegen.ukernel_descriptor<"pingpong_medium_f8E4M3FN_expanded", tensor>
+
+// -----
+
+#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>
+#map2 = affine_map<(d0, d1, d2, d3) -> (d2, d3)>
+#map3 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>
+func.func @negative_matmul_f8_medium_no_zero_fill_gfx950(%arg0: tensor<1x128x4096xf8E4M3FN>, %arg1: tensor<1024x4096xf8E4M3FN>) -> tensor<1x128x1024xf32> {
+  %cst = arith.constant 1.000000e+00 : f32
+  %0 = tensor.empty() : tensor<1x128x1024xf32>
+  %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<1x128x1024xf32>) -> tensor<1x128x1024xf32>
+  %2 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%arg0, %arg1 : tensor<1x128x4096xf8E4M3FN>, tensor<1024x4096xf8E4M3FN>) outs(%1 : tensor<1x128x1024xf32>) {
+    ^bb0(%in: f8E4M3FN, %in_4: f8E4M3FN, %out: f32):
+      %12 = arith.extf %in : f8E4M3FN to f32
+      %13 = arith.extf %in_4 : f8E4M3FN to f32
+      %14 = arith.mulf %12, %13 : f32
+      %15 = arith.addf %out, %14 : f32
+      linalg.yield %15 : f32
+    } -> tensor<1x128x1024xf32>
+  return %2 : tensor<1x128x1024xf32>
+}
+// GFX950-LABEL: @negative_matmul_f8_medium_no_zero_fill_gfx950
+// GFX950-NOT:     compilation_info = #iree_codegen.compilation_info
+// GFX950-NOT:     iree_codegen.ukernel = #iree_codegen.ukernel_descriptor
+// GFX942-LABEL: @negative_matmul_f8_medium_no_zero_fill_gfx950
+// GFX942-NOT:     compilation_info = #iree_codegen.compilation_info
+// GFX942-NOT:     iree_codegen.ukernel = #iree_codegen.ukernel_descriptor
+// -----
+
+// Through a constraint, the inner dimension is known to be a multiple of 128 and has a lower bound of 512, so should be matched.
+
+#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>
+#map2 = affine_map<(d0, d1, d2, d3) -> (d2, d3)>
+#map3 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>
+func.func @matmul_f8_dynamic_gfx950(%arg0: index) -> tensor<1x128x1024xf32> {
+  %cst = arith.constant 0.000000e+00 : f32
+  %0 = util.assume.int %arg0<umin = 512, udiv = 128> : index
+  %1 = tensor.empty(%0) : tensor<1x128x?xf8E4M3FN>
+  %2 = tensor.empty(%0) : tensor<1024x?xf8E4M3FN>
+  %3 = tensor.empty() : tensor<1x128x1024xf32>
+  %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<1x128x1024xf32>) -> tensor<1x128x1024xf32>
+  %5 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%1, %2 : tensor<1x128x?xf8E4M3FN>, tensor<1024x?xf8E4M3FN>) outs(%4 : tensor<1x128x1024xf32>) {
+  ^bb0(%in: f8E4M3FN, %in_0: f8E4M3FN, %out: f32):
+    %6 = arith.extf %in : f8E4M3FN to f32
+    %7 = arith.extf %in_0 : f8E4M3FN to f32
+    %8 = arith.mulf %6, %7 : f32
+    %9 = arith.addf %out, %8 : f32
+    linalg.yield %9 : f32
+  } -> tensor<1x128x1024xf32>
+  return %5 : tensor<1x128x1024xf32>
+}
+// GFX950-LABEL: @matmul_f8_dynamic_gfx950
+// GFX950:         linalg.generic
+// GFX950-SAME:      compilation_info = #iree_codegen.compilation_info
+// GFX950-SAME:      lowering_config =
+// GFX950-SAME:      translation_info =
+// GFX950-SAME:      iree_codegen.ukernel = #iree_codegen.ukernel_descriptor<"pingpong_medium_f8E4M3FN_expanded", tensor>
+// GFX942-LABEL: @matmul_f8_dynamic_gfx950
+// GFX942:         linalg.generic
+// GFX942-NOT:      compilation_info = #iree_codegen.compilation_info
+// GFX942-NOT:      iree_codegen.ukernel = #iree_codegen.ukernel_descriptor<"pingpong_medium_f8E4M3FN_expanded", tensor>
+// -----
+
+// The dynamic dimension is not a multiple of 128.
+
+#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>
+#map2 = affine_map<(d0, d1, d2, d3) -> (d2, d3)>
+#map3 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>
+func.func @negative_matmul_f8_dynamic_multiple_of_gfx950(%arg0: tensor<1024x512xf8E4M3FN>, %arg1: index) -> tensor<1x?x1024xf32> {
+  %cst = arith.constant 0.000000e+00 : f32
+  %0 = util.assume.int %arg1<udiv = 64> : index
+  %1 = tensor.empty(%0) : tensor<1x?x512xf8E4M3FN>
+  %2 = tensor.empty(%0) : tensor<1x?x1024xf32>
+  %3 = linalg.fill ins(%cst : f32) outs(%2 : tensor<1x?x1024xf32>) -> tensor<1x?x1024xf32>
+  %4 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%1, %arg0 : tensor<1x?x512xf8E4M3FN>, tensor<1024x512xf8E4M3FN>) outs(%3 : tensor<1x?x1024xf32>) {
+  ^bb0(%in: f8E4M3FN, %in_0: f8E4M3FN, %out: f32):
+    %5 = arith.extf %in : f8E4M3FN to f32
+    %6 = arith.extf %in_0 : f8E4M3FN to f32
+    %7 = arith.mulf %5, %6 : f32
+    %8 = arith.addf %out, %7 : f32
+    linalg.yield %8 : f32
+  } -> tensor<1x?x1024xf32>
+  return %4 : tensor<1x?x1024xf32>
+}
+// GFX950-LABEL: @negative_matmul_f8_dynamic_multiple_of_gfx950
+// GFX950-NOT:     compilation_info = #iree_codegen.compilation_info
+// GFX950-NOT:     iree_codegen.ukernel = #iree_codegen.ukernel_descriptor<"pingpong_medium_f8E4M3FN_expanded", tensor>
+// GFX942-LABEL: @negative_matmul_f8_dynamic_multiple_of_gfx950
+// GFX942-NOT:     compilation_info = #iree_codegen.compilation_info
+// GFX942-NOT:     iree_codegen.ukernel = #iree_codegen.ukernel_descriptor<"pingpong_medium_f8E4M3FN_expanded", tensor>
+// -----
+
+// The dynamic dimension is a multiple of 128, but doesn't have a lower bound of 512.
+
+#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>
+#map2 = affine_map<(d0, d1, d2, d3) -> (d2, d3)>
+#map3 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>
+func.func @negative_matmul_f8_dynamic_lower_bound_gfx950(%arg0: index) -> tensor<1x128x1024xf32> {
+  %cst = arith.constant 0.000000e+00 : f32
+  %0 = util.assume.int %arg0<umin = 256, udiv = 128> : index
+  %1 = tensor.empty(%0) : tensor<1x128x?xf8E4M3FN>
+  %2 = tensor.empty(%0) : tensor<1024x?xf8E4M3FN>
+  %3 = tensor.empty() : tensor<1x128x1024xf32>
+  %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<1x128x1024xf32>) -> tensor<1x128x1024xf32>
+  %5 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%1, %2 : tensor<1x128x?xf8E4M3FN>, tensor<1024x?xf8E4M3FN>) outs(%4 : tensor<1x128x1024xf32>) {
+  ^bb0(%in: f8E4M3FN, %in_0: f8E4M3FN, %out: f32):
+    %6 = arith.extf %in : f8E4M3FN to f32
+    %7 = arith.extf %in_0 : f8E4M3FN to f32
+    %8 = arith.mulf %6, %7 : f32
+    %9 = arith.addf %out, %8 : f32
+    linalg.yield %9 : f32
+  } -> tensor<1x128x1024xf32>
+  return %5 : tensor<1x128x1024xf32>
+}
+// GFX950-LABEL: @negative_matmul_f8_dynamic_lower_bound_gfx950
+// GFX950-NOT:     compilation_info = #iree_codegen.compilation_info
+// GFX950-NOT:     iree_codegen.ukernel = #iree_codegen.ukernel_descriptor<"pingpong_medium_f8E4M3FN_expanded", tensor>
+// GFX942-LABEL: @negative_matmul_f8_dynamic_lower_bound_gfx950
+// GFX942-NOT:     compilation_info = #iree_codegen.compilation_info
+// GFX942-NOT:     iree_codegen.ukernel = #iree_codegen.ukernel_descriptor<"pingpong_medium_f8E4M3FN_expanded", tensor>
+// -----
+
+#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>
+#map2 = affine_map<(d0, d1, d2, d3) -> (d2, d3)>
+#map3 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>
+func.func @negative_matmul_f8_large_no_zero_fill_gfx950(%arg0: tensor<1x256x4096xf8E4M3FN>, %arg1: tensor<1024x4096xf8E4M3FN>) -> tensor<1x256x1024xf32> {
+  %cst = arith.constant 1.000000e+00 : f32
+  %0 = tensor.empty() : tensor<1x256x1024xf32>
+  %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<1x256x1024xf32>) -> tensor<1x256x1024xf32>
+  %2 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%arg0, %arg1 : tensor<1x256x4096xf8E4M3FN>, tensor<1024x4096xf8E4M3FN>) outs(%1 : tensor<1x256x1024xf32>) {
+    ^bb0(%in: f8E4M3FN, %in_4: f8E4M3FN, %out: f32):
+      %12 = arith.extf %in : f8E4M3FN to f32
+      %13 = arith.extf %in_4 : f8E4M3FN to f32
+      %14 = arith.mulf %12, %13 : f32
+      %15 = arith.addf %out, %14 : f32
+      linalg.yield %15 : f32
+    } -> tensor<1x256x1024xf32>
+  return %2 : tensor<1x256x1024xf32>
+}
+// GFX950-LABEL: @negative_matmul_f8_large_no_zero_fill_gfx950
+// GFX950-NOT:     compilation_info = #iree_codegen.compilation_info
+// GFX950-NOT:     iree_codegen.ukernel = #iree_codegen.ukernel_descriptor
+// GFX942-LABEL: @negative_matmul_f8_large_no_zero_fill_gfx950
+// GFX942-NOT:     compilation_info = #iree_codegen.compilation_info
+// GFX942-NOT:     iree_codegen.ukernel = #iree_codegen.ukernel_descriptor
+// -----
+
 
 #map1 = affine_map<(d0, d1, d2) -> (d0, d2)>
 #map2 = affine_map<(d0, d1, d2) -> (d1, d2)>
@@ -199,9 +421,12 @@
     } -> tensor<256x1024xf32>
   return %2 : tensor<256x1024xf32>
 }
-// CHECK-LABEL: @negative_matmul_f16
-// CHECK-NOT:     compilation_info = #iree_codegen.compilation_info
-// CHECK-NOT:     iree_codegen.ukernel = #iree_codegen.ukernel_descriptor
+// GFX942-LABEL: @negative_matmul_f16
+// GFX942-NOT:     compilation_info = #iree_codegen.compilation_info
+// GFX942-NOT:     iree_codegen.ukernel = #iree_codegen.ukernel_descriptor
+// GFX950-LABEL: @negative_matmul_f16
+// GFX950-NOT:     compilation_info = #iree_codegen.compilation_info
+// GFX950-NOT:     iree_codegen.ukernel = #iree_codegen.ukernel_descriptor
 
 // -----
 
@@ -222,9 +447,12 @@
     } -> tensor<1024x1024xf32>
   return %2 : tensor<1024x1024xf32>
 }
-// CHECK-LABEL: @negative_matmul_f16_no_zero_fill
-// CHECK-NOT:     compilation_info = #iree_codegen.compilation_info
-// CHECK-NOT:     iree_codegen.ukernel = #iree_codegen.ukernel_descriptor
+// GFX942-LABEL: @negative_matmul_f16_no_zero_fill
+// GFX942-NOT:     compilation_info = #iree_codegen.compilation_info
+// GFX942-NOT:     iree_codegen.ukernel = #iree_codegen.ukernel_descriptor
+// GFX950-LABEL: @negative_matmul_f16_no_zero_fill
+// GFX950-NOT:     compilation_info = #iree_codegen.compilation_info
+// GFX950-NOT:     iree_codegen.ukernel = #iree_codegen.ukernel_descriptor
 
 // -----
 
@@ -245,9 +473,12 @@
     } -> tensor<1x128x1024xf32>
   return %2 : tensor<1x128x1024xf32>
 }
-// CHECK-LABEL: @negative_matmul_f16_medium_expanded_no_zero_fill
-// CHECK-NOT:     compilation_info = #iree_codegen.compilation_info
-// CHECK-NOT:     iree_codegen.ukernel = #iree_codegen.ukernel_descriptor
+// GFX942-LABEL: @negative_matmul_f16_medium_expanded_no_zero_fill
+// GFX942-NOT:     compilation_info = #iree_codegen.compilation_info
+// GFX942-NOT:     iree_codegen.ukernel = #iree_codegen.ukernel_descriptor
+// GFX950-LABEL: @negative_matmul_f16_medium_expanded_no_zero_fill
+// GFX950-NOT:     compilation_info = #iree_codegen.compilation_info
+// GFX950-NOT:     iree_codegen.ukernel = #iree_codegen.ukernel_descriptor
 
 // -----
 
@@ -268,9 +499,12 @@
     } -> tensor<1x256x1024xf32>
   return %2 : tensor<1x256x1024xf32>
 }
-// CHECK-LABEL: @negative_matmul_f16_large_expanded_no_zero_fill
-// CHECK-NOT:     compilation_info = #iree_codegen.compilation_info
-// CHECK-NOT:     iree_codegen.ukernel = #iree_codegen.ukernel_descriptor
+// GFX942-LABEL: @negative_matmul_f16_large_expanded_no_zero_fill
+// GFX942-NOT:     compilation_info = #iree_codegen.compilation_info
+// GFX942-NOT:     iree_codegen.ukernel = #iree_codegen.ukernel_descriptor
+// GFX950-LABEL: @negative_matmul_f16_large_expanded_no_zero_fill
+// GFX950-NOT:     compilation_info = #iree_codegen.compilation_info
+// GFX950-NOT:     iree_codegen.ukernel = #iree_codegen.ukernel_descriptor
 
 // -----
 
@@ -291,9 +525,12 @@
     } -> tensor<256x1024xf32>
   return %2 : tensor<256x1024xf32>
 }
-// CHECK-LABEL: @negative_matmul_bf16
-// CHECK-NOT:     compilation_info = #iree_codegen.compilation_info
-// CHECK-NOT:     iree_codegen.ukernel = #iree_codegen.ukernel_descriptor
+// GFX942-LABEL: @negative_matmul_bf16
+// GFX942-NOT:     compilation_info = #iree_codegen.compilation_info
+// GFX942-NOT:     iree_codegen.ukernel = #iree_codegen.ukernel_descriptor
+// GFX950-LABEL: @negative_matmul_bf16
+// GFX950-NOT:     compilation_info = #iree_codegen.compilation_info
+// GFX950-NOT:     iree_codegen.ukernel = #iree_codegen.ukernel_descriptor
 
 // -----
 
@@ -314,9 +551,12 @@
     } -> tensor<1024x1024xf32>
   return %2 : tensor<1024x1024xf32>
 }
-// CHECK-LABEL: @negative_matmul_bf16_large_no_zero_fill
-// CHECK-NOT:     compilation_info = #iree_codegen.compilation_info
-// CHECK-NOT:     iree_codegen.ukernel = #iree_codegen.ukernel_descriptor
+// GFX942-LABEL: @negative_matmul_bf16_large_no_zero_fill
+// GFX942-NOT:     compilation_info = #iree_codegen.compilation_info
+// GFX942-NOT:     iree_codegen.ukernel = #iree_codegen.ukernel_descriptor
+// GFX950-LABEL: @negative_matmul_bf16_large_no_zero_fill
+// GFX950-NOT:     compilation_info = #iree_codegen.compilation_info
+// GFX950-NOT:     iree_codegen.ukernel = #iree_codegen.ukernel_descriptor
 
 // -----
 
@@ -342,9 +582,12 @@
   } -> tensor<1x256x1024xf32>
   return %5 : tensor<1x256x1024xf32>
 }
-// CHECK-LABEL: @negative_matmul_bf16_dynamic_lower_bound
-// CHECK-NOT:     compilation_info = #iree_codegen.compilation_info
-// CHECK-NOT:     iree_codegen.ukernel = #iree_codegen.ukernel_descriptor<"pingpong_large_bf16_expanded", tensor>
+// GFX942-LABEL: @negative_matmul_bf16_dynamic_lower_bound
+// GFX942-NOT:     compilation_info = #iree_codegen.compilation_info
+// GFX942-NOT:     iree_codegen.ukernel = #iree_codegen.ukernel_descriptor<"pingpong_large_bf16_expanded", tensor>
+// GFX950-LABEL: @negative_matmul_bf16_dynamic_lower_bound
+// GFX950-NOT:     compilation_info = #iree_codegen.compilation_info
+// GFX950-NOT:     iree_codegen.ukernel = #iree_codegen.ukernel_descriptor<"pingpong_large_bf16_expanded", tensor>
 
 // -----
 
@@ -365,9 +608,12 @@
     } -> tensor<1x128x1024xf32>
   return %2 : tensor<1x128x1024xf32>
 }
-// CHECK-LABEL: @negative_matmul_bf16_expanded_medium_no_zero_fill
-// CHECK-NOT:     compilation_info = #iree_codegen.compilation_info
-// CHECK-NOT:     iree_codegen.ukernel = #iree_codegen.ukernel_descriptor
+// GFX942-LABEL: @negative_matmul_bf16_expanded_medium_no_zero_fill
+// GFX942-NOT:     compilation_info = #iree_codegen.compilation_info
+// GFX942-NOT:     iree_codegen.ukernel = #iree_codegen.ukernel_descriptor
+// GFX950-LABEL: @negative_matmul_bf16_expanded_medium_no_zero_fill
+// GFX950-NOT:     compilation_info = #iree_codegen.compilation_info
+// GFX950-NOT:     iree_codegen.ukernel = #iree_codegen.ukernel_descriptor
 
 // -----
 
@@ -388,6 +634,9 @@
     } -> tensor<1x256x1024xf32>
   return %2 : tensor<1x256x1024xf32>
 }
-// CHECK-LABEL: @negative_matmul_bf16_expanded_large_no_zero_fill
-// CHECK-NOT:     compilation_info = #iree_codegen.compilation_info
-// CHECK-NOT:     iree_codegen.ukernel = #iree_codegen.ukernel_descriptor
+// GFX942-LABEL: @negative_matmul_bf16_expanded_large_no_zero_fill
+// GFX942-NOT:     compilation_info = #iree_codegen.compilation_info
+// GFX942-NOT:     iree_codegen.ukernel = #iree_codegen.ukernel_descriptor
+// GFX950-LABEL: @negative_matmul_bf16_expanded_large_no_zero_fill
+// GFX950-NOT:     compilation_info = #iree_codegen.compilation_info
+// GFX950-NOT:     iree_codegen.ukernel = #iree_codegen.ukernel_descriptor
diff --git a/compiler/plugins/target/ROCM/Dialect/ROCM/Transforms/test/apply_builtin_ukernel_pdl_patterns_driver.mlir b/compiler/plugins/target/ROCM/Dialect/ROCM/Transforms/test/apply_builtin_ukernel_pdl_patterns_driver.mlir
index 30a8340..e3197ef 100644
--- a/compiler/plugins/target/ROCM/Dialect/ROCM/Transforms/test/apply_builtin_ukernel_pdl_patterns_driver.mlir
+++ b/compiler/plugins/target/ROCM/Dialect/ROCM/Transforms/test/apply_builtin_ukernel_pdl_patterns_driver.mlir
@@ -47,7 +47,7 @@
     return %2 : tensor<1x128x1024xf32>
   }
 }
-// CHECK-LABEL: util.func private @pingpong_medium_f8_expanded
+// CHECK-LABEL: util.func private @pingpong_medium_f8E4M3FNUZ_expanded
 // CHECK:         iree_codegen.inner_tiled
 
 // -----
@@ -88,8 +88,8 @@
 // CHECK-SAME:      compilation_info = #iree_codegen.compilation_info
 // CHECK-SAME:      lowering_config =
 // CHECK-SAME:      translation_info =
-// CHECK-SAME:      iree_codegen.ukernel = #iree_codegen.ukernel_descriptor<"pingpong_large_f8_expanded", tensor>
-// CHECK-LABEL: util.func private @pingpong_large_f8_expanded
+// CHECK-SAME:      iree_codegen.ukernel = #iree_codegen.ukernel_descriptor<"pingpong_large_f8E4M3FNUZ_expanded", tensor>
+// CHECK-LABEL: util.func private @pingpong_large_f8E4M3FNUZ_expanded
 // CHECK:         iree_codegen.inner_tiled
 
 // -----
diff --git a/compiler/plugins/target/ROCM/builtins/mlir_ukernel/BUILD.bazel b/compiler/plugins/target/ROCM/builtins/mlir_ukernel/BUILD.bazel
index a5c4e18..43e205a 100644
--- a/compiler/plugins/target/ROCM/builtins/mlir_ukernel/BUILD.bazel
+++ b/compiler/plugins/target/ROCM/builtins/mlir_ukernel/BUILD.bazel
@@ -23,15 +23,7 @@
     inline = True,
 )
 
-# Target archs for specialization patternsets. https://llvm.org/docs/AMDGPUUsage.html#processors
-gpu_archs = [
-    "gfx942",
-]
-
-ukernel_patterns_mlir_files = [
-    "ukernel_patterns_%s.mlir" % gpu_arch
-    for gpu_arch in gpu_archs
-]
+ukernel_patterns_mlir_files = glob(["ukernel_patterns_*.mlir"])
 
 iree_c_embed_data(
     name = "iree_mlir_ukernel_patterns_amdgpu",
@@ -57,7 +49,8 @@
     srcs = [
         "iree_uk_amdgpu_matmul_bf16.mlir",
         "iree_uk_amdgpu_matmul_f16.mlir",
-        "iree_uk_amdgpu_matmul_f8.mlir",
+        "iree_uk_amdgpu_matmul_f8E4M3FN.mlir",
+        "iree_uk_amdgpu_matmul_f8E4M3FNUZ.mlir",
     ],
     c_file_output = "iree_mlir_ukernels_amdgpu.c",
     flatten = True,
@@ -71,7 +64,8 @@
     srcs = [
         "iree_uk_amdgpu_matmul_bf16.mlir",
         "iree_uk_amdgpu_matmul_f16.mlir",
-        "iree_uk_amdgpu_matmul_f8.mlir",
+        "iree_uk_amdgpu_matmul_f8E4M3FN.mlir",
+        "iree_uk_amdgpu_matmul_f8E4M3FNUZ.mlir",
     ],
     cfg = "//compiler:lit.cfg.py",
     tools = [
diff --git a/compiler/plugins/target/ROCM/builtins/mlir_ukernel/CMakeLists.txt b/compiler/plugins/target/ROCM/builtins/mlir_ukernel/CMakeLists.txt
index 73815c9..057162f 100644
--- a/compiler/plugins/target/ROCM/builtins/mlir_ukernel/CMakeLists.txt
+++ b/compiler/plugins/target/ROCM/builtins/mlir_ukernel/CMakeLists.txt
@@ -14,11 +14,12 @@
   return()
 endif()
 
+file(GLOB _GLOB_UKERNEL_PATTERNS_X_MLIR LIST_DIRECTORIES false RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} CONFIGURE_DEPENDS ukernel_patterns_*.mlir)
 iree_c_embed_data(
   NAME
     iree_mlir_ukernel_patterns_amdgpu
   SRCS
-    "ukernel_patterns_gfx942.mlir"
+    "${_GLOB_UKERNEL_PATTERNS_X_MLIR}"
   C_FILE_OUTPUT
     "iree_mlir_ukernel_patterns_amdgpu.c"
   H_FILE_OUTPUT
@@ -31,7 +32,7 @@
   NAME
     verify_mlir_ukernel_patterns_amdgpu
   SRCS
-    "ukernel_patterns_gfx942.mlir"
+    "${_GLOB_UKERNEL_PATTERNS_X_MLIR}"
   TOOLS
     iree-opt
 )
@@ -42,7 +43,8 @@
   SRCS
     "iree_uk_amdgpu_matmul_bf16.mlir"
     "iree_uk_amdgpu_matmul_f16.mlir"
-    "iree_uk_amdgpu_matmul_f8.mlir"
+    "iree_uk_amdgpu_matmul_f8E4M3FN.mlir"
+    "iree_uk_amdgpu_matmul_f8E4M3FNUZ.mlir"
   C_FILE_OUTPUT
     "iree_mlir_ukernels_amdgpu.c"
   H_FILE_OUTPUT
@@ -57,7 +59,8 @@
   SRCS
     "iree_uk_amdgpu_matmul_bf16.mlir"
     "iree_uk_amdgpu_matmul_f16.mlir"
-    "iree_uk_amdgpu_matmul_f8.mlir"
+    "iree_uk_amdgpu_matmul_f8E4M3FN.mlir"
+    "iree_uk_amdgpu_matmul_f8E4M3FNUZ.mlir"
   TOOLS
     iree-opt
 )
diff --git a/compiler/plugins/target/ROCM/builtins/mlir_ukernel/iree_uk_amdgpu_matmul_f8.mlir b/compiler/plugins/target/ROCM/builtins/mlir_ukernel/iree_uk_amdgpu_matmul_f8E4M3FN.mlir
similarity index 73%
copy from compiler/plugins/target/ROCM/builtins/mlir_ukernel/iree_uk_amdgpu_matmul_f8.mlir
copy to compiler/plugins/target/ROCM/builtins/mlir_ukernel/iree_uk_amdgpu_matmul_f8E4M3FN.mlir
index d808b97..1f4bcdf 100644
--- a/compiler/plugins/target/ROCM/builtins/mlir_ukernel/iree_uk_amdgpu_matmul_f8.mlir
+++ b/compiler/plugins/target/ROCM/builtins/mlir_ukernel/iree_uk_amdgpu_matmul_f8E4M3FN.mlir
@@ -1,18 +1,18 @@
 //  RUN: iree-opt %s
 
-!in_ty_f8 = tensor<256x?xf8E4M3FNUZ>
-!exp_in_ty_f8 = tensor<1x256x?xf8E4M3FNUZ>
-!block_in_f8 = tensor<256x128xf8E4M3FNUZ>
-!exp_block_in_f8 = tensor<1x256x128xf8E4M3FNUZ>
-!flat_shared_f8 = memref<32768xf8E4M3FNUZ, #gpu.address_space<workgroup>>
-!shared_f8 = memref<256x128xf8E4M3FNUZ, #gpu.address_space<workgroup>>
-!shared_exp_f8 = memref<16x16x4x32xf8E4M3FNUZ, #gpu.address_space<workgroup>>
+!in_ty_f8 = tensor<256x?xf8E4M3FN>
+!exp_in_ty_f8 = tensor<1x256x?xf8E4M3FN>
+!block_in_f8 = tensor<256x128xf8E4M3FN>
+!exp_block_in_f8 = tensor<1x256x128xf8E4M3FN>
+!flat_shared_f8 = memref<32768xf8E4M3FN, #gpu.address_space<workgroup>>
+!shared_f8 = memref<256x128xf8E4M3FN, #gpu.address_space<workgroup>>
+!shared_exp_f8 = memref<16x16x4x32xf8E4M3FN, #gpu.address_space<workgroup>>
 
-!mexp_in_ty_f8 = tensor<1x128x?xf8E4M3FNUZ>
-!mexp_block_in_f8 = tensor<1x128x128xf8E4M3FNUZ>
-!mflat_shared_f8 = memref<16384xf8E4M3FNUZ, #gpu.address_space<workgroup>>
-!mshared_f8 = memref<128x128xf8E4M3FNUZ, #gpu.address_space<workgroup>>
-!mshared_exp_f8 = memref<8x16x4x32xf8E4M3FNUZ, #gpu.address_space<workgroup>>
+!mexp_in_ty_f8 = tensor<1x128x?xf8E4M3FN>
+!mexp_block_in_f8 = tensor<1x128x128xf8E4M3FN>
+!mflat_shared_f8 = memref<16384xf8E4M3FN, #gpu.address_space<workgroup>>
+!mshared_f8 = memref<128x128xf8E4M3FN, #gpu.address_space<workgroup>>
+!mshared_exp_f8 = memref<8x16x4x32xf8E4M3FN, #gpu.address_space<workgroup>>
 
 #contraction_accesses = [
  affine_map<(i, j, k) -> (i, k)>,
@@ -20,7 +20,7 @@
  affine_map<(i, j, k) -> (i, j)>
 ]
 
-util.func @pingpong_medium_f8_expanded(%lhs_base: !mexp_in_ty_f8, %rhs_base: !in_ty_f8, %unused_acc: tensor<1x128x256xf32>) -> tensor<1x128x256xf32> {
+util.func @pingpong_medium_f8E4M3FN_expanded(%lhs_base: !mexp_in_ty_f8, %rhs_base: !in_ty_f8, %unused_acc: tensor<1x128x256xf32>) -> tensor<1x128x256xf32> {
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
   %c2 = arith.constant 2 : index
@@ -32,7 +32,7 @@
   %c64 = arith.constant 64 : index
   %c128 = arith.constant 128 : index
   %c256 = arith.constant 256 : index
-  %cst = arith.constant 0.0 : f8E4M3FNUZ
+  %cst = arith.constant 0.0 : f8E4M3FN
   %lhs_shared_base = memref.alloc() : !mflat_shared_f8
   %rhs_shared_base = memref.alloc() : !flat_shared_f8
 
@@ -52,16 +52,16 @@
   scf.forall (%id) in (1024) {
     %delin:2 = affine.delinearize_index %id into (128, 8) : index, index
     %vec = arith.muli %delin#1, %c16 overflow<nsw, nuw> : index
-    %lhs_thread_local = tensor.extract_slice %lhs_init [0, %delin#0, %vec] [1, 1, 16] [1, 1, 1] : !mexp_block_in_f8 to tensor<1x1x16xf8E4M3FNUZ>
-    %lhs_vec_local = vector.transfer_read %lhs_thread_local [%c0, %c0, %c0], %cst {in_bounds = [true, true]} : tensor<1x1x16xf8E4M3FNUZ>, vector<1x16xf8E4M3FNUZ>
-    vector.transfer_write %lhs_vec_local, %lhs_shared[%delin#0, %vec] {in_bounds = [true, true]} : vector<1x16xf8E4M3FNUZ>, !mshared_f8
+    %lhs_thread_local = tensor.extract_slice %lhs_init [0, %delin#0, %vec] [1, 1, 16] [1, 1, 1] : !mexp_block_in_f8 to tensor<1x1x16xf8E4M3FN>
+    %lhs_vec_local = vector.transfer_read %lhs_thread_local [%c0, %c0, %c0], %cst {in_bounds = [true, true]} : tensor<1x1x16xf8E4M3FN>, vector<1x16xf8E4M3FN>
+    vector.transfer_write %lhs_vec_local, %lhs_shared[%delin#0, %vec] {in_bounds = [true, true]} : vector<1x16xf8E4M3FN>, !mshared_f8
   } {mapping = [#gpu.thread<linear_dim_0>]}
   scf.forall (%id) in (2048) {
     %delin:2 = affine.delinearize_index %id into (256, 8) : index, index
     %vec = arith.muli %delin#1, %c16 overflow<nsw, nuw> : index
-    %rhs_thread_local = tensor.extract_slice %rhs_init [%delin#0, %vec] [1, 16] [1, 1] : !block_in_f8 to tensor<1x16xf8E4M3FNUZ>
-    %rhs_vec_local = vector.transfer_read %rhs_thread_local [%c0, %c0], %cst {in_bounds = [true, true]} : tensor<1x16xf8E4M3FNUZ>, vector<1x16xf8E4M3FNUZ>
-    vector.transfer_write %rhs_vec_local, %rhs_shared[%delin#0, %vec] {in_bounds = [true, true]} : vector<1x16xf8E4M3FNUZ>, !shared_f8
+    %rhs_thread_local = tensor.extract_slice %rhs_init [%delin#0, %vec] [1, 16] [1, 1] : !block_in_f8 to tensor<1x16xf8E4M3FN>
+    %rhs_vec_local = vector.transfer_read %rhs_thread_local [%c0, %c0], %cst {in_bounds = [true, true]} : tensor<1x16xf8E4M3FN>, vector<1x16xf8E4M3FN>
+    vector.transfer_write %rhs_vec_local, %rhs_shared[%delin#0, %vec] {in_bounds = [true, true]} : vector<1x16xf8E4M3FN>, !shared_f8
   } {mapping = [#gpu.thread<linear_dim_0>]}
 
   %lhs_shared_expand = memref.expand_shape %lhs_shared [[0, 1], [2, 3]] output_shape [8, 16, 4, 32] : !mshared_f8 into !mshared_exp_f8
@@ -100,39 +100,39 @@
     }
     %3 = scf.for %i = %c128 to %dim step %c128 iter_args(%iter = %2) -> vector<4x4x1x4xf32> {
 
-      %lhs_vec_0 = vector.transfer_read %lhs_shared_expand[%m_outer_id, %ids#3, %c0, %inner_id], %cst {in_bounds = [true, true, true, true]} : !mshared_exp_f8, vector<4x1x2x8xf8E4M3FNUZ>
-      %rhs_vec_0 = vector.transfer_read %rhs_shared_expand[%n_outer_id, %ids#3, %c0, %inner_id], %cst {in_bounds = [true, true, true, true]} : !shared_exp_f8, vector<4x1x2x8xf8E4M3FNUZ>
-      %lhs_vec_0_t = vector.transpose %lhs_vec_0, [0, 2, 1, 3] : vector<4x1x2x8xf8E4M3FNUZ> to vector<4x2x1x8xf8E4M3FNUZ>
-      %rhs_vec_0_t = vector.transpose %rhs_vec_0, [0, 2, 1, 3] : vector<4x1x2x8xf8E4M3FNUZ> to vector<4x2x1x8xf8E4M3FNUZ>
+      %lhs_vec_0 = vector.transfer_read %lhs_shared_expand[%m_outer_id, %ids#3, %c0, %inner_id], %cst {in_bounds = [true, true, true, true]} : !mshared_exp_f8, vector<4x1x2x8xf8E4M3FN>
+      %rhs_vec_0 = vector.transfer_read %rhs_shared_expand[%n_outer_id, %ids#3, %c0, %inner_id], %cst {in_bounds = [true, true, true, true]} : !shared_exp_f8, vector<4x1x2x8xf8E4M3FN>
+      %lhs_vec_0_t = vector.transpose %lhs_vec_0, [0, 2, 1, 3] : vector<4x1x2x8xf8E4M3FN> to vector<4x2x1x8xf8E4M3FN>
+      %rhs_vec_0_t = vector.transpose %rhs_vec_0, [0, 2, 1, 3] : vector<4x1x2x8xf8E4M3FN> to vector<4x2x1x8xf8E4M3FN>
 
       rocdl.sched.barrier 0
 
       // Global loads of rhs.
       %rhs_block = tensor.extract_slice %rhs [0, %i] [256, 128] [1, 1] : !in_ty_f8 to !block_in_f8
-      %rhs_thread_0 = tensor.extract_slice %rhs_block [%glb0, %gko] [1, 16] [1, 1] : !block_in_f8 to tensor<1x16xf8E4M3FNUZ>
-      %rhs_vec_local_0 = vector.transfer_read %rhs_thread_0 [%c0, %c0], %cst {in_bounds = [true, true]} : tensor<1x16xf8E4M3FNUZ>, vector<1x16xf8E4M3FNUZ>
-      %rhs_thread_1 = tensor.extract_slice %rhs_block [%glb1, %gko] [1, 16] [1, 1] : !block_in_f8 to tensor<1x16xf8E4M3FNUZ>
-      %rhs_vec_local_1 = vector.transfer_read %rhs_thread_1 [%c0, %c0], %cst {in_bounds = [true, true]} : tensor<1x16xf8E4M3FNUZ>, vector<1x16xf8E4M3FNUZ>
-      %rhs_thread_2 = tensor.extract_slice %rhs_block [%glb2, %gko] [1, 16] [1, 1] : !block_in_f8 to tensor<1x16xf8E4M3FNUZ>
-      %rhs_vec_local_2 = vector.transfer_read %rhs_thread_2 [%c0, %c0], %cst {in_bounds = [true, true]} : tensor<1x16xf8E4M3FNUZ>, vector<1x16xf8E4M3FNUZ>
-      %rhs_thread_3 = tensor.extract_slice %rhs_block [%glb3, %gko] [1, 16] [1, 1] : !block_in_f8 to tensor<1x16xf8E4M3FNUZ>
-      %rhs_vec_local_3 = vector.transfer_read %rhs_thread_3 [%c0, %c0], %cst {in_bounds = [true, true]} : tensor<1x16xf8E4M3FNUZ>, vector<1x16xf8E4M3FNUZ>
+      %rhs_thread_0 = tensor.extract_slice %rhs_block [%glb0, %gko] [1, 16] [1, 1] : !block_in_f8 to tensor<1x16xf8E4M3FN>
+      %rhs_vec_local_0 = vector.transfer_read %rhs_thread_0 [%c0, %c0], %cst {in_bounds = [true, true]} : tensor<1x16xf8E4M3FN>, vector<1x16xf8E4M3FN>
+      %rhs_thread_1 = tensor.extract_slice %rhs_block [%glb1, %gko] [1, 16] [1, 1] : !block_in_f8 to tensor<1x16xf8E4M3FN>
+      %rhs_vec_local_1 = vector.transfer_read %rhs_thread_1 [%c0, %c0], %cst {in_bounds = [true, true]} : tensor<1x16xf8E4M3FN>, vector<1x16xf8E4M3FN>
+      %rhs_thread_2 = tensor.extract_slice %rhs_block [%glb2, %gko] [1, 16] [1, 1] : !block_in_f8 to tensor<1x16xf8E4M3FN>
+      %rhs_vec_local_2 = vector.transfer_read %rhs_thread_2 [%c0, %c0], %cst {in_bounds = [true, true]} : tensor<1x16xf8E4M3FN>, vector<1x16xf8E4M3FN>
+      %rhs_thread_3 = tensor.extract_slice %rhs_block [%glb3, %gko] [1, 16] [1, 1] : !block_in_f8 to tensor<1x16xf8E4M3FN>
+      %rhs_vec_local_3 = vector.transfer_read %rhs_thread_3 [%c0, %c0], %cst {in_bounds = [true, true]} : tensor<1x16xf8E4M3FN>, vector<1x16xf8E4M3FN>
 
       rocdl.sched.barrier 0
 
-      %lhs_vec_2 = vector.transfer_read %lhs_shared_expand[%m_outer_id, %ids#3, %c2, %inner_id], %cst {in_bounds = [true, true, true, true]} : !mshared_exp_f8, vector<4x1x2x8xf8E4M3FNUZ>
-      %rhs_vec_2 = vector.transfer_read %rhs_shared_expand[%n_outer_id, %ids#3, %c2, %inner_id], %cst {in_bounds = [true, true, true, true]} : !shared_exp_f8, vector<4x1x2x8xf8E4M3FNUZ>
-      %lhs_vec_2_t = vector.transpose %lhs_vec_2, [0, 2, 1, 3] : vector<4x1x2x8xf8E4M3FNUZ> to vector<4x2x1x8xf8E4M3FNUZ>
-      %rhs_vec_2_t = vector.transpose %rhs_vec_2, [0, 2, 1, 3] : vector<4x1x2x8xf8E4M3FNUZ> to vector<4x2x1x8xf8E4M3FNUZ>
+      %lhs_vec_2 = vector.transfer_read %lhs_shared_expand[%m_outer_id, %ids#3, %c2, %inner_id], %cst {in_bounds = [true, true, true, true]} : !mshared_exp_f8, vector<4x1x2x8xf8E4M3FN>
+      %rhs_vec_2 = vector.transfer_read %rhs_shared_expand[%n_outer_id, %ids#3, %c2, %inner_id], %cst {in_bounds = [true, true, true, true]} : !shared_exp_f8, vector<4x1x2x8xf8E4M3FN>
+      %lhs_vec_2_t = vector.transpose %lhs_vec_2, [0, 2, 1, 3] : vector<4x1x2x8xf8E4M3FN> to vector<4x2x1x8xf8E4M3FN>
+      %rhs_vec_2_t = vector.transpose %rhs_vec_2, [0, 2, 1, 3] : vector<4x1x2x8xf8E4M3FN> to vector<4x2x1x8xf8E4M3FN>
 
       rocdl.sched.barrier 0
 
       // Global loads of lhs.
       %lhs_block = tensor.extract_slice %lhs [0, 0, %i] [1, 128, 128] [1, 1, 1] : !mexp_in_ty_f8 to !mexp_block_in_f8
-      %lhs_thread_0 = tensor.extract_slice %lhs_block [0, %glb0_lhs, %gko] [1, 1, 16] [1, 1, 1] : !mexp_block_in_f8 to tensor<1x1x16xf8E4M3FNUZ>
-      %lhs_vec_local_0 = vector.transfer_read %lhs_thread_0 [%c0, %c0, %c0], %cst {in_bounds = [true, true]} : tensor<1x1x16xf8E4M3FNUZ>, vector<1x16xf8E4M3FNUZ>
-      %lhs_thread_1 = tensor.extract_slice %lhs_block [0, %glb1_lhs, %gko] [1, 1, 16] [1, 1, 1] : !mexp_block_in_f8 to tensor<1x1x16xf8E4M3FNUZ>
-      %lhs_vec_local_1 = vector.transfer_read %lhs_thread_1 [%c0, %c0, %c0], %cst {in_bounds = [true, true]} : tensor<1x1x16xf8E4M3FNUZ>, vector<1x16xf8E4M3FNUZ>
+      %lhs_thread_0 = tensor.extract_slice %lhs_block [0, %glb0_lhs, %gko] [1, 1, 16] [1, 1, 1] : !mexp_block_in_f8 to tensor<1x1x16xf8E4M3FN>
+      %lhs_vec_local_0 = vector.transfer_read %lhs_thread_0 [%c0, %c0, %c0], %cst {in_bounds = [true, true]} : tensor<1x1x16xf8E4M3FN>, vector<1x16xf8E4M3FN>
+      %lhs_thread_1 = tensor.extract_slice %lhs_block [0, %glb1_lhs, %gko] [1, 1, 16] [1, 1, 1] : !mexp_block_in_f8 to tensor<1x1x16xf8E4M3FN>
+      %lhs_vec_local_1 = vector.transfer_read %lhs_thread_1 [%c0, %c0, %c0], %cst {in_bounds = [true, true]} : tensor<1x1x16xf8E4M3FN>, vector<1x16xf8E4M3FN>
 
       gpu.barrier
       rocdl.sched.barrier 0
@@ -141,20 +141,20 @@
       %dot0 = iree_codegen.inner_tiled ins(%lhs_vec_0_t, %rhs_vec_0_t) outs(%iter) {
         indexing_maps = #contraction_accesses,
         iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<reduction>],
-        kind = #iree_gpu.mma_layout<MFMA_F32_16x16x32_F8E4M3FNUZ, col_major = true>
-      } : vector<4x2x1x8xf8E4M3FNUZ>, vector<4x2x1x8xf8E4M3FNUZ> into vector<4x4x1x4xf32>
+        kind = #iree_gpu.mma_layout<MFMA_F32_16x16x32_F8E4M3FN, col_major = true>
+      } : vector<4x2x1x8xf8E4M3FN>, vector<4x2x1x8xf8E4M3FN> into vector<4x4x1x4xf32>
 
       rocdl.s.setprio 0
       gpu.barrier
       rocdl.sched.barrier 0
 
-      vector.transfer_write %rhs_vec_local_0, %rhs_shared [%glb0, %gko] {in_bounds = [true, true]} : vector<1x16xf8E4M3FNUZ>, !shared_f8
-      vector.transfer_write %rhs_vec_local_1, %rhs_shared [%glb1, %gko] {in_bounds = [true, true]} : vector<1x16xf8E4M3FNUZ>, !shared_f8
-      vector.transfer_write %rhs_vec_local_2, %rhs_shared [%glb2, %gko] {in_bounds = [true, true]} : vector<1x16xf8E4M3FNUZ>, !shared_f8
-      vector.transfer_write %rhs_vec_local_3, %rhs_shared [%glb3, %gko] {in_bounds = [true, true]} : vector<1x16xf8E4M3FNUZ>, !shared_f8
+      vector.transfer_write %rhs_vec_local_0, %rhs_shared [%glb0, %gko] {in_bounds = [true, true]} : vector<1x16xf8E4M3FN>, !shared_f8
+      vector.transfer_write %rhs_vec_local_1, %rhs_shared [%glb1, %gko] {in_bounds = [true, true]} : vector<1x16xf8E4M3FN>, !shared_f8
+      vector.transfer_write %rhs_vec_local_2, %rhs_shared [%glb2, %gko] {in_bounds = [true, true]} : vector<1x16xf8E4M3FN>, !shared_f8
+      vector.transfer_write %rhs_vec_local_3, %rhs_shared [%glb3, %gko] {in_bounds = [true, true]} : vector<1x16xf8E4M3FN>, !shared_f8
 
-      vector.transfer_write %lhs_vec_local_0, %lhs_shared [%glb0_lhs, %gko] {in_bounds = [true, true]} : vector<1x16xf8E4M3FNUZ>, !mshared_f8
-      vector.transfer_write %lhs_vec_local_1, %lhs_shared [%glb1_lhs, %gko] {in_bounds = [true, true]} : vector<1x16xf8E4M3FNUZ>, !mshared_f8
+      vector.transfer_write %lhs_vec_local_0, %lhs_shared [%glb0_lhs, %gko] {in_bounds = [true, true]} : vector<1x16xf8E4M3FN>, !mshared_f8
+      vector.transfer_write %lhs_vec_local_1, %lhs_shared [%glb1_lhs, %gko] {in_bounds = [true, true]} : vector<1x16xf8E4M3FN>, !mshared_f8
 
       gpu.barrier
       rocdl.sched.barrier 0
@@ -163,8 +163,8 @@
       %dot2 = iree_codegen.inner_tiled ins(%lhs_vec_2_t, %rhs_vec_2_t) outs(%dot0) {
         indexing_maps = #contraction_accesses,
         iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<reduction>],
-        kind = #iree_gpu.mma_layout<MFMA_F32_16x16x32_F8E4M3FNUZ, col_major = true>
-      } : vector<4x2x1x8xf8E4M3FNUZ>, vector<4x2x1x8xf8E4M3FNUZ> into vector<4x4x1x4xf32>
+        kind = #iree_gpu.mma_layout<MFMA_F32_16x16x32_F8E4M3FN, col_major = true>
+      } : vector<4x2x1x8xf8E4M3FN>, vector<4x2x1x8xf8E4M3FN> into vector<4x4x1x4xf32>
 
       rocdl.s.setprio 0
       gpu.barrier
@@ -177,27 +177,27 @@
     }
 
     // Epilogue
-    %lhs_vec_0 = vector.transfer_read %lhs_shared_expand[%m_outer_id, %ids#3, %c0, %inner_id], %cst {in_bounds = [true, true, true, true]} : !mshared_exp_f8, vector<4x1x2x8xf8E4M3FNUZ>
-    %rhs_vec_0 = vector.transfer_read %rhs_shared_expand[%n_outer_id, %ids#3, %c0, %inner_id], %cst {in_bounds = [true, true, true, true]} : !shared_exp_f8, vector<4x1x2x8xf8E4M3FNUZ>
-    %lhs_vec_0_t = vector.transpose %lhs_vec_0, [0, 2, 1, 3] : vector<4x1x2x8xf8E4M3FNUZ> to vector<4x2x1x8xf8E4M3FNUZ>
-    %rhs_vec_0_t = vector.transpose %rhs_vec_0, [0, 2, 1, 3] : vector<4x1x2x8xf8E4M3FNUZ> to vector<4x2x1x8xf8E4M3FNUZ>
+    %lhs_vec_0 = vector.transfer_read %lhs_shared_expand[%m_outer_id, %ids#3, %c0, %inner_id], %cst {in_bounds = [true, true, true, true]} : !mshared_exp_f8, vector<4x1x2x8xf8E4M3FN>
+    %rhs_vec_0 = vector.transfer_read %rhs_shared_expand[%n_outer_id, %ids#3, %c0, %inner_id], %cst {in_bounds = [true, true, true, true]} : !shared_exp_f8, vector<4x1x2x8xf8E4M3FN>
+    %lhs_vec_0_t = vector.transpose %lhs_vec_0, [0, 2, 1, 3] : vector<4x1x2x8xf8E4M3FN> to vector<4x2x1x8xf8E4M3FN>
+    %rhs_vec_0_t = vector.transpose %rhs_vec_0, [0, 2, 1, 3] : vector<4x1x2x8xf8E4M3FN> to vector<4x2x1x8xf8E4M3FN>
 
     %dot0 = iree_codegen.inner_tiled ins(%lhs_vec_0_t, %rhs_vec_0_t) outs(%3) {
       indexing_maps = #contraction_accesses,
       iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<reduction>],
-      kind = #iree_gpu.mma_layout<MFMA_F32_16x16x32_F8E4M3FNUZ, col_major = true>
-    } : vector<4x2x1x8xf8E4M3FNUZ>, vector<4x2x1x8xf8E4M3FNUZ> into vector<4x4x1x4xf32>
+      kind = #iree_gpu.mma_layout<MFMA_F32_16x16x32_F8E4M3FN, col_major = true>
+    } : vector<4x2x1x8xf8E4M3FN>, vector<4x2x1x8xf8E4M3FN> into vector<4x4x1x4xf32>
 
-    %lhs_vec_2 = vector.transfer_read %lhs_shared_expand[%m_outer_id, %ids#3, %c2, %inner_id], %cst {in_bounds = [true, true, true, true]} : !mshared_exp_f8, vector<4x1x2x8xf8E4M3FNUZ>
-    %rhs_vec_2 = vector.transfer_read %rhs_shared_expand[%n_outer_id, %ids#3, %c2, %inner_id], %cst {in_bounds = [true, true, true, true]} : !shared_exp_f8, vector<4x1x2x8xf8E4M3FNUZ>
-    %lhs_vec_2_t = vector.transpose %lhs_vec_2, [0, 2, 1, 3] : vector<4x1x2x8xf8E4M3FNUZ> to vector<4x2x1x8xf8E4M3FNUZ>
-    %rhs_vec_2_t = vector.transpose %rhs_vec_2, [0, 2, 1, 3] : vector<4x1x2x8xf8E4M3FNUZ> to vector<4x2x1x8xf8E4M3FNUZ>
+    %lhs_vec_2 = vector.transfer_read %lhs_shared_expand[%m_outer_id, %ids#3, %c2, %inner_id], %cst {in_bounds = [true, true, true, true]} : !mshared_exp_f8, vector<4x1x2x8xf8E4M3FN>
+    %rhs_vec_2 = vector.transfer_read %rhs_shared_expand[%n_outer_id, %ids#3, %c2, %inner_id], %cst {in_bounds = [true, true, true, true]} : !shared_exp_f8, vector<4x1x2x8xf8E4M3FN>
+    %lhs_vec_2_t = vector.transpose %lhs_vec_2, [0, 2, 1, 3] : vector<4x1x2x8xf8E4M3FN> to vector<4x2x1x8xf8E4M3FN>
+    %rhs_vec_2_t = vector.transpose %rhs_vec_2, [0, 2, 1, 3] : vector<4x1x2x8xf8E4M3FN> to vector<4x2x1x8xf8E4M3FN>
 
     %dot2 = iree_codegen.inner_tiled ins(%lhs_vec_2_t, %rhs_vec_2_t) outs(%dot0) {
       indexing_maps = #contraction_accesses,
       iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<reduction>],
-      kind = #iree_gpu.mma_layout<MFMA_F32_16x16x32_F8E4M3FNUZ, col_major = true>
-    } : vector<4x2x1x8xf8E4M3FNUZ>, vector<4x2x1x8xf8E4M3FNUZ> into vector<4x4x1x4xf32>
+      kind = #iree_gpu.mma_layout<MFMA_F32_16x16x32_F8E4M3FN, col_major = true>
+    } : vector<4x2x1x8xf8E4M3FN>, vector<4x2x1x8xf8E4M3FN> into vector<4x4x1x4xf32>
 
     %tp = vector.transpose %dot2, [0, 2, 1, 3] : vector<4x4x1x4xf32> to vector<4x1x4x4xf32>
     %empty = tensor.empty() : tensor<1x4x1x4x4xf32>
@@ -210,7 +210,7 @@
   util.return %collapse : tensor<1x128x256xf32>
 }
 
-util.func private @pingpong_large_f8_expanded(%lhs_base: !exp_in_ty_f8, %rhs_base: !in_ty_f8, %unused_acc: tensor<1x256x256xf32>) -> tensor<1x256x256xf32> {
+util.func private @pingpong_large_f8E4M3FN_expanded(%lhs_base: !exp_in_ty_f8, %rhs_base: !in_ty_f8, %unused_acc: tensor<1x256x256xf32>) -> tensor<1x256x256xf32> {
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
   %c2 = arith.constant 2 : index
@@ -222,7 +222,7 @@
   %c64 = arith.constant 64 : index
   %c128 = arith.constant 128 : index
   %c256 = arith.constant 256 : index
-  %cst = arith.constant 0.0 : f8E4M3FNUZ
+  %cst = arith.constant 0.0 : f8E4M3FN
   %lhs_shared_base = memref.alloc() : !flat_shared_f8
   %rhs_shared_base = memref.alloc() : !flat_shared_f8
 
@@ -242,16 +242,16 @@
   scf.forall (%id) in (2048) {
     %delin:2 = affine.delinearize_index %id into (256, 8) : index, index
     %vec = arith.muli %delin#1, %c16 overflow<nsw, nuw> : index
-    %lhs_thread_local = tensor.extract_slice %lhs_init [0, %delin#0, %vec] [1, 1, 16] [1, 1, 1] : !exp_block_in_f8 to tensor<1x1x16xf8E4M3FNUZ>
-    %lhs_vec_local = vector.transfer_read %lhs_thread_local [%c0, %c0, %c0], %cst {in_bounds = [true, true]} : tensor<1x1x16xf8E4M3FNUZ>, vector<1x16xf8E4M3FNUZ>
-    vector.transfer_write %lhs_vec_local, %lhs_shared[%delin#0, %vec] {in_bounds = [true, true]} : vector<1x16xf8E4M3FNUZ>, !shared_f8
+    %lhs_thread_local = tensor.extract_slice %lhs_init [0, %delin#0, %vec] [1, 1, 16] [1, 1, 1] : !exp_block_in_f8 to tensor<1x1x16xf8E4M3FN>
+    %lhs_vec_local = vector.transfer_read %lhs_thread_local [%c0, %c0, %c0], %cst {in_bounds = [true, true]} : tensor<1x1x16xf8E4M3FN>, vector<1x16xf8E4M3FN>
+    vector.transfer_write %lhs_vec_local, %lhs_shared[%delin#0, %vec] {in_bounds = [true, true]} : vector<1x16xf8E4M3FN>, !shared_f8
   } {mapping = [#gpu.thread<linear_dim_0>]}
   scf.forall (%id) in (2048) {
     %delin:2 = affine.delinearize_index %id into (256, 8) : index, index
     %vec = arith.muli %delin#1, %c16 overflow<nsw, nuw> : index
-    %rhs_thread_local = tensor.extract_slice %rhs_init [%delin#0, %vec] [1, 16] [1, 1] : !block_in_f8 to tensor<1x16xf8E4M3FNUZ>
-    %rhs_vec_local = vector.transfer_read %rhs_thread_local [%c0, %c0], %cst {in_bounds = [true, true]} : tensor<1x16xf8E4M3FNUZ>, vector<1x16xf8E4M3FNUZ>
-    vector.transfer_write %rhs_vec_local, %rhs_shared[%delin#0, %vec] {in_bounds = [true, true]} : vector<1x16xf8E4M3FNUZ>, !shared_f8
+    %rhs_thread_local = tensor.extract_slice %rhs_init [%delin#0, %vec] [1, 16] [1, 1] : !block_in_f8 to tensor<1x16xf8E4M3FN>
+    %rhs_vec_local = vector.transfer_read %rhs_thread_local [%c0, %c0], %cst {in_bounds = [true, true]} : tensor<1x16xf8E4M3FN>, vector<1x16xf8E4M3FN>
+    vector.transfer_write %rhs_vec_local, %rhs_shared[%delin#0, %vec] {in_bounds = [true, true]} : vector<1x16xf8E4M3FN>, !shared_f8
   } {mapping = [#gpu.thread<linear_dim_0>]}
 
   %lhs_shared_expand = memref.expand_shape %lhs_shared [[0, 1], [2, 3]] output_shape [16, 16, 4, 32] : !shared_f8 into !shared_exp_f8
@@ -288,17 +288,17 @@
 
       // Global loads of lhs.
       %lhs_block = tensor.extract_slice %lhs [0, 0, %i] [1, 256, 128] [1, 1, 1] : !exp_in_ty_f8 to !exp_block_in_f8
-      %lhs_thread_0 = tensor.extract_slice %lhs_block [0, %glb0, %gko] [1, 1, 16] [1, 1, 1] : !exp_block_in_f8 to tensor<1x1x16xf8E4M3FNUZ>
-      %lhs_vec_local_0 = vector.transfer_read %lhs_thread_0 [%c0, %c0, %c0], %cst {in_bounds = [true, true]} : tensor<1x1x16xf8E4M3FNUZ>, vector<1x16xf8E4M3FNUZ>
-      %lhs_thread_1 = tensor.extract_slice %lhs_block [0, %glb1, %gko] [1, 1, 16] [1, 1, 1] : !exp_block_in_f8 to tensor<1x1x16xf8E4M3FNUZ>
-      %lhs_vec_local_1 = vector.transfer_read %lhs_thread_1 [%c0, %c0, %c0], %cst {in_bounds = [true, true]} : tensor<1x1x16xf8E4M3FNUZ>, vector<1x16xf8E4M3FNUZ>
-      %lhs_thread_2 = tensor.extract_slice %lhs_block [0, %glb2, %gko] [1, 1, 16] [1, 1, 1] : !exp_block_in_f8 to tensor<1x1x16xf8E4M3FNUZ>
-      %lhs_vec_local_2 = vector.transfer_read %lhs_thread_2 [%c0, %c0, %c0], %cst {in_bounds = [true, true]} : tensor<1x1x16xf8E4M3FNUZ>, vector<1x16xf8E4M3FNUZ>
-      %lhs_thread_3 = tensor.extract_slice %lhs_block [0, %glb3, %gko] [1, 1, 16] [1, 1, 1] : !exp_block_in_f8 to tensor<1x1x16xf8E4M3FNUZ>
-      %lhs_vec_local_3 = vector.transfer_read %lhs_thread_3 [%c0, %c0, %c0], %cst {in_bounds = [true, true]} : tensor<1x1x16xf8E4M3FNUZ>, vector<1x16xf8E4M3FNUZ>
+      %lhs_thread_0 = tensor.extract_slice %lhs_block [0, %glb0, %gko] [1, 1, 16] [1, 1, 1] : !exp_block_in_f8 to tensor<1x1x16xf8E4M3FN>
+      %lhs_vec_local_0 = vector.transfer_read %lhs_thread_0 [%c0, %c0, %c0], %cst {in_bounds = [true, true]} : tensor<1x1x16xf8E4M3FN>, vector<1x16xf8E4M3FN>
+      %lhs_thread_1 = tensor.extract_slice %lhs_block [0, %glb1, %gko] [1, 1, 16] [1, 1, 1] : !exp_block_in_f8 to tensor<1x1x16xf8E4M3FN>
+      %lhs_vec_local_1 = vector.transfer_read %lhs_thread_1 [%c0, %c0, %c0], %cst {in_bounds = [true, true]} : tensor<1x1x16xf8E4M3FN>, vector<1x16xf8E4M3FN>
+      %lhs_thread_2 = tensor.extract_slice %lhs_block [0, %glb2, %gko] [1, 1, 16] [1, 1, 1] : !exp_block_in_f8 to tensor<1x1x16xf8E4M3FN>
+      %lhs_vec_local_2 = vector.transfer_read %lhs_thread_2 [%c0, %c0, %c0], %cst {in_bounds = [true, true]} : tensor<1x1x16xf8E4M3FN>, vector<1x16xf8E4M3FN>
+      %lhs_thread_3 = tensor.extract_slice %lhs_block [0, %glb3, %gko] [1, 1, 16] [1, 1, 1] : !exp_block_in_f8 to tensor<1x1x16xf8E4M3FN>
+      %lhs_vec_local_3 = vector.transfer_read %lhs_thread_3 [%c0, %c0, %c0], %cst {in_bounds = [true, true]} : tensor<1x1x16xf8E4M3FN>, vector<1x16xf8E4M3FN>
 
-      %lhs_vec_0 = vector.transfer_read %lhs_shared_expand[%m_outer_id, %ids#3, %c0, %inner_id], %cst {in_bounds = [true, true, true, true]} : !shared_exp_f8, vector<8x1x1x8xf8E4M3FNUZ>
-      %rhs_vec_0 = vector.transfer_read %rhs_shared_expand[%n_outer_id, %ids#3, %c0, %inner_id], %cst {in_bounds = [true, true, true, true]} : !shared_exp_f8, vector<4x1x1x8xf8E4M3FNUZ>
+      %lhs_vec_0 = vector.transfer_read %lhs_shared_expand[%m_outer_id, %ids#3, %c0, %inner_id], %cst {in_bounds = [true, true, true, true]} : !shared_exp_f8, vector<8x1x1x8xf8E4M3FN>
+      %rhs_vec_0 = vector.transfer_read %rhs_shared_expand[%n_outer_id, %ids#3, %c0, %inner_id], %cst {in_bounds = [true, true, true, true]} : !shared_exp_f8, vector<4x1x1x8xf8E4M3FN>
 
       gpu.barrier
       rocdl.sched.barrier 0
@@ -307,8 +307,8 @@
       %dot0 = iree_codegen.inner_tiled ins(%lhs_vec_0, %rhs_vec_0) outs(%iter) {
         indexing_maps = #contraction_accesses,
         iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<reduction>],
-        kind = #iree_gpu.mma_layout<MFMA_F32_16x16x32_F8E4M3FNUZ, col_major = true>
-      } : vector<8x1x1x8xf8E4M3FNUZ>, vector<4x1x1x8xf8E4M3FNUZ> into vector<8x4x1x4xf32>
+        kind = #iree_gpu.mma_layout<MFMA_F32_16x16x32_F8E4M3FN, col_major = true>
+      } : vector<8x1x1x8xf8E4M3FN>, vector<4x1x1x8xf8E4M3FN> into vector<8x4x1x4xf32>
 
       rocdl.s.setprio 0
       gpu.barrier
@@ -316,17 +316,17 @@
 
       // Global loads of rhs.
       %rhs_block = tensor.extract_slice %rhs [0, %i] [256, 128] [1, 1] : !in_ty_f8 to !block_in_f8
-      %rhs_thread_0 = tensor.extract_slice %rhs_block [%glb0, %gko] [1, 16] [1, 1] : !block_in_f8 to tensor<1x16xf8E4M3FNUZ>
-      %rhs_vec_local_0 = vector.transfer_read %rhs_thread_0 [%c0, %c0], %cst {in_bounds = [true, true]} : tensor<1x16xf8E4M3FNUZ>, vector<1x16xf8E4M3FNUZ>
-      %rhs_thread_1 = tensor.extract_slice %rhs_block [%glb1, %gko] [1, 16] [1, 1] : !block_in_f8 to tensor<1x16xf8E4M3FNUZ>
-      %rhs_vec_local_1 = vector.transfer_read %rhs_thread_1 [%c0, %c0], %cst {in_bounds = [true, true]} : tensor<1x16xf8E4M3FNUZ>, vector<1x16xf8E4M3FNUZ>
-      %rhs_thread_2 = tensor.extract_slice %rhs_block [%glb2, %gko] [1, 16] [1, 1] : !block_in_f8 to tensor<1x16xf8E4M3FNUZ>
-      %rhs_vec_local_2 = vector.transfer_read %rhs_thread_2 [%c0, %c0], %cst {in_bounds = [true, true]} : tensor<1x16xf8E4M3FNUZ>, vector<1x16xf8E4M3FNUZ>
-      %rhs_thread_3 = tensor.extract_slice %rhs_block [%glb3, %gko] [1, 16] [1, 1] : !block_in_f8 to tensor<1x16xf8E4M3FNUZ>
-      %rhs_vec_local_3 = vector.transfer_read %rhs_thread_3 [%c0, %c0], %cst {in_bounds = [true, true]} : tensor<1x16xf8E4M3FNUZ>, vector<1x16xf8E4M3FNUZ>
+      %rhs_thread_0 = tensor.extract_slice %rhs_block [%glb0, %gko] [1, 16] [1, 1] : !block_in_f8 to tensor<1x16xf8E4M3FN>
+      %rhs_vec_local_0 = vector.transfer_read %rhs_thread_0 [%c0, %c0], %cst {in_bounds = [true, true]} : tensor<1x16xf8E4M3FN>, vector<1x16xf8E4M3FN>
+      %rhs_thread_1 = tensor.extract_slice %rhs_block [%glb1, %gko] [1, 16] [1, 1] : !block_in_f8 to tensor<1x16xf8E4M3FN>
+      %rhs_vec_local_1 = vector.transfer_read %rhs_thread_1 [%c0, %c0], %cst {in_bounds = [true, true]} : tensor<1x16xf8E4M3FN>, vector<1x16xf8E4M3FN>
+      %rhs_thread_2 = tensor.extract_slice %rhs_block [%glb2, %gko] [1, 16] [1, 1] : !block_in_f8 to tensor<1x16xf8E4M3FN>
+      %rhs_vec_local_2 = vector.transfer_read %rhs_thread_2 [%c0, %c0], %cst {in_bounds = [true, true]} : tensor<1x16xf8E4M3FN>, vector<1x16xf8E4M3FN>
+      %rhs_thread_3 = tensor.extract_slice %rhs_block [%glb3, %gko] [1, 16] [1, 1] : !block_in_f8 to tensor<1x16xf8E4M3FN>
+      %rhs_vec_local_3 = vector.transfer_read %rhs_thread_3 [%c0, %c0], %cst {in_bounds = [true, true]} : tensor<1x16xf8E4M3FN>, vector<1x16xf8E4M3FN>
 
-      %lhs_vec_1 = vector.transfer_read %lhs_shared_expand[%m_outer_id, %ids#3, %c1, %inner_id], %cst {in_bounds = [true, true, true, true]} : !shared_exp_f8, vector<8x1x1x8xf8E4M3FNUZ>
-      %rhs_vec_1 = vector.transfer_read %rhs_shared_expand[%n_outer_id, %ids#3, %c1, %inner_id], %cst {in_bounds = [true, true, true, true]} : !shared_exp_f8, vector<4x1x1x8xf8E4M3FNUZ>
+      %lhs_vec_1 = vector.transfer_read %lhs_shared_expand[%m_outer_id, %ids#3, %c1, %inner_id], %cst {in_bounds = [true, true, true, true]} : !shared_exp_f8, vector<8x1x1x8xf8E4M3FN>
+      %rhs_vec_1 = vector.transfer_read %rhs_shared_expand[%n_outer_id, %ids#3, %c1, %inner_id], %cst {in_bounds = [true, true, true, true]} : !shared_exp_f8, vector<4x1x1x8xf8E4M3FN>
 
       gpu.barrier
       rocdl.sched.barrier 0
@@ -335,18 +335,18 @@
       %dot1 = iree_codegen.inner_tiled ins(%lhs_vec_1, %rhs_vec_1) outs(%dot0) {
         indexing_maps = #contraction_accesses,
         iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<reduction>],
-        kind = #iree_gpu.mma_layout<MFMA_F32_16x16x32_F8E4M3FNUZ, col_major = true>
-      } : vector<8x1x1x8xf8E4M3FNUZ>, vector<4x1x1x8xf8E4M3FNUZ> into vector<8x4x1x4xf32>
+        kind = #iree_gpu.mma_layout<MFMA_F32_16x16x32_F8E4M3FN, col_major = true>
+      } : vector<8x1x1x8xf8E4M3FN>, vector<4x1x1x8xf8E4M3FN> into vector<8x4x1x4xf32>
 
       rocdl.s.setprio 0
       gpu.barrier
       rocdl.sched.barrier 0
 
-      %lhs_vec_2 = vector.transfer_read %lhs_shared_expand[%m_outer_id, %ids#3, %c2, %inner_id], %cst {in_bounds = [true, true, true, true]} : !shared_exp_f8, vector<8x1x1x8xf8E4M3FNUZ>
-      %rhs_vec_2 = vector.transfer_read %rhs_shared_expand[%n_outer_id, %ids#3, %c2, %inner_id], %cst {in_bounds = [true, true, true, true]} : !shared_exp_f8, vector<4x1x1x8xf8E4M3FNUZ>
+      %lhs_vec_2 = vector.transfer_read %lhs_shared_expand[%m_outer_id, %ids#3, %c2, %inner_id], %cst {in_bounds = [true, true, true, true]} : !shared_exp_f8, vector<8x1x1x8xf8E4M3FN>
+      %rhs_vec_2 = vector.transfer_read %rhs_shared_expand[%n_outer_id, %ids#3, %c2, %inner_id], %cst {in_bounds = [true, true, true, true]} : !shared_exp_f8, vector<4x1x1x8xf8E4M3FN>
 
-      %lhs_vec_3 = vector.transfer_read %lhs_shared_expand[%m_outer_id, %ids#3, %c3, %inner_id], %cst {in_bounds = [true, true, true, true]} : !shared_exp_f8, vector<8x1x1x8xf8E4M3FNUZ>
-      %rhs_vec_3 = vector.transfer_read %rhs_shared_expand[%n_outer_id, %ids#3, %c3, %inner_id], %cst {in_bounds = [true, true, true, true]} : !shared_exp_f8, vector<4x1x1x8xf8E4M3FNUZ>
+      %lhs_vec_3 = vector.transfer_read %lhs_shared_expand[%m_outer_id, %ids#3, %c3, %inner_id], %cst {in_bounds = [true, true, true, true]} : !shared_exp_f8, vector<8x1x1x8xf8E4M3FN>
+      %rhs_vec_3 = vector.transfer_read %rhs_shared_expand[%n_outer_id, %ids#3, %c3, %inner_id], %cst {in_bounds = [true, true, true, true]} : !shared_exp_f8, vector<4x1x1x8xf8E4M3FN>
 
       gpu.barrier
       rocdl.sched.barrier 0
@@ -355,22 +355,22 @@
       %dot2 = iree_codegen.inner_tiled ins(%lhs_vec_2, %rhs_vec_2) outs(%dot1) {
         indexing_maps = #contraction_accesses,
         iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<reduction>],
-        kind = #iree_gpu.mma_layout<MFMA_F32_16x16x32_F8E4M3FNUZ, col_major = true>
-      } : vector<8x1x1x8xf8E4M3FNUZ>, vector<4x1x1x8xf8E4M3FNUZ> into vector<8x4x1x4xf32>
+        kind = #iree_gpu.mma_layout<MFMA_F32_16x16x32_F8E4M3FN, col_major = true>
+      } : vector<8x1x1x8xf8E4M3FN>, vector<4x1x1x8xf8E4M3FN> into vector<8x4x1x4xf32>
 
       rocdl.s.setprio 0
       gpu.barrier
       rocdl.sched.barrier 0
 
-      vector.transfer_write %lhs_vec_local_0, %lhs_shared [%glb0, %gko] {in_bounds = [true, true]} : vector<1x16xf8E4M3FNUZ>, !shared_f8
-      vector.transfer_write %lhs_vec_local_1, %lhs_shared [%glb1, %gko] {in_bounds = [true, true]} : vector<1x16xf8E4M3FNUZ>, !shared_f8
-      vector.transfer_write %lhs_vec_local_2, %lhs_shared [%glb2, %gko] {in_bounds = [true, true]} : vector<1x16xf8E4M3FNUZ>, !shared_f8
-      vector.transfer_write %lhs_vec_local_3, %lhs_shared [%glb3, %gko] {in_bounds = [true, true]} : vector<1x16xf8E4M3FNUZ>, !shared_f8
+      vector.transfer_write %lhs_vec_local_0, %lhs_shared [%glb0, %gko] {in_bounds = [true, true]} : vector<1x16xf8E4M3FN>, !shared_f8
+      vector.transfer_write %lhs_vec_local_1, %lhs_shared [%glb1, %gko] {in_bounds = [true, true]} : vector<1x16xf8E4M3FN>, !shared_f8
+      vector.transfer_write %lhs_vec_local_2, %lhs_shared [%glb2, %gko] {in_bounds = [true, true]} : vector<1x16xf8E4M3FN>, !shared_f8
+      vector.transfer_write %lhs_vec_local_3, %lhs_shared [%glb3, %gko] {in_bounds = [true, true]} : vector<1x16xf8E4M3FN>, !shared_f8
 
-      vector.transfer_write %rhs_vec_local_0, %rhs_shared [%glb0, %gko] {in_bounds = [true, true]} : vector<1x16xf8E4M3FNUZ>, !shared_f8
-      vector.transfer_write %rhs_vec_local_1, %rhs_shared [%glb1, %gko] {in_bounds = [true, true]} : vector<1x16xf8E4M3FNUZ>, !shared_f8
-      vector.transfer_write %rhs_vec_local_2, %rhs_shared [%glb2, %gko] {in_bounds = [true, true]} : vector<1x16xf8E4M3FNUZ>, !shared_f8
-      vector.transfer_write %rhs_vec_local_3, %rhs_shared [%glb3, %gko] {in_bounds = [true, true]} : vector<1x16xf8E4M3FNUZ>, !shared_f8
+      vector.transfer_write %rhs_vec_local_0, %rhs_shared [%glb0, %gko] {in_bounds = [true, true]} : vector<1x16xf8E4M3FN>, !shared_f8
+      vector.transfer_write %rhs_vec_local_1, %rhs_shared [%glb1, %gko] {in_bounds = [true, true]} : vector<1x16xf8E4M3FN>, !shared_f8
+      vector.transfer_write %rhs_vec_local_2, %rhs_shared [%glb2, %gko] {in_bounds = [true, true]} : vector<1x16xf8E4M3FN>, !shared_f8
+      vector.transfer_write %rhs_vec_local_3, %rhs_shared [%glb3, %gko] {in_bounds = [true, true]} : vector<1x16xf8E4M3FN>, !shared_f8
 
       gpu.barrier
       rocdl.sched.barrier 0
@@ -379,8 +379,8 @@
       %dot3 = iree_codegen.inner_tiled ins(%lhs_vec_3, %rhs_vec_3) outs(%dot2) {
         indexing_maps = #contraction_accesses,
         iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<reduction>],
-        kind = #iree_gpu.mma_layout<MFMA_F32_16x16x32_F8E4M3FNUZ, col_major = true>
-      } : vector<8x1x1x8xf8E4M3FNUZ>, vector<4x1x1x8xf8E4M3FNUZ> into vector<8x4x1x4xf32>
+        kind = #iree_gpu.mma_layout<MFMA_F32_16x16x32_F8E4M3FN, col_major = true>
+      } : vector<8x1x1x8xf8E4M3FN>, vector<4x1x1x8xf8E4M3FN> into vector<8x4x1x4xf32>
 
       rocdl.s.setprio 0
       gpu.barrier
@@ -393,34 +393,34 @@
     }
 
     // Epilogue
-    %lhs_vec_0 = vector.transfer_read %lhs_shared_expand[%m_outer_id, %ids#3, %c0, %inner_id], %cst {in_bounds = [true, true, true, true]} : !shared_exp_f8, vector<8x1x1x8xf8E4M3FNUZ>
-    %rhs_vec_0 = vector.transfer_read %rhs_shared_expand[%n_outer_id, %ids#3, %c0, %inner_id], %cst {in_bounds = [true, true, true, true]} : !shared_exp_f8, vector<4x1x1x8xf8E4M3FNUZ>
+    %lhs_vec_0 = vector.transfer_read %lhs_shared_expand[%m_outer_id, %ids#3, %c0, %inner_id], %cst {in_bounds = [true, true, true, true]} : !shared_exp_f8, vector<8x1x1x8xf8E4M3FN>
+    %rhs_vec_0 = vector.transfer_read %rhs_shared_expand[%n_outer_id, %ids#3, %c0, %inner_id], %cst {in_bounds = [true, true, true, true]} : !shared_exp_f8, vector<4x1x1x8xf8E4M3FN>
     %dot0 = iree_codegen.inner_tiled ins(%lhs_vec_0, %rhs_vec_0) outs(%3) {
       indexing_maps = #contraction_accesses,
       iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<reduction>],
-      kind = #iree_gpu.mma_layout<MFMA_F32_16x16x32_F8E4M3FNUZ, col_major = true>
-    } : vector<8x1x1x8xf8E4M3FNUZ>, vector<4x1x1x8xf8E4M3FNUZ> into vector<8x4x1x4xf32>
-    %lhs_vec_1 = vector.transfer_read %lhs_shared_expand[%m_outer_id, %ids#3, %c1, %inner_id], %cst {in_bounds = [true, true, true, true]} : !shared_exp_f8, vector<8x1x1x8xf8E4M3FNUZ>
-    %rhs_vec_1 = vector.transfer_read %rhs_shared_expand[%n_outer_id, %ids#3, %c1, %inner_id], %cst {in_bounds = [true, true, true, true]} : !shared_exp_f8, vector<4x1x1x8xf8E4M3FNUZ>
+      kind = #iree_gpu.mma_layout<MFMA_F32_16x16x32_F8E4M3FN, col_major = true>
+    } : vector<8x1x1x8xf8E4M3FN>, vector<4x1x1x8xf8E4M3FN> into vector<8x4x1x4xf32>
+    %lhs_vec_1 = vector.transfer_read %lhs_shared_expand[%m_outer_id, %ids#3, %c1, %inner_id], %cst {in_bounds = [true, true, true, true]} : !shared_exp_f8, vector<8x1x1x8xf8E4M3FN>
+    %rhs_vec_1 = vector.transfer_read %rhs_shared_expand[%n_outer_id, %ids#3, %c1, %inner_id], %cst {in_bounds = [true, true, true, true]} : !shared_exp_f8, vector<4x1x1x8xf8E4M3FN>
     %dot1 = iree_codegen.inner_tiled ins(%lhs_vec_1, %rhs_vec_1) outs(%dot0) {
       indexing_maps = #contraction_accesses,
       iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<reduction>],
-      kind = #iree_gpu.mma_layout<MFMA_F32_16x16x32_F8E4M3FNUZ, col_major = true>
-    } : vector<8x1x1x8xf8E4M3FNUZ>, vector<4x1x1x8xf8E4M3FNUZ> into vector<8x4x1x4xf32>
-    %lhs_vec_2 = vector.transfer_read %lhs_shared_expand[%m_outer_id, %ids#3, %c2, %inner_id], %cst {in_bounds = [true, true, true, true]} : !shared_exp_f8, vector<8x1x1x8xf8E4M3FNUZ>
-    %rhs_vec_2 = vector.transfer_read %rhs_shared_expand[%n_outer_id, %ids#3, %c2, %inner_id], %cst {in_bounds = [true, true, true, true]} : !shared_exp_f8, vector<4x1x1x8xf8E4M3FNUZ>
+      kind = #iree_gpu.mma_layout<MFMA_F32_16x16x32_F8E4M3FN, col_major = true>
+    } : vector<8x1x1x8xf8E4M3FN>, vector<4x1x1x8xf8E4M3FN> into vector<8x4x1x4xf32>
+    %lhs_vec_2 = vector.transfer_read %lhs_shared_expand[%m_outer_id, %ids#3, %c2, %inner_id], %cst {in_bounds = [true, true, true, true]} : !shared_exp_f8, vector<8x1x1x8xf8E4M3FN>
+    %rhs_vec_2 = vector.transfer_read %rhs_shared_expand[%n_outer_id, %ids#3, %c2, %inner_id], %cst {in_bounds = [true, true, true, true]} : !shared_exp_f8, vector<4x1x1x8xf8E4M3FN>
     %dot2 = iree_codegen.inner_tiled ins(%lhs_vec_2, %rhs_vec_2) outs(%dot1) {
       indexing_maps = #contraction_accesses,
       iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<reduction>],
-      kind = #iree_gpu.mma_layout<MFMA_F32_16x16x32_F8E4M3FNUZ, col_major = true>
-    } : vector<8x1x1x8xf8E4M3FNUZ>, vector<4x1x1x8xf8E4M3FNUZ> into vector<8x4x1x4xf32>
-    %lhs_vec_3 = vector.transfer_read %lhs_shared_expand[%m_outer_id, %ids#3, %c3, %inner_id], %cst {in_bounds = [true, true, true, true]} : !shared_exp_f8, vector<8x1x1x8xf8E4M3FNUZ>
-    %rhs_vec_3 = vector.transfer_read %rhs_shared_expand[%n_outer_id, %ids#3, %c3, %inner_id], %cst {in_bounds = [true, true, true, true]} : !shared_exp_f8, vector<4x1x1x8xf8E4M3FNUZ>
+      kind = #iree_gpu.mma_layout<MFMA_F32_16x16x32_F8E4M3FN, col_major = true>
+    } : vector<8x1x1x8xf8E4M3FN>, vector<4x1x1x8xf8E4M3FN> into vector<8x4x1x4xf32>
+    %lhs_vec_3 = vector.transfer_read %lhs_shared_expand[%m_outer_id, %ids#3, %c3, %inner_id], %cst {in_bounds = [true, true, true, true]} : !shared_exp_f8, vector<8x1x1x8xf8E4M3FN>
+    %rhs_vec_3 = vector.transfer_read %rhs_shared_expand[%n_outer_id, %ids#3, %c3, %inner_id], %cst {in_bounds = [true, true, true, true]} : !shared_exp_f8, vector<4x1x1x8xf8E4M3FN>
     %dot3 = iree_codegen.inner_tiled ins(%lhs_vec_3, %rhs_vec_3) outs(%dot2) {
       indexing_maps = #contraction_accesses,
       iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<reduction>],
-      kind = #iree_gpu.mma_layout<MFMA_F32_16x16x32_F8E4M3FNUZ, col_major = true>
-    } : vector<8x1x1x8xf8E4M3FNUZ>, vector<4x1x1x8xf8E4M3FNUZ> into vector<8x4x1x4xf32>
+      kind = #iree_gpu.mma_layout<MFMA_F32_16x16x32_F8E4M3FN, col_major = true>
+    } : vector<8x1x1x8xf8E4M3FN>, vector<4x1x1x8xf8E4M3FN> into vector<8x4x1x4xf32>
 
     %tp = vector.transpose %dot3, [0, 2, 1, 3] : vector<8x4x1x4xf32> to vector<8x1x4x4xf32>
     %empty = tensor.empty() : tensor<1x8x1x4x4xf32>
diff --git a/compiler/plugins/target/ROCM/builtins/mlir_ukernel/iree_uk_amdgpu_matmul_f8.mlir b/compiler/plugins/target/ROCM/builtins/mlir_ukernel/iree_uk_amdgpu_matmul_f8E4M3FNUZ.mlir
similarity index 98%
rename from compiler/plugins/target/ROCM/builtins/mlir_ukernel/iree_uk_amdgpu_matmul_f8.mlir
rename to compiler/plugins/target/ROCM/builtins/mlir_ukernel/iree_uk_amdgpu_matmul_f8E4M3FNUZ.mlir
index d808b97..c383d6e 100644
--- a/compiler/plugins/target/ROCM/builtins/mlir_ukernel/iree_uk_amdgpu_matmul_f8.mlir
+++ b/compiler/plugins/target/ROCM/builtins/mlir_ukernel/iree_uk_amdgpu_matmul_f8E4M3FNUZ.mlir
@@ -20,7 +20,7 @@
  affine_map<(i, j, k) -> (i, j)>
 ]
 
-util.func @pingpong_medium_f8_expanded(%lhs_base: !mexp_in_ty_f8, %rhs_base: !in_ty_f8, %unused_acc: tensor<1x128x256xf32>) -> tensor<1x128x256xf32> {
+util.func @pingpong_medium_f8E4M3FNUZ_expanded(%lhs_base: !mexp_in_ty_f8, %rhs_base: !in_ty_f8, %unused_acc: tensor<1x128x256xf32>) -> tensor<1x128x256xf32> {
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
   %c2 = arith.constant 2 : index
@@ -210,7 +210,7 @@
   util.return %collapse : tensor<1x128x256xf32>
 }
 
-util.func private @pingpong_large_f8_expanded(%lhs_base: !exp_in_ty_f8, %rhs_base: !in_ty_f8, %unused_acc: tensor<1x256x256xf32>) -> tensor<1x256x256xf32> {
+util.func private @pingpong_large_f8E4M3FNUZ_expanded(%lhs_base: !exp_in_ty_f8, %rhs_base: !in_ty_f8, %unused_acc: tensor<1x256x256xf32>) -> tensor<1x256x256xf32> {
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
   %c2 = arith.constant 2 : index
diff --git a/compiler/plugins/target/ROCM/builtins/mlir_ukernel/ukernel_patterns_gfx942.mlir b/compiler/plugins/target/ROCM/builtins/mlir_ukernel/ukernel_patterns_gfx942.mlir
index ce24d04..9a3afdc 100644
--- a/compiler/plugins/target/ROCM/builtins/mlir_ukernel/ukernel_patterns_gfx942.mlir
+++ b/compiler/plugins/target/ROCM/builtins/mlir_ukernel/ukernel_patterns_gfx942.mlir
@@ -1,10 +1,10 @@
 // RUN: iree-opt -allow-unregistered-dialect %s
 
-// F8 Patterns
+// f8E4M3FNUZ Patterns
 
 // This pattern matches a medium-sized expanded matmul-like operation and
 // annotates it with ukernel descriptor and configuration attributes.
-pdl.pattern @annotate_matmul_like_f8_medium_expanded : benefit(1) {
+pdl.pattern @annotate_matmul_like_f8E4M3FNUZ_medium_expanded : benefit(1) {
   %elemtypes = pdl.attribute = [f8E4M3FNUZ, f8E4M3FNUZ, f32]
   %imaps = pdl.attribute = [
     affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>,
@@ -58,7 +58,7 @@
     // Call the C++ "annotateOperation" utility to add the attributes to the matched linalg.generic op.
     // This modifies the operation in-place.
 
-    %annotation = pdl.attribute = #iree_codegen.ukernel_descriptor<"pingpong_medium_f8_expanded", tensor>
+    %annotation = pdl.attribute = #iree_codegen.ukernel_descriptor<"pingpong_medium_f8E4M3FNUZ_expanded", tensor>
     pdl.apply_native_rewrite "annotateOperation"(%generic_op, %attr_name, %annotation : !pdl.operation, !pdl.attribute, !pdl.attribute)
 
     %config_name = pdl.attribute = "compilation_info"
@@ -82,7 +82,7 @@
     pdl.apply_native_rewrite "annotateOperation"(%generic_op, %config_name, %config : !pdl.operation, !pdl.attribute, !pdl.attribute)
 
     %builtin_attr = pdl.attribute = "rocm.builtin_name"
-    %builtin_annotation = pdl.attribute = "iree_uk_amdgpu_matmul_f8.mlir"
+    %builtin_annotation = pdl.attribute = "iree_uk_amdgpu_matmul_f8E4M3FNUZ.mlir"
     pdl.apply_native_rewrite "annotateOperation"(%generic_op, %builtin_attr, %builtin_annotation : !pdl.operation, !pdl.attribute, !pdl.attribute)
   }
 }
@@ -90,7 +90,7 @@
 // This pattern matches a large expanded f8 matmul-like operation and annotates it
 // with ukernel descriptor and configuration attributes. This is preferred over the
 // medium-sized ukernel.
-pdl.pattern @annotate_matmul_like_f8_large_expanded : benefit(2) {
+pdl.pattern @annotate_matmul_like_f8E4M3FNUZ_large_expanded : benefit(2) {
   %elemtypes = pdl.attribute = [f8E4M3FNUZ, f8E4M3FNUZ, f32]
   %imaps = pdl.attribute = [
     affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>,
@@ -150,7 +150,7 @@
     // Call the C++ "annotateOperation" utility to add the attributes to the matched linalg.generic op.
     // This modifies the operation in-place.
 
-    %annotation = pdl.attribute = #iree_codegen.ukernel_descriptor<"pingpong_large_f8_expanded", tensor>
+    %annotation = pdl.attribute = #iree_codegen.ukernel_descriptor<"pingpong_large_f8E4M3FNUZ_expanded", tensor>
     pdl.apply_native_rewrite "annotateOperation"(%generic_op, %attr_name, %annotation : !pdl.operation, !pdl.attribute, !pdl.attribute)
 
     %config_name = pdl.attribute = "compilation_info"
@@ -174,529 +174,7 @@
     pdl.apply_native_rewrite "annotateOperation"(%generic_op, %config_name, %config : !pdl.operation, !pdl.attribute, !pdl.attribute)
 
     %builtin_attr = pdl.attribute = "rocm.builtin_name"
-    %builtin_annotation = pdl.attribute = "iree_uk_amdgpu_matmul_f8.mlir"
-    pdl.apply_native_rewrite "annotateOperation"(%generic_op, %builtin_attr, %builtin_annotation : !pdl.operation, !pdl.attribute, !pdl.attribute)
-  }
-}
-
-// F16 Patterns
-
-// This pattern matches a large f16 matmul-like operation and annotates it
-// with ukernel descriptor and configuration attributes.
-pdl.pattern @annotate_matmul_like_f16_large : benefit(1) {
-  %elemtypes = pdl.attribute = [f16, f16, f32]
-  %imaps = pdl.attribute = [
-    affine_map<(d0, d1, d2) -> (d0, d2)>,
-    affine_map<(d0, d1, d2) -> (d1, d2)>,
-    affine_map<(d0, d1, d2) -> (d0, d1)>
-  ]
-
-  %lhs_type = pdl.type
-  %rhs_type = pdl.type
-  %out_type = pdl.type
-  %zero_type = pdl.type : f32
-
-  %lhs = pdl.operand : %lhs_type
-  %rhs = pdl.operand : %rhs_type
-  %out_init = pdl.operand : %out_type
-
-  %zero_val = pdl.attribute = 0. : f32
-  %zero_op = pdl.operation "arith.constant" {"value" = %zero_val} -> (%zero_type : !pdl.type)
-  %zero = pdl.result 0 of %zero_op
-  %fill_op = pdl.operation "linalg.fill" (%zero, %out_init : !pdl.value, !pdl.value) -> (%out_type : !pdl.type)
-  %fill = pdl.result 0 of %fill_op
-
-  // Match the a matmul-like generic with above indexing maps.
-  %generic_op = pdl.operation (%lhs, %rhs, %fill : !pdl.value, !pdl.value, !pdl.value) -> (%out_type : !pdl.type)
-  pdl.apply_native_constraint "matchContraction"(
-        %generic_op, %elemtypes, %imaps
-        : !pdl.operation, !pdl.attribute, !pdl.attribute)
-
-  %attr_name = pdl.attribute = "iree_codegen.ukernel"
-  pdl.apply_native_constraint "hasAttr"(%generic_op, %attr_name : !pdl.operation, !pdl.attribute) {isNegated = true}
-
-  // M % 256 == 0, K % 64 == 0, N % 256 == 0
-  %empty = pdl.attribute = {}
-  %c0 = pdl.attribute = 0
-  %c1 = pdl.attribute = 1
-  %c2 = pdl.attribute = 2
-  %c64 = pdl.attribute = 64
-  %c256 = pdl.attribute = 256
-  pdl.apply_native_constraint "dimIsMultipleOf"(%lhs, %c0, %c256 : !pdl.value, !pdl.attribute, !pdl.attribute)
-  pdl.apply_native_constraint "dimIsMultipleOf"(%lhs, %c1, %c64 : !pdl.value, !pdl.attribute, !pdl.attribute)
-  pdl.apply_native_constraint "dimIsMultipleOf"(%rhs, %c0, %c256 : !pdl.value, !pdl.attribute, !pdl.attribute)
-  pdl.apply_native_constraint "dimIsMultipleOf"(%rhs, %c1, %c64 : !pdl.value, !pdl.attribute, !pdl.attribute)
-
-  // M, N >= 1024, K >= 256
-  %c1024 = pdl.attribute = 1024
-  pdl.apply_native_constraint "dimIsBound"(%lhs, %c0, %c1024, %empty : !pdl.value, !pdl.attribute, !pdl.attribute, !pdl.attribute)
-  pdl.apply_native_constraint "dimIsBound"(%rhs, %c0, %c1024, %empty : !pdl.value, !pdl.attribute, !pdl.attribute, !pdl.attribute)
-  pdl.apply_native_constraint "dimIsBound"(%lhs, %c1, %c256, %empty : !pdl.value, !pdl.attribute, !pdl.attribute, !pdl.attribute)
-
-  pdl.rewrite {
-    // Call the C++ "annotateOperation" utility to add the attributes to the matched linalg.generic op.
-    // This modifies the operation in-place.
-
-    %annotation = pdl.attribute = #iree_codegen.ukernel_descriptor<"pingpong_large_f16", tensor>
-    pdl.apply_native_rewrite "annotateOperation"(%generic_op, %attr_name, %annotation : !pdl.operation, !pdl.attribute, !pdl.attribute)
-
-    %config_name = pdl.attribute = "compilation_info"
-    %config = pdl.attribute = #iree_codegen.compilation_info<
-      lowering_config = #iree_gpu.lowering_config<{workgroup = [256, 256, 0]}>,
-      translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse
-        workgroup_size = [512, 1, 1] subgroup_size = 64,
-        // This strategy uses the maximum amount of possible shared memory on
-        // all gfx942 architectures so shared memory padding to reduce bank
-        // conflicts must be disabled. Also prefetching is done manually in the
-        // above and is disabled here as well.
-        {gpu_pipeline_options =
-          #iree_gpu.pipeline_options<
-            prefetch_shared_memory = false,
-            no_reduce_shared_memory_bank_conflicts = true>,
-        // This strategy requires 2 waves per SIMD.
-          llvm_func_attrs = {"amdgpu-waves-per-eu" = "2"}}>
-    >
-    pdl.apply_native_rewrite "annotateOperation"(%generic_op, %config_name, %config : !pdl.operation, !pdl.attribute, !pdl.attribute)
-
-    %builtin_attr = pdl.attribute = "rocm.builtin_name"
-    %builtin_annotation = pdl.attribute = "iree_uk_amdgpu_matmul_f16.mlir"
-    pdl.apply_native_rewrite "annotateOperation"(%generic_op, %builtin_attr, %builtin_annotation : !pdl.operation, !pdl.attribute, !pdl.attribute)
-  }
-}
-
-// This pattern matches a medium-sized f16 matmul-like operation and annotates it
-// with ukernel descriptor and configuration attributes.
-pdl.pattern @annotate_matmul_like_f16_medium_expanded : benefit(1) {
-  %elemtypes = pdl.attribute = [f16, f16, f32]
-  %imaps = pdl.attribute = [
-    affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>,
-    affine_map<(d0, d1, d2, d3) -> (d2, d3)>,
-    affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>
-  ]
-
-  %lhs_type = pdl.type
-  %rhs_type = pdl.type
-  %out_type = pdl.type
-  %zero_type = pdl.type : f32
-
-  %lhs = pdl.operand : %lhs_type
-  %rhs = pdl.operand : %rhs_type
-  %out_init = pdl.operand : %out_type
-
-  %zero_val = pdl.attribute = 0. : f32
-  %zero_op = pdl.operation "arith.constant" {"value" = %zero_val} -> (%zero_type : !pdl.type)
-  %zero = pdl.result 0 of %zero_op
-  %fill_op = pdl.operation "linalg.fill" (%zero, %out_init : !pdl.value, !pdl.value) -> (%out_type : !pdl.type)
-  %fill = pdl.result 0 of %fill_op
-
-  // Match the a matmul-like generic with above indexing maps.
-  %generic_op = pdl.operation (%lhs, %rhs, %fill : !pdl.value, !pdl.value, !pdl.value) -> (%out_type : !pdl.type)
-  pdl.apply_native_constraint "matchContraction"(
-        %generic_op, %elemtypes, %imaps
-        : !pdl.operation, !pdl.attribute, !pdl.attribute)
-
-  %attr_name = pdl.attribute = "iree_codegen.ukernel"
-  pdl.apply_native_constraint "hasAttr"(%generic_op, %attr_name : !pdl.operation, !pdl.attribute) {isNegated = true}
-
-  // M % 128 == 0, K % 64 == 0, N % 256 == 0
-  %empty = pdl.attribute = {}
-  %c0 = pdl.attribute = 0
-  %c1 = pdl.attribute = 1
-  %c2 = pdl.attribute = 2
-  %c64 = pdl.attribute = 64
-  %c128 = pdl.attribute = 128
-  %c256 = pdl.attribute = 256
-  pdl.apply_native_constraint "dimIsMultipleOf"(%lhs, %c1, %c128 : !pdl.value, !pdl.attribute, !pdl.attribute)
-  pdl.apply_native_constraint "dimIsMultipleOf"(%lhs, %c2, %c64 : !pdl.value, !pdl.attribute, !pdl.attribute)
-  pdl.apply_native_constraint "dimIsMultipleOf"(%rhs, %c0, %c256 : !pdl.value, !pdl.attribute, !pdl.attribute)
-  pdl.apply_native_constraint "dimIsMultipleOf"(%rhs, %c1, %c64 : !pdl.value, !pdl.attribute, !pdl.attribute)
-
-  // M, N >= 1024, K >= 256
-  %c1024 = pdl.attribute = 1024
-
-  // TODO: Kernel specialization is needed to apply this strategy selectively at
-  // runtime. Additionally model exports don't specify lower bounds so it is
-  // impossible to use this strategy with this check.
-  // pdl.apply_native_constraint "dimIsBound"(%lhs, %c0, %c4, %empty : !pdl.value, !pdl.attribute, !pdl.attribute, !pdl.attribute)
-
-  pdl.apply_native_constraint "dimIsBound"(%rhs, %c0, %c1024, %empty : !pdl.value, !pdl.attribute, !pdl.attribute, !pdl.attribute)
-  pdl.apply_native_constraint "dimIsBound"(%lhs, %c2, %c256, %empty : !pdl.value, !pdl.attribute, !pdl.attribute, !pdl.attribute)
-
-  pdl.rewrite {
-    // Call the C++ "annotateOperation" utility to add the attributes to the matched linalg.generic op.
-    // This modifies the operation in-place.
-
-    %annotation = pdl.attribute = #iree_codegen.ukernel_descriptor<"pingpong_medium_f16_expanded", tensor>
-    pdl.apply_native_rewrite "annotateOperation"(%generic_op, %attr_name, %annotation : !pdl.operation, !pdl.attribute, !pdl.attribute)
-
-    %config_name = pdl.attribute = "compilation_info"
-    %config = pdl.attribute = #iree_codegen.compilation_info<
-      lowering_config = #iree_gpu.lowering_config<{workgroup = [1, 128, 256, 0]}>,
-      translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse
-        workgroup_size = [512, 1, 1] subgroup_size = 64,
-        // This strategy uses the maximum amount of possible shared memory on
-        // all gfx942 architectures so shared memory padding to reduce bank
-        // conflicts must be disabled. Also prefetching is done manually in the
-        // above and is disabled here as well.
-        {gpu_pipeline_options =
-          #iree_gpu.pipeline_options<
-            prefetch_shared_memory = false,
-            no_reduce_shared_memory_bank_conflicts = true>,
-        // This strategy requires 2 waves per SIMD.
-          llvm_func_attrs = {"amdgpu-waves-per-eu" = "2"}}>
-    >
-    pdl.apply_native_rewrite "annotateOperation"(%generic_op, %config_name, %config : !pdl.operation, !pdl.attribute, !pdl.attribute)
-
-    %builtin_attr = pdl.attribute = "rocm.builtin_name"
-    %builtin_annotation = pdl.attribute = "iree_uk_amdgpu_matmul_f16.mlir"
-    pdl.apply_native_rewrite "annotateOperation"(%generic_op, %builtin_attr, %builtin_annotation : !pdl.operation, !pdl.attribute, !pdl.attribute)
-  }
-}
-
-// This pattern matches a medium-sized f16 matmul-like operation and annotates it
-// with ukernel descriptor and configuration attributes. This is preferred over the
-// medium-sized ukernel.
-pdl.pattern @annotate_matmul_like_f16_large_expanded : benefit(2) {
-  %elemtypes = pdl.attribute = [f16, f16, f32]
-  %imaps = pdl.attribute = [
-    affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>,
-    affine_map<(d0, d1, d2, d3) -> (d2, d3)>,
-    affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>
-  ]
-
-  %lhs_type = pdl.type
-  %rhs_type = pdl.type
-  %out_type = pdl.type
-  %zero_type = pdl.type : f32
-
-  %lhs = pdl.operand : %lhs_type
-  %rhs = pdl.operand : %rhs_type
-  %out_init = pdl.operand : %out_type
-
-  %zero_val = pdl.attribute = 0. : f32
-  %zero_op = pdl.operation "arith.constant" {"value" = %zero_val} -> (%zero_type : !pdl.type)
-  %zero = pdl.result 0 of %zero_op
-  %fill_op = pdl.operation "linalg.fill" (%zero, %out_init : !pdl.value, !pdl.value) -> (%out_type : !pdl.type)
-  %fill = pdl.result 0 of %fill_op
-
-  // Match the a matmul-like generic with above indexing maps.
-  %generic_op = pdl.operation (%lhs, %rhs, %fill : !pdl.value, !pdl.value, !pdl.value) -> (%out_type : !pdl.type)
-  pdl.apply_native_constraint "matchContraction"(
-        %generic_op, %elemtypes, %imaps
-        : !pdl.operation, !pdl.attribute, !pdl.attribute)
-
-  %attr_name = pdl.attribute = "iree_codegen.ukernel"
-  pdl.apply_native_constraint "hasAttr"(%generic_op, %attr_name : !pdl.operation, !pdl.attribute) {isNegated = true}
-
-  // M % 256 == 0, K % 64 == 0, N % 256 == 0
-  %empty = pdl.attribute = {}
-  %c0 = pdl.attribute = 0
-  %c1 = pdl.attribute = 1
-  %c2 = pdl.attribute = 2
-  %c64 = pdl.attribute = 64
-  %c256 = pdl.attribute = 256
-  pdl.apply_native_constraint "dimIsMultipleOf"(%lhs, %c1, %c256 : !pdl.value, !pdl.attribute, !pdl.attribute)
-  pdl.apply_native_constraint "dimIsMultipleOf"(%lhs, %c2, %c64 : !pdl.value, !pdl.attribute, !pdl.attribute)
-  pdl.apply_native_constraint "dimIsMultipleOf"(%rhs, %c0, %c256 : !pdl.value, !pdl.attribute, !pdl.attribute)
-  pdl.apply_native_constraint "dimIsMultipleOf"(%rhs, %c1, %c64 : !pdl.value, !pdl.attribute, !pdl.attribute)
-
-  // M, N >= 1024, K >= 256
-  %c1024 = pdl.attribute = 1024
-
-  // TODO: Kernel specialization is needed to apply this strategy selectively at
-  // runtime. Additionally model exports don't specify lower bounds so it is
-  // impossible to use this strategy with this check.
-  // pdl.apply_native_constraint "dimIsBound"(%lhs, %c0, %c4, %empty : !pdl.value, !pdl.attribute, !pdl.attribute, !pdl.attribute)
-
-  pdl.apply_native_constraint "dimIsBound"(%rhs, %c0, %c1024, %empty : !pdl.value, !pdl.attribute, !pdl.attribute, !pdl.attribute)
-  pdl.apply_native_constraint "dimIsBound"(%lhs, %c2, %c256, %empty : !pdl.value, !pdl.attribute, !pdl.attribute, !pdl.attribute)
-
-  pdl.rewrite {
-    // Call the C++ "annotateOperation" utility to add the attributes to the matched linalg.generic op.
-    // This modifies the operation in-place.
-
-    %annotation = pdl.attribute = #iree_codegen.ukernel_descriptor<"pingpong_large_f16_expanded", tensor>
-    pdl.apply_native_rewrite "annotateOperation"(%generic_op, %attr_name, %annotation : !pdl.operation, !pdl.attribute, !pdl.attribute)
-
-    %config_name = pdl.attribute = "compilation_info"
-    %config = pdl.attribute = #iree_codegen.compilation_info<
-      lowering_config = #iree_gpu.lowering_config<{workgroup = [1, 256, 256, 0]}>,
-      translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse
-        workgroup_size = [512, 1, 1] subgroup_size = 64,
-        // This strategy uses the maximum amount of possible shared memory on
-        // all gfx942 architectures so shared memory padding to reduce bank
-        // conflicts must be disabled. Also prefetching is done manually in the
-        // above and is disabled here as well.
-        {gpu_pipeline_options =
-          #iree_gpu.pipeline_options<
-            prefetch_shared_memory = false,
-            no_reduce_shared_memory_bank_conflicts = true>,
-        // This strategy requires 2 waves per SIMD.
-          llvm_func_attrs = {"amdgpu-waves-per-eu" = "2"}}>
-    >
-    pdl.apply_native_rewrite "annotateOperation"(%generic_op, %config_name, %config : !pdl.operation, !pdl.attribute, !pdl.attribute)
-
-    %builtin_attr = pdl.attribute = "rocm.builtin_name"
-    %builtin_annotation = pdl.attribute = "iree_uk_amdgpu_matmul_f16.mlir"
-    pdl.apply_native_rewrite "annotateOperation"(%generic_op, %builtin_attr, %builtin_annotation : !pdl.operation, !pdl.attribute, !pdl.attribute)
-  }
-}
-
-// BF16 Patterns
-
-// This pattern matches a bf16 matmul-like operation and annotates it
-// with ukernel descriptor and configuration attributes.
-pdl.pattern @annotate_matmul_like_bf16_large : benefit(1) {
-  %elemtypes = pdl.attribute = [bf16, bf16, f32]
-  %imaps = pdl.attribute = [
-    affine_map<(d0, d1, d2) -> (d0, d2)>,
-    affine_map<(d0, d1, d2) -> (d1, d2)>,
-    affine_map<(d0, d1, d2) -> (d0, d1)>
-  ]
-
-  %lhs_type = pdl.type
-  %rhs_type = pdl.type
-  %out_type = pdl.type
-  %zero_type = pdl.type : f32
-
-  %lhs = pdl.operand : %lhs_type
-  %rhs = pdl.operand : %rhs_type
-  %out_init = pdl.operand : %out_type
-
-  %zero_val = pdl.attribute = 0. : f32
-  %zero_op = pdl.operation "arith.constant" {"value" = %zero_val} -> (%zero_type : !pdl.type)
-  %zero = pdl.result 0 of %zero_op
-  %fill_op = pdl.operation "linalg.fill" (%zero, %out_init : !pdl.value, !pdl.value) -> (%out_type : !pdl.type)
-  %fill = pdl.result 0 of %fill_op
-
-  // Match the a matmul-like generic with above indexing maps.
-  %generic_op = pdl.operation (%lhs, %rhs, %fill : !pdl.value, !pdl.value, !pdl.value) -> (%out_type : !pdl.type)
-  pdl.apply_native_constraint "matchContraction"(
-        %generic_op, %elemtypes, %imaps
-        : !pdl.operation, !pdl.attribute, !pdl.attribute)
-
-  %attr_name = pdl.attribute = "iree_codegen.ukernel"
-  pdl.apply_native_constraint "hasAttr"(%generic_op, %attr_name : !pdl.operation, !pdl.attribute) {isNegated = true}
-
-  // M % 256 == 0, K % 64 == 0, N % 256 == 0
-  %empty = pdl.attribute = {}
-  %c0 = pdl.attribute = 0
-  %c1 = pdl.attribute = 1
-  %c2 = pdl.attribute = 2
-  %c64 = pdl.attribute = 64
-  %c256 = pdl.attribute = 256
-  pdl.apply_native_constraint "dimIsMultipleOf"(%lhs, %c0, %c256 : !pdl.value, !pdl.attribute, !pdl.attribute)
-  pdl.apply_native_constraint "dimIsMultipleOf"(%lhs, %c1, %c64 : !pdl.value, !pdl.attribute, !pdl.attribute)
-  pdl.apply_native_constraint "dimIsMultipleOf"(%rhs, %c0, %c256 : !pdl.value, !pdl.attribute, !pdl.attribute)
-  pdl.apply_native_constraint "dimIsMultipleOf"(%rhs, %c1, %c64 : !pdl.value, !pdl.attribute, !pdl.attribute)
-
-  // M, N >= 1024, K >= 256
-  %c1024 = pdl.attribute = 1024
-  pdl.apply_native_constraint "dimIsBound"(%lhs, %c0, %c1024, %empty : !pdl.value, !pdl.attribute, !pdl.attribute, !pdl.attribute)
-  pdl.apply_native_constraint "dimIsBound"(%rhs, %c0, %c1024, %empty : !pdl.value, !pdl.attribute, !pdl.attribute, !pdl.attribute)
-  pdl.apply_native_constraint "dimIsBound"(%lhs, %c1, %c256, %empty : !pdl.value, !pdl.attribute, !pdl.attribute, !pdl.attribute)
-
-  pdl.rewrite {
-    // Call the C++ "annotateOperation" utility to add the attributes to the matched linalg.generic op.
-    // This modifies the operation in-place.
-
-    %annotation = pdl.attribute = #iree_codegen.ukernel_descriptor<"pingpong_large_bf16", tensor>
-    pdl.apply_native_rewrite "annotateOperation"(%generic_op, %attr_name, %annotation : !pdl.operation, !pdl.attribute, !pdl.attribute)
-
-    %config_name = pdl.attribute = "compilation_info"
-    %config = pdl.attribute = #iree_codegen.compilation_info<
-      lowering_config = #iree_gpu.lowering_config<{workgroup = [256, 256, 0]}>,
-      translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse
-        workgroup_size = [512, 1, 1] subgroup_size = 64,
-        // This strategy uses the maximum amount of possible shared memory on
-        // all gfx942 architectures so shared memory padding to reduce bank
-        // conflicts must be disabled. Also prefetching is done manually in the
-        // above and is disabled here as well.
-        {gpu_pipeline_options =
-          #iree_gpu.pipeline_options<
-            prefetch_shared_memory = false,
-            no_reduce_shared_memory_bank_conflicts = true>,
-        // This strategy requires 2 waves per SIMD.
-          llvm_func_attrs = {"amdgpu-waves-per-eu" = "2"}}>
-    >
-    pdl.apply_native_rewrite "annotateOperation"(%generic_op, %config_name, %config : !pdl.operation, !pdl.attribute, !pdl.attribute)
-
-    %builtin_attr = pdl.attribute = "rocm.builtin_name"
-    %builtin_annotation = pdl.attribute = "iree_uk_amdgpu_matmul_bf16.mlir"
-    pdl.apply_native_rewrite "annotateOperation"(%generic_op, %builtin_attr, %builtin_annotation : !pdl.operation, !pdl.attribute, !pdl.attribute)
-  }
-}
-
-// This pattern matches an expanded bf16 matmul-like operation of medium size and annotates it
-// with ukernel descriptor and configuration attributes.
-pdl.pattern @annotate_matmul_like_bf16_medium_expanded : benefit(1) {
-  %elemtypes = pdl.attribute = [bf16, bf16, f32]
-  %imaps = pdl.attribute = [
-    affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>,
-    affine_map<(d0, d1, d2, d3) -> (d2, d3)>,
-    affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>
-  ]
-
-  %lhs_type = pdl.type
-  %rhs_type = pdl.type
-  %out_type = pdl.type
-  %zero_type = pdl.type : f32
-
-  %lhs = pdl.operand : %lhs_type
-  %rhs = pdl.operand : %rhs_type
-  %out_init = pdl.operand : %out_type
-
-  %zero_val = pdl.attribute = 0. : f32
-  %zero_op = pdl.operation "arith.constant" {"value" = %zero_val} -> (%zero_type : !pdl.type)
-  %zero = pdl.result 0 of %zero_op
-  %fill_op = pdl.operation "linalg.fill" (%zero, %out_init : !pdl.value, !pdl.value) -> (%out_type : !pdl.type)
-  %fill = pdl.result 0 of %fill_op
-
-  // Match the a matmul-like generic with above indexing maps.
-  %generic_op = pdl.operation (%lhs, %rhs, %fill : !pdl.value, !pdl.value, !pdl.value) -> (%out_type : !pdl.type)
-  pdl.apply_native_constraint "matchContraction"(
-        %generic_op, %elemtypes, %imaps
-        : !pdl.operation, !pdl.attribute, !pdl.attribute)
-
-  %attr_name = pdl.attribute = "iree_codegen.ukernel"
-  pdl.apply_native_constraint "hasAttr"(%generic_op, %attr_name : !pdl.operation, !pdl.attribute) {isNegated = true}
-
-  // M % 128 == 0, K % 64 == 0, N % 256 == 0
-  %empty = pdl.attribute = {}
-  %c0 = pdl.attribute = 0
-  %c1 = pdl.attribute = 1
-  %c2 = pdl.attribute = 2
-  %c64 = pdl.attribute = 64
-  %c128 = pdl.attribute = 128
-  %c256 = pdl.attribute = 256
-  pdl.apply_native_constraint "dimIsMultipleOf"(%lhs, %c1, %c128 : !pdl.value, !pdl.attribute, !pdl.attribute)
-  pdl.apply_native_constraint "dimIsMultipleOf"(%lhs, %c2, %c64 : !pdl.value, !pdl.attribute, !pdl.attribute)
-  pdl.apply_native_constraint "dimIsMultipleOf"(%rhs, %c0, %c256 : !pdl.value, !pdl.attribute, !pdl.attribute)
-  pdl.apply_native_constraint "dimIsMultipleOf"(%rhs, %c1, %c64 : !pdl.value, !pdl.attribute, !pdl.attribute)
-
-  // M, N >= 1024, K >= 256
-  %c4 = pdl.attribute = 4
-  %c512 = pdl.attribute = 512
-  %c1024 = pdl.attribute = 1024
-
-  // TODO: Kernel specialization is needed to apply this strategy selectively at
-  // runtime. Additionally model exports don't specify lower bounds so it is
-  // impossible to use this strategy with this check.
-  // pdl.apply_native_constraint "dimIsBound"(%lhs, %c0, %c4, %empty : !pdl.value, !pdl.attribute, !pdl.attribute, !pdl.attribute)
-
-  pdl.apply_native_constraint "dimIsBound"(%lhs, %c2, %c256, %empty : !pdl.value, !pdl.attribute, !pdl.attribute, !pdl.attribute)
-  pdl.apply_native_constraint "dimIsBound"(%rhs, %c0, %c1024, %empty : !pdl.value, !pdl.attribute, !pdl.attribute, !pdl.attribute)
-
-  pdl.rewrite {
-    // Call the C++ "annotateOperation" utility to add the attributes to the matched linalg.generic op.
-    // This modifies the operation in-place.
-
-    %annotation = pdl.attribute = #iree_codegen.ukernel_descriptor<"pingpong_medium_bf16_expanded", tensor>
-    pdl.apply_native_rewrite "annotateOperation"(%generic_op, %attr_name, %annotation : !pdl.operation, !pdl.attribute, !pdl.attribute)
-
-    %config_name = pdl.attribute = "compilation_info"
-    %config = pdl.attribute = #iree_codegen.compilation_info<
-      lowering_config = #iree_gpu.lowering_config<{workgroup = [1, 128, 256, 0]}>,
-      translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse
-        workgroup_size = [512, 1, 1] subgroup_size = 64,
-        // This strategy uses the maximum amount of possible shared memory on
-        // all gfx942 architectures so shared memory padding to reduce bank
-        // conflicts must be disabled. Also prefetching is done manually in the
-        // above and is disabled here as well.
-        {gpu_pipeline_options =
-          #iree_gpu.pipeline_options<
-            prefetch_shared_memory = false,
-            no_reduce_shared_memory_bank_conflicts = true>,
-        // This strategy requires 2 waves per SIMD.
-          llvm_func_attrs = {"amdgpu-waves-per-eu" = "2"}}>
-    >
-    pdl.apply_native_rewrite "annotateOperation"(%generic_op, %config_name, %config : !pdl.operation, !pdl.attribute, !pdl.attribute)
-
-    %builtin_attr = pdl.attribute = "rocm.builtin_name"
-    %builtin_annotation = pdl.attribute = "iree_uk_amdgpu_matmul_bf16.mlir"
-    pdl.apply_native_rewrite "annotateOperation"(%generic_op, %builtin_attr, %builtin_annotation : !pdl.operation, !pdl.attribute, !pdl.attribute)
-  }
-}
-
-// This pattern matches an expanded bf16 matmul-like operation of large size and annotates it
-// with ukernel descriptor and configuration attributes. This is preferred over the medium
-// strategy.
-pdl.pattern @annotate_matmul_like_bf16_large_expanded : benefit(2) {
-  %elemtypes = pdl.attribute = [bf16, bf16, f32]
-  %imaps = pdl.attribute = [
-    affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>,
-    affine_map<(d0, d1, d2, d3) -> (d2, d3)>,
-    affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>
-  ]
-
-  %lhs_type = pdl.type
-  %rhs_type = pdl.type
-  %out_type = pdl.type
-  %zero_type = pdl.type : f32
-
-  %lhs = pdl.operand : %lhs_type
-  %rhs = pdl.operand : %rhs_type
-  %out_init = pdl.operand : %out_type
-
-  %zero_val = pdl.attribute = 0. : f32
-  %zero_op = pdl.operation "arith.constant" {"value" = %zero_val} -> (%zero_type : !pdl.type)
-  %zero = pdl.result 0 of %zero_op
-  %fill_op = pdl.operation "linalg.fill" (%zero, %out_init : !pdl.value, !pdl.value) -> (%out_type : !pdl.type)
-  %fill = pdl.result 0 of %fill_op
-
-  // Match the a matmul-like generic with above indexing maps.
-  %generic_op = pdl.operation (%lhs, %rhs, %fill : !pdl.value, !pdl.value, !pdl.value) -> (%out_type : !pdl.type)
-  pdl.apply_native_constraint "matchContraction"(
-        %generic_op, %elemtypes, %imaps
-        : !pdl.operation, !pdl.attribute, !pdl.attribute)
-
-  %attr_name = pdl.attribute = "iree_codegen.ukernel"
-  pdl.apply_native_constraint "hasAttr"(%generic_op, %attr_name : !pdl.operation, !pdl.attribute) {isNegated = true}
-
-  // M % 256 == 0, K % 64 == 0, N % 256 == 0
-  %empty = pdl.attribute = {}
-  %c0 = pdl.attribute = 0
-  %c1 = pdl.attribute = 1
-  %c2 = pdl.attribute = 2
-  %c64 = pdl.attribute = 64
-  %c256 = pdl.attribute = 256
-  pdl.apply_native_constraint "dimIsMultipleOf"(%lhs, %c1, %c256 : !pdl.value, !pdl.attribute, !pdl.attribute)
-  pdl.apply_native_constraint "dimIsMultipleOf"(%lhs, %c2, %c64 : !pdl.value, !pdl.attribute, !pdl.attribute)
-  pdl.apply_native_constraint "dimIsMultipleOf"(%rhs, %c0, %c256 : !pdl.value, !pdl.attribute, !pdl.attribute)
-  pdl.apply_native_constraint "dimIsMultipleOf"(%rhs, %c1, %c64 : !pdl.value, !pdl.attribute, !pdl.attribute)
-
-  // M, N >= 1024, K >= 256
-  %c1024 = pdl.attribute = 1024
-  pdl.apply_native_constraint "dimIsBound"(%lhs, %c2, %c256, %empty : !pdl.value, !pdl.attribute, !pdl.attribute, !pdl.attribute)
-  pdl.apply_native_constraint "dimIsBound"(%rhs, %c0, %c1024, %empty : !pdl.value, !pdl.attribute, !pdl.attribute, !pdl.attribute)
-
-  pdl.rewrite {
-    // Call the C++ "annotateOperation" utility to add the attributes to the matched linalg.generic op.
-    // This modifies the operation in-place.
-
-    %annotation = pdl.attribute = #iree_codegen.ukernel_descriptor<"pingpong_large_bf16_expanded", tensor>
-    pdl.apply_native_rewrite "annotateOperation"(%generic_op, %attr_name, %annotation : !pdl.operation, !pdl.attribute, !pdl.attribute)
-
-    %config_name = pdl.attribute = "compilation_info"
-    %config = pdl.attribute = #iree_codegen.compilation_info<
-      lowering_config = #iree_gpu.lowering_config<{workgroup = [1, 256, 256, 0]}>,
-      translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse
-        workgroup_size = [512, 1, 1] subgroup_size = 64,
-        // This strategy uses the maximum amount of possible shared memory on
-        // all gfx942 architectures so shared memory padding to reduce bank
-        // conflicts must be disabled. Also prefetching is done manually in the
-        // above and is disabled here as well.
-        {gpu_pipeline_options =
-          #iree_gpu.pipeline_options<
-            prefetch_shared_memory = false,
-            no_reduce_shared_memory_bank_conflicts = true>,
-        // This strategy requires 2 waves per SIMD.
-          llvm_func_attrs = {"amdgpu-waves-per-eu" = "2"}}>
-    >
-    pdl.apply_native_rewrite "annotateOperation"(%generic_op, %config_name, %config : !pdl.operation, !pdl.attribute, !pdl.attribute)
-
-    %builtin_attr = pdl.attribute = "rocm.builtin_name"
-    %builtin_annotation = pdl.attribute = "iree_uk_amdgpu_matmul_bf16.mlir"
+    %builtin_annotation = pdl.attribute = "iree_uk_amdgpu_matmul_f8E4M3FNUZ.mlir"
     pdl.apply_native_rewrite "annotateOperation"(%generic_op, %builtin_attr, %builtin_annotation : !pdl.operation, !pdl.attribute, !pdl.attribute)
   }
 }
diff --git a/compiler/plugins/target/ROCM/builtins/mlir_ukernel/ukernel_patterns_gfx942_gfx950.mlir b/compiler/plugins/target/ROCM/builtins/mlir_ukernel/ukernel_patterns_gfx942_gfx950.mlir
new file mode 100644
index 0000000..d45660d
--- /dev/null
+++ b/compiler/plugins/target/ROCM/builtins/mlir_ukernel/ukernel_patterns_gfx942_gfx950.mlir
@@ -0,0 +1,523 @@
+// RUN: iree-opt -allow-unregistered-dialect %s
+
+// F16 Patterns
+
+// This pattern matches a large f16 matmul-like operation and annotates it
+// with ukernel descriptor and configuration attributes.
+pdl.pattern @annotate_matmul_like_f16_large : benefit(1) {
+  %elemtypes = pdl.attribute = [f16, f16, f32]
+  %imaps = pdl.attribute = [
+    affine_map<(d0, d1, d2) -> (d0, d2)>,
+    affine_map<(d0, d1, d2) -> (d1, d2)>,
+    affine_map<(d0, d1, d2) -> (d0, d1)>
+  ]
+
+  %lhs_type = pdl.type
+  %rhs_type = pdl.type
+  %out_type = pdl.type
+  %zero_type = pdl.type : f32
+
+  %lhs = pdl.operand : %lhs_type
+  %rhs = pdl.operand : %rhs_type
+  %out_init = pdl.operand : %out_type
+
+  %zero_val = pdl.attribute = 0. : f32
+  %zero_op = pdl.operation "arith.constant" {"value" = %zero_val} -> (%zero_type : !pdl.type)
+  %zero = pdl.result 0 of %zero_op
+  %fill_op = pdl.operation "linalg.fill" (%zero, %out_init : !pdl.value, !pdl.value) -> (%out_type : !pdl.type)
+  %fill = pdl.result 0 of %fill_op
+
+  // Match the a matmul-like generic with above indexing maps.
+  %generic_op = pdl.operation (%lhs, %rhs, %fill : !pdl.value, !pdl.value, !pdl.value) -> (%out_type : !pdl.type)
+  pdl.apply_native_constraint "matchContraction"(
+        %generic_op, %elemtypes, %imaps
+        : !pdl.operation, !pdl.attribute, !pdl.attribute)
+
+  %attr_name = pdl.attribute = "iree_codegen.ukernel"
+  pdl.apply_native_constraint "hasAttr"(%generic_op, %attr_name : !pdl.operation, !pdl.attribute) {isNegated = true}
+
+  // M % 256 == 0, K % 64 == 0, N % 256 == 0
+  %empty = pdl.attribute = {}
+  %c0 = pdl.attribute = 0
+  %c1 = pdl.attribute = 1
+  %c2 = pdl.attribute = 2
+  %c64 = pdl.attribute = 64
+  %c256 = pdl.attribute = 256
+  pdl.apply_native_constraint "dimIsMultipleOf"(%lhs, %c0, %c256 : !pdl.value, !pdl.attribute, !pdl.attribute)
+  pdl.apply_native_constraint "dimIsMultipleOf"(%lhs, %c1, %c64 : !pdl.value, !pdl.attribute, !pdl.attribute)
+  pdl.apply_native_constraint "dimIsMultipleOf"(%rhs, %c0, %c256 : !pdl.value, !pdl.attribute, !pdl.attribute)
+  pdl.apply_native_constraint "dimIsMultipleOf"(%rhs, %c1, %c64 : !pdl.value, !pdl.attribute, !pdl.attribute)
+
+  // M, N >= 1024, K >= 256
+  %c1024 = pdl.attribute = 1024
+  pdl.apply_native_constraint "dimIsBound"(%lhs, %c0, %c1024, %empty : !pdl.value, !pdl.attribute, !pdl.attribute, !pdl.attribute)
+  pdl.apply_native_constraint "dimIsBound"(%rhs, %c0, %c1024, %empty : !pdl.value, !pdl.attribute, !pdl.attribute, !pdl.attribute)
+  pdl.apply_native_constraint "dimIsBound"(%lhs, %c1, %c256, %empty : !pdl.value, !pdl.attribute, !pdl.attribute, !pdl.attribute)
+
+  pdl.rewrite {
+    // Call the C++ "annotateOperation" utility to add the attributes to the matched linalg.generic op.
+    // This modifies the operation in-place.
+
+    %annotation = pdl.attribute = #iree_codegen.ukernel_descriptor<"pingpong_large_f16", tensor>
+    pdl.apply_native_rewrite "annotateOperation"(%generic_op, %attr_name, %annotation : !pdl.operation, !pdl.attribute, !pdl.attribute)
+
+    %config_name = pdl.attribute = "compilation_info"
+    %config = pdl.attribute = #iree_codegen.compilation_info<
+      lowering_config = #iree_gpu.lowering_config<{workgroup = [256, 256, 0]}>,
+      translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse
+        workgroup_size = [512, 1, 1] subgroup_size = 64,
+        // This strategy uses the maximum amount of possible shared memory on
+        // all gfx9 architectures so shared memory padding to reduce bank
+        // conflicts must be disabled. Also prefetching is done manually in the
+        // above and is disabled here as well.
+        {gpu_pipeline_options =
+          #iree_gpu.pipeline_options<
+            prefetch_shared_memory = false,
+            no_reduce_shared_memory_bank_conflicts = true>,
+        // This strategy requires 2 waves per SIMD.
+          llvm_func_attrs = {"amdgpu-waves-per-eu" = "2"}}>
+    >
+    pdl.apply_native_rewrite "annotateOperation"(%generic_op, %config_name, %config : !pdl.operation, !pdl.attribute, !pdl.attribute)
+
+    %builtin_attr = pdl.attribute = "rocm.builtin_name"
+    %builtin_annotation = pdl.attribute = "iree_uk_amdgpu_matmul_f16.mlir"
+    pdl.apply_native_rewrite "annotateOperation"(%generic_op, %builtin_attr, %builtin_annotation : !pdl.operation, !pdl.attribute, !pdl.attribute)
+  }
+}
+
+// This pattern matches a medium-sized f16 matmul-like operation and annotates it
+// with ukernel descriptor and configuration attributes.
+pdl.pattern @annotate_matmul_like_f16_medium_expanded : benefit(1) {
+  %elemtypes = pdl.attribute = [f16, f16, f32]
+  %imaps = pdl.attribute = [
+    affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>,
+    affine_map<(d0, d1, d2, d3) -> (d2, d3)>,
+    affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>
+  ]
+
+  %lhs_type = pdl.type
+  %rhs_type = pdl.type
+  %out_type = pdl.type
+  %zero_type = pdl.type : f32
+
+  %lhs = pdl.operand : %lhs_type
+  %rhs = pdl.operand : %rhs_type
+  %out_init = pdl.operand : %out_type
+
+  %zero_val = pdl.attribute = 0. : f32
+  %zero_op = pdl.operation "arith.constant" {"value" = %zero_val} -> (%zero_type : !pdl.type)
+  %zero = pdl.result 0 of %zero_op
+  %fill_op = pdl.operation "linalg.fill" (%zero, %out_init : !pdl.value, !pdl.value) -> (%out_type : !pdl.type)
+  %fill = pdl.result 0 of %fill_op
+
+  // Match the a matmul-like generic with above indexing maps.
+  %generic_op = pdl.operation (%lhs, %rhs, %fill : !pdl.value, !pdl.value, !pdl.value) -> (%out_type : !pdl.type)
+  pdl.apply_native_constraint "matchContraction"(
+        %generic_op, %elemtypes, %imaps
+        : !pdl.operation, !pdl.attribute, !pdl.attribute)
+
+  %attr_name = pdl.attribute = "iree_codegen.ukernel"
+  pdl.apply_native_constraint "hasAttr"(%generic_op, %attr_name : !pdl.operation, !pdl.attribute) {isNegated = true}
+
+  // M % 128 == 0, K % 64 == 0, N % 256 == 0
+  %empty = pdl.attribute = {}
+  %c0 = pdl.attribute = 0
+  %c1 = pdl.attribute = 1
+  %c2 = pdl.attribute = 2
+  %c64 = pdl.attribute = 64
+  %c128 = pdl.attribute = 128
+  %c256 = pdl.attribute = 256
+  pdl.apply_native_constraint "dimIsMultipleOf"(%lhs, %c1, %c128 : !pdl.value, !pdl.attribute, !pdl.attribute)
+  pdl.apply_native_constraint "dimIsMultipleOf"(%lhs, %c2, %c64 : !pdl.value, !pdl.attribute, !pdl.attribute)
+  pdl.apply_native_constraint "dimIsMultipleOf"(%rhs, %c0, %c256 : !pdl.value, !pdl.attribute, !pdl.attribute)
+  pdl.apply_native_constraint "dimIsMultipleOf"(%rhs, %c1, %c64 : !pdl.value, !pdl.attribute, !pdl.attribute)
+
+  // M, N >= 1024, K >= 256
+  %c1024 = pdl.attribute = 1024
+
+  // TODO: Kernel specialization is needed to apply this strategy selectively at
+  // runtime. Additionally model exports don't specify lower bounds so it is
+  // impossible to use this strategy with this check.
+  // pdl.apply_native_constraint "dimIsBound"(%lhs, %c0, %c4, %empty : !pdl.value, !pdl.attribute, !pdl.attribute, !pdl.attribute)
+
+  pdl.apply_native_constraint "dimIsBound"(%rhs, %c0, %c1024, %empty : !pdl.value, !pdl.attribute, !pdl.attribute, !pdl.attribute)
+  pdl.apply_native_constraint "dimIsBound"(%lhs, %c2, %c256, %empty : !pdl.value, !pdl.attribute, !pdl.attribute, !pdl.attribute)
+
+  pdl.rewrite {
+    // Call the C++ "annotateOperation" utility to add the attributes to the matched linalg.generic op.
+    // This modifies the operation in-place.
+
+    %annotation = pdl.attribute = #iree_codegen.ukernel_descriptor<"pingpong_medium_f16_expanded", tensor>
+    pdl.apply_native_rewrite "annotateOperation"(%generic_op, %attr_name, %annotation : !pdl.operation, !pdl.attribute, !pdl.attribute)
+
+    %config_name = pdl.attribute = "compilation_info"
+    %config = pdl.attribute = #iree_codegen.compilation_info<
+      lowering_config = #iree_gpu.lowering_config<{workgroup = [1, 128, 256, 0]}>,
+      translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse
+        workgroup_size = [512, 1, 1] subgroup_size = 64,
+        // This strategy uses the maximum amount of possible shared memory on
+        // all gfx9 architectures so shared memory padding to reduce bank
+        // conflicts must be disabled. Also prefetching is done manually in the
+        // above and is disabled here as well.
+        {gpu_pipeline_options =
+          #iree_gpu.pipeline_options<
+            prefetch_shared_memory = false,
+            no_reduce_shared_memory_bank_conflicts = true>,
+        // This strategy requires 2 waves per SIMD.
+          llvm_func_attrs = {"amdgpu-waves-per-eu" = "2"}}>
+    >
+    pdl.apply_native_rewrite "annotateOperation"(%generic_op, %config_name, %config : !pdl.operation, !pdl.attribute, !pdl.attribute)
+
+    %builtin_attr = pdl.attribute = "rocm.builtin_name"
+    %builtin_annotation = pdl.attribute = "iree_uk_amdgpu_matmul_f16.mlir"
+    pdl.apply_native_rewrite "annotateOperation"(%generic_op, %builtin_attr, %builtin_annotation : !pdl.operation, !pdl.attribute, !pdl.attribute)
+  }
+}
+
+// This pattern matches a medium-sized f16 matmul-like operation and annotates it
+// with ukernel descriptor and configuration attributes. This is preferred over the
+// medium-sized ukernel.
+pdl.pattern @annotate_matmul_like_f16_large_expanded : benefit(2) {
+  %elemtypes = pdl.attribute = [f16, f16, f32]
+  %imaps = pdl.attribute = [
+    affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>,
+    affine_map<(d0, d1, d2, d3) -> (d2, d3)>,
+    affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>
+  ]
+
+  %lhs_type = pdl.type
+  %rhs_type = pdl.type
+  %out_type = pdl.type
+  %zero_type = pdl.type : f32
+
+  %lhs = pdl.operand : %lhs_type
+  %rhs = pdl.operand : %rhs_type
+  %out_init = pdl.operand : %out_type
+
+  %zero_val = pdl.attribute = 0. : f32
+  %zero_op = pdl.operation "arith.constant" {"value" = %zero_val} -> (%zero_type : !pdl.type)
+  %zero = pdl.result 0 of %zero_op
+  %fill_op = pdl.operation "linalg.fill" (%zero, %out_init : !pdl.value, !pdl.value) -> (%out_type : !pdl.type)
+  %fill = pdl.result 0 of %fill_op
+
+  // Match the a matmul-like generic with above indexing maps.
+  %generic_op = pdl.operation (%lhs, %rhs, %fill : !pdl.value, !pdl.value, !pdl.value) -> (%out_type : !pdl.type)
+  pdl.apply_native_constraint "matchContraction"(
+        %generic_op, %elemtypes, %imaps
+        : !pdl.operation, !pdl.attribute, !pdl.attribute)
+
+  %attr_name = pdl.attribute = "iree_codegen.ukernel"
+  pdl.apply_native_constraint "hasAttr"(%generic_op, %attr_name : !pdl.operation, !pdl.attribute) {isNegated = true}
+
+  // M % 256 == 0, K % 64 == 0, N % 256 == 0
+  %empty = pdl.attribute = {}
+  %c0 = pdl.attribute = 0
+  %c1 = pdl.attribute = 1
+  %c2 = pdl.attribute = 2
+  %c64 = pdl.attribute = 64
+  %c256 = pdl.attribute = 256
+  pdl.apply_native_constraint "dimIsMultipleOf"(%lhs, %c1, %c256 : !pdl.value, !pdl.attribute, !pdl.attribute)
+  pdl.apply_native_constraint "dimIsMultipleOf"(%lhs, %c2, %c64 : !pdl.value, !pdl.attribute, !pdl.attribute)
+  pdl.apply_native_constraint "dimIsMultipleOf"(%rhs, %c0, %c256 : !pdl.value, !pdl.attribute, !pdl.attribute)
+  pdl.apply_native_constraint "dimIsMultipleOf"(%rhs, %c1, %c64 : !pdl.value, !pdl.attribute, !pdl.attribute)
+
+  // M, N >= 1024, K >= 256
+  %c1024 = pdl.attribute = 1024
+
+  // TODO: Kernel specialization is needed to apply this strategy selectively at
+  // runtime. Additionally model exports don't specify lower bounds so it is
+  // impossible to use this strategy with this check.
+  // pdl.apply_native_constraint "dimIsBound"(%lhs, %c0, %c4, %empty : !pdl.value, !pdl.attribute, !pdl.attribute, !pdl.attribute)
+
+  pdl.apply_native_constraint "dimIsBound"(%rhs, %c0, %c1024, %empty : !pdl.value, !pdl.attribute, !pdl.attribute, !pdl.attribute)
+  pdl.apply_native_constraint "dimIsBound"(%lhs, %c2, %c256, %empty : !pdl.value, !pdl.attribute, !pdl.attribute, !pdl.attribute)
+
+  pdl.rewrite {
+    // Call the C++ "annotateOperation" utility to add the attributes to the matched linalg.generic op.
+    // This modifies the operation in-place.
+
+    %annotation = pdl.attribute = #iree_codegen.ukernel_descriptor<"pingpong_large_f16_expanded", tensor>
+    pdl.apply_native_rewrite "annotateOperation"(%generic_op, %attr_name, %annotation : !pdl.operation, !pdl.attribute, !pdl.attribute)
+
+    %config_name = pdl.attribute = "compilation_info"
+    %config = pdl.attribute = #iree_codegen.compilation_info<
+      lowering_config = #iree_gpu.lowering_config<{workgroup = [1, 256, 256, 0]}>,
+      translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse
+        workgroup_size = [512, 1, 1] subgroup_size = 64,
+        // This strategy uses the maximum amount of possible shared memory on
+        // all gfx9 architectures so shared memory padding to reduce bank
+        // conflicts must be disabled. Also prefetching is done manually in the
+        // above and is disabled here as well.
+        {gpu_pipeline_options =
+          #iree_gpu.pipeline_options<
+            prefetch_shared_memory = false,
+            no_reduce_shared_memory_bank_conflicts = true>,
+        // This strategy requires 2 waves per SIMD.
+          llvm_func_attrs = {"amdgpu-waves-per-eu" = "2"}}>
+    >
+    pdl.apply_native_rewrite "annotateOperation"(%generic_op, %config_name, %config : !pdl.operation, !pdl.attribute, !pdl.attribute)
+
+    %builtin_attr = pdl.attribute = "rocm.builtin_name"
+    %builtin_annotation = pdl.attribute = "iree_uk_amdgpu_matmul_f16.mlir"
+    pdl.apply_native_rewrite "annotateOperation"(%generic_op, %builtin_attr, %builtin_annotation : !pdl.operation, !pdl.attribute, !pdl.attribute)
+  }
+}
+
+// BF16 Patterns
+
+// This pattern matches a bf16 matmul-like operation and annotates it
+// with ukernel descriptor and configuration attributes.
+pdl.pattern @annotate_matmul_like_bf16_large : benefit(1) {
+  %elemtypes = pdl.attribute = [bf16, bf16, f32]
+  %imaps = pdl.attribute = [
+    affine_map<(d0, d1, d2) -> (d0, d2)>,
+    affine_map<(d0, d1, d2) -> (d1, d2)>,
+    affine_map<(d0, d1, d2) -> (d0, d1)>
+  ]
+
+  %lhs_type = pdl.type
+  %rhs_type = pdl.type
+  %out_type = pdl.type
+  %zero_type = pdl.type : f32
+
+  %lhs = pdl.operand : %lhs_type
+  %rhs = pdl.operand : %rhs_type
+  %out_init = pdl.operand : %out_type
+
+  %zero_val = pdl.attribute = 0. : f32
+  %zero_op = pdl.operation "arith.constant" {"value" = %zero_val} -> (%zero_type : !pdl.type)
+  %zero = pdl.result 0 of %zero_op
+  %fill_op = pdl.operation "linalg.fill" (%zero, %out_init : !pdl.value, !pdl.value) -> (%out_type : !pdl.type)
+  %fill = pdl.result 0 of %fill_op
+
+  // Match the a matmul-like generic with above indexing maps.
+  %generic_op = pdl.operation (%lhs, %rhs, %fill : !pdl.value, !pdl.value, !pdl.value) -> (%out_type : !pdl.type)
+  pdl.apply_native_constraint "matchContraction"(
+        %generic_op, %elemtypes, %imaps
+        : !pdl.operation, !pdl.attribute, !pdl.attribute)
+
+  %attr_name = pdl.attribute = "iree_codegen.ukernel"
+  pdl.apply_native_constraint "hasAttr"(%generic_op, %attr_name : !pdl.operation, !pdl.attribute) {isNegated = true}
+
+  // M % 256 == 0, K % 64 == 0, N % 256 == 0
+  %empty = pdl.attribute = {}
+  %c0 = pdl.attribute = 0
+  %c1 = pdl.attribute = 1
+  %c2 = pdl.attribute = 2
+  %c64 = pdl.attribute = 64
+  %c256 = pdl.attribute = 256
+  pdl.apply_native_constraint "dimIsMultipleOf"(%lhs, %c0, %c256 : !pdl.value, !pdl.attribute, !pdl.attribute)
+  pdl.apply_native_constraint "dimIsMultipleOf"(%lhs, %c1, %c64 : !pdl.value, !pdl.attribute, !pdl.attribute)
+  pdl.apply_native_constraint "dimIsMultipleOf"(%rhs, %c0, %c256 : !pdl.value, !pdl.attribute, !pdl.attribute)
+  pdl.apply_native_constraint "dimIsMultipleOf"(%rhs, %c1, %c64 : !pdl.value, !pdl.attribute, !pdl.attribute)
+
+  // M, N >= 1024, K >= 256
+  %c1024 = pdl.attribute = 1024
+  pdl.apply_native_constraint "dimIsBound"(%lhs, %c0, %c1024, %empty : !pdl.value, !pdl.attribute, !pdl.attribute, !pdl.attribute)
+  pdl.apply_native_constraint "dimIsBound"(%rhs, %c0, %c1024, %empty : !pdl.value, !pdl.attribute, !pdl.attribute, !pdl.attribute)
+  pdl.apply_native_constraint "dimIsBound"(%lhs, %c1, %c256, %empty : !pdl.value, !pdl.attribute, !pdl.attribute, !pdl.attribute)
+
+  pdl.rewrite {
+    // Call the C++ "annotateOperation" utility to add the attributes to the matched linalg.generic op.
+    // This modifies the operation in-place.
+
+    %annotation = pdl.attribute = #iree_codegen.ukernel_descriptor<"pingpong_large_bf16", tensor>
+    pdl.apply_native_rewrite "annotateOperation"(%generic_op, %attr_name, %annotation : !pdl.operation, !pdl.attribute, !pdl.attribute)
+
+    %config_name = pdl.attribute = "compilation_info"
+    %config = pdl.attribute = #iree_codegen.compilation_info<
+      lowering_config = #iree_gpu.lowering_config<{workgroup = [256, 256, 0]}>,
+      translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse
+        workgroup_size = [512, 1, 1] subgroup_size = 64,
+        // This strategy uses the maximum amount of possible shared memory on
+        // all gfx9 architectures so shared memory padding to reduce bank
+        // conflicts must be disabled. Also prefetching is done manually in the
+        // above and is disabled here as well.
+        {gpu_pipeline_options =
+          #iree_gpu.pipeline_options<
+            prefetch_shared_memory = false,
+            no_reduce_shared_memory_bank_conflicts = true>,
+        // This strategy requires 2 waves per SIMD.
+          llvm_func_attrs = {"amdgpu-waves-per-eu" = "2"}}>
+    >
+    pdl.apply_native_rewrite "annotateOperation"(%generic_op, %config_name, %config : !pdl.operation, !pdl.attribute, !pdl.attribute)
+
+    %builtin_attr = pdl.attribute = "rocm.builtin_name"
+    %builtin_annotation = pdl.attribute = "iree_uk_amdgpu_matmul_bf16.mlir"
+    pdl.apply_native_rewrite "annotateOperation"(%generic_op, %builtin_attr, %builtin_annotation : !pdl.operation, !pdl.attribute, !pdl.attribute)
+  }
+}
+
+// This pattern matches an expanded bf16 matmul-like operation of medium size and annotates it
+// with ukernel descriptor and configuration attributes.
+pdl.pattern @annotate_matmul_like_bf16_medium_expanded : benefit(1) {
+  %elemtypes = pdl.attribute = [bf16, bf16, f32]
+  %imaps = pdl.attribute = [
+    affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>,
+    affine_map<(d0, d1, d2, d3) -> (d2, d3)>,
+    affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>
+  ]
+
+  %lhs_type = pdl.type
+  %rhs_type = pdl.type
+  %out_type = pdl.type
+  %zero_type = pdl.type : f32
+
+  %lhs = pdl.operand : %lhs_type
+  %rhs = pdl.operand : %rhs_type
+  %out_init = pdl.operand : %out_type
+
+  %zero_val = pdl.attribute = 0. : f32
+  %zero_op = pdl.operation "arith.constant" {"value" = %zero_val} -> (%zero_type : !pdl.type)
+  %zero = pdl.result 0 of %zero_op
+  %fill_op = pdl.operation "linalg.fill" (%zero, %out_init : !pdl.value, !pdl.value) -> (%out_type : !pdl.type)
+  %fill = pdl.result 0 of %fill_op
+
+  // Match the a matmul-like generic with above indexing maps.
+  %generic_op = pdl.operation (%lhs, %rhs, %fill : !pdl.value, !pdl.value, !pdl.value) -> (%out_type : !pdl.type)
+  pdl.apply_native_constraint "matchContraction"(
+        %generic_op, %elemtypes, %imaps
+        : !pdl.operation, !pdl.attribute, !pdl.attribute)
+
+  %attr_name = pdl.attribute = "iree_codegen.ukernel"
+  pdl.apply_native_constraint "hasAttr"(%generic_op, %attr_name : !pdl.operation, !pdl.attribute) {isNegated = true}
+
+  // M % 128 == 0, K % 64 == 0, N % 256 == 0
+  %empty = pdl.attribute = {}
+  %c0 = pdl.attribute = 0
+  %c1 = pdl.attribute = 1
+  %c2 = pdl.attribute = 2
+  %c64 = pdl.attribute = 64
+  %c128 = pdl.attribute = 128
+  %c256 = pdl.attribute = 256
+  pdl.apply_native_constraint "dimIsMultipleOf"(%lhs, %c1, %c128 : !pdl.value, !pdl.attribute, !pdl.attribute)
+  pdl.apply_native_constraint "dimIsMultipleOf"(%lhs, %c2, %c64 : !pdl.value, !pdl.attribute, !pdl.attribute)
+  pdl.apply_native_constraint "dimIsMultipleOf"(%rhs, %c0, %c256 : !pdl.value, !pdl.attribute, !pdl.attribute)
+  pdl.apply_native_constraint "dimIsMultipleOf"(%rhs, %c1, %c64 : !pdl.value, !pdl.attribute, !pdl.attribute)
+
+  // M, N >= 1024, K >= 256
+  %c4 = pdl.attribute = 4
+  %c512 = pdl.attribute = 512
+  %c1024 = pdl.attribute = 1024
+
+  // TODO: Kernel specialization is needed to apply this strategy selectively at
+  // runtime. Additionally model exports don't specify lower bounds so it is
+  // impossible to use this strategy with this check.
+  // pdl.apply_native_constraint "dimIsBound"(%lhs, %c0, %c4, %empty : !pdl.value, !pdl.attribute, !pdl.attribute, !pdl.attribute)
+
+  pdl.apply_native_constraint "dimIsBound"(%lhs, %c2, %c256, %empty : !pdl.value, !pdl.attribute, !pdl.attribute, !pdl.attribute)
+  pdl.apply_native_constraint "dimIsBound"(%rhs, %c0, %c1024, %empty : !pdl.value, !pdl.attribute, !pdl.attribute, !pdl.attribute)
+
+  pdl.rewrite {
+    // Call the C++ "annotateOperation" utility to add the attributes to the matched linalg.generic op.
+    // This modifies the operation in-place.
+
+    %annotation = pdl.attribute = #iree_codegen.ukernel_descriptor<"pingpong_medium_bf16_expanded", tensor>
+    pdl.apply_native_rewrite "annotateOperation"(%generic_op, %attr_name, %annotation : !pdl.operation, !pdl.attribute, !pdl.attribute)
+
+    %config_name = pdl.attribute = "compilation_info"
+    %config = pdl.attribute = #iree_codegen.compilation_info<
+      lowering_config = #iree_gpu.lowering_config<{workgroup = [1, 128, 256, 0]}>,
+      translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse
+        workgroup_size = [512, 1, 1] subgroup_size = 64,
+        // This strategy uses the maximum amount of possible shared memory on
+        // all gfx9 architectures so shared memory padding to reduce bank
+        // conflicts must be disabled. Also prefetching is done manually in the
+        // above and is disabled here as well.
+        {gpu_pipeline_options =
+          #iree_gpu.pipeline_options<
+            prefetch_shared_memory = false,
+            no_reduce_shared_memory_bank_conflicts = true>,
+        // This strategy requires 2 waves per SIMD.
+          llvm_func_attrs = {"amdgpu-waves-per-eu" = "2"}}>
+    >
+    pdl.apply_native_rewrite "annotateOperation"(%generic_op, %config_name, %config : !pdl.operation, !pdl.attribute, !pdl.attribute)
+
+    %builtin_attr = pdl.attribute = "rocm.builtin_name"
+    %builtin_annotation = pdl.attribute = "iree_uk_amdgpu_matmul_bf16.mlir"
+    pdl.apply_native_rewrite "annotateOperation"(%generic_op, %builtin_attr, %builtin_annotation : !pdl.operation, !pdl.attribute, !pdl.attribute)
+  }
+}
+
+// This pattern matches an expanded bf16 matmul-like operation of large size and annotates it
+// with ukernel descriptor and configuration attributes. This is preferred over the medium
+// strategy.
+pdl.pattern @annotate_matmul_like_bf16_large_expanded : benefit(2) {
+  %elemtypes = pdl.attribute = [bf16, bf16, f32]
+  %imaps = pdl.attribute = [
+    affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>,
+    affine_map<(d0, d1, d2, d3) -> (d2, d3)>,
+    affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>
+  ]
+
+  %lhs_type = pdl.type
+  %rhs_type = pdl.type
+  %out_type = pdl.type
+  %zero_type = pdl.type : f32
+
+  %lhs = pdl.operand : %lhs_type
+  %rhs = pdl.operand : %rhs_type
+  %out_init = pdl.operand : %out_type
+
+  %zero_val = pdl.attribute = 0. : f32
+  %zero_op = pdl.operation "arith.constant" {"value" = %zero_val} -> (%zero_type : !pdl.type)
+  %zero = pdl.result 0 of %zero_op
+  %fill_op = pdl.operation "linalg.fill" (%zero, %out_init : !pdl.value, !pdl.value) -> (%out_type : !pdl.type)
+  %fill = pdl.result 0 of %fill_op
+
+  // Match the a matmul-like generic with above indexing maps.
+  %generic_op = pdl.operation (%lhs, %rhs, %fill : !pdl.value, !pdl.value, !pdl.value) -> (%out_type : !pdl.type)
+  pdl.apply_native_constraint "matchContraction"(
+        %generic_op, %elemtypes, %imaps
+        : !pdl.operation, !pdl.attribute, !pdl.attribute)
+
+  %attr_name = pdl.attribute = "iree_codegen.ukernel"
+  pdl.apply_native_constraint "hasAttr"(%generic_op, %attr_name : !pdl.operation, !pdl.attribute) {isNegated = true}
+
+  // M % 256 == 0, K % 64 == 0, N % 256 == 0
+  %empty = pdl.attribute = {}
+  %c0 = pdl.attribute = 0
+  %c1 = pdl.attribute = 1
+  %c2 = pdl.attribute = 2
+  %c64 = pdl.attribute = 64
+  %c256 = pdl.attribute = 256
+  pdl.apply_native_constraint "dimIsMultipleOf"(%lhs, %c1, %c256 : !pdl.value, !pdl.attribute, !pdl.attribute)
+  pdl.apply_native_constraint "dimIsMultipleOf"(%lhs, %c2, %c64 : !pdl.value, !pdl.attribute, !pdl.attribute)
+  pdl.apply_native_constraint "dimIsMultipleOf"(%rhs, %c0, %c256 : !pdl.value, !pdl.attribute, !pdl.attribute)
+  pdl.apply_native_constraint "dimIsMultipleOf"(%rhs, %c1, %c64 : !pdl.value, !pdl.attribute, !pdl.attribute)
+
+  // M, N >= 1024, K >= 256
+  %c1024 = pdl.attribute = 1024
+  pdl.apply_native_constraint "dimIsBound"(%lhs, %c2, %c256, %empty : !pdl.value, !pdl.attribute, !pdl.attribute, !pdl.attribute)
+  pdl.apply_native_constraint "dimIsBound"(%rhs, %c0, %c1024, %empty : !pdl.value, !pdl.attribute, !pdl.attribute, !pdl.attribute)
+
+  pdl.rewrite {
+    // Call the C++ "annotateOperation" utility to add the attributes to the matched linalg.generic op.
+    // This modifies the operation in-place.
+
+    %annotation = pdl.attribute = #iree_codegen.ukernel_descriptor<"pingpong_large_bf16_expanded", tensor>
+    pdl.apply_native_rewrite "annotateOperation"(%generic_op, %attr_name, %annotation : !pdl.operation, !pdl.attribute, !pdl.attribute)
+
+    %config_name = pdl.attribute = "compilation_info"
+    %config = pdl.attribute = #iree_codegen.compilation_info<
+      lowering_config = #iree_gpu.lowering_config<{workgroup = [1, 256, 256, 0]}>,
+      translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse
+        workgroup_size = [512, 1, 1] subgroup_size = 64,
+        // This strategy uses the maximum amount of possible shared memory on
+        // all gfx9 architectures so shared memory padding to reduce bank
+        // conflicts must be disabled. Also prefetching is done manually in the
+        // above and is disabled here as well.
+        {gpu_pipeline_options =
+          #iree_gpu.pipeline_options<
+            prefetch_shared_memory = false,
+            no_reduce_shared_memory_bank_conflicts = true>,
+        // This strategy requires 2 waves per SIMD.
+          llvm_func_attrs = {"amdgpu-waves-per-eu" = "2"}}>
+    >
+    pdl.apply_native_rewrite "annotateOperation"(%generic_op, %config_name, %config : !pdl.operation, !pdl.attribute, !pdl.attribute)
+
+    %builtin_attr = pdl.attribute = "rocm.builtin_name"
+    %builtin_annotation = pdl.attribute = "iree_uk_amdgpu_matmul_bf16.mlir"
+    pdl.apply_native_rewrite "annotateOperation"(%generic_op, %builtin_attr, %builtin_annotation : !pdl.operation, !pdl.attribute, !pdl.attribute)
+  }
+}
diff --git a/compiler/plugins/target/ROCM/builtins/mlir_ukernel/ukernel_patterns_gfx950.mlir b/compiler/plugins/target/ROCM/builtins/mlir_ukernel/ukernel_patterns_gfx950.mlir
new file mode 100644
index 0000000..c140d97
--- /dev/null
+++ b/compiler/plugins/target/ROCM/builtins/mlir_ukernel/ukernel_patterns_gfx950.mlir
@@ -0,0 +1,180 @@
+// RUN: iree-opt -allow-unregistered-dialect %s
+
+// f8E4M3FN Patterns
+
+// This pattern matches a medium-sized expanded matmul-like operation and
+// annotates it with ukernel descriptor and configuration attributes.
+pdl.pattern @annotate_matmul_like_f8E4M3FN_medium_expanded : benefit(1) {
+  %elemtypes = pdl.attribute = [f8E4M3FN, f8E4M3FN, f32]
+  %imaps = pdl.attribute = [
+    affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>,
+    affine_map<(d0, d1, d2, d3) -> (d2, d3)>,
+    affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>
+  ]
+
+  %lhs_type = pdl.type
+  %rhs_type = pdl.type
+  %out_type = pdl.type
+  %zero_type = pdl.type : f32
+
+  %lhs = pdl.operand : %lhs_type
+  %rhs = pdl.operand : %rhs_type
+  %out_init = pdl.operand : %out_type
+
+  %zero_val = pdl.attribute = 0. : f32
+  %zero_op = pdl.operation "arith.constant" {"value" = %zero_val} -> (%zero_type : !pdl.type)
+  %zero = pdl.result 0 of %zero_op
+  %fill_op = pdl.operation "linalg.fill" (%zero, %out_init : !pdl.value, !pdl.value) -> (%out_type : !pdl.type)
+  %fill = pdl.result 0 of %fill_op
+
+  // Match the a matmul-like generic with above indexing maps.
+  %generic_op = pdl.operation (%lhs, %rhs, %fill : !pdl.value, !pdl.value, !pdl.value) -> (%out_type : !pdl.type)
+  pdl.apply_native_constraint "matchContraction"(
+        %generic_op, %elemtypes, %imaps
+        : !pdl.operation, !pdl.attribute, !pdl.attribute)
+
+  %attr_name = pdl.attribute = "iree_codegen.ukernel"
+  pdl.apply_native_constraint "hasAttr"(%generic_op, %attr_name : !pdl.operation, !pdl.attribute) {isNegated = true}
+
+  // M % 128 == 0, K % 128 == 0, N % 256 == 0
+  %empty = pdl.attribute = {}
+  %c0 = pdl.attribute = 0
+  %c1 = pdl.attribute = 1
+  %c2 = pdl.attribute = 2
+  %c128 = pdl.attribute = 128
+  %c256 = pdl.attribute = 256
+  pdl.apply_native_constraint "dimIsMultipleOf"(%lhs, %c1, %c128 : !pdl.value, !pdl.attribute, !pdl.attribute)
+  pdl.apply_native_constraint "dimIsMultipleOf"(%lhs, %c2, %c128 : !pdl.value, !pdl.attribute, !pdl.attribute)
+  pdl.apply_native_constraint "dimIsMultipleOf"(%rhs, %c0, %c256 : !pdl.value, !pdl.attribute, !pdl.attribute)
+  pdl.apply_native_constraint "dimIsMultipleOf"(%rhs, %c1, %c128 : !pdl.value, !pdl.attribute, !pdl.attribute)
+
+  // N >= 1024, K >= 512
+  %c512 = pdl.attribute = 512
+  %c1024 = pdl.attribute = 1024
+  pdl.apply_native_constraint "dimIsBound"(%rhs, %c0, %c1024, %empty : !pdl.value, !pdl.attribute, !pdl.attribute, !pdl.attribute)
+  pdl.apply_native_constraint "dimIsBound"(%lhs, %c2, %c512, %empty : !pdl.value, !pdl.attribute, !pdl.attribute, !pdl.attribute)
+
+  pdl.rewrite {
+    // Call the C++ "annotateOperation" utility to add the attributes to the matched linalg.generic op.
+    // This modifies the operation in-place.
+
+    %annotation = pdl.attribute = #iree_codegen.ukernel_descriptor<"pingpong_medium_f8E4M3FN_expanded", tensor>
+    pdl.apply_native_rewrite "annotateOperation"(%generic_op, %attr_name, %annotation : !pdl.operation, !pdl.attribute, !pdl.attribute)
+
+    %config_name = pdl.attribute = "compilation_info"
+    %config = pdl.attribute = #iree_codegen.compilation_info<
+      lowering_config = #iree_gpu.lowering_config<{
+        workgroup = [1, 128, 256, 0]
+      }>,
+      translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse
+        workgroup_size = [512, 1, 1] subgroup_size = 64,
+        // This strategy uses the maximum amount of possible shared memory on
+        // all gfx9 architectures so shared memory padding to reduce bank
+        // conflicts must be disabled. Also prefetching is done manually in the
+        // above and is disabled here as well.
+        {gpu_pipeline_options =
+          #iree_gpu.pipeline_options<
+            prefetch_shared_memory = false,
+            no_reduce_shared_memory_bank_conflicts = true>,
+        // This strategy requires 2 waves per SIMD.
+          llvm_func_attrs = {"amdgpu-waves-per-eu" = "2"}}>
+    >
+    pdl.apply_native_rewrite "annotateOperation"(%generic_op, %config_name, %config : !pdl.operation, !pdl.attribute, !pdl.attribute)
+
+    %builtin_attr = pdl.attribute = "rocm.builtin_name"
+    %builtin_annotation = pdl.attribute = "iree_uk_amdgpu_matmul_f8E4M3FN.mlir"
+    pdl.apply_native_rewrite "annotateOperation"(%generic_op, %builtin_attr, %builtin_annotation : !pdl.operation, !pdl.attribute, !pdl.attribute)
+  }
+}
+
+// This pattern matches a large expanded f8 matmul-like operation and annotates it
+// with ukernel descriptor and configuration attributes. This is preferred over the
+// medium-sized ukernel.
+pdl.pattern @annotate_matmul_like_f8E4M3FN_large_expanded : benefit(2) {
+  %elemtypes = pdl.attribute = [f8E4M3FN, f8E4M3FN, f32]
+  %imaps = pdl.attribute = [
+    affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>,
+    affine_map<(d0, d1, d2, d3) -> (d2, d3)>,
+    affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>
+  ]
+
+  %lhs_type = pdl.type
+  %rhs_type = pdl.type
+  %out_type = pdl.type
+  %zero_type = pdl.type : f32
+
+  %lhs = pdl.operand : %lhs_type
+  %rhs = pdl.operand : %rhs_type
+  %out_init = pdl.operand : %out_type
+
+  %zero_val = pdl.attribute = 0. : f32
+  %zero_op = pdl.operation "arith.constant" {"value" = %zero_val} -> (%zero_type : !pdl.type)
+  %zero = pdl.result 0 of %zero_op
+  %fill_op = pdl.operation "linalg.fill" (%zero, %out_init : !pdl.value, !pdl.value) -> (%out_type : !pdl.type)
+  %fill = pdl.result 0 of %fill_op
+
+  // Match the a matmul-like generic with above indexing maps.
+  %generic_op = pdl.operation (%lhs, %rhs, %fill : !pdl.value, !pdl.value, !pdl.value) -> (%out_type : !pdl.type)
+  pdl.apply_native_constraint "matchContraction"(
+        %generic_op, %elemtypes, %imaps
+        : !pdl.operation, !pdl.attribute, !pdl.attribute)
+
+  %attr_name = pdl.attribute = "iree_codegen.ukernel"
+  pdl.apply_native_constraint "hasAttr"(%generic_op, %attr_name : !pdl.operation, !pdl.attribute) {isNegated = true}
+
+  // M % 256 == 0, K % 128 == 0, N % 256 == 0
+  %empty = pdl.attribute = {}
+  %c0 = pdl.attribute = 0
+  %c1 = pdl.attribute = 1
+  %c2 = pdl.attribute = 2
+  %c128 = pdl.attribute = 128
+  %c256 = pdl.attribute = 256
+  pdl.apply_native_constraint "dimIsMultipleOf"(%lhs, %c1, %c256 : !pdl.value, !pdl.attribute, !pdl.attribute)
+  pdl.apply_native_constraint "dimIsMultipleOf"(%lhs, %c2, %c128 : !pdl.value, !pdl.attribute, !pdl.attribute)
+  pdl.apply_native_constraint "dimIsMultipleOf"(%rhs, %c0, %c256 : !pdl.value, !pdl.attribute, !pdl.attribute)
+  pdl.apply_native_constraint "dimIsMultipleOf"(%rhs, %c1, %c128 : !pdl.value, !pdl.attribute, !pdl.attribute)
+
+  // N >= 1024, K >= 512
+  %c512 = pdl.attribute = 512
+  %c1024 = pdl.attribute = 1024
+
+  // TODO: Kernel specialization is needed to apply this strategy selectively at
+  // runtime. Additionally model exports don't specify lower bounds so it is
+  // impossible to use this strategy with this check.
+  // pdl.apply_native_constraint "dimIsBound"(%lhs, %c0, %c4, %empty : !pdl.value, !pdl.attribute, !pdl.attribute, !pdl.attribute)
+
+  pdl.apply_native_constraint "dimIsBound"(%rhs, %c0, %c1024, %empty : !pdl.value, !pdl.attribute, !pdl.attribute, !pdl.attribute)
+  pdl.apply_native_constraint "dimIsBound"(%lhs, %c2, %c512, %empty : !pdl.value, !pdl.attribute, !pdl.attribute, !pdl.attribute)
+
+  pdl.rewrite {
+    // Call the C++ "annotateOperation" utility to add the attributes to the matched linalg.generic op.
+    // This modifies the operation in-place.
+
+    %annotation = pdl.attribute = #iree_codegen.ukernel_descriptor<"pingpong_large_f8E4M3FN_expanded", tensor>
+    pdl.apply_native_rewrite "annotateOperation"(%generic_op, %attr_name, %annotation : !pdl.operation, !pdl.attribute, !pdl.attribute)
+
+    %config_name = pdl.attribute = "compilation_info"
+    %config = pdl.attribute = #iree_codegen.compilation_info<
+      lowering_config = #iree_gpu.lowering_config<{
+        workgroup = [1, 256, 256, 0]
+      }>,
+      translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse
+        workgroup_size = [512, 1, 1] subgroup_size = 64,
+        // This strategy uses the maximum amount of possible shared memory on
+        // all gfx9 architectures so shared memory padding to reduce bank
+        // conflicts must be disabled. Also prefetching is done manually in the
+        // above and is disabled here as well.
+        {gpu_pipeline_options =
+          #iree_gpu.pipeline_options<
+            prefetch_shared_memory = false,
+            no_reduce_shared_memory_bank_conflicts = true>,
+        // This strategy requires 2 waves per SIMD.
+          llvm_func_attrs = {"amdgpu-waves-per-eu" = "2"}}>
+    >
+    pdl.apply_native_rewrite "annotateOperation"(%generic_op, %config_name, %config : !pdl.operation, !pdl.attribute, !pdl.attribute)
+
+    %builtin_attr = pdl.attribute = "rocm.builtin_name"
+    %builtin_annotation = pdl.attribute = "iree_uk_amdgpu_matmul_f8E4M3FN.mlir"
+    pdl.apply_native_rewrite "annotateOperation"(%generic_op, %builtin_attr, %builtin_annotation : !pdl.operation, !pdl.attribute, !pdl.attribute)
+  }
+}
diff --git a/compiler/plugins/target/ROCM/test/enable_tensor_ukernels.mlir b/compiler/plugins/target/ROCM/test/enable_tensor_ukernels.mlir
index 7b5b30c..54adc75 100644
--- a/compiler/plugins/target/ROCM/test/enable_tensor_ukernels.mlir
+++ b/compiler/plugins/target/ROCM/test/enable_tensor_ukernels.mlir
@@ -48,7 +48,7 @@
 // CHECK:      func.func @matmul_f8
 // CHECK-SAME:   translation_info = #[[TRANSLATION]]
 // CHECK:        linalg.generic
-// CHECK-SAME:     iree_codegen.ukernel = #iree_codegen.ukernel_descriptor<"pingpong_medium_f8_expanded", tensor>
+// CHECK-SAME:     iree_codegen.ukernel = #iree_codegen.ukernel_descriptor<"pingpong_medium_f8E4M3FNUZ_expanded", tensor>
 // CHECK-SAME:     lowering_config = #iree_gpu.lowering_config
-// CHECK:        util.func private @pingpong_medium_f8_expanded
+// CHECK:        util.func private @pingpong_medium_f8E4M3FNUZ_expanded
 // CHECK:          iree_codegen.inner_tiled