Add gfx950 ukernel patterns (#21856)
This PR enables our existing MLIR ukernel for GFX942 to GFX950 for
matmul operations with F16, F8, and BF16 data types.
diff --git a/compiler/plugins/target/ROCM/Dialect/ROCM/IR/ROCMDialect.cpp b/compiler/plugins/target/ROCM/Dialect/ROCM/IR/ROCMDialect.cpp
index 0c4e3f2..2c0a8f6 100644
--- a/compiler/plugins/target/ROCM/Dialect/ROCM/IR/ROCMDialect.cpp
+++ b/compiler/plugins/target/ROCM/Dialect/ROCM/IR/ROCMDialect.cpp
@@ -58,4 +58,8 @@
return builtins.getFile(name);
}
+SmallVector<StringRef> ROCMDialect::getBuiltinNames() {
+ return llvm::to_vector(builtins.getMap().keys());
+}
+
} // namespace mlir::iree_compiler::IREE::ROCM
diff --git a/compiler/plugins/target/ROCM/Dialect/ROCM/IR/ROCMDialect.td b/compiler/plugins/target/ROCM/Dialect/ROCM/IR/ROCMDialect.td
index 720821f..01243e1 100644
--- a/compiler/plugins/target/ROCM/Dialect/ROCM/IR/ROCMDialect.td
+++ b/compiler/plugins/target/ROCM/Dialect/ROCM/IR/ROCMDialect.td
@@ -31,6 +31,8 @@
bool hasBuiltin(StringRef name);
/// Helper for getting a builtin.
std::optional<StringRef> getBuiltin(StringRef name);
+ /// Helper for getting the builtin names.
+ SmallVector<StringRef> getBuiltinNames();
/// Returns the loaded builtin module for the `path`. If the module has
/// already been loaded in the past, returns the memoized module without
diff --git a/compiler/plugins/target/ROCM/Dialect/ROCM/Transforms/ApplyBuiltinPDLPatterns.cpp b/compiler/plugins/target/ROCM/Dialect/ROCM/Transforms/ApplyBuiltinPDLPatterns.cpp
index 2ff03ee..fbe6581 100644
--- a/compiler/plugins/target/ROCM/Dialect/ROCM/Transforms/ApplyBuiltinPDLPatterns.cpp
+++ b/compiler/plugins/target/ROCM/Dialect/ROCM/Transforms/ApplyBuiltinPDLPatterns.cpp
@@ -19,6 +19,7 @@
#include "iree/compiler/Utils/ShapeUtils.h"
#include "llvm/ADT/SmallVectorExtras.h"
#include "llvm/Support/FormatVariadic.h"
+#include "llvm/Support/Regex.h"
#include "mlir/Dialect/LLVMIR/ROCDLDialect.h"
#include "mlir/Dialect/Linalg/IR/Linalg.h"
#include "mlir/Dialect/PDL/IR/PDL.h"
@@ -203,6 +204,14 @@
namespace {
+SmallVector<StringRef> filterUkernelPatternsByTarget(ArrayRef<StringRef> names,
+ StringRef target) {
+ std::string pattern =
+ "ukernel_patterns(_.*)*" + target.str() + "(_.*)*\\.mlir";
+ llvm::Regex regex(pattern);
+ return llvm::filter_to_vector(
+ names, [®ex](StringRef name) { return regex.match(name); });
+}
class ApplyBuiltinPDLPatternsPass
: public iree_compiler::IREE::ROCM::impl::ApplyBuiltinPDLPatternsPassBase<
ApplyBuiltinPDLPatternsPass> {
@@ -232,17 +241,21 @@
}
if (enableTensorUKernels) {
for (std::string target : targets) {
- std::string builtinName =
- llvm::formatv("ukernel_patterns_{}.mlir", target);
- std::optional<StringRef> maybeBuiltin =
- rocmDialect->getBuiltin(builtinName);
- if (!maybeBuiltin) {
- // Skip when no patterns are present.
- continue;
- }
- if (failed(populatePDLModuleFromBuiltin(context, tmpPatterns,
- maybeBuiltin.value()))) {
- return failure();
+ SmallVector<StringRef> allBuiltinNames = rocmDialect->getBuiltinNames();
+ SmallVector<StringRef> builtinNames =
+ filterUkernelPatternsByTarget(allBuiltinNames, target);
+ std::string builtinSrc;
+ for (StringRef builtinName : builtinNames) {
+ std::optional<StringRef> maybeBuiltin =
+ rocmDialect->getBuiltin(builtinName);
+ if (!maybeBuiltin) {
+ // Skip when no patterns are present.
+ continue;
+ }
+ if (failed(populatePDLModuleFromBuiltin(context, tmpPatterns,
+ maybeBuiltin.value()))) {
+ return failure();
+ }
}
}
}
diff --git a/compiler/plugins/target/ROCM/Dialect/ROCM/Transforms/test/BUILD.bazel b/compiler/plugins/target/ROCM/Dialect/ROCM/Transforms/test/BUILD.bazel
index 74a2b84..8bfe040 100644
--- a/compiler/plugins/target/ROCM/Dialect/ROCM/Transforms/test/BUILD.bazel
+++ b/compiler/plugins/target/ROCM/Dialect/ROCM/Transforms/test/BUILD.bazel
@@ -12,16 +12,14 @@
licenses = ["notice"], # Apache 2.0
)
+ukernel_pdl_patterns_files = [
+ "apply_builtin_ukernel_pdl_patterns.mlir",
+ "apply_builtin_ukernel_pdl_patterns_driver.mlir",
+]
+
iree_lit_test_suite(
name = "lit",
- srcs = enforce_glob(
- [
- "apply_builtin_pdl_patterns.mlir",
- "apply_builtin_ukernel_pdl_patterns.mlir",
- "apply_builtin_ukernel_pdl_patterns_driver.mlir",
- ],
- include = ["*.mlir"],
- ),
+ srcs = ukernel_pdl_patterns_files,
cfg = "//compiler:lit.cfg.py",
tools = [
"//tools:iree-opt",
diff --git a/compiler/plugins/target/ROCM/Dialect/ROCM/Transforms/test/CMakeLists.txt b/compiler/plugins/target/ROCM/Dialect/ROCM/Transforms/test/CMakeLists.txt
index 45224bc..7e87ce7 100644
--- a/compiler/plugins/target/ROCM/Dialect/ROCM/Transforms/test/CMakeLists.txt
+++ b/compiler/plugins/target/ROCM/Dialect/ROCM/Transforms/test/CMakeLists.txt
@@ -14,7 +14,6 @@
NAME
lit
SRCS
- "apply_builtin_pdl_patterns.mlir"
"apply_builtin_ukernel_pdl_patterns.mlir"
"apply_builtin_ukernel_pdl_patterns_driver.mlir"
TOOLS
diff --git a/compiler/plugins/target/ROCM/Dialect/ROCM/Transforms/test/apply_builtin_ukernel_pdl_patterns.mlir b/compiler/plugins/target/ROCM/Dialect/ROCM/Transforms/test/apply_builtin_ukernel_pdl_patterns.mlir
index 6ddcba2..47794f8 100644
--- a/compiler/plugins/target/ROCM/Dialect/ROCM/Transforms/test/apply_builtin_ukernel_pdl_patterns.mlir
+++ b/compiler/plugins/target/ROCM/Dialect/ROCM/Transforms/test/apply_builtin_ukernel_pdl_patterns.mlir
@@ -1,10 +1,12 @@
// RUN: iree-opt --pass-pipeline='builtin.module(func.func(iree-rocm-apply-builtin-pdl-patterns{targets=gfx942 enable-tensor-ukernels=true}))' \
-// RUN: --mlir-print-local-scope --split-input-file %s | FileCheck %s
+// RUN: --mlir-print-local-scope --split-input-file %s | FileCheck %s --check-prefix=GFX942
+// RUN: iree-opt --pass-pipeline='builtin.module(func.func(iree-rocm-apply-builtin-pdl-patterns{targets=gfx950 enable-tensor-ukernels=true}))' \
+// RUN: --mlir-print-local-scope --split-input-file %s | FileCheck %s --check-prefix=GFX950
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>
#map2 = affine_map<(d0, d1, d2, d3) -> (d2, d3)>
#map3 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>
-func.func @matmul_f8(%arg0: tensor<1x128x4096xf8E4M3FNUZ>, %arg1: tensor<1024x4096xf8E4M3FNUZ>) -> tensor<1x128x1024xf32> {
+func.func @matmul_f8_gfx942(%arg0: tensor<1x128x4096xf8E4M3FNUZ>, %arg1: tensor<1024x4096xf8E4M3FNUZ>) -> tensor<1x128x1024xf32> {
%cst = arith.constant 0.000000e+00 : f32
%0 = tensor.empty() : tensor<1x128x1024xf32>
%1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<1x128x1024xf32>) -> tensor<1x128x1024xf32>
@@ -18,19 +20,22 @@
} -> tensor<1x128x1024xf32>
return %2 : tensor<1x128x1024xf32>
}
-// CHECK-LABEL: @matmul_f8
-// CHECK: linalg.generic
-// CHECK-SAME: compilation_info = #iree_codegen.compilation_info
-// CHECK-SAME: lowering_config =
-// CHECK-SAME: translation_info =
-// CHECK-SAME: iree_codegen.ukernel = #iree_codegen.ukernel_descriptor<"pingpong_medium_f8_expanded", tensor>
-
+// GFX942-LABEL: @matmul_f8_gfx942
+// GFX942: linalg.generic
+// GFX942-SAME: compilation_info = #iree_codegen.compilation_info
+// GFX942-SAME: lowering_config =
+// GFX942-SAME: translation_info =
+// GFX942-SAME: iree_codegen.ukernel = #iree_codegen.ukernel_descriptor<"pingpong_medium_f8E4M3FNUZ_expanded", tensor>
+// GFX950-LABEL: @matmul_f8_gfx942
+// GFX950: linalg.generic
+// GFX950-NOT: compilation_info = #iree_codegen.compilation_info
+// GFX950-NOT: iree_codegen.ukernel = #iree_codegen.ukernel_descriptor<"pingpong_medium_f8E4M3FNUZ_expanded", tensor>
// -----
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>
#map2 = affine_map<(d0, d1, d2, d3) -> (d2, d3)>
#map3 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>
-func.func @negative_matmul_f8(%arg0: tensor<1x128x256xf8E4M3FNUZ>, %arg1: tensor<1024x256xf8E4M3FNUZ>) -> tensor<1x128x1024xf32> {
+func.func @negative_matmul_f8_gfx942(%arg0: tensor<1x128x256xf8E4M3FNUZ>, %arg1: tensor<1024x256xf8E4M3FNUZ>) -> tensor<1x128x1024xf32> {
%cst = arith.constant 0.000000e+00 : f32
%0 = tensor.empty() : tensor<1x128x1024xf32>
%1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<1x128x1024xf32>) -> tensor<1x128x1024xf32>
@@ -44,16 +49,19 @@
} -> tensor<1x128x1024xf32>
return %2 : tensor<1x128x1024xf32>
}
-// CHECK-LABEL: @negative_matmul_f8
-// CHECK-NOT: compilation_info = #iree_codegen.compilation_info
-// CHECK-NOT: iree_codegen.ukernel = #iree_codegen.ukernel_descriptor<"pingpong_medium_f8_expanded", tensor>
+// GFX942-LABEL: @negative_matmul_f8_gfx942
+// GFX942-NOT: compilation_info = #iree_codegen.compilation_info
+// GFX942-NOT: iree_codegen.ukernel = #iree_codegen.ukernel_descriptor<"pingpong_medium_f8E4M3FNUZ_expanded", tensor>
+// GFX950-LABEL: @negative_matmul_f8_gfx942
+// GFX950-NOT: compilation_info = #iree_codegen.compilation_info
+// GFX950-NOT: iree_codegen.ukernel = #iree_codegen.ukernel_descriptor<"pingpong_medium_f8E4M3FNUZ_expanded", tensor>
// -----
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>
#map2 = affine_map<(d0, d1, d2, d3) -> (d2, d3)>
#map3 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>
-func.func @negative_matmul_f8_medium_no_zero_fill(%arg0: tensor<1x128x4096xf8E4M3FNUZ>, %arg1: tensor<1024x4096xf8E4M3FNUZ>) -> tensor<1x128x1024xf32> {
+func.func @negative_matmul_f8_medium_no_zero_fill_gfx942(%arg0: tensor<1x128x4096xf8E4M3FNUZ>, %arg1: tensor<1024x4096xf8E4M3FNUZ>) -> tensor<1x128x1024xf32> {
%cst = arith.constant 1.000000e+00 : f32
%0 = tensor.empty() : tensor<1x128x1024xf32>
%1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<1x128x1024xf32>) -> tensor<1x128x1024xf32>
@@ -67,9 +75,12 @@
} -> tensor<1x128x1024xf32>
return %2 : tensor<1x128x1024xf32>
}
-// CHECK-LABEL: @negative_matmul_f8_medium_no_zero_fill
-// CHECK-NOT: compilation_info = #iree_codegen.compilation_info
-// CHECK-NOT: iree_codegen.ukernel = #iree_codegen.ukernel_descriptor
+// GFX942-LABEL: @negative_matmul_f8_medium_no_zero_fill_gfx942
+// GFX942-NOT: compilation_info = #iree_codegen.compilation_info
+// GFX942-NOT: iree_codegen.ukernel = #iree_codegen.ukernel_descriptor
+// GFX950-LABEL: @negative_matmul_f8_medium_no_zero_fill_gfx942
+// GFX950-NOT: compilation_info = #iree_codegen.compilation_info
+// GFX950-NOT: iree_codegen.ukernel = #iree_codegen.ukernel_descriptor
// -----
@@ -78,7 +89,7 @@
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>
#map2 = affine_map<(d0, d1, d2, d3) -> (d2, d3)>
#map3 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>
-func.func @matmul_f8_dynamic(%arg0: index) -> tensor<1x128x1024xf32> {
+func.func @matmul_f8_dynamic_gfx942(%arg0: index) -> tensor<1x128x1024xf32> {
%cst = arith.constant 0.000000e+00 : f32
%0 = util.assume.int %arg0<umin = 512, udiv = 128> : index
%1 = tensor.empty(%0) : tensor<1x128x?xf8E4M3FNUZ>
@@ -95,12 +106,16 @@
} -> tensor<1x128x1024xf32>
return %5 : tensor<1x128x1024xf32>
}
-// CHECK-LABEL: @matmul_f8_dynamic
-// CHECK: linalg.generic
-// CHECK-SAME: compilation_info = #iree_codegen.compilation_info
-// CHECK-SAME: lowering_config =
-// CHECK-SAME: translation_info =
-// CHECK-SAME: iree_codegen.ukernel = #iree_codegen.ukernel_descriptor<"pingpong_medium_f8_expanded", tensor>
+// GFX942-LABEL: @matmul_f8_dynamic_gfx942
+// GFX942: linalg.generic
+// GFX942-SAME: compilation_info = #iree_codegen.compilation_info
+// GFX942-SAME: lowering_config =
+// GFX942-SAME: translation_info =
+// GFX942-SAME: iree_codegen.ukernel = #iree_codegen.ukernel_descriptor<"pingpong_medium_f8E4M3FNUZ_expanded", tensor>
+// GFX950-LABEL: @matmul_f8_dynamic_gfx942
+// GFX950: linalg.generic
+// GFX950-NOT: compilation_info = #iree_codegen.compilation_info
+// GFX950-NOT: iree_codegen.ukernel = #iree_codegen.ukernel_descriptor<"pingpong_medium_f8E4M3FNUZ_expanded", tensor>
// -----
@@ -109,7 +124,7 @@
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>
#map2 = affine_map<(d0, d1, d2, d3) -> (d2, d3)>
#map3 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>
-func.func @negative_matmul_f8_dynamic_multiple_of(%arg0: tensor<1024x512xf8E4M3FNUZ>, %arg1: index) -> tensor<1x?x1024xf32> {
+func.func @negative_matmul_f8_dynamic_multiple_of_gfx942(%arg0: tensor<1024x512xf8E4M3FNUZ>, %arg1: index) -> tensor<1x?x1024xf32> {
%cst = arith.constant 0.000000e+00 : f32
%0 = util.assume.int %arg1<udiv = 64> : index
%1 = tensor.empty(%0) : tensor<1x?x512xf8E4M3FNUZ>
@@ -125,9 +140,12 @@
} -> tensor<1x?x1024xf32>
return %4 : tensor<1x?x1024xf32>
}
-// CHECK-LABEL: @negative_matmul_f8_dynamic_multiple_of
-// CHECK-NOT: compilation_info = #iree_codegen.compilation_info
-// CHECK-NOT: iree_codegen.ukernel = #iree_codegen.ukernel_descriptor<"pingpong_medium_f8_expanded", tensor>
+// GFX942-LABEL: @negative_matmul_f8_dynamic_multiple_of_gfx942
+// GFX942-NOT: compilation_info = #iree_codegen.compilation_info
+// GFX942-NOT: iree_codegen.ukernel = #iree_codegen.ukernel_descriptor<"pingpong_medium_f8E4M3FNUZ_expanded", tensor>
+// GFX950-LABEL: @negative_matmul_f8_dynamic_multiple_of_gfx942
+// GFX950-NOT: compilation_info = #iree_codegen.compilation_info
+// GFX950-NOT: iree_codegen.ukernel = #iree_codegen.ukernel_descriptor<"pingpong_medium_f8E4M3FNUZ_expanded", tensor>
// -----
@@ -136,7 +154,7 @@
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>
#map2 = affine_map<(d0, d1, d2, d3) -> (d2, d3)>
#map3 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>
-func.func @negative_matmul_f8_dynamic_lower_bound(%arg0: index) -> tensor<1x128x1024xf32> {
+func.func @negative_matmul_f8_dynamic_lower_bound_gfx942(%arg0: index) -> tensor<1x128x1024xf32> {
%cst = arith.constant 0.000000e+00 : f32
%0 = util.assume.int %arg0<umin = 256, udiv = 128> : index
%1 = tensor.empty(%0) : tensor<1x128x?xf8E4M3FNUZ>
@@ -153,16 +171,19 @@
} -> tensor<1x128x1024xf32>
return %5 : tensor<1x128x1024xf32>
}
-// CHECK-LABEL: @negative_matmul_f8_dynamic_lower_bound
-// CHECK-NOT: compilation_info = #iree_codegen.compilation_info
-// CHECK-NOT: iree_codegen.ukernel = #iree_codegen.ukernel_descriptor<"pingpong_medium_f8_expanded", tensor>
+// GFX942-LABEL: @negative_matmul_f8_dynamic_lower_bound_gfx942
+// GFX942-NOT: compilation_info = #iree_codegen.compilation_info
+// GFX942-NOT: iree_codegen.ukernel = #iree_codegen.ukernel_descriptor<"pingpong_medium_f8E4M3FNUZ_expanded", tensor>
+// GFX950-LABEL: @negative_matmul_f8_dynamic_lower_bound_gfx942
+// GFX950-NOT: compilation_info = #iree_codegen.compilation_info
+// GFX950-NOT: iree_codegen.ukernel = #iree_codegen.ukernel_descriptor<"pingpong_medium_f8E4M3FNUZ_expanded", tensor>
// -----
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>
#map2 = affine_map<(d0, d1, d2, d3) -> (d2, d3)>
#map3 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>
-func.func @negative_matmul_f8_large_no_zero_fill(%arg0: tensor<1x256x4096xf8E4M3FNUZ>, %arg1: tensor<1024x4096xf8E4M3FNUZ>) -> tensor<1x256x1024xf32> {
+func.func @negative_matmul_f8_large_no_zero_fill_gfx942(%arg0: tensor<1x256x4096xf8E4M3FNUZ>, %arg1: tensor<1024x4096xf8E4M3FNUZ>) -> tensor<1x256x1024xf32> {
%cst = arith.constant 1.000000e+00 : f32
%0 = tensor.empty() : tensor<1x256x1024xf32>
%1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<1x256x1024xf32>) -> tensor<1x256x1024xf32>
@@ -176,11 +197,212 @@
} -> tensor<1x256x1024xf32>
return %2 : tensor<1x256x1024xf32>
}
-// CHECK-LABEL: @negative_matmul_f8_large_no_zero_fill
-// CHECK-NOT: compilation_info = #iree_codegen.compilation_info
-// CHECK-NOT: iree_codegen.ukernel = #iree_codegen.ukernel_descriptor
+// GFX942-LABEL: @negative_matmul_f8_large_no_zero_fill_gfx942
+// GFX942-NOT: compilation_info = #iree_codegen.compilation_info
+// GFX942-NOT: iree_codegen.ukernel = #iree_codegen.ukernel_descriptor
+// GFX950-LABEL: @negative_matmul_f8_large_no_zero_fill_gfx942
+// GFX950-NOT: compilation_info = #iree_codegen.compilation_info
+// GFX950-NOT: iree_codegen.ukernel = #iree_codegen.ukernel_descriptor
// -----
+#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>
+#map2 = affine_map<(d0, d1, d2, d3) -> (d2, d3)>
+#map3 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>
+func.func @matmul_f8_gfx950(%arg0: tensor<1x128x4096xf8E4M3FN>, %arg1: tensor<1024x4096xf8E4M3FN>) -> tensor<1x128x1024xf32> {
+ %cst = arith.constant 0.000000e+00 : f32
+ %0 = tensor.empty() : tensor<1x128x1024xf32>
+ %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<1x128x1024xf32>) -> tensor<1x128x1024xf32>
+ %2 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%arg0, %arg1 : tensor<1x128x4096xf8E4M3FN>, tensor<1024x4096xf8E4M3FN>) outs(%1 : tensor<1x128x1024xf32>) {
+ ^bb0(%in: f8E4M3FN, %in_4: f8E4M3FN, %out: f32):
+ %12 = arith.extf %in : f8E4M3FN to f32
+ %13 = arith.extf %in_4 : f8E4M3FN to f32
+ %14 = arith.mulf %12, %13 : f32
+ %15 = arith.addf %out, %14 : f32
+ linalg.yield %15 : f32
+ } -> tensor<1x128x1024xf32>
+ return %2 : tensor<1x128x1024xf32>
+}
+// GFX950-LABEL: @matmul_f8_gfx950
+// GFX950: linalg.generic
+// GFX950-SAME: compilation_info = #iree_codegen.compilation_info
+// GFX950-SAME: lowering_config =
+// GFX950-SAME: translation_info =
+// GFX950-SAME: iree_codegen.ukernel = #iree_codegen.ukernel_descriptor<"pingpong_medium_f8E4M3FN_expanded", tensor>
+// GFX942-LABEL: @matmul_f8_gfx950
+// GFX942: linalg.generic
+// GFX942-NOT: compilation_info = #iree_codegen.compilation_info
+// GFX942-NOT: iree_codegen.ukernel = #iree_codegen.ukernel_descriptor<"pingpong_medium_f8E4M3FN_expanded", tensor>
+// -----
+
+#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>
+#map2 = affine_map<(d0, d1, d2, d3) -> (d2, d3)>
+#map3 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>
+func.func @negative_matmul_f8_gfx950(%arg0: tensor<1x128x256xf8E4M3FN>, %arg1: tensor<1024x256xf8E4M3FN>) -> tensor<1x128x1024xf32> {
+ %cst = arith.constant 0.000000e+00 : f32
+ %0 = tensor.empty() : tensor<1x128x1024xf32>
+ %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<1x128x1024xf32>) -> tensor<1x128x1024xf32>
+ %2 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%arg0, %arg1 : tensor<1x128x256xf8E4M3FN>, tensor<1024x256xf8E4M3FN>) outs(%1 : tensor<1x128x1024xf32>) {
+ ^bb0(%in: f8E4M3FN, %in_4: f8E4M3FN, %out: f32):
+ %12 = arith.extf %in : f8E4M3FN to f32
+ %13 = arith.extf %in_4 : f8E4M3FN to f32
+ %14 = arith.mulf %12, %13 : f32
+ %15 = arith.addf %out, %14 : f32
+ linalg.yield %15 : f32
+ } -> tensor<1x128x1024xf32>
+ return %2 : tensor<1x128x1024xf32>
+}
+// GFX950-LABEL: @negative_matmul_f8_gfx950
+// GFX950-NOT: compilation_info = #iree_codegen.compilation_info
+// GFX950-NOT: iree_codegen.ukernel = #iree_codegen.ukernel_descriptor<"pingpong_medium_f8E4M3FN_expanded", tensor>
+// GFX942-LABEL: @negative_matmul_f8_gfx950
+// GFX942-NOT: compilation_info = #iree_codegen.compilation_info
+// GFX942-NOT: iree_codegen.ukernel = #iree_codegen.ukernel_descriptor<"pingpong_medium_f8E4M3FN_expanded", tensor>
+
+// -----
+
+#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>
+#map2 = affine_map<(d0, d1, d2, d3) -> (d2, d3)>
+#map3 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>
+func.func @negative_matmul_f8_medium_no_zero_fill_gfx950(%arg0: tensor<1x128x4096xf8E4M3FN>, %arg1: tensor<1024x4096xf8E4M3FN>) -> tensor<1x128x1024xf32> {
+ %cst = arith.constant 1.000000e+00 : f32
+ %0 = tensor.empty() : tensor<1x128x1024xf32>
+ %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<1x128x1024xf32>) -> tensor<1x128x1024xf32>
+ %2 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%arg0, %arg1 : tensor<1x128x4096xf8E4M3FN>, tensor<1024x4096xf8E4M3FN>) outs(%1 : tensor<1x128x1024xf32>) {
+ ^bb0(%in: f8E4M3FN, %in_4: f8E4M3FN, %out: f32):
+ %12 = arith.extf %in : f8E4M3FN to f32
+ %13 = arith.extf %in_4 : f8E4M3FN to f32
+ %14 = arith.mulf %12, %13 : f32
+ %15 = arith.addf %out, %14 : f32
+ linalg.yield %15 : f32
+ } -> tensor<1x128x1024xf32>
+ return %2 : tensor<1x128x1024xf32>
+}
+// GFX950-LABEL: @negative_matmul_f8_medium_no_zero_fill_gfx950
+// GFX950-NOT: compilation_info = #iree_codegen.compilation_info
+// GFX950-NOT: iree_codegen.ukernel = #iree_codegen.ukernel_descriptor
+// GFX942-LABEL: @negative_matmul_f8_medium_no_zero_fill_gfx950
+// GFX942-NOT: compilation_info = #iree_codegen.compilation_info
+// GFX942-NOT: iree_codegen.ukernel = #iree_codegen.ukernel_descriptor
+// -----
+
+// Through a constraint, the inner dimension is known to be a multiple of 128 and has a lower bound of 512, so should be matched.
+
+#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>
+#map2 = affine_map<(d0, d1, d2, d3) -> (d2, d3)>
+#map3 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>
+func.func @matmul_f8_dynamic_gfx950(%arg0: index) -> tensor<1x128x1024xf32> {
+ %cst = arith.constant 0.000000e+00 : f32
+ %0 = util.assume.int %arg0<umin = 512, udiv = 128> : index
+ %1 = tensor.empty(%0) : tensor<1x128x?xf8E4M3FN>
+ %2 = tensor.empty(%0) : tensor<1024x?xf8E4M3FN>
+ %3 = tensor.empty() : tensor<1x128x1024xf32>
+ %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<1x128x1024xf32>) -> tensor<1x128x1024xf32>
+ %5 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%1, %2 : tensor<1x128x?xf8E4M3FN>, tensor<1024x?xf8E4M3FN>) outs(%4 : tensor<1x128x1024xf32>) {
+ ^bb0(%in: f8E4M3FN, %in_0: f8E4M3FN, %out: f32):
+ %6 = arith.extf %in : f8E4M3FN to f32
+ %7 = arith.extf %in_0 : f8E4M3FN to f32
+ %8 = arith.mulf %6, %7 : f32
+ %9 = arith.addf %out, %8 : f32
+ linalg.yield %9 : f32
+ } -> tensor<1x128x1024xf32>
+ return %5 : tensor<1x128x1024xf32>
+}
+// GFX950-LABEL: @matmul_f8_dynamic_gfx950
+// GFX950: linalg.generic
+// GFX950-SAME: compilation_info = #iree_codegen.compilation_info
+// GFX950-SAME: lowering_config =
+// GFX950-SAME: translation_info =
+// GFX950-SAME: iree_codegen.ukernel = #iree_codegen.ukernel_descriptor<"pingpong_medium_f8E4M3FN_expanded", tensor>
+// GFX942-LABEL: @matmul_f8_dynamic_gfx950
+// GFX942: linalg.generic
+// GFX942-NOT: compilation_info = #iree_codegen.compilation_info
+// GFX942-NOT: iree_codegen.ukernel = #iree_codegen.ukernel_descriptor<"pingpong_medium_f8E4M3FN_expanded", tensor>
+// -----
+
+// The dynamic dimension is not a multiple of 128.
+
+#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>
+#map2 = affine_map<(d0, d1, d2, d3) -> (d2, d3)>
+#map3 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>
+func.func @negative_matmul_f8_dynamic_multiple_of_gfx950(%arg0: tensor<1024x512xf8E4M3FN>, %arg1: index) -> tensor<1x?x1024xf32> {
+ %cst = arith.constant 0.000000e+00 : f32
+ %0 = util.assume.int %arg1<udiv = 64> : index
+ %1 = tensor.empty(%0) : tensor<1x?x512xf8E4M3FN>
+ %2 = tensor.empty(%0) : tensor<1x?x1024xf32>
+ %3 = linalg.fill ins(%cst : f32) outs(%2 : tensor<1x?x1024xf32>) -> tensor<1x?x1024xf32>
+ %4 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%1, %arg0 : tensor<1x?x512xf8E4M3FN>, tensor<1024x512xf8E4M3FN>) outs(%3 : tensor<1x?x1024xf32>) {
+ ^bb0(%in: f8E4M3FN, %in_0: f8E4M3FN, %out: f32):
+ %5 = arith.extf %in : f8E4M3FN to f32
+ %6 = arith.extf %in_0 : f8E4M3FN to f32
+ %7 = arith.mulf %5, %6 : f32
+ %8 = arith.addf %out, %7 : f32
+ linalg.yield %8 : f32
+ } -> tensor<1x?x1024xf32>
+ return %4 : tensor<1x?x1024xf32>
+}
+// GFX950-LABEL: @negative_matmul_f8_dynamic_multiple_of_gfx950
+// GFX950-NOT: compilation_info = #iree_codegen.compilation_info
+// GFX950-NOT: iree_codegen.ukernel = #iree_codegen.ukernel_descriptor<"pingpong_medium_f8E4M3FN_expanded", tensor>
+// GFX942-LABEL: @negative_matmul_f8_dynamic_multiple_of_gfx950
+// GFX942-NOT: compilation_info = #iree_codegen.compilation_info
+// GFX942-NOT: iree_codegen.ukernel = #iree_codegen.ukernel_descriptor<"pingpong_medium_f8E4M3FN_expanded", tensor>
+// -----
+
+// The dynamic dimension is a multiple of 128, but doesn't have a lower bound of 512.
+
+#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>
+#map2 = affine_map<(d0, d1, d2, d3) -> (d2, d3)>
+#map3 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>
+func.func @negative_matmul_f8_dynamic_lower_bound_gfx950(%arg0: index) -> tensor<1x128x1024xf32> {
+ %cst = arith.constant 0.000000e+00 : f32
+ %0 = util.assume.int %arg0<umin = 256, udiv = 128> : index
+ %1 = tensor.empty(%0) : tensor<1x128x?xf8E4M3FN>
+ %2 = tensor.empty(%0) : tensor<1024x?xf8E4M3FN>
+ %3 = tensor.empty() : tensor<1x128x1024xf32>
+ %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<1x128x1024xf32>) -> tensor<1x128x1024xf32>
+ %5 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%1, %2 : tensor<1x128x?xf8E4M3FN>, tensor<1024x?xf8E4M3FN>) outs(%4 : tensor<1x128x1024xf32>) {
+ ^bb0(%in: f8E4M3FN, %in_0: f8E4M3FN, %out: f32):
+ %6 = arith.extf %in : f8E4M3FN to f32
+ %7 = arith.extf %in_0 : f8E4M3FN to f32
+ %8 = arith.mulf %6, %7 : f32
+ %9 = arith.addf %out, %8 : f32
+ linalg.yield %9 : f32
+ } -> tensor<1x128x1024xf32>
+ return %5 : tensor<1x128x1024xf32>
+}
+// GFX950-LABEL: @negative_matmul_f8_dynamic_lower_bound_gfx950
+// GFX950-NOT: compilation_info = #iree_codegen.compilation_info
+// GFX950-NOT: iree_codegen.ukernel = #iree_codegen.ukernel_descriptor<"pingpong_medium_f8E4M3FN_expanded", tensor>
+// GFX942-LABEL: @negative_matmul_f8_dynamic_lower_bound_gfx950
+// GFX942-NOT: compilation_info = #iree_codegen.compilation_info
+// GFX942-NOT: iree_codegen.ukernel = #iree_codegen.ukernel_descriptor<"pingpong_medium_f8E4M3FN_expanded", tensor>
+// -----
+
+#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>
+#map2 = affine_map<(d0, d1, d2, d3) -> (d2, d3)>
+#map3 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>
+func.func @negative_matmul_f8_large_no_zero_fill_gfx950(%arg0: tensor<1x256x4096xf8E4M3FN>, %arg1: tensor<1024x4096xf8E4M3FN>) -> tensor<1x256x1024xf32> {
+ %cst = arith.constant 1.000000e+00 : f32
+ %0 = tensor.empty() : tensor<1x256x1024xf32>
+ %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<1x256x1024xf32>) -> tensor<1x256x1024xf32>
+ %2 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%arg0, %arg1 : tensor<1x256x4096xf8E4M3FN>, tensor<1024x4096xf8E4M3FN>) outs(%1 : tensor<1x256x1024xf32>) {
+ ^bb0(%in: f8E4M3FN, %in_4: f8E4M3FN, %out: f32):
+ %12 = arith.extf %in : f8E4M3FN to f32
+ %13 = arith.extf %in_4 : f8E4M3FN to f32
+ %14 = arith.mulf %12, %13 : f32
+ %15 = arith.addf %out, %14 : f32
+ linalg.yield %15 : f32
+ } -> tensor<1x256x1024xf32>
+ return %2 : tensor<1x256x1024xf32>
+}
+// GFX950-LABEL: @negative_matmul_f8_large_no_zero_fill_gfx950
+// GFX950-NOT: compilation_info = #iree_codegen.compilation_info
+// GFX950-NOT: iree_codegen.ukernel = #iree_codegen.ukernel_descriptor
+// GFX942-LABEL: @negative_matmul_f8_large_no_zero_fill_gfx950
+// GFX942-NOT: compilation_info = #iree_codegen.compilation_info
+// GFX942-NOT: iree_codegen.ukernel = #iree_codegen.ukernel_descriptor
+// -----
+
#map1 = affine_map<(d0, d1, d2) -> (d0, d2)>
#map2 = affine_map<(d0, d1, d2) -> (d1, d2)>
@@ -199,9 +421,12 @@
} -> tensor<256x1024xf32>
return %2 : tensor<256x1024xf32>
}
-// CHECK-LABEL: @negative_matmul_f16
-// CHECK-NOT: compilation_info = #iree_codegen.compilation_info
-// CHECK-NOT: iree_codegen.ukernel = #iree_codegen.ukernel_descriptor
+// GFX942-LABEL: @negative_matmul_f16
+// GFX942-NOT: compilation_info = #iree_codegen.compilation_info
+// GFX942-NOT: iree_codegen.ukernel = #iree_codegen.ukernel_descriptor
+// GFX950-LABEL: @negative_matmul_f16
+// GFX950-NOT: compilation_info = #iree_codegen.compilation_info
+// GFX950-NOT: iree_codegen.ukernel = #iree_codegen.ukernel_descriptor
// -----
@@ -222,9 +447,12 @@
} -> tensor<1024x1024xf32>
return %2 : tensor<1024x1024xf32>
}
-// CHECK-LABEL: @negative_matmul_f16_no_zero_fill
-// CHECK-NOT: compilation_info = #iree_codegen.compilation_info
-// CHECK-NOT: iree_codegen.ukernel = #iree_codegen.ukernel_descriptor
+// GFX942-LABEL: @negative_matmul_f16_no_zero_fill
+// GFX942-NOT: compilation_info = #iree_codegen.compilation_info
+// GFX942-NOT: iree_codegen.ukernel = #iree_codegen.ukernel_descriptor
+// GFX950-LABEL: @negative_matmul_f16_no_zero_fill
+// GFX950-NOT: compilation_info = #iree_codegen.compilation_info
+// GFX950-NOT: iree_codegen.ukernel = #iree_codegen.ukernel_descriptor
// -----
@@ -245,9 +473,12 @@
} -> tensor<1x128x1024xf32>
return %2 : tensor<1x128x1024xf32>
}
-// CHECK-LABEL: @negative_matmul_f16_medium_expanded_no_zero_fill
-// CHECK-NOT: compilation_info = #iree_codegen.compilation_info
-// CHECK-NOT: iree_codegen.ukernel = #iree_codegen.ukernel_descriptor
+// GFX942-LABEL: @negative_matmul_f16_medium_expanded_no_zero_fill
+// GFX942-NOT: compilation_info = #iree_codegen.compilation_info
+// GFX942-NOT: iree_codegen.ukernel = #iree_codegen.ukernel_descriptor
+// GFX950-LABEL: @negative_matmul_f16_medium_expanded_no_zero_fill
+// GFX950-NOT: compilation_info = #iree_codegen.compilation_info
+// GFX950-NOT: iree_codegen.ukernel = #iree_codegen.ukernel_descriptor
// -----
@@ -268,9 +499,12 @@
} -> tensor<1x256x1024xf32>
return %2 : tensor<1x256x1024xf32>
}
-// CHECK-LABEL: @negative_matmul_f16_large_expanded_no_zero_fill
-// CHECK-NOT: compilation_info = #iree_codegen.compilation_info
-// CHECK-NOT: iree_codegen.ukernel = #iree_codegen.ukernel_descriptor
+// GFX942-LABEL: @negative_matmul_f16_large_expanded_no_zero_fill
+// GFX942-NOT: compilation_info = #iree_codegen.compilation_info
+// GFX942-NOT: iree_codegen.ukernel = #iree_codegen.ukernel_descriptor
+// GFX950-LABEL: @negative_matmul_f16_large_expanded_no_zero_fill
+// GFX950-NOT: compilation_info = #iree_codegen.compilation_info
+// GFX950-NOT: iree_codegen.ukernel = #iree_codegen.ukernel_descriptor
// -----
@@ -291,9 +525,12 @@
} -> tensor<256x1024xf32>
return %2 : tensor<256x1024xf32>
}
-// CHECK-LABEL: @negative_matmul_bf16
-// CHECK-NOT: compilation_info = #iree_codegen.compilation_info
-// CHECK-NOT: iree_codegen.ukernel = #iree_codegen.ukernel_descriptor
+// GFX942-LABEL: @negative_matmul_bf16
+// GFX942-NOT: compilation_info = #iree_codegen.compilation_info
+// GFX942-NOT: iree_codegen.ukernel = #iree_codegen.ukernel_descriptor
+// GFX950-LABEL: @negative_matmul_bf16
+// GFX950-NOT: compilation_info = #iree_codegen.compilation_info
+// GFX950-NOT: iree_codegen.ukernel = #iree_codegen.ukernel_descriptor
// -----
@@ -314,9 +551,12 @@
} -> tensor<1024x1024xf32>
return %2 : tensor<1024x1024xf32>
}
-// CHECK-LABEL: @negative_matmul_bf16_large_no_zero_fill
-// CHECK-NOT: compilation_info = #iree_codegen.compilation_info
-// CHECK-NOT: iree_codegen.ukernel = #iree_codegen.ukernel_descriptor
+// GFX942-LABEL: @negative_matmul_bf16_large_no_zero_fill
+// GFX942-NOT: compilation_info = #iree_codegen.compilation_info
+// GFX942-NOT: iree_codegen.ukernel = #iree_codegen.ukernel_descriptor
+// GFX950-LABEL: @negative_matmul_bf16_large_no_zero_fill
+// GFX950-NOT: compilation_info = #iree_codegen.compilation_info
+// GFX950-NOT: iree_codegen.ukernel = #iree_codegen.ukernel_descriptor
// -----
@@ -342,9 +582,12 @@
} -> tensor<1x256x1024xf32>
return %5 : tensor<1x256x1024xf32>
}
-// CHECK-LABEL: @negative_matmul_bf16_dynamic_lower_bound
-// CHECK-NOT: compilation_info = #iree_codegen.compilation_info
-// CHECK-NOT: iree_codegen.ukernel = #iree_codegen.ukernel_descriptor<"pingpong_large_bf16_expanded", tensor>
+// GFX942-LABEL: @negative_matmul_bf16_dynamic_lower_bound
+// GFX942-NOT: compilation_info = #iree_codegen.compilation_info
+// GFX942-NOT: iree_codegen.ukernel = #iree_codegen.ukernel_descriptor<"pingpong_large_bf16_expanded", tensor>
+// GFX950-LABEL: @negative_matmul_bf16_dynamic_lower_bound
+// GFX950-NOT: compilation_info = #iree_codegen.compilation_info
+// GFX950-NOT: iree_codegen.ukernel = #iree_codegen.ukernel_descriptor<"pingpong_large_bf16_expanded", tensor>
// -----
@@ -365,9 +608,12 @@
} -> tensor<1x128x1024xf32>
return %2 : tensor<1x128x1024xf32>
}
-// CHECK-LABEL: @negative_matmul_bf16_expanded_medium_no_zero_fill
-// CHECK-NOT: compilation_info = #iree_codegen.compilation_info
-// CHECK-NOT: iree_codegen.ukernel = #iree_codegen.ukernel_descriptor
+// GFX942-LABEL: @negative_matmul_bf16_expanded_medium_no_zero_fill
+// GFX942-NOT: compilation_info = #iree_codegen.compilation_info
+// GFX942-NOT: iree_codegen.ukernel = #iree_codegen.ukernel_descriptor
+// GFX950-LABEL: @negative_matmul_bf16_expanded_medium_no_zero_fill
+// GFX950-NOT: compilation_info = #iree_codegen.compilation_info
+// GFX950-NOT: iree_codegen.ukernel = #iree_codegen.ukernel_descriptor
// -----
@@ -388,6 +634,9 @@
} -> tensor<1x256x1024xf32>
return %2 : tensor<1x256x1024xf32>
}
-// CHECK-LABEL: @negative_matmul_bf16_expanded_large_no_zero_fill
-// CHECK-NOT: compilation_info = #iree_codegen.compilation_info
-// CHECK-NOT: iree_codegen.ukernel = #iree_codegen.ukernel_descriptor
+// GFX942-LABEL: @negative_matmul_bf16_expanded_large_no_zero_fill
+// GFX942-NOT: compilation_info = #iree_codegen.compilation_info
+// GFX942-NOT: iree_codegen.ukernel = #iree_codegen.ukernel_descriptor
+// GFX950-LABEL: @negative_matmul_bf16_expanded_large_no_zero_fill
+// GFX950-NOT: compilation_info = #iree_codegen.compilation_info
+// GFX950-NOT: iree_codegen.ukernel = #iree_codegen.ukernel_descriptor
diff --git a/compiler/plugins/target/ROCM/Dialect/ROCM/Transforms/test/apply_builtin_ukernel_pdl_patterns_driver.mlir b/compiler/plugins/target/ROCM/Dialect/ROCM/Transforms/test/apply_builtin_ukernel_pdl_patterns_driver.mlir
index 30a8340..e3197ef 100644
--- a/compiler/plugins/target/ROCM/Dialect/ROCM/Transforms/test/apply_builtin_ukernel_pdl_patterns_driver.mlir
+++ b/compiler/plugins/target/ROCM/Dialect/ROCM/Transforms/test/apply_builtin_ukernel_pdl_patterns_driver.mlir
@@ -47,7 +47,7 @@
return %2 : tensor<1x128x1024xf32>
}
}
-// CHECK-LABEL: util.func private @pingpong_medium_f8_expanded
+// CHECK-LABEL: util.func private @pingpong_medium_f8E4M3FNUZ_expanded
// CHECK: iree_codegen.inner_tiled
// -----
@@ -88,8 +88,8 @@
// CHECK-SAME: compilation_info = #iree_codegen.compilation_info
// CHECK-SAME: lowering_config =
// CHECK-SAME: translation_info =
-// CHECK-SAME: iree_codegen.ukernel = #iree_codegen.ukernel_descriptor<"pingpong_large_f8_expanded", tensor>
-// CHECK-LABEL: util.func private @pingpong_large_f8_expanded
+// CHECK-SAME: iree_codegen.ukernel = #iree_codegen.ukernel_descriptor<"pingpong_large_f8E4M3FNUZ_expanded", tensor>
+// CHECK-LABEL: util.func private @pingpong_large_f8E4M3FNUZ_expanded
// CHECK: iree_codegen.inner_tiled
// -----
diff --git a/compiler/plugins/target/ROCM/builtins/mlir_ukernel/BUILD.bazel b/compiler/plugins/target/ROCM/builtins/mlir_ukernel/BUILD.bazel
index a5c4e18..43e205a 100644
--- a/compiler/plugins/target/ROCM/builtins/mlir_ukernel/BUILD.bazel
+++ b/compiler/plugins/target/ROCM/builtins/mlir_ukernel/BUILD.bazel
@@ -23,15 +23,7 @@
inline = True,
)
-# Target archs for specialization patternsets. https://llvm.org/docs/AMDGPUUsage.html#processors
-gpu_archs = [
- "gfx942",
-]
-
-ukernel_patterns_mlir_files = [
- "ukernel_patterns_%s.mlir" % gpu_arch
- for gpu_arch in gpu_archs
-]
+ukernel_patterns_mlir_files = glob(["ukernel_patterns_*.mlir"])
iree_c_embed_data(
name = "iree_mlir_ukernel_patterns_amdgpu",
@@ -57,7 +49,8 @@
srcs = [
"iree_uk_amdgpu_matmul_bf16.mlir",
"iree_uk_amdgpu_matmul_f16.mlir",
- "iree_uk_amdgpu_matmul_f8.mlir",
+ "iree_uk_amdgpu_matmul_f8E4M3FN.mlir",
+ "iree_uk_amdgpu_matmul_f8E4M3FNUZ.mlir",
],
c_file_output = "iree_mlir_ukernels_amdgpu.c",
flatten = True,
@@ -71,7 +64,8 @@
srcs = [
"iree_uk_amdgpu_matmul_bf16.mlir",
"iree_uk_amdgpu_matmul_f16.mlir",
- "iree_uk_amdgpu_matmul_f8.mlir",
+ "iree_uk_amdgpu_matmul_f8E4M3FN.mlir",
+ "iree_uk_amdgpu_matmul_f8E4M3FNUZ.mlir",
],
cfg = "//compiler:lit.cfg.py",
tools = [
diff --git a/compiler/plugins/target/ROCM/builtins/mlir_ukernel/CMakeLists.txt b/compiler/plugins/target/ROCM/builtins/mlir_ukernel/CMakeLists.txt
index 73815c9..057162f 100644
--- a/compiler/plugins/target/ROCM/builtins/mlir_ukernel/CMakeLists.txt
+++ b/compiler/plugins/target/ROCM/builtins/mlir_ukernel/CMakeLists.txt
@@ -14,11 +14,12 @@
return()
endif()
+file(GLOB _GLOB_UKERNEL_PATTERNS_X_MLIR LIST_DIRECTORIES false RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} CONFIGURE_DEPENDS ukernel_patterns_*.mlir)
iree_c_embed_data(
NAME
iree_mlir_ukernel_patterns_amdgpu
SRCS
- "ukernel_patterns_gfx942.mlir"
+ "${_GLOB_UKERNEL_PATTERNS_X_MLIR}"
C_FILE_OUTPUT
"iree_mlir_ukernel_patterns_amdgpu.c"
H_FILE_OUTPUT
@@ -31,7 +32,7 @@
NAME
verify_mlir_ukernel_patterns_amdgpu
SRCS
- "ukernel_patterns_gfx942.mlir"
+ "${_GLOB_UKERNEL_PATTERNS_X_MLIR}"
TOOLS
iree-opt
)
@@ -42,7 +43,8 @@
SRCS
"iree_uk_amdgpu_matmul_bf16.mlir"
"iree_uk_amdgpu_matmul_f16.mlir"
- "iree_uk_amdgpu_matmul_f8.mlir"
+ "iree_uk_amdgpu_matmul_f8E4M3FN.mlir"
+ "iree_uk_amdgpu_matmul_f8E4M3FNUZ.mlir"
C_FILE_OUTPUT
"iree_mlir_ukernels_amdgpu.c"
H_FILE_OUTPUT
@@ -57,7 +59,8 @@
SRCS
"iree_uk_amdgpu_matmul_bf16.mlir"
"iree_uk_amdgpu_matmul_f16.mlir"
- "iree_uk_amdgpu_matmul_f8.mlir"
+ "iree_uk_amdgpu_matmul_f8E4M3FN.mlir"
+ "iree_uk_amdgpu_matmul_f8E4M3FNUZ.mlir"
TOOLS
iree-opt
)
diff --git a/compiler/plugins/target/ROCM/builtins/mlir_ukernel/iree_uk_amdgpu_matmul_f8.mlir b/compiler/plugins/target/ROCM/builtins/mlir_ukernel/iree_uk_amdgpu_matmul_f8E4M3FN.mlir
similarity index 73%
copy from compiler/plugins/target/ROCM/builtins/mlir_ukernel/iree_uk_amdgpu_matmul_f8.mlir
copy to compiler/plugins/target/ROCM/builtins/mlir_ukernel/iree_uk_amdgpu_matmul_f8E4M3FN.mlir
index d808b97..1f4bcdf 100644
--- a/compiler/plugins/target/ROCM/builtins/mlir_ukernel/iree_uk_amdgpu_matmul_f8.mlir
+++ b/compiler/plugins/target/ROCM/builtins/mlir_ukernel/iree_uk_amdgpu_matmul_f8E4M3FN.mlir
@@ -1,18 +1,18 @@
// RUN: iree-opt %s
-!in_ty_f8 = tensor<256x?xf8E4M3FNUZ>
-!exp_in_ty_f8 = tensor<1x256x?xf8E4M3FNUZ>
-!block_in_f8 = tensor<256x128xf8E4M3FNUZ>
-!exp_block_in_f8 = tensor<1x256x128xf8E4M3FNUZ>
-!flat_shared_f8 = memref<32768xf8E4M3FNUZ, #gpu.address_space<workgroup>>
-!shared_f8 = memref<256x128xf8E4M3FNUZ, #gpu.address_space<workgroup>>
-!shared_exp_f8 = memref<16x16x4x32xf8E4M3FNUZ, #gpu.address_space<workgroup>>
+!in_ty_f8 = tensor<256x?xf8E4M3FN>
+!exp_in_ty_f8 = tensor<1x256x?xf8E4M3FN>
+!block_in_f8 = tensor<256x128xf8E4M3FN>
+!exp_block_in_f8 = tensor<1x256x128xf8E4M3FN>
+!flat_shared_f8 = memref<32768xf8E4M3FN, #gpu.address_space<workgroup>>
+!shared_f8 = memref<256x128xf8E4M3FN, #gpu.address_space<workgroup>>
+!shared_exp_f8 = memref<16x16x4x32xf8E4M3FN, #gpu.address_space<workgroup>>
-!mexp_in_ty_f8 = tensor<1x128x?xf8E4M3FNUZ>
-!mexp_block_in_f8 = tensor<1x128x128xf8E4M3FNUZ>
-!mflat_shared_f8 = memref<16384xf8E4M3FNUZ, #gpu.address_space<workgroup>>
-!mshared_f8 = memref<128x128xf8E4M3FNUZ, #gpu.address_space<workgroup>>
-!mshared_exp_f8 = memref<8x16x4x32xf8E4M3FNUZ, #gpu.address_space<workgroup>>
+!mexp_in_ty_f8 = tensor<1x128x?xf8E4M3FN>
+!mexp_block_in_f8 = tensor<1x128x128xf8E4M3FN>
+!mflat_shared_f8 = memref<16384xf8E4M3FN, #gpu.address_space<workgroup>>
+!mshared_f8 = memref<128x128xf8E4M3FN, #gpu.address_space<workgroup>>
+!mshared_exp_f8 = memref<8x16x4x32xf8E4M3FN, #gpu.address_space<workgroup>>
#contraction_accesses = [
affine_map<(i, j, k) -> (i, k)>,
@@ -20,7 +20,7 @@
affine_map<(i, j, k) -> (i, j)>
]
-util.func @pingpong_medium_f8_expanded(%lhs_base: !mexp_in_ty_f8, %rhs_base: !in_ty_f8, %unused_acc: tensor<1x128x256xf32>) -> tensor<1x128x256xf32> {
+util.func @pingpong_medium_f8E4M3FN_expanded(%lhs_base: !mexp_in_ty_f8, %rhs_base: !in_ty_f8, %unused_acc: tensor<1x128x256xf32>) -> tensor<1x128x256xf32> {
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c2 = arith.constant 2 : index
@@ -32,7 +32,7 @@
%c64 = arith.constant 64 : index
%c128 = arith.constant 128 : index
%c256 = arith.constant 256 : index
- %cst = arith.constant 0.0 : f8E4M3FNUZ
+ %cst = arith.constant 0.0 : f8E4M3FN
%lhs_shared_base = memref.alloc() : !mflat_shared_f8
%rhs_shared_base = memref.alloc() : !flat_shared_f8
@@ -52,16 +52,16 @@
scf.forall (%id) in (1024) {
%delin:2 = affine.delinearize_index %id into (128, 8) : index, index
%vec = arith.muli %delin#1, %c16 overflow<nsw, nuw> : index
- %lhs_thread_local = tensor.extract_slice %lhs_init [0, %delin#0, %vec] [1, 1, 16] [1, 1, 1] : !mexp_block_in_f8 to tensor<1x1x16xf8E4M3FNUZ>
- %lhs_vec_local = vector.transfer_read %lhs_thread_local [%c0, %c0, %c0], %cst {in_bounds = [true, true]} : tensor<1x1x16xf8E4M3FNUZ>, vector<1x16xf8E4M3FNUZ>
- vector.transfer_write %lhs_vec_local, %lhs_shared[%delin#0, %vec] {in_bounds = [true, true]} : vector<1x16xf8E4M3FNUZ>, !mshared_f8
+ %lhs_thread_local = tensor.extract_slice %lhs_init [0, %delin#0, %vec] [1, 1, 16] [1, 1, 1] : !mexp_block_in_f8 to tensor<1x1x16xf8E4M3FN>
+ %lhs_vec_local = vector.transfer_read %lhs_thread_local [%c0, %c0, %c0], %cst {in_bounds = [true, true]} : tensor<1x1x16xf8E4M3FN>, vector<1x16xf8E4M3FN>
+ vector.transfer_write %lhs_vec_local, %lhs_shared[%delin#0, %vec] {in_bounds = [true, true]} : vector<1x16xf8E4M3FN>, !mshared_f8
} {mapping = [#gpu.thread<linear_dim_0>]}
scf.forall (%id) in (2048) {
%delin:2 = affine.delinearize_index %id into (256, 8) : index, index
%vec = arith.muli %delin#1, %c16 overflow<nsw, nuw> : index
- %rhs_thread_local = tensor.extract_slice %rhs_init [%delin#0, %vec] [1, 16] [1, 1] : !block_in_f8 to tensor<1x16xf8E4M3FNUZ>
- %rhs_vec_local = vector.transfer_read %rhs_thread_local [%c0, %c0], %cst {in_bounds = [true, true]} : tensor<1x16xf8E4M3FNUZ>, vector<1x16xf8E4M3FNUZ>
- vector.transfer_write %rhs_vec_local, %rhs_shared[%delin#0, %vec] {in_bounds = [true, true]} : vector<1x16xf8E4M3FNUZ>, !shared_f8
+ %rhs_thread_local = tensor.extract_slice %rhs_init [%delin#0, %vec] [1, 16] [1, 1] : !block_in_f8 to tensor<1x16xf8E4M3FN>
+ %rhs_vec_local = vector.transfer_read %rhs_thread_local [%c0, %c0], %cst {in_bounds = [true, true]} : tensor<1x16xf8E4M3FN>, vector<1x16xf8E4M3FN>
+ vector.transfer_write %rhs_vec_local, %rhs_shared[%delin#0, %vec] {in_bounds = [true, true]} : vector<1x16xf8E4M3FN>, !shared_f8
} {mapping = [#gpu.thread<linear_dim_0>]}
%lhs_shared_expand = memref.expand_shape %lhs_shared [[0, 1], [2, 3]] output_shape [8, 16, 4, 32] : !mshared_f8 into !mshared_exp_f8
@@ -100,39 +100,39 @@
}
%3 = scf.for %i = %c128 to %dim step %c128 iter_args(%iter = %2) -> vector<4x4x1x4xf32> {
- %lhs_vec_0 = vector.transfer_read %lhs_shared_expand[%m_outer_id, %ids#3, %c0, %inner_id], %cst {in_bounds = [true, true, true, true]} : !mshared_exp_f8, vector<4x1x2x8xf8E4M3FNUZ>
- %rhs_vec_0 = vector.transfer_read %rhs_shared_expand[%n_outer_id, %ids#3, %c0, %inner_id], %cst {in_bounds = [true, true, true, true]} : !shared_exp_f8, vector<4x1x2x8xf8E4M3FNUZ>
- %lhs_vec_0_t = vector.transpose %lhs_vec_0, [0, 2, 1, 3] : vector<4x1x2x8xf8E4M3FNUZ> to vector<4x2x1x8xf8E4M3FNUZ>
- %rhs_vec_0_t = vector.transpose %rhs_vec_0, [0, 2, 1, 3] : vector<4x1x2x8xf8E4M3FNUZ> to vector<4x2x1x8xf8E4M3FNUZ>
+ %lhs_vec_0 = vector.transfer_read %lhs_shared_expand[%m_outer_id, %ids#3, %c0, %inner_id], %cst {in_bounds = [true, true, true, true]} : !mshared_exp_f8, vector<4x1x2x8xf8E4M3FN>
+ %rhs_vec_0 = vector.transfer_read %rhs_shared_expand[%n_outer_id, %ids#3, %c0, %inner_id], %cst {in_bounds = [true, true, true, true]} : !shared_exp_f8, vector<4x1x2x8xf8E4M3FN>
+ %lhs_vec_0_t = vector.transpose %lhs_vec_0, [0, 2, 1, 3] : vector<4x1x2x8xf8E4M3FN> to vector<4x2x1x8xf8E4M3FN>
+ %rhs_vec_0_t = vector.transpose %rhs_vec_0, [0, 2, 1, 3] : vector<4x1x2x8xf8E4M3FN> to vector<4x2x1x8xf8E4M3FN>
rocdl.sched.barrier 0
// Global loads of rhs.
%rhs_block = tensor.extract_slice %rhs [0, %i] [256, 128] [1, 1] : !in_ty_f8 to !block_in_f8
- %rhs_thread_0 = tensor.extract_slice %rhs_block [%glb0, %gko] [1, 16] [1, 1] : !block_in_f8 to tensor<1x16xf8E4M3FNUZ>
- %rhs_vec_local_0 = vector.transfer_read %rhs_thread_0 [%c0, %c0], %cst {in_bounds = [true, true]} : tensor<1x16xf8E4M3FNUZ>, vector<1x16xf8E4M3FNUZ>
- %rhs_thread_1 = tensor.extract_slice %rhs_block [%glb1, %gko] [1, 16] [1, 1] : !block_in_f8 to tensor<1x16xf8E4M3FNUZ>
- %rhs_vec_local_1 = vector.transfer_read %rhs_thread_1 [%c0, %c0], %cst {in_bounds = [true, true]} : tensor<1x16xf8E4M3FNUZ>, vector<1x16xf8E4M3FNUZ>
- %rhs_thread_2 = tensor.extract_slice %rhs_block [%glb2, %gko] [1, 16] [1, 1] : !block_in_f8 to tensor<1x16xf8E4M3FNUZ>
- %rhs_vec_local_2 = vector.transfer_read %rhs_thread_2 [%c0, %c0], %cst {in_bounds = [true, true]} : tensor<1x16xf8E4M3FNUZ>, vector<1x16xf8E4M3FNUZ>
- %rhs_thread_3 = tensor.extract_slice %rhs_block [%glb3, %gko] [1, 16] [1, 1] : !block_in_f8 to tensor<1x16xf8E4M3FNUZ>
- %rhs_vec_local_3 = vector.transfer_read %rhs_thread_3 [%c0, %c0], %cst {in_bounds = [true, true]} : tensor<1x16xf8E4M3FNUZ>, vector<1x16xf8E4M3FNUZ>
+ %rhs_thread_0 = tensor.extract_slice %rhs_block [%glb0, %gko] [1, 16] [1, 1] : !block_in_f8 to tensor<1x16xf8E4M3FN>
+ %rhs_vec_local_0 = vector.transfer_read %rhs_thread_0 [%c0, %c0], %cst {in_bounds = [true, true]} : tensor<1x16xf8E4M3FN>, vector<1x16xf8E4M3FN>
+ %rhs_thread_1 = tensor.extract_slice %rhs_block [%glb1, %gko] [1, 16] [1, 1] : !block_in_f8 to tensor<1x16xf8E4M3FN>
+ %rhs_vec_local_1 = vector.transfer_read %rhs_thread_1 [%c0, %c0], %cst {in_bounds = [true, true]} : tensor<1x16xf8E4M3FN>, vector<1x16xf8E4M3FN>
+ %rhs_thread_2 = tensor.extract_slice %rhs_block [%glb2, %gko] [1, 16] [1, 1] : !block_in_f8 to tensor<1x16xf8E4M3FN>
+ %rhs_vec_local_2 = vector.transfer_read %rhs_thread_2 [%c0, %c0], %cst {in_bounds = [true, true]} : tensor<1x16xf8E4M3FN>, vector<1x16xf8E4M3FN>
+ %rhs_thread_3 = tensor.extract_slice %rhs_block [%glb3, %gko] [1, 16] [1, 1] : !block_in_f8 to tensor<1x16xf8E4M3FN>
+ %rhs_vec_local_3 = vector.transfer_read %rhs_thread_3 [%c0, %c0], %cst {in_bounds = [true, true]} : tensor<1x16xf8E4M3FN>, vector<1x16xf8E4M3FN>
rocdl.sched.barrier 0
- %lhs_vec_2 = vector.transfer_read %lhs_shared_expand[%m_outer_id, %ids#3, %c2, %inner_id], %cst {in_bounds = [true, true, true, true]} : !mshared_exp_f8, vector<4x1x2x8xf8E4M3FNUZ>
- %rhs_vec_2 = vector.transfer_read %rhs_shared_expand[%n_outer_id, %ids#3, %c2, %inner_id], %cst {in_bounds = [true, true, true, true]} : !shared_exp_f8, vector<4x1x2x8xf8E4M3FNUZ>
- %lhs_vec_2_t = vector.transpose %lhs_vec_2, [0, 2, 1, 3] : vector<4x1x2x8xf8E4M3FNUZ> to vector<4x2x1x8xf8E4M3FNUZ>
- %rhs_vec_2_t = vector.transpose %rhs_vec_2, [0, 2, 1, 3] : vector<4x1x2x8xf8E4M3FNUZ> to vector<4x2x1x8xf8E4M3FNUZ>
+ %lhs_vec_2 = vector.transfer_read %lhs_shared_expand[%m_outer_id, %ids#3, %c2, %inner_id], %cst {in_bounds = [true, true, true, true]} : !mshared_exp_f8, vector<4x1x2x8xf8E4M3FN>
+ %rhs_vec_2 = vector.transfer_read %rhs_shared_expand[%n_outer_id, %ids#3, %c2, %inner_id], %cst {in_bounds = [true, true, true, true]} : !shared_exp_f8, vector<4x1x2x8xf8E4M3FN>
+ %lhs_vec_2_t = vector.transpose %lhs_vec_2, [0, 2, 1, 3] : vector<4x1x2x8xf8E4M3FN> to vector<4x2x1x8xf8E4M3FN>
+ %rhs_vec_2_t = vector.transpose %rhs_vec_2, [0, 2, 1, 3] : vector<4x1x2x8xf8E4M3FN> to vector<4x2x1x8xf8E4M3FN>
rocdl.sched.barrier 0
// Global loads of lhs.
%lhs_block = tensor.extract_slice %lhs [0, 0, %i] [1, 128, 128] [1, 1, 1] : !mexp_in_ty_f8 to !mexp_block_in_f8
- %lhs_thread_0 = tensor.extract_slice %lhs_block [0, %glb0_lhs, %gko] [1, 1, 16] [1, 1, 1] : !mexp_block_in_f8 to tensor<1x1x16xf8E4M3FNUZ>
- %lhs_vec_local_0 = vector.transfer_read %lhs_thread_0 [%c0, %c0, %c0], %cst {in_bounds = [true, true]} : tensor<1x1x16xf8E4M3FNUZ>, vector<1x16xf8E4M3FNUZ>
- %lhs_thread_1 = tensor.extract_slice %lhs_block [0, %glb1_lhs, %gko] [1, 1, 16] [1, 1, 1] : !mexp_block_in_f8 to tensor<1x1x16xf8E4M3FNUZ>
- %lhs_vec_local_1 = vector.transfer_read %lhs_thread_1 [%c0, %c0, %c0], %cst {in_bounds = [true, true]} : tensor<1x1x16xf8E4M3FNUZ>, vector<1x16xf8E4M3FNUZ>
+ %lhs_thread_0 = tensor.extract_slice %lhs_block [0, %glb0_lhs, %gko] [1, 1, 16] [1, 1, 1] : !mexp_block_in_f8 to tensor<1x1x16xf8E4M3FN>
+ %lhs_vec_local_0 = vector.transfer_read %lhs_thread_0 [%c0, %c0, %c0], %cst {in_bounds = [true, true]} : tensor<1x1x16xf8E4M3FN>, vector<1x16xf8E4M3FN>
+ %lhs_thread_1 = tensor.extract_slice %lhs_block [0, %glb1_lhs, %gko] [1, 1, 16] [1, 1, 1] : !mexp_block_in_f8 to tensor<1x1x16xf8E4M3FN>
+ %lhs_vec_local_1 = vector.transfer_read %lhs_thread_1 [%c0, %c0, %c0], %cst {in_bounds = [true, true]} : tensor<1x1x16xf8E4M3FN>, vector<1x16xf8E4M3FN>
gpu.barrier
rocdl.sched.barrier 0
@@ -141,20 +141,20 @@
%dot0 = iree_codegen.inner_tiled ins(%lhs_vec_0_t, %rhs_vec_0_t) outs(%iter) {
indexing_maps = #contraction_accesses,
iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<reduction>],
- kind = #iree_gpu.mma_layout<MFMA_F32_16x16x32_F8E4M3FNUZ, col_major = true>
- } : vector<4x2x1x8xf8E4M3FNUZ>, vector<4x2x1x8xf8E4M3FNUZ> into vector<4x4x1x4xf32>
+ kind = #iree_gpu.mma_layout<MFMA_F32_16x16x32_F8E4M3FN, col_major = true>
+ } : vector<4x2x1x8xf8E4M3FN>, vector<4x2x1x8xf8E4M3FN> into vector<4x4x1x4xf32>
rocdl.s.setprio 0
gpu.barrier
rocdl.sched.barrier 0
- vector.transfer_write %rhs_vec_local_0, %rhs_shared [%glb0, %gko] {in_bounds = [true, true]} : vector<1x16xf8E4M3FNUZ>, !shared_f8
- vector.transfer_write %rhs_vec_local_1, %rhs_shared [%glb1, %gko] {in_bounds = [true, true]} : vector<1x16xf8E4M3FNUZ>, !shared_f8
- vector.transfer_write %rhs_vec_local_2, %rhs_shared [%glb2, %gko] {in_bounds = [true, true]} : vector<1x16xf8E4M3FNUZ>, !shared_f8
- vector.transfer_write %rhs_vec_local_3, %rhs_shared [%glb3, %gko] {in_bounds = [true, true]} : vector<1x16xf8E4M3FNUZ>, !shared_f8
+ vector.transfer_write %rhs_vec_local_0, %rhs_shared [%glb0, %gko] {in_bounds = [true, true]} : vector<1x16xf8E4M3FN>, !shared_f8
+ vector.transfer_write %rhs_vec_local_1, %rhs_shared [%glb1, %gko] {in_bounds = [true, true]} : vector<1x16xf8E4M3FN>, !shared_f8
+ vector.transfer_write %rhs_vec_local_2, %rhs_shared [%glb2, %gko] {in_bounds = [true, true]} : vector<1x16xf8E4M3FN>, !shared_f8
+ vector.transfer_write %rhs_vec_local_3, %rhs_shared [%glb3, %gko] {in_bounds = [true, true]} : vector<1x16xf8E4M3FN>, !shared_f8
- vector.transfer_write %lhs_vec_local_0, %lhs_shared [%glb0_lhs, %gko] {in_bounds = [true, true]} : vector<1x16xf8E4M3FNUZ>, !mshared_f8
- vector.transfer_write %lhs_vec_local_1, %lhs_shared [%glb1_lhs, %gko] {in_bounds = [true, true]} : vector<1x16xf8E4M3FNUZ>, !mshared_f8
+ vector.transfer_write %lhs_vec_local_0, %lhs_shared [%glb0_lhs, %gko] {in_bounds = [true, true]} : vector<1x16xf8E4M3FN>, !mshared_f8
+ vector.transfer_write %lhs_vec_local_1, %lhs_shared [%glb1_lhs, %gko] {in_bounds = [true, true]} : vector<1x16xf8E4M3FN>, !mshared_f8
gpu.barrier
rocdl.sched.barrier 0
@@ -163,8 +163,8 @@
%dot2 = iree_codegen.inner_tiled ins(%lhs_vec_2_t, %rhs_vec_2_t) outs(%dot0) {
indexing_maps = #contraction_accesses,
iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<reduction>],
- kind = #iree_gpu.mma_layout<MFMA_F32_16x16x32_F8E4M3FNUZ, col_major = true>
- } : vector<4x2x1x8xf8E4M3FNUZ>, vector<4x2x1x8xf8E4M3FNUZ> into vector<4x4x1x4xf32>
+ kind = #iree_gpu.mma_layout<MFMA_F32_16x16x32_F8E4M3FN, col_major = true>
+ } : vector<4x2x1x8xf8E4M3FN>, vector<4x2x1x8xf8E4M3FN> into vector<4x4x1x4xf32>
rocdl.s.setprio 0
gpu.barrier
@@ -177,27 +177,27 @@
}
// Epilogue
- %lhs_vec_0 = vector.transfer_read %lhs_shared_expand[%m_outer_id, %ids#3, %c0, %inner_id], %cst {in_bounds = [true, true, true, true]} : !mshared_exp_f8, vector<4x1x2x8xf8E4M3FNUZ>
- %rhs_vec_0 = vector.transfer_read %rhs_shared_expand[%n_outer_id, %ids#3, %c0, %inner_id], %cst {in_bounds = [true, true, true, true]} : !shared_exp_f8, vector<4x1x2x8xf8E4M3FNUZ>
- %lhs_vec_0_t = vector.transpose %lhs_vec_0, [0, 2, 1, 3] : vector<4x1x2x8xf8E4M3FNUZ> to vector<4x2x1x8xf8E4M3FNUZ>
- %rhs_vec_0_t = vector.transpose %rhs_vec_0, [0, 2, 1, 3] : vector<4x1x2x8xf8E4M3FNUZ> to vector<4x2x1x8xf8E4M3FNUZ>
+ %lhs_vec_0 = vector.transfer_read %lhs_shared_expand[%m_outer_id, %ids#3, %c0, %inner_id], %cst {in_bounds = [true, true, true, true]} : !mshared_exp_f8, vector<4x1x2x8xf8E4M3FN>
+ %rhs_vec_0 = vector.transfer_read %rhs_shared_expand[%n_outer_id, %ids#3, %c0, %inner_id], %cst {in_bounds = [true, true, true, true]} : !shared_exp_f8, vector<4x1x2x8xf8E4M3FN>
+ %lhs_vec_0_t = vector.transpose %lhs_vec_0, [0, 2, 1, 3] : vector<4x1x2x8xf8E4M3FN> to vector<4x2x1x8xf8E4M3FN>
+ %rhs_vec_0_t = vector.transpose %rhs_vec_0, [0, 2, 1, 3] : vector<4x1x2x8xf8E4M3FN> to vector<4x2x1x8xf8E4M3FN>
%dot0 = iree_codegen.inner_tiled ins(%lhs_vec_0_t, %rhs_vec_0_t) outs(%3) {
indexing_maps = #contraction_accesses,
iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<reduction>],
- kind = #iree_gpu.mma_layout<MFMA_F32_16x16x32_F8E4M3FNUZ, col_major = true>
- } : vector<4x2x1x8xf8E4M3FNUZ>, vector<4x2x1x8xf8E4M3FNUZ> into vector<4x4x1x4xf32>
+ kind = #iree_gpu.mma_layout<MFMA_F32_16x16x32_F8E4M3FN, col_major = true>
+ } : vector<4x2x1x8xf8E4M3FN>, vector<4x2x1x8xf8E4M3FN> into vector<4x4x1x4xf32>
- %lhs_vec_2 = vector.transfer_read %lhs_shared_expand[%m_outer_id, %ids#3, %c2, %inner_id], %cst {in_bounds = [true, true, true, true]} : !mshared_exp_f8, vector<4x1x2x8xf8E4M3FNUZ>
- %rhs_vec_2 = vector.transfer_read %rhs_shared_expand[%n_outer_id, %ids#3, %c2, %inner_id], %cst {in_bounds = [true, true, true, true]} : !shared_exp_f8, vector<4x1x2x8xf8E4M3FNUZ>
- %lhs_vec_2_t = vector.transpose %lhs_vec_2, [0, 2, 1, 3] : vector<4x1x2x8xf8E4M3FNUZ> to vector<4x2x1x8xf8E4M3FNUZ>
- %rhs_vec_2_t = vector.transpose %rhs_vec_2, [0, 2, 1, 3] : vector<4x1x2x8xf8E4M3FNUZ> to vector<4x2x1x8xf8E4M3FNUZ>
+ %lhs_vec_2 = vector.transfer_read %lhs_shared_expand[%m_outer_id, %ids#3, %c2, %inner_id], %cst {in_bounds = [true, true, true, true]} : !mshared_exp_f8, vector<4x1x2x8xf8E4M3FN>
+ %rhs_vec_2 = vector.transfer_read %rhs_shared_expand[%n_outer_id, %ids#3, %c2, %inner_id], %cst {in_bounds = [true, true, true, true]} : !shared_exp_f8, vector<4x1x2x8xf8E4M3FN>
+ %lhs_vec_2_t = vector.transpose %lhs_vec_2, [0, 2, 1, 3] : vector<4x1x2x8xf8E4M3FN> to vector<4x2x1x8xf8E4M3FN>
+ %rhs_vec_2_t = vector.transpose %rhs_vec_2, [0, 2, 1, 3] : vector<4x1x2x8xf8E4M3FN> to vector<4x2x1x8xf8E4M3FN>
%dot2 = iree_codegen.inner_tiled ins(%lhs_vec_2_t, %rhs_vec_2_t) outs(%dot0) {
indexing_maps = #contraction_accesses,
iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<reduction>],
- kind = #iree_gpu.mma_layout<MFMA_F32_16x16x32_F8E4M3FNUZ, col_major = true>
- } : vector<4x2x1x8xf8E4M3FNUZ>, vector<4x2x1x8xf8E4M3FNUZ> into vector<4x4x1x4xf32>
+ kind = #iree_gpu.mma_layout<MFMA_F32_16x16x32_F8E4M3FN, col_major = true>
+ } : vector<4x2x1x8xf8E4M3FN>, vector<4x2x1x8xf8E4M3FN> into vector<4x4x1x4xf32>
%tp = vector.transpose %dot2, [0, 2, 1, 3] : vector<4x4x1x4xf32> to vector<4x1x4x4xf32>
%empty = tensor.empty() : tensor<1x4x1x4x4xf32>
@@ -210,7 +210,7 @@
util.return %collapse : tensor<1x128x256xf32>
}
-util.func private @pingpong_large_f8_expanded(%lhs_base: !exp_in_ty_f8, %rhs_base: !in_ty_f8, %unused_acc: tensor<1x256x256xf32>) -> tensor<1x256x256xf32> {
+util.func private @pingpong_large_f8E4M3FN_expanded(%lhs_base: !exp_in_ty_f8, %rhs_base: !in_ty_f8, %unused_acc: tensor<1x256x256xf32>) -> tensor<1x256x256xf32> {
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c2 = arith.constant 2 : index
@@ -222,7 +222,7 @@
%c64 = arith.constant 64 : index
%c128 = arith.constant 128 : index
%c256 = arith.constant 256 : index
- %cst = arith.constant 0.0 : f8E4M3FNUZ
+ %cst = arith.constant 0.0 : f8E4M3FN
%lhs_shared_base = memref.alloc() : !flat_shared_f8
%rhs_shared_base = memref.alloc() : !flat_shared_f8
@@ -242,16 +242,16 @@
scf.forall (%id) in (2048) {
%delin:2 = affine.delinearize_index %id into (256, 8) : index, index
%vec = arith.muli %delin#1, %c16 overflow<nsw, nuw> : index
- %lhs_thread_local = tensor.extract_slice %lhs_init [0, %delin#0, %vec] [1, 1, 16] [1, 1, 1] : !exp_block_in_f8 to tensor<1x1x16xf8E4M3FNUZ>
- %lhs_vec_local = vector.transfer_read %lhs_thread_local [%c0, %c0, %c0], %cst {in_bounds = [true, true]} : tensor<1x1x16xf8E4M3FNUZ>, vector<1x16xf8E4M3FNUZ>
- vector.transfer_write %lhs_vec_local, %lhs_shared[%delin#0, %vec] {in_bounds = [true, true]} : vector<1x16xf8E4M3FNUZ>, !shared_f8
+ %lhs_thread_local = tensor.extract_slice %lhs_init [0, %delin#0, %vec] [1, 1, 16] [1, 1, 1] : !exp_block_in_f8 to tensor<1x1x16xf8E4M3FN>
+ %lhs_vec_local = vector.transfer_read %lhs_thread_local [%c0, %c0, %c0], %cst {in_bounds = [true, true]} : tensor<1x1x16xf8E4M3FN>, vector<1x16xf8E4M3FN>
+ vector.transfer_write %lhs_vec_local, %lhs_shared[%delin#0, %vec] {in_bounds = [true, true]} : vector<1x16xf8E4M3FN>, !shared_f8
} {mapping = [#gpu.thread<linear_dim_0>]}
scf.forall (%id) in (2048) {
%delin:2 = affine.delinearize_index %id into (256, 8) : index, index
%vec = arith.muli %delin#1, %c16 overflow<nsw, nuw> : index
- %rhs_thread_local = tensor.extract_slice %rhs_init [%delin#0, %vec] [1, 16] [1, 1] : !block_in_f8 to tensor<1x16xf8E4M3FNUZ>
- %rhs_vec_local = vector.transfer_read %rhs_thread_local [%c0, %c0], %cst {in_bounds = [true, true]} : tensor<1x16xf8E4M3FNUZ>, vector<1x16xf8E4M3FNUZ>
- vector.transfer_write %rhs_vec_local, %rhs_shared[%delin#0, %vec] {in_bounds = [true, true]} : vector<1x16xf8E4M3FNUZ>, !shared_f8
+ %rhs_thread_local = tensor.extract_slice %rhs_init [%delin#0, %vec] [1, 16] [1, 1] : !block_in_f8 to tensor<1x16xf8E4M3FN>
+ %rhs_vec_local = vector.transfer_read %rhs_thread_local [%c0, %c0], %cst {in_bounds = [true, true]} : tensor<1x16xf8E4M3FN>, vector<1x16xf8E4M3FN>
+ vector.transfer_write %rhs_vec_local, %rhs_shared[%delin#0, %vec] {in_bounds = [true, true]} : vector<1x16xf8E4M3FN>, !shared_f8
} {mapping = [#gpu.thread<linear_dim_0>]}
%lhs_shared_expand = memref.expand_shape %lhs_shared [[0, 1], [2, 3]] output_shape [16, 16, 4, 32] : !shared_f8 into !shared_exp_f8
@@ -288,17 +288,17 @@
// Global loads of lhs.
%lhs_block = tensor.extract_slice %lhs [0, 0, %i] [1, 256, 128] [1, 1, 1] : !exp_in_ty_f8 to !exp_block_in_f8
- %lhs_thread_0 = tensor.extract_slice %lhs_block [0, %glb0, %gko] [1, 1, 16] [1, 1, 1] : !exp_block_in_f8 to tensor<1x1x16xf8E4M3FNUZ>
- %lhs_vec_local_0 = vector.transfer_read %lhs_thread_0 [%c0, %c0, %c0], %cst {in_bounds = [true, true]} : tensor<1x1x16xf8E4M3FNUZ>, vector<1x16xf8E4M3FNUZ>
- %lhs_thread_1 = tensor.extract_slice %lhs_block [0, %glb1, %gko] [1, 1, 16] [1, 1, 1] : !exp_block_in_f8 to tensor<1x1x16xf8E4M3FNUZ>
- %lhs_vec_local_1 = vector.transfer_read %lhs_thread_1 [%c0, %c0, %c0], %cst {in_bounds = [true, true]} : tensor<1x1x16xf8E4M3FNUZ>, vector<1x16xf8E4M3FNUZ>
- %lhs_thread_2 = tensor.extract_slice %lhs_block [0, %glb2, %gko] [1, 1, 16] [1, 1, 1] : !exp_block_in_f8 to tensor<1x1x16xf8E4M3FNUZ>
- %lhs_vec_local_2 = vector.transfer_read %lhs_thread_2 [%c0, %c0, %c0], %cst {in_bounds = [true, true]} : tensor<1x1x16xf8E4M3FNUZ>, vector<1x16xf8E4M3FNUZ>
- %lhs_thread_3 = tensor.extract_slice %lhs_block [0, %glb3, %gko] [1, 1, 16] [1, 1, 1] : !exp_block_in_f8 to tensor<1x1x16xf8E4M3FNUZ>
- %lhs_vec_local_3 = vector.transfer_read %lhs_thread_3 [%c0, %c0, %c0], %cst {in_bounds = [true, true]} : tensor<1x1x16xf8E4M3FNUZ>, vector<1x16xf8E4M3FNUZ>
+ %lhs_thread_0 = tensor.extract_slice %lhs_block [0, %glb0, %gko] [1, 1, 16] [1, 1, 1] : !exp_block_in_f8 to tensor<1x1x16xf8E4M3FN>
+ %lhs_vec_local_0 = vector.transfer_read %lhs_thread_0 [%c0, %c0, %c0], %cst {in_bounds = [true, true]} : tensor<1x1x16xf8E4M3FN>, vector<1x16xf8E4M3FN>
+ %lhs_thread_1 = tensor.extract_slice %lhs_block [0, %glb1, %gko] [1, 1, 16] [1, 1, 1] : !exp_block_in_f8 to tensor<1x1x16xf8E4M3FN>
+ %lhs_vec_local_1 = vector.transfer_read %lhs_thread_1 [%c0, %c0, %c0], %cst {in_bounds = [true, true]} : tensor<1x1x16xf8E4M3FN>, vector<1x16xf8E4M3FN>
+ %lhs_thread_2 = tensor.extract_slice %lhs_block [0, %glb2, %gko] [1, 1, 16] [1, 1, 1] : !exp_block_in_f8 to tensor<1x1x16xf8E4M3FN>
+ %lhs_vec_local_2 = vector.transfer_read %lhs_thread_2 [%c0, %c0, %c0], %cst {in_bounds = [true, true]} : tensor<1x1x16xf8E4M3FN>, vector<1x16xf8E4M3FN>
+ %lhs_thread_3 = tensor.extract_slice %lhs_block [0, %glb3, %gko] [1, 1, 16] [1, 1, 1] : !exp_block_in_f8 to tensor<1x1x16xf8E4M3FN>
+ %lhs_vec_local_3 = vector.transfer_read %lhs_thread_3 [%c0, %c0, %c0], %cst {in_bounds = [true, true]} : tensor<1x1x16xf8E4M3FN>, vector<1x16xf8E4M3FN>
- %lhs_vec_0 = vector.transfer_read %lhs_shared_expand[%m_outer_id, %ids#3, %c0, %inner_id], %cst {in_bounds = [true, true, true, true]} : !shared_exp_f8, vector<8x1x1x8xf8E4M3FNUZ>
- %rhs_vec_0 = vector.transfer_read %rhs_shared_expand[%n_outer_id, %ids#3, %c0, %inner_id], %cst {in_bounds = [true, true, true, true]} : !shared_exp_f8, vector<4x1x1x8xf8E4M3FNUZ>
+ %lhs_vec_0 = vector.transfer_read %lhs_shared_expand[%m_outer_id, %ids#3, %c0, %inner_id], %cst {in_bounds = [true, true, true, true]} : !shared_exp_f8, vector<8x1x1x8xf8E4M3FN>
+ %rhs_vec_0 = vector.transfer_read %rhs_shared_expand[%n_outer_id, %ids#3, %c0, %inner_id], %cst {in_bounds = [true, true, true, true]} : !shared_exp_f8, vector<4x1x1x8xf8E4M3FN>
gpu.barrier
rocdl.sched.barrier 0
@@ -307,8 +307,8 @@
%dot0 = iree_codegen.inner_tiled ins(%lhs_vec_0, %rhs_vec_0) outs(%iter) {
indexing_maps = #contraction_accesses,
iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<reduction>],
- kind = #iree_gpu.mma_layout<MFMA_F32_16x16x32_F8E4M3FNUZ, col_major = true>
- } : vector<8x1x1x8xf8E4M3FNUZ>, vector<4x1x1x8xf8E4M3FNUZ> into vector<8x4x1x4xf32>
+ kind = #iree_gpu.mma_layout<MFMA_F32_16x16x32_F8E4M3FN, col_major = true>
+ } : vector<8x1x1x8xf8E4M3FN>, vector<4x1x1x8xf8E4M3FN> into vector<8x4x1x4xf32>
rocdl.s.setprio 0
gpu.barrier
@@ -316,17 +316,17 @@
// Global loads of rhs.
%rhs_block = tensor.extract_slice %rhs [0, %i] [256, 128] [1, 1] : !in_ty_f8 to !block_in_f8
- %rhs_thread_0 = tensor.extract_slice %rhs_block [%glb0, %gko] [1, 16] [1, 1] : !block_in_f8 to tensor<1x16xf8E4M3FNUZ>
- %rhs_vec_local_0 = vector.transfer_read %rhs_thread_0 [%c0, %c0], %cst {in_bounds = [true, true]} : tensor<1x16xf8E4M3FNUZ>, vector<1x16xf8E4M3FNUZ>
- %rhs_thread_1 = tensor.extract_slice %rhs_block [%glb1, %gko] [1, 16] [1, 1] : !block_in_f8 to tensor<1x16xf8E4M3FNUZ>
- %rhs_vec_local_1 = vector.transfer_read %rhs_thread_1 [%c0, %c0], %cst {in_bounds = [true, true]} : tensor<1x16xf8E4M3FNUZ>, vector<1x16xf8E4M3FNUZ>
- %rhs_thread_2 = tensor.extract_slice %rhs_block [%glb2, %gko] [1, 16] [1, 1] : !block_in_f8 to tensor<1x16xf8E4M3FNUZ>
- %rhs_vec_local_2 = vector.transfer_read %rhs_thread_2 [%c0, %c0], %cst {in_bounds = [true, true]} : tensor<1x16xf8E4M3FNUZ>, vector<1x16xf8E4M3FNUZ>
- %rhs_thread_3 = tensor.extract_slice %rhs_block [%glb3, %gko] [1, 16] [1, 1] : !block_in_f8 to tensor<1x16xf8E4M3FNUZ>
- %rhs_vec_local_3 = vector.transfer_read %rhs_thread_3 [%c0, %c0], %cst {in_bounds = [true, true]} : tensor<1x16xf8E4M3FNUZ>, vector<1x16xf8E4M3FNUZ>
+ %rhs_thread_0 = tensor.extract_slice %rhs_block [%glb0, %gko] [1, 16] [1, 1] : !block_in_f8 to tensor<1x16xf8E4M3FN>
+ %rhs_vec_local_0 = vector.transfer_read %rhs_thread_0 [%c0, %c0], %cst {in_bounds = [true, true]} : tensor<1x16xf8E4M3FN>, vector<1x16xf8E4M3FN>
+ %rhs_thread_1 = tensor.extract_slice %rhs_block [%glb1, %gko] [1, 16] [1, 1] : !block_in_f8 to tensor<1x16xf8E4M3FN>
+ %rhs_vec_local_1 = vector.transfer_read %rhs_thread_1 [%c0, %c0], %cst {in_bounds = [true, true]} : tensor<1x16xf8E4M3FN>, vector<1x16xf8E4M3FN>
+ %rhs_thread_2 = tensor.extract_slice %rhs_block [%glb2, %gko] [1, 16] [1, 1] : !block_in_f8 to tensor<1x16xf8E4M3FN>
+ %rhs_vec_local_2 = vector.transfer_read %rhs_thread_2 [%c0, %c0], %cst {in_bounds = [true, true]} : tensor<1x16xf8E4M3FN>, vector<1x16xf8E4M3FN>
+ %rhs_thread_3 = tensor.extract_slice %rhs_block [%glb3, %gko] [1, 16] [1, 1] : !block_in_f8 to tensor<1x16xf8E4M3FN>
+ %rhs_vec_local_3 = vector.transfer_read %rhs_thread_3 [%c0, %c0], %cst {in_bounds = [true, true]} : tensor<1x16xf8E4M3FN>, vector<1x16xf8E4M3FN>
- %lhs_vec_1 = vector.transfer_read %lhs_shared_expand[%m_outer_id, %ids#3, %c1, %inner_id], %cst {in_bounds = [true, true, true, true]} : !shared_exp_f8, vector<8x1x1x8xf8E4M3FNUZ>
- %rhs_vec_1 = vector.transfer_read %rhs_shared_expand[%n_outer_id, %ids#3, %c1, %inner_id], %cst {in_bounds = [true, true, true, true]} : !shared_exp_f8, vector<4x1x1x8xf8E4M3FNUZ>
+ %lhs_vec_1 = vector.transfer_read %lhs_shared_expand[%m_outer_id, %ids#3, %c1, %inner_id], %cst {in_bounds = [true, true, true, true]} : !shared_exp_f8, vector<8x1x1x8xf8E4M3FN>
+ %rhs_vec_1 = vector.transfer_read %rhs_shared_expand[%n_outer_id, %ids#3, %c1, %inner_id], %cst {in_bounds = [true, true, true, true]} : !shared_exp_f8, vector<4x1x1x8xf8E4M3FN>
gpu.barrier
rocdl.sched.barrier 0
@@ -335,18 +335,18 @@
%dot1 = iree_codegen.inner_tiled ins(%lhs_vec_1, %rhs_vec_1) outs(%dot0) {
indexing_maps = #contraction_accesses,
iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<reduction>],
- kind = #iree_gpu.mma_layout<MFMA_F32_16x16x32_F8E4M3FNUZ, col_major = true>
- } : vector<8x1x1x8xf8E4M3FNUZ>, vector<4x1x1x8xf8E4M3FNUZ> into vector<8x4x1x4xf32>
+ kind = #iree_gpu.mma_layout<MFMA_F32_16x16x32_F8E4M3FN, col_major = true>
+ } : vector<8x1x1x8xf8E4M3FN>, vector<4x1x1x8xf8E4M3FN> into vector<8x4x1x4xf32>
rocdl.s.setprio 0
gpu.barrier
rocdl.sched.barrier 0
- %lhs_vec_2 = vector.transfer_read %lhs_shared_expand[%m_outer_id, %ids#3, %c2, %inner_id], %cst {in_bounds = [true, true, true, true]} : !shared_exp_f8, vector<8x1x1x8xf8E4M3FNUZ>
- %rhs_vec_2 = vector.transfer_read %rhs_shared_expand[%n_outer_id, %ids#3, %c2, %inner_id], %cst {in_bounds = [true, true, true, true]} : !shared_exp_f8, vector<4x1x1x8xf8E4M3FNUZ>
+ %lhs_vec_2 = vector.transfer_read %lhs_shared_expand[%m_outer_id, %ids#3, %c2, %inner_id], %cst {in_bounds = [true, true, true, true]} : !shared_exp_f8, vector<8x1x1x8xf8E4M3FN>
+ %rhs_vec_2 = vector.transfer_read %rhs_shared_expand[%n_outer_id, %ids#3, %c2, %inner_id], %cst {in_bounds = [true, true, true, true]} : !shared_exp_f8, vector<4x1x1x8xf8E4M3FN>
- %lhs_vec_3 = vector.transfer_read %lhs_shared_expand[%m_outer_id, %ids#3, %c3, %inner_id], %cst {in_bounds = [true, true, true, true]} : !shared_exp_f8, vector<8x1x1x8xf8E4M3FNUZ>
- %rhs_vec_3 = vector.transfer_read %rhs_shared_expand[%n_outer_id, %ids#3, %c3, %inner_id], %cst {in_bounds = [true, true, true, true]} : !shared_exp_f8, vector<4x1x1x8xf8E4M3FNUZ>
+ %lhs_vec_3 = vector.transfer_read %lhs_shared_expand[%m_outer_id, %ids#3, %c3, %inner_id], %cst {in_bounds = [true, true, true, true]} : !shared_exp_f8, vector<8x1x1x8xf8E4M3FN>
+ %rhs_vec_3 = vector.transfer_read %rhs_shared_expand[%n_outer_id, %ids#3, %c3, %inner_id], %cst {in_bounds = [true, true, true, true]} : !shared_exp_f8, vector<4x1x1x8xf8E4M3FN>
gpu.barrier
rocdl.sched.barrier 0
@@ -355,22 +355,22 @@
%dot2 = iree_codegen.inner_tiled ins(%lhs_vec_2, %rhs_vec_2) outs(%dot1) {
indexing_maps = #contraction_accesses,
iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<reduction>],
- kind = #iree_gpu.mma_layout<MFMA_F32_16x16x32_F8E4M3FNUZ, col_major = true>
- } : vector<8x1x1x8xf8E4M3FNUZ>, vector<4x1x1x8xf8E4M3FNUZ> into vector<8x4x1x4xf32>
+ kind = #iree_gpu.mma_layout<MFMA_F32_16x16x32_F8E4M3FN, col_major = true>
+ } : vector<8x1x1x8xf8E4M3FN>, vector<4x1x1x8xf8E4M3FN> into vector<8x4x1x4xf32>
rocdl.s.setprio 0
gpu.barrier
rocdl.sched.barrier 0
- vector.transfer_write %lhs_vec_local_0, %lhs_shared [%glb0, %gko] {in_bounds = [true, true]} : vector<1x16xf8E4M3FNUZ>, !shared_f8
- vector.transfer_write %lhs_vec_local_1, %lhs_shared [%glb1, %gko] {in_bounds = [true, true]} : vector<1x16xf8E4M3FNUZ>, !shared_f8
- vector.transfer_write %lhs_vec_local_2, %lhs_shared [%glb2, %gko] {in_bounds = [true, true]} : vector<1x16xf8E4M3FNUZ>, !shared_f8
- vector.transfer_write %lhs_vec_local_3, %lhs_shared [%glb3, %gko] {in_bounds = [true, true]} : vector<1x16xf8E4M3FNUZ>, !shared_f8
+ vector.transfer_write %lhs_vec_local_0, %lhs_shared [%glb0, %gko] {in_bounds = [true, true]} : vector<1x16xf8E4M3FN>, !shared_f8
+ vector.transfer_write %lhs_vec_local_1, %lhs_shared [%glb1, %gko] {in_bounds = [true, true]} : vector<1x16xf8E4M3FN>, !shared_f8
+ vector.transfer_write %lhs_vec_local_2, %lhs_shared [%glb2, %gko] {in_bounds = [true, true]} : vector<1x16xf8E4M3FN>, !shared_f8
+ vector.transfer_write %lhs_vec_local_3, %lhs_shared [%glb3, %gko] {in_bounds = [true, true]} : vector<1x16xf8E4M3FN>, !shared_f8
- vector.transfer_write %rhs_vec_local_0, %rhs_shared [%glb0, %gko] {in_bounds = [true, true]} : vector<1x16xf8E4M3FNUZ>, !shared_f8
- vector.transfer_write %rhs_vec_local_1, %rhs_shared [%glb1, %gko] {in_bounds = [true, true]} : vector<1x16xf8E4M3FNUZ>, !shared_f8
- vector.transfer_write %rhs_vec_local_2, %rhs_shared [%glb2, %gko] {in_bounds = [true, true]} : vector<1x16xf8E4M3FNUZ>, !shared_f8
- vector.transfer_write %rhs_vec_local_3, %rhs_shared [%glb3, %gko] {in_bounds = [true, true]} : vector<1x16xf8E4M3FNUZ>, !shared_f8
+ vector.transfer_write %rhs_vec_local_0, %rhs_shared [%glb0, %gko] {in_bounds = [true, true]} : vector<1x16xf8E4M3FN>, !shared_f8
+ vector.transfer_write %rhs_vec_local_1, %rhs_shared [%glb1, %gko] {in_bounds = [true, true]} : vector<1x16xf8E4M3FN>, !shared_f8
+ vector.transfer_write %rhs_vec_local_2, %rhs_shared [%glb2, %gko] {in_bounds = [true, true]} : vector<1x16xf8E4M3FN>, !shared_f8
+ vector.transfer_write %rhs_vec_local_3, %rhs_shared [%glb3, %gko] {in_bounds = [true, true]} : vector<1x16xf8E4M3FN>, !shared_f8
gpu.barrier
rocdl.sched.barrier 0
@@ -379,8 +379,8 @@
%dot3 = iree_codegen.inner_tiled ins(%lhs_vec_3, %rhs_vec_3) outs(%dot2) {
indexing_maps = #contraction_accesses,
iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<reduction>],
- kind = #iree_gpu.mma_layout<MFMA_F32_16x16x32_F8E4M3FNUZ, col_major = true>
- } : vector<8x1x1x8xf8E4M3FNUZ>, vector<4x1x1x8xf8E4M3FNUZ> into vector<8x4x1x4xf32>
+ kind = #iree_gpu.mma_layout<MFMA_F32_16x16x32_F8E4M3FN, col_major = true>
+ } : vector<8x1x1x8xf8E4M3FN>, vector<4x1x1x8xf8E4M3FN> into vector<8x4x1x4xf32>
rocdl.s.setprio 0
gpu.barrier
@@ -393,34 +393,34 @@
}
// Epilogue
- %lhs_vec_0 = vector.transfer_read %lhs_shared_expand[%m_outer_id, %ids#3, %c0, %inner_id], %cst {in_bounds = [true, true, true, true]} : !shared_exp_f8, vector<8x1x1x8xf8E4M3FNUZ>
- %rhs_vec_0 = vector.transfer_read %rhs_shared_expand[%n_outer_id, %ids#3, %c0, %inner_id], %cst {in_bounds = [true, true, true, true]} : !shared_exp_f8, vector<4x1x1x8xf8E4M3FNUZ>
+ %lhs_vec_0 = vector.transfer_read %lhs_shared_expand[%m_outer_id, %ids#3, %c0, %inner_id], %cst {in_bounds = [true, true, true, true]} : !shared_exp_f8, vector<8x1x1x8xf8E4M3FN>
+ %rhs_vec_0 = vector.transfer_read %rhs_shared_expand[%n_outer_id, %ids#3, %c0, %inner_id], %cst {in_bounds = [true, true, true, true]} : !shared_exp_f8, vector<4x1x1x8xf8E4M3FN>
%dot0 = iree_codegen.inner_tiled ins(%lhs_vec_0, %rhs_vec_0) outs(%3) {
indexing_maps = #contraction_accesses,
iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<reduction>],
- kind = #iree_gpu.mma_layout<MFMA_F32_16x16x32_F8E4M3FNUZ, col_major = true>
- } : vector<8x1x1x8xf8E4M3FNUZ>, vector<4x1x1x8xf8E4M3FNUZ> into vector<8x4x1x4xf32>
- %lhs_vec_1 = vector.transfer_read %lhs_shared_expand[%m_outer_id, %ids#3, %c1, %inner_id], %cst {in_bounds = [true, true, true, true]} : !shared_exp_f8, vector<8x1x1x8xf8E4M3FNUZ>
- %rhs_vec_1 = vector.transfer_read %rhs_shared_expand[%n_outer_id, %ids#3, %c1, %inner_id], %cst {in_bounds = [true, true, true, true]} : !shared_exp_f8, vector<4x1x1x8xf8E4M3FNUZ>
+ kind = #iree_gpu.mma_layout<MFMA_F32_16x16x32_F8E4M3FN, col_major = true>
+ } : vector<8x1x1x8xf8E4M3FN>, vector<4x1x1x8xf8E4M3FN> into vector<8x4x1x4xf32>
+ %lhs_vec_1 = vector.transfer_read %lhs_shared_expand[%m_outer_id, %ids#3, %c1, %inner_id], %cst {in_bounds = [true, true, true, true]} : !shared_exp_f8, vector<8x1x1x8xf8E4M3FN>
+ %rhs_vec_1 = vector.transfer_read %rhs_shared_expand[%n_outer_id, %ids#3, %c1, %inner_id], %cst {in_bounds = [true, true, true, true]} : !shared_exp_f8, vector<4x1x1x8xf8E4M3FN>
%dot1 = iree_codegen.inner_tiled ins(%lhs_vec_1, %rhs_vec_1) outs(%dot0) {
indexing_maps = #contraction_accesses,
iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<reduction>],
- kind = #iree_gpu.mma_layout<MFMA_F32_16x16x32_F8E4M3FNUZ, col_major = true>
- } : vector<8x1x1x8xf8E4M3FNUZ>, vector<4x1x1x8xf8E4M3FNUZ> into vector<8x4x1x4xf32>
- %lhs_vec_2 = vector.transfer_read %lhs_shared_expand[%m_outer_id, %ids#3, %c2, %inner_id], %cst {in_bounds = [true, true, true, true]} : !shared_exp_f8, vector<8x1x1x8xf8E4M3FNUZ>
- %rhs_vec_2 = vector.transfer_read %rhs_shared_expand[%n_outer_id, %ids#3, %c2, %inner_id], %cst {in_bounds = [true, true, true, true]} : !shared_exp_f8, vector<4x1x1x8xf8E4M3FNUZ>
+ kind = #iree_gpu.mma_layout<MFMA_F32_16x16x32_F8E4M3FN, col_major = true>
+ } : vector<8x1x1x8xf8E4M3FN>, vector<4x1x1x8xf8E4M3FN> into vector<8x4x1x4xf32>
+ %lhs_vec_2 = vector.transfer_read %lhs_shared_expand[%m_outer_id, %ids#3, %c2, %inner_id], %cst {in_bounds = [true, true, true, true]} : !shared_exp_f8, vector<8x1x1x8xf8E4M3FN>
+ %rhs_vec_2 = vector.transfer_read %rhs_shared_expand[%n_outer_id, %ids#3, %c2, %inner_id], %cst {in_bounds = [true, true, true, true]} : !shared_exp_f8, vector<4x1x1x8xf8E4M3FN>
%dot2 = iree_codegen.inner_tiled ins(%lhs_vec_2, %rhs_vec_2) outs(%dot1) {
indexing_maps = #contraction_accesses,
iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<reduction>],
- kind = #iree_gpu.mma_layout<MFMA_F32_16x16x32_F8E4M3FNUZ, col_major = true>
- } : vector<8x1x1x8xf8E4M3FNUZ>, vector<4x1x1x8xf8E4M3FNUZ> into vector<8x4x1x4xf32>
- %lhs_vec_3 = vector.transfer_read %lhs_shared_expand[%m_outer_id, %ids#3, %c3, %inner_id], %cst {in_bounds = [true, true, true, true]} : !shared_exp_f8, vector<8x1x1x8xf8E4M3FNUZ>
- %rhs_vec_3 = vector.transfer_read %rhs_shared_expand[%n_outer_id, %ids#3, %c3, %inner_id], %cst {in_bounds = [true, true, true, true]} : !shared_exp_f8, vector<4x1x1x8xf8E4M3FNUZ>
+ kind = #iree_gpu.mma_layout<MFMA_F32_16x16x32_F8E4M3FN, col_major = true>
+ } : vector<8x1x1x8xf8E4M3FN>, vector<4x1x1x8xf8E4M3FN> into vector<8x4x1x4xf32>
+ %lhs_vec_3 = vector.transfer_read %lhs_shared_expand[%m_outer_id, %ids#3, %c3, %inner_id], %cst {in_bounds = [true, true, true, true]} : !shared_exp_f8, vector<8x1x1x8xf8E4M3FN>
+ %rhs_vec_3 = vector.transfer_read %rhs_shared_expand[%n_outer_id, %ids#3, %c3, %inner_id], %cst {in_bounds = [true, true, true, true]} : !shared_exp_f8, vector<4x1x1x8xf8E4M3FN>
%dot3 = iree_codegen.inner_tiled ins(%lhs_vec_3, %rhs_vec_3) outs(%dot2) {
indexing_maps = #contraction_accesses,
iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<reduction>],
- kind = #iree_gpu.mma_layout<MFMA_F32_16x16x32_F8E4M3FNUZ, col_major = true>
- } : vector<8x1x1x8xf8E4M3FNUZ>, vector<4x1x1x8xf8E4M3FNUZ> into vector<8x4x1x4xf32>
+ kind = #iree_gpu.mma_layout<MFMA_F32_16x16x32_F8E4M3FN, col_major = true>
+ } : vector<8x1x1x8xf8E4M3FN>, vector<4x1x1x8xf8E4M3FN> into vector<8x4x1x4xf32>
%tp = vector.transpose %dot3, [0, 2, 1, 3] : vector<8x4x1x4xf32> to vector<8x1x4x4xf32>
%empty = tensor.empty() : tensor<1x8x1x4x4xf32>
diff --git a/compiler/plugins/target/ROCM/builtins/mlir_ukernel/iree_uk_amdgpu_matmul_f8.mlir b/compiler/plugins/target/ROCM/builtins/mlir_ukernel/iree_uk_amdgpu_matmul_f8E4M3FNUZ.mlir
similarity index 98%
rename from compiler/plugins/target/ROCM/builtins/mlir_ukernel/iree_uk_amdgpu_matmul_f8.mlir
rename to compiler/plugins/target/ROCM/builtins/mlir_ukernel/iree_uk_amdgpu_matmul_f8E4M3FNUZ.mlir
index d808b97..c383d6e 100644
--- a/compiler/plugins/target/ROCM/builtins/mlir_ukernel/iree_uk_amdgpu_matmul_f8.mlir
+++ b/compiler/plugins/target/ROCM/builtins/mlir_ukernel/iree_uk_amdgpu_matmul_f8E4M3FNUZ.mlir
@@ -20,7 +20,7 @@
affine_map<(i, j, k) -> (i, j)>
]
-util.func @pingpong_medium_f8_expanded(%lhs_base: !mexp_in_ty_f8, %rhs_base: !in_ty_f8, %unused_acc: tensor<1x128x256xf32>) -> tensor<1x128x256xf32> {
+util.func @pingpong_medium_f8E4M3FNUZ_expanded(%lhs_base: !mexp_in_ty_f8, %rhs_base: !in_ty_f8, %unused_acc: tensor<1x128x256xf32>) -> tensor<1x128x256xf32> {
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c2 = arith.constant 2 : index
@@ -210,7 +210,7 @@
util.return %collapse : tensor<1x128x256xf32>
}
-util.func private @pingpong_large_f8_expanded(%lhs_base: !exp_in_ty_f8, %rhs_base: !in_ty_f8, %unused_acc: tensor<1x256x256xf32>) -> tensor<1x256x256xf32> {
+util.func private @pingpong_large_f8E4M3FNUZ_expanded(%lhs_base: !exp_in_ty_f8, %rhs_base: !in_ty_f8, %unused_acc: tensor<1x256x256xf32>) -> tensor<1x256x256xf32> {
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c2 = arith.constant 2 : index
diff --git a/compiler/plugins/target/ROCM/builtins/mlir_ukernel/ukernel_patterns_gfx942.mlir b/compiler/plugins/target/ROCM/builtins/mlir_ukernel/ukernel_patterns_gfx942.mlir
index ce24d04..9a3afdc 100644
--- a/compiler/plugins/target/ROCM/builtins/mlir_ukernel/ukernel_patterns_gfx942.mlir
+++ b/compiler/plugins/target/ROCM/builtins/mlir_ukernel/ukernel_patterns_gfx942.mlir
@@ -1,10 +1,10 @@
// RUN: iree-opt -allow-unregistered-dialect %s
-// F8 Patterns
+// f8E4M3FNUZ Patterns
// This pattern matches a medium-sized expanded matmul-like operation and
// annotates it with ukernel descriptor and configuration attributes.
-pdl.pattern @annotate_matmul_like_f8_medium_expanded : benefit(1) {
+pdl.pattern @annotate_matmul_like_f8E4M3FNUZ_medium_expanded : benefit(1) {
%elemtypes = pdl.attribute = [f8E4M3FNUZ, f8E4M3FNUZ, f32]
%imaps = pdl.attribute = [
affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>,
@@ -58,7 +58,7 @@
// Call the C++ "annotateOperation" utility to add the attributes to the matched linalg.generic op.
// This modifies the operation in-place.
- %annotation = pdl.attribute = #iree_codegen.ukernel_descriptor<"pingpong_medium_f8_expanded", tensor>
+ %annotation = pdl.attribute = #iree_codegen.ukernel_descriptor<"pingpong_medium_f8E4M3FNUZ_expanded", tensor>
pdl.apply_native_rewrite "annotateOperation"(%generic_op, %attr_name, %annotation : !pdl.operation, !pdl.attribute, !pdl.attribute)
%config_name = pdl.attribute = "compilation_info"
@@ -82,7 +82,7 @@
pdl.apply_native_rewrite "annotateOperation"(%generic_op, %config_name, %config : !pdl.operation, !pdl.attribute, !pdl.attribute)
%builtin_attr = pdl.attribute = "rocm.builtin_name"
- %builtin_annotation = pdl.attribute = "iree_uk_amdgpu_matmul_f8.mlir"
+ %builtin_annotation = pdl.attribute = "iree_uk_amdgpu_matmul_f8E4M3FNUZ.mlir"
pdl.apply_native_rewrite "annotateOperation"(%generic_op, %builtin_attr, %builtin_annotation : !pdl.operation, !pdl.attribute, !pdl.attribute)
}
}
@@ -90,7 +90,7 @@
// This pattern matches a large expanded f8 matmul-like operation and annotates it
// with ukernel descriptor and configuration attributes. This is preferred over the
// medium-sized ukernel.
-pdl.pattern @annotate_matmul_like_f8_large_expanded : benefit(2) {
+pdl.pattern @annotate_matmul_like_f8E4M3FNUZ_large_expanded : benefit(2) {
%elemtypes = pdl.attribute = [f8E4M3FNUZ, f8E4M3FNUZ, f32]
%imaps = pdl.attribute = [
affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>,
@@ -150,7 +150,7 @@
// Call the C++ "annotateOperation" utility to add the attributes to the matched linalg.generic op.
// This modifies the operation in-place.
- %annotation = pdl.attribute = #iree_codegen.ukernel_descriptor<"pingpong_large_f8_expanded", tensor>
+ %annotation = pdl.attribute = #iree_codegen.ukernel_descriptor<"pingpong_large_f8E4M3FNUZ_expanded", tensor>
pdl.apply_native_rewrite "annotateOperation"(%generic_op, %attr_name, %annotation : !pdl.operation, !pdl.attribute, !pdl.attribute)
%config_name = pdl.attribute = "compilation_info"
@@ -174,529 +174,7 @@
pdl.apply_native_rewrite "annotateOperation"(%generic_op, %config_name, %config : !pdl.operation, !pdl.attribute, !pdl.attribute)
%builtin_attr = pdl.attribute = "rocm.builtin_name"
- %builtin_annotation = pdl.attribute = "iree_uk_amdgpu_matmul_f8.mlir"
- pdl.apply_native_rewrite "annotateOperation"(%generic_op, %builtin_attr, %builtin_annotation : !pdl.operation, !pdl.attribute, !pdl.attribute)
- }
-}
-
-// F16 Patterns
-
-// This pattern matches a large f16 matmul-like operation and annotates it
-// with ukernel descriptor and configuration attributes.
-pdl.pattern @annotate_matmul_like_f16_large : benefit(1) {
- %elemtypes = pdl.attribute = [f16, f16, f32]
- %imaps = pdl.attribute = [
- affine_map<(d0, d1, d2) -> (d0, d2)>,
- affine_map<(d0, d1, d2) -> (d1, d2)>,
- affine_map<(d0, d1, d2) -> (d0, d1)>
- ]
-
- %lhs_type = pdl.type
- %rhs_type = pdl.type
- %out_type = pdl.type
- %zero_type = pdl.type : f32
-
- %lhs = pdl.operand : %lhs_type
- %rhs = pdl.operand : %rhs_type
- %out_init = pdl.operand : %out_type
-
- %zero_val = pdl.attribute = 0. : f32
- %zero_op = pdl.operation "arith.constant" {"value" = %zero_val} -> (%zero_type : !pdl.type)
- %zero = pdl.result 0 of %zero_op
- %fill_op = pdl.operation "linalg.fill" (%zero, %out_init : !pdl.value, !pdl.value) -> (%out_type : !pdl.type)
- %fill = pdl.result 0 of %fill_op
-
- // Match the a matmul-like generic with above indexing maps.
- %generic_op = pdl.operation (%lhs, %rhs, %fill : !pdl.value, !pdl.value, !pdl.value) -> (%out_type : !pdl.type)
- pdl.apply_native_constraint "matchContraction"(
- %generic_op, %elemtypes, %imaps
- : !pdl.operation, !pdl.attribute, !pdl.attribute)
-
- %attr_name = pdl.attribute = "iree_codegen.ukernel"
- pdl.apply_native_constraint "hasAttr"(%generic_op, %attr_name : !pdl.operation, !pdl.attribute) {isNegated = true}
-
- // M % 256 == 0, K % 64 == 0, N % 256 == 0
- %empty = pdl.attribute = {}
- %c0 = pdl.attribute = 0
- %c1 = pdl.attribute = 1
- %c2 = pdl.attribute = 2
- %c64 = pdl.attribute = 64
- %c256 = pdl.attribute = 256
- pdl.apply_native_constraint "dimIsMultipleOf"(%lhs, %c0, %c256 : !pdl.value, !pdl.attribute, !pdl.attribute)
- pdl.apply_native_constraint "dimIsMultipleOf"(%lhs, %c1, %c64 : !pdl.value, !pdl.attribute, !pdl.attribute)
- pdl.apply_native_constraint "dimIsMultipleOf"(%rhs, %c0, %c256 : !pdl.value, !pdl.attribute, !pdl.attribute)
- pdl.apply_native_constraint "dimIsMultipleOf"(%rhs, %c1, %c64 : !pdl.value, !pdl.attribute, !pdl.attribute)
-
- // M, N >= 1024, K >= 256
- %c1024 = pdl.attribute = 1024
- pdl.apply_native_constraint "dimIsBound"(%lhs, %c0, %c1024, %empty : !pdl.value, !pdl.attribute, !pdl.attribute, !pdl.attribute)
- pdl.apply_native_constraint "dimIsBound"(%rhs, %c0, %c1024, %empty : !pdl.value, !pdl.attribute, !pdl.attribute, !pdl.attribute)
- pdl.apply_native_constraint "dimIsBound"(%lhs, %c1, %c256, %empty : !pdl.value, !pdl.attribute, !pdl.attribute, !pdl.attribute)
-
- pdl.rewrite {
- // Call the C++ "annotateOperation" utility to add the attributes to the matched linalg.generic op.
- // This modifies the operation in-place.
-
- %annotation = pdl.attribute = #iree_codegen.ukernel_descriptor<"pingpong_large_f16", tensor>
- pdl.apply_native_rewrite "annotateOperation"(%generic_op, %attr_name, %annotation : !pdl.operation, !pdl.attribute, !pdl.attribute)
-
- %config_name = pdl.attribute = "compilation_info"
- %config = pdl.attribute = #iree_codegen.compilation_info<
- lowering_config = #iree_gpu.lowering_config<{workgroup = [256, 256, 0]}>,
- translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse
- workgroup_size = [512, 1, 1] subgroup_size = 64,
- // This strategy uses the maximum amount of possible shared memory on
- // all gfx942 architectures so shared memory padding to reduce bank
- // conflicts must be disabled. Also prefetching is done manually in the
- // above and is disabled here as well.
- {gpu_pipeline_options =
- #iree_gpu.pipeline_options<
- prefetch_shared_memory = false,
- no_reduce_shared_memory_bank_conflicts = true>,
- // This strategy requires 2 waves per SIMD.
- llvm_func_attrs = {"amdgpu-waves-per-eu" = "2"}}>
- >
- pdl.apply_native_rewrite "annotateOperation"(%generic_op, %config_name, %config : !pdl.operation, !pdl.attribute, !pdl.attribute)
-
- %builtin_attr = pdl.attribute = "rocm.builtin_name"
- %builtin_annotation = pdl.attribute = "iree_uk_amdgpu_matmul_f16.mlir"
- pdl.apply_native_rewrite "annotateOperation"(%generic_op, %builtin_attr, %builtin_annotation : !pdl.operation, !pdl.attribute, !pdl.attribute)
- }
-}
-
-// This pattern matches a medium-sized f16 matmul-like operation and annotates it
-// with ukernel descriptor and configuration attributes.
-pdl.pattern @annotate_matmul_like_f16_medium_expanded : benefit(1) {
- %elemtypes = pdl.attribute = [f16, f16, f32]
- %imaps = pdl.attribute = [
- affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>,
- affine_map<(d0, d1, d2, d3) -> (d2, d3)>,
- affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>
- ]
-
- %lhs_type = pdl.type
- %rhs_type = pdl.type
- %out_type = pdl.type
- %zero_type = pdl.type : f32
-
- %lhs = pdl.operand : %lhs_type
- %rhs = pdl.operand : %rhs_type
- %out_init = pdl.operand : %out_type
-
- %zero_val = pdl.attribute = 0. : f32
- %zero_op = pdl.operation "arith.constant" {"value" = %zero_val} -> (%zero_type : !pdl.type)
- %zero = pdl.result 0 of %zero_op
- %fill_op = pdl.operation "linalg.fill" (%zero, %out_init : !pdl.value, !pdl.value) -> (%out_type : !pdl.type)
- %fill = pdl.result 0 of %fill_op
-
- // Match the a matmul-like generic with above indexing maps.
- %generic_op = pdl.operation (%lhs, %rhs, %fill : !pdl.value, !pdl.value, !pdl.value) -> (%out_type : !pdl.type)
- pdl.apply_native_constraint "matchContraction"(
- %generic_op, %elemtypes, %imaps
- : !pdl.operation, !pdl.attribute, !pdl.attribute)
-
- %attr_name = pdl.attribute = "iree_codegen.ukernel"
- pdl.apply_native_constraint "hasAttr"(%generic_op, %attr_name : !pdl.operation, !pdl.attribute) {isNegated = true}
-
- // M % 128 == 0, K % 64 == 0, N % 256 == 0
- %empty = pdl.attribute = {}
- %c0 = pdl.attribute = 0
- %c1 = pdl.attribute = 1
- %c2 = pdl.attribute = 2
- %c64 = pdl.attribute = 64
- %c128 = pdl.attribute = 128
- %c256 = pdl.attribute = 256
- pdl.apply_native_constraint "dimIsMultipleOf"(%lhs, %c1, %c128 : !pdl.value, !pdl.attribute, !pdl.attribute)
- pdl.apply_native_constraint "dimIsMultipleOf"(%lhs, %c2, %c64 : !pdl.value, !pdl.attribute, !pdl.attribute)
- pdl.apply_native_constraint "dimIsMultipleOf"(%rhs, %c0, %c256 : !pdl.value, !pdl.attribute, !pdl.attribute)
- pdl.apply_native_constraint "dimIsMultipleOf"(%rhs, %c1, %c64 : !pdl.value, !pdl.attribute, !pdl.attribute)
-
- // M, N >= 1024, K >= 256
- %c1024 = pdl.attribute = 1024
-
- // TODO: Kernel specialization is needed to apply this strategy selectively at
- // runtime. Additionally model exports don't specify lower bounds so it is
- // impossible to use this strategy with this check.
- // pdl.apply_native_constraint "dimIsBound"(%lhs, %c0, %c4, %empty : !pdl.value, !pdl.attribute, !pdl.attribute, !pdl.attribute)
-
- pdl.apply_native_constraint "dimIsBound"(%rhs, %c0, %c1024, %empty : !pdl.value, !pdl.attribute, !pdl.attribute, !pdl.attribute)
- pdl.apply_native_constraint "dimIsBound"(%lhs, %c2, %c256, %empty : !pdl.value, !pdl.attribute, !pdl.attribute, !pdl.attribute)
-
- pdl.rewrite {
- // Call the C++ "annotateOperation" utility to add the attributes to the matched linalg.generic op.
- // This modifies the operation in-place.
-
- %annotation = pdl.attribute = #iree_codegen.ukernel_descriptor<"pingpong_medium_f16_expanded", tensor>
- pdl.apply_native_rewrite "annotateOperation"(%generic_op, %attr_name, %annotation : !pdl.operation, !pdl.attribute, !pdl.attribute)
-
- %config_name = pdl.attribute = "compilation_info"
- %config = pdl.attribute = #iree_codegen.compilation_info<
- lowering_config = #iree_gpu.lowering_config<{workgroup = [1, 128, 256, 0]}>,
- translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse
- workgroup_size = [512, 1, 1] subgroup_size = 64,
- // This strategy uses the maximum amount of possible shared memory on
- // all gfx942 architectures so shared memory padding to reduce bank
- // conflicts must be disabled. Also prefetching is done manually in the
- // above and is disabled here as well.
- {gpu_pipeline_options =
- #iree_gpu.pipeline_options<
- prefetch_shared_memory = false,
- no_reduce_shared_memory_bank_conflicts = true>,
- // This strategy requires 2 waves per SIMD.
- llvm_func_attrs = {"amdgpu-waves-per-eu" = "2"}}>
- >
- pdl.apply_native_rewrite "annotateOperation"(%generic_op, %config_name, %config : !pdl.operation, !pdl.attribute, !pdl.attribute)
-
- %builtin_attr = pdl.attribute = "rocm.builtin_name"
- %builtin_annotation = pdl.attribute = "iree_uk_amdgpu_matmul_f16.mlir"
- pdl.apply_native_rewrite "annotateOperation"(%generic_op, %builtin_attr, %builtin_annotation : !pdl.operation, !pdl.attribute, !pdl.attribute)
- }
-}
-
-// This pattern matches a medium-sized f16 matmul-like operation and annotates it
-// with ukernel descriptor and configuration attributes. This is preferred over the
-// medium-sized ukernel.
-pdl.pattern @annotate_matmul_like_f16_large_expanded : benefit(2) {
- %elemtypes = pdl.attribute = [f16, f16, f32]
- %imaps = pdl.attribute = [
- affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>,
- affine_map<(d0, d1, d2, d3) -> (d2, d3)>,
- affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>
- ]
-
- %lhs_type = pdl.type
- %rhs_type = pdl.type
- %out_type = pdl.type
- %zero_type = pdl.type : f32
-
- %lhs = pdl.operand : %lhs_type
- %rhs = pdl.operand : %rhs_type
- %out_init = pdl.operand : %out_type
-
- %zero_val = pdl.attribute = 0. : f32
- %zero_op = pdl.operation "arith.constant" {"value" = %zero_val} -> (%zero_type : !pdl.type)
- %zero = pdl.result 0 of %zero_op
- %fill_op = pdl.operation "linalg.fill" (%zero, %out_init : !pdl.value, !pdl.value) -> (%out_type : !pdl.type)
- %fill = pdl.result 0 of %fill_op
-
- // Match the a matmul-like generic with above indexing maps.
- %generic_op = pdl.operation (%lhs, %rhs, %fill : !pdl.value, !pdl.value, !pdl.value) -> (%out_type : !pdl.type)
- pdl.apply_native_constraint "matchContraction"(
- %generic_op, %elemtypes, %imaps
- : !pdl.operation, !pdl.attribute, !pdl.attribute)
-
- %attr_name = pdl.attribute = "iree_codegen.ukernel"
- pdl.apply_native_constraint "hasAttr"(%generic_op, %attr_name : !pdl.operation, !pdl.attribute) {isNegated = true}
-
- // M % 256 == 0, K % 64 == 0, N % 256 == 0
- %empty = pdl.attribute = {}
- %c0 = pdl.attribute = 0
- %c1 = pdl.attribute = 1
- %c2 = pdl.attribute = 2
- %c64 = pdl.attribute = 64
- %c256 = pdl.attribute = 256
- pdl.apply_native_constraint "dimIsMultipleOf"(%lhs, %c1, %c256 : !pdl.value, !pdl.attribute, !pdl.attribute)
- pdl.apply_native_constraint "dimIsMultipleOf"(%lhs, %c2, %c64 : !pdl.value, !pdl.attribute, !pdl.attribute)
- pdl.apply_native_constraint "dimIsMultipleOf"(%rhs, %c0, %c256 : !pdl.value, !pdl.attribute, !pdl.attribute)
- pdl.apply_native_constraint "dimIsMultipleOf"(%rhs, %c1, %c64 : !pdl.value, !pdl.attribute, !pdl.attribute)
-
- // M, N >= 1024, K >= 256
- %c1024 = pdl.attribute = 1024
-
- // TODO: Kernel specialization is needed to apply this strategy selectively at
- // runtime. Additionally model exports don't specify lower bounds so it is
- // impossible to use this strategy with this check.
- // pdl.apply_native_constraint "dimIsBound"(%lhs, %c0, %c4, %empty : !pdl.value, !pdl.attribute, !pdl.attribute, !pdl.attribute)
-
- pdl.apply_native_constraint "dimIsBound"(%rhs, %c0, %c1024, %empty : !pdl.value, !pdl.attribute, !pdl.attribute, !pdl.attribute)
- pdl.apply_native_constraint "dimIsBound"(%lhs, %c2, %c256, %empty : !pdl.value, !pdl.attribute, !pdl.attribute, !pdl.attribute)
-
- pdl.rewrite {
- // Call the C++ "annotateOperation" utility to add the attributes to the matched linalg.generic op.
- // This modifies the operation in-place.
-
- %annotation = pdl.attribute = #iree_codegen.ukernel_descriptor<"pingpong_large_f16_expanded", tensor>
- pdl.apply_native_rewrite "annotateOperation"(%generic_op, %attr_name, %annotation : !pdl.operation, !pdl.attribute, !pdl.attribute)
-
- %config_name = pdl.attribute = "compilation_info"
- %config = pdl.attribute = #iree_codegen.compilation_info<
- lowering_config = #iree_gpu.lowering_config<{workgroup = [1, 256, 256, 0]}>,
- translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse
- workgroup_size = [512, 1, 1] subgroup_size = 64,
- // This strategy uses the maximum amount of possible shared memory on
- // all gfx942 architectures so shared memory padding to reduce bank
- // conflicts must be disabled. Also prefetching is done manually in the
- // above and is disabled here as well.
- {gpu_pipeline_options =
- #iree_gpu.pipeline_options<
- prefetch_shared_memory = false,
- no_reduce_shared_memory_bank_conflicts = true>,
- // This strategy requires 2 waves per SIMD.
- llvm_func_attrs = {"amdgpu-waves-per-eu" = "2"}}>
- >
- pdl.apply_native_rewrite "annotateOperation"(%generic_op, %config_name, %config : !pdl.operation, !pdl.attribute, !pdl.attribute)
-
- %builtin_attr = pdl.attribute = "rocm.builtin_name"
- %builtin_annotation = pdl.attribute = "iree_uk_amdgpu_matmul_f16.mlir"
- pdl.apply_native_rewrite "annotateOperation"(%generic_op, %builtin_attr, %builtin_annotation : !pdl.operation, !pdl.attribute, !pdl.attribute)
- }
-}
-
-// BF16 Patterns
-
-// This pattern matches a bf16 matmul-like operation and annotates it
-// with ukernel descriptor and configuration attributes.
-pdl.pattern @annotate_matmul_like_bf16_large : benefit(1) {
- %elemtypes = pdl.attribute = [bf16, bf16, f32]
- %imaps = pdl.attribute = [
- affine_map<(d0, d1, d2) -> (d0, d2)>,
- affine_map<(d0, d1, d2) -> (d1, d2)>,
- affine_map<(d0, d1, d2) -> (d0, d1)>
- ]
-
- %lhs_type = pdl.type
- %rhs_type = pdl.type
- %out_type = pdl.type
- %zero_type = pdl.type : f32
-
- %lhs = pdl.operand : %lhs_type
- %rhs = pdl.operand : %rhs_type
- %out_init = pdl.operand : %out_type
-
- %zero_val = pdl.attribute = 0. : f32
- %zero_op = pdl.operation "arith.constant" {"value" = %zero_val} -> (%zero_type : !pdl.type)
- %zero = pdl.result 0 of %zero_op
- %fill_op = pdl.operation "linalg.fill" (%zero, %out_init : !pdl.value, !pdl.value) -> (%out_type : !pdl.type)
- %fill = pdl.result 0 of %fill_op
-
- // Match the a matmul-like generic with above indexing maps.
- %generic_op = pdl.operation (%lhs, %rhs, %fill : !pdl.value, !pdl.value, !pdl.value) -> (%out_type : !pdl.type)
- pdl.apply_native_constraint "matchContraction"(
- %generic_op, %elemtypes, %imaps
- : !pdl.operation, !pdl.attribute, !pdl.attribute)
-
- %attr_name = pdl.attribute = "iree_codegen.ukernel"
- pdl.apply_native_constraint "hasAttr"(%generic_op, %attr_name : !pdl.operation, !pdl.attribute) {isNegated = true}
-
- // M % 256 == 0, K % 64 == 0, N % 256 == 0
- %empty = pdl.attribute = {}
- %c0 = pdl.attribute = 0
- %c1 = pdl.attribute = 1
- %c2 = pdl.attribute = 2
- %c64 = pdl.attribute = 64
- %c256 = pdl.attribute = 256
- pdl.apply_native_constraint "dimIsMultipleOf"(%lhs, %c0, %c256 : !pdl.value, !pdl.attribute, !pdl.attribute)
- pdl.apply_native_constraint "dimIsMultipleOf"(%lhs, %c1, %c64 : !pdl.value, !pdl.attribute, !pdl.attribute)
- pdl.apply_native_constraint "dimIsMultipleOf"(%rhs, %c0, %c256 : !pdl.value, !pdl.attribute, !pdl.attribute)
- pdl.apply_native_constraint "dimIsMultipleOf"(%rhs, %c1, %c64 : !pdl.value, !pdl.attribute, !pdl.attribute)
-
- // M, N >= 1024, K >= 256
- %c1024 = pdl.attribute = 1024
- pdl.apply_native_constraint "dimIsBound"(%lhs, %c0, %c1024, %empty : !pdl.value, !pdl.attribute, !pdl.attribute, !pdl.attribute)
- pdl.apply_native_constraint "dimIsBound"(%rhs, %c0, %c1024, %empty : !pdl.value, !pdl.attribute, !pdl.attribute, !pdl.attribute)
- pdl.apply_native_constraint "dimIsBound"(%lhs, %c1, %c256, %empty : !pdl.value, !pdl.attribute, !pdl.attribute, !pdl.attribute)
-
- pdl.rewrite {
- // Call the C++ "annotateOperation" utility to add the attributes to the matched linalg.generic op.
- // This modifies the operation in-place.
-
- %annotation = pdl.attribute = #iree_codegen.ukernel_descriptor<"pingpong_large_bf16", tensor>
- pdl.apply_native_rewrite "annotateOperation"(%generic_op, %attr_name, %annotation : !pdl.operation, !pdl.attribute, !pdl.attribute)
-
- %config_name = pdl.attribute = "compilation_info"
- %config = pdl.attribute = #iree_codegen.compilation_info<
- lowering_config = #iree_gpu.lowering_config<{workgroup = [256, 256, 0]}>,
- translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse
- workgroup_size = [512, 1, 1] subgroup_size = 64,
- // This strategy uses the maximum amount of possible shared memory on
- // all gfx942 architectures so shared memory padding to reduce bank
- // conflicts must be disabled. Also prefetching is done manually in the
- // above and is disabled here as well.
- {gpu_pipeline_options =
- #iree_gpu.pipeline_options<
- prefetch_shared_memory = false,
- no_reduce_shared_memory_bank_conflicts = true>,
- // This strategy requires 2 waves per SIMD.
- llvm_func_attrs = {"amdgpu-waves-per-eu" = "2"}}>
- >
- pdl.apply_native_rewrite "annotateOperation"(%generic_op, %config_name, %config : !pdl.operation, !pdl.attribute, !pdl.attribute)
-
- %builtin_attr = pdl.attribute = "rocm.builtin_name"
- %builtin_annotation = pdl.attribute = "iree_uk_amdgpu_matmul_bf16.mlir"
- pdl.apply_native_rewrite "annotateOperation"(%generic_op, %builtin_attr, %builtin_annotation : !pdl.operation, !pdl.attribute, !pdl.attribute)
- }
-}
-
-// This pattern matches an expanded bf16 matmul-like operation of medium size and annotates it
-// with ukernel descriptor and configuration attributes.
-pdl.pattern @annotate_matmul_like_bf16_medium_expanded : benefit(1) {
- %elemtypes = pdl.attribute = [bf16, bf16, f32]
- %imaps = pdl.attribute = [
- affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>,
- affine_map<(d0, d1, d2, d3) -> (d2, d3)>,
- affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>
- ]
-
- %lhs_type = pdl.type
- %rhs_type = pdl.type
- %out_type = pdl.type
- %zero_type = pdl.type : f32
-
- %lhs = pdl.operand : %lhs_type
- %rhs = pdl.operand : %rhs_type
- %out_init = pdl.operand : %out_type
-
- %zero_val = pdl.attribute = 0. : f32
- %zero_op = pdl.operation "arith.constant" {"value" = %zero_val} -> (%zero_type : !pdl.type)
- %zero = pdl.result 0 of %zero_op
- %fill_op = pdl.operation "linalg.fill" (%zero, %out_init : !pdl.value, !pdl.value) -> (%out_type : !pdl.type)
- %fill = pdl.result 0 of %fill_op
-
- // Match the a matmul-like generic with above indexing maps.
- %generic_op = pdl.operation (%lhs, %rhs, %fill : !pdl.value, !pdl.value, !pdl.value) -> (%out_type : !pdl.type)
- pdl.apply_native_constraint "matchContraction"(
- %generic_op, %elemtypes, %imaps
- : !pdl.operation, !pdl.attribute, !pdl.attribute)
-
- %attr_name = pdl.attribute = "iree_codegen.ukernel"
- pdl.apply_native_constraint "hasAttr"(%generic_op, %attr_name : !pdl.operation, !pdl.attribute) {isNegated = true}
-
- // M % 128 == 0, K % 64 == 0, N % 256 == 0
- %empty = pdl.attribute = {}
- %c0 = pdl.attribute = 0
- %c1 = pdl.attribute = 1
- %c2 = pdl.attribute = 2
- %c64 = pdl.attribute = 64
- %c128 = pdl.attribute = 128
- %c256 = pdl.attribute = 256
- pdl.apply_native_constraint "dimIsMultipleOf"(%lhs, %c1, %c128 : !pdl.value, !pdl.attribute, !pdl.attribute)
- pdl.apply_native_constraint "dimIsMultipleOf"(%lhs, %c2, %c64 : !pdl.value, !pdl.attribute, !pdl.attribute)
- pdl.apply_native_constraint "dimIsMultipleOf"(%rhs, %c0, %c256 : !pdl.value, !pdl.attribute, !pdl.attribute)
- pdl.apply_native_constraint "dimIsMultipleOf"(%rhs, %c1, %c64 : !pdl.value, !pdl.attribute, !pdl.attribute)
-
- // M, N >= 1024, K >= 256
- %c4 = pdl.attribute = 4
- %c512 = pdl.attribute = 512
- %c1024 = pdl.attribute = 1024
-
- // TODO: Kernel specialization is needed to apply this strategy selectively at
- // runtime. Additionally model exports don't specify lower bounds so it is
- // impossible to use this strategy with this check.
- // pdl.apply_native_constraint "dimIsBound"(%lhs, %c0, %c4, %empty : !pdl.value, !pdl.attribute, !pdl.attribute, !pdl.attribute)
-
- pdl.apply_native_constraint "dimIsBound"(%lhs, %c2, %c256, %empty : !pdl.value, !pdl.attribute, !pdl.attribute, !pdl.attribute)
- pdl.apply_native_constraint "dimIsBound"(%rhs, %c0, %c1024, %empty : !pdl.value, !pdl.attribute, !pdl.attribute, !pdl.attribute)
-
- pdl.rewrite {
- // Call the C++ "annotateOperation" utility to add the attributes to the matched linalg.generic op.
- // This modifies the operation in-place.
-
- %annotation = pdl.attribute = #iree_codegen.ukernel_descriptor<"pingpong_medium_bf16_expanded", tensor>
- pdl.apply_native_rewrite "annotateOperation"(%generic_op, %attr_name, %annotation : !pdl.operation, !pdl.attribute, !pdl.attribute)
-
- %config_name = pdl.attribute = "compilation_info"
- %config = pdl.attribute = #iree_codegen.compilation_info<
- lowering_config = #iree_gpu.lowering_config<{workgroup = [1, 128, 256, 0]}>,
- translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse
- workgroup_size = [512, 1, 1] subgroup_size = 64,
- // This strategy uses the maximum amount of possible shared memory on
- // all gfx942 architectures so shared memory padding to reduce bank
- // conflicts must be disabled. Also prefetching is done manually in the
- // above and is disabled here as well.
- {gpu_pipeline_options =
- #iree_gpu.pipeline_options<
- prefetch_shared_memory = false,
- no_reduce_shared_memory_bank_conflicts = true>,
- // This strategy requires 2 waves per SIMD.
- llvm_func_attrs = {"amdgpu-waves-per-eu" = "2"}}>
- >
- pdl.apply_native_rewrite "annotateOperation"(%generic_op, %config_name, %config : !pdl.operation, !pdl.attribute, !pdl.attribute)
-
- %builtin_attr = pdl.attribute = "rocm.builtin_name"
- %builtin_annotation = pdl.attribute = "iree_uk_amdgpu_matmul_bf16.mlir"
- pdl.apply_native_rewrite "annotateOperation"(%generic_op, %builtin_attr, %builtin_annotation : !pdl.operation, !pdl.attribute, !pdl.attribute)
- }
-}
-
-// This pattern matches an expanded bf16 matmul-like operation of large size and annotates it
-// with ukernel descriptor and configuration attributes. This is preferred over the medium
-// strategy.
-pdl.pattern @annotate_matmul_like_bf16_large_expanded : benefit(2) {
- %elemtypes = pdl.attribute = [bf16, bf16, f32]
- %imaps = pdl.attribute = [
- affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>,
- affine_map<(d0, d1, d2, d3) -> (d2, d3)>,
- affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>
- ]
-
- %lhs_type = pdl.type
- %rhs_type = pdl.type
- %out_type = pdl.type
- %zero_type = pdl.type : f32
-
- %lhs = pdl.operand : %lhs_type
- %rhs = pdl.operand : %rhs_type
- %out_init = pdl.operand : %out_type
-
- %zero_val = pdl.attribute = 0. : f32
- %zero_op = pdl.operation "arith.constant" {"value" = %zero_val} -> (%zero_type : !pdl.type)
- %zero = pdl.result 0 of %zero_op
- %fill_op = pdl.operation "linalg.fill" (%zero, %out_init : !pdl.value, !pdl.value) -> (%out_type : !pdl.type)
- %fill = pdl.result 0 of %fill_op
-
- // Match the a matmul-like generic with above indexing maps.
- %generic_op = pdl.operation (%lhs, %rhs, %fill : !pdl.value, !pdl.value, !pdl.value) -> (%out_type : !pdl.type)
- pdl.apply_native_constraint "matchContraction"(
- %generic_op, %elemtypes, %imaps
- : !pdl.operation, !pdl.attribute, !pdl.attribute)
-
- %attr_name = pdl.attribute = "iree_codegen.ukernel"
- pdl.apply_native_constraint "hasAttr"(%generic_op, %attr_name : !pdl.operation, !pdl.attribute) {isNegated = true}
-
- // M % 256 == 0, K % 64 == 0, N % 256 == 0
- %empty = pdl.attribute = {}
- %c0 = pdl.attribute = 0
- %c1 = pdl.attribute = 1
- %c2 = pdl.attribute = 2
- %c64 = pdl.attribute = 64
- %c256 = pdl.attribute = 256
- pdl.apply_native_constraint "dimIsMultipleOf"(%lhs, %c1, %c256 : !pdl.value, !pdl.attribute, !pdl.attribute)
- pdl.apply_native_constraint "dimIsMultipleOf"(%lhs, %c2, %c64 : !pdl.value, !pdl.attribute, !pdl.attribute)
- pdl.apply_native_constraint "dimIsMultipleOf"(%rhs, %c0, %c256 : !pdl.value, !pdl.attribute, !pdl.attribute)
- pdl.apply_native_constraint "dimIsMultipleOf"(%rhs, %c1, %c64 : !pdl.value, !pdl.attribute, !pdl.attribute)
-
- // M, N >= 1024, K >= 256
- %c1024 = pdl.attribute = 1024
- pdl.apply_native_constraint "dimIsBound"(%lhs, %c2, %c256, %empty : !pdl.value, !pdl.attribute, !pdl.attribute, !pdl.attribute)
- pdl.apply_native_constraint "dimIsBound"(%rhs, %c0, %c1024, %empty : !pdl.value, !pdl.attribute, !pdl.attribute, !pdl.attribute)
-
- pdl.rewrite {
- // Call the C++ "annotateOperation" utility to add the attributes to the matched linalg.generic op.
- // This modifies the operation in-place.
-
- %annotation = pdl.attribute = #iree_codegen.ukernel_descriptor<"pingpong_large_bf16_expanded", tensor>
- pdl.apply_native_rewrite "annotateOperation"(%generic_op, %attr_name, %annotation : !pdl.operation, !pdl.attribute, !pdl.attribute)
-
- %config_name = pdl.attribute = "compilation_info"
- %config = pdl.attribute = #iree_codegen.compilation_info<
- lowering_config = #iree_gpu.lowering_config<{workgroup = [1, 256, 256, 0]}>,
- translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse
- workgroup_size = [512, 1, 1] subgroup_size = 64,
- // This strategy uses the maximum amount of possible shared memory on
- // all gfx942 architectures so shared memory padding to reduce bank
- // conflicts must be disabled. Also prefetching is done manually in the
- // above and is disabled here as well.
- {gpu_pipeline_options =
- #iree_gpu.pipeline_options<
- prefetch_shared_memory = false,
- no_reduce_shared_memory_bank_conflicts = true>,
- // This strategy requires 2 waves per SIMD.
- llvm_func_attrs = {"amdgpu-waves-per-eu" = "2"}}>
- >
- pdl.apply_native_rewrite "annotateOperation"(%generic_op, %config_name, %config : !pdl.operation, !pdl.attribute, !pdl.attribute)
-
- %builtin_attr = pdl.attribute = "rocm.builtin_name"
- %builtin_annotation = pdl.attribute = "iree_uk_amdgpu_matmul_bf16.mlir"
+ %builtin_annotation = pdl.attribute = "iree_uk_amdgpu_matmul_f8E4M3FNUZ.mlir"
pdl.apply_native_rewrite "annotateOperation"(%generic_op, %builtin_attr, %builtin_annotation : !pdl.operation, !pdl.attribute, !pdl.attribute)
}
}
diff --git a/compiler/plugins/target/ROCM/builtins/mlir_ukernel/ukernel_patterns_gfx942_gfx950.mlir b/compiler/plugins/target/ROCM/builtins/mlir_ukernel/ukernel_patterns_gfx942_gfx950.mlir
new file mode 100644
index 0000000..d45660d
--- /dev/null
+++ b/compiler/plugins/target/ROCM/builtins/mlir_ukernel/ukernel_patterns_gfx942_gfx950.mlir
@@ -0,0 +1,523 @@
+// RUN: iree-opt -allow-unregistered-dialect %s
+
+// F16 Patterns
+
+// This pattern matches a large f16 matmul-like operation and annotates it
+// with ukernel descriptor and configuration attributes.
+pdl.pattern @annotate_matmul_like_f16_large : benefit(1) {
+ %elemtypes = pdl.attribute = [f16, f16, f32]
+ %imaps = pdl.attribute = [
+ affine_map<(d0, d1, d2) -> (d0, d2)>,
+ affine_map<(d0, d1, d2) -> (d1, d2)>,
+ affine_map<(d0, d1, d2) -> (d0, d1)>
+ ]
+
+ %lhs_type = pdl.type
+ %rhs_type = pdl.type
+ %out_type = pdl.type
+ %zero_type = pdl.type : f32
+
+ %lhs = pdl.operand : %lhs_type
+ %rhs = pdl.operand : %rhs_type
+ %out_init = pdl.operand : %out_type
+
+ %zero_val = pdl.attribute = 0. : f32
+ %zero_op = pdl.operation "arith.constant" {"value" = %zero_val} -> (%zero_type : !pdl.type)
+ %zero = pdl.result 0 of %zero_op
+ %fill_op = pdl.operation "linalg.fill" (%zero, %out_init : !pdl.value, !pdl.value) -> (%out_type : !pdl.type)
+ %fill = pdl.result 0 of %fill_op
+
+ // Match the a matmul-like generic with above indexing maps.
+ %generic_op = pdl.operation (%lhs, %rhs, %fill : !pdl.value, !pdl.value, !pdl.value) -> (%out_type : !pdl.type)
+ pdl.apply_native_constraint "matchContraction"(
+ %generic_op, %elemtypes, %imaps
+ : !pdl.operation, !pdl.attribute, !pdl.attribute)
+
+ %attr_name = pdl.attribute = "iree_codegen.ukernel"
+ pdl.apply_native_constraint "hasAttr"(%generic_op, %attr_name : !pdl.operation, !pdl.attribute) {isNegated = true}
+
+ // M % 256 == 0, K % 64 == 0, N % 256 == 0
+ %empty = pdl.attribute = {}
+ %c0 = pdl.attribute = 0
+ %c1 = pdl.attribute = 1
+ %c2 = pdl.attribute = 2
+ %c64 = pdl.attribute = 64
+ %c256 = pdl.attribute = 256
+ pdl.apply_native_constraint "dimIsMultipleOf"(%lhs, %c0, %c256 : !pdl.value, !pdl.attribute, !pdl.attribute)
+ pdl.apply_native_constraint "dimIsMultipleOf"(%lhs, %c1, %c64 : !pdl.value, !pdl.attribute, !pdl.attribute)
+ pdl.apply_native_constraint "dimIsMultipleOf"(%rhs, %c0, %c256 : !pdl.value, !pdl.attribute, !pdl.attribute)
+ pdl.apply_native_constraint "dimIsMultipleOf"(%rhs, %c1, %c64 : !pdl.value, !pdl.attribute, !pdl.attribute)
+
+ // M, N >= 1024, K >= 256
+ %c1024 = pdl.attribute = 1024
+ pdl.apply_native_constraint "dimIsBound"(%lhs, %c0, %c1024, %empty : !pdl.value, !pdl.attribute, !pdl.attribute, !pdl.attribute)
+ pdl.apply_native_constraint "dimIsBound"(%rhs, %c0, %c1024, %empty : !pdl.value, !pdl.attribute, !pdl.attribute, !pdl.attribute)
+ pdl.apply_native_constraint "dimIsBound"(%lhs, %c1, %c256, %empty : !pdl.value, !pdl.attribute, !pdl.attribute, !pdl.attribute)
+
+ pdl.rewrite {
+ // Call the C++ "annotateOperation" utility to add the attributes to the matched linalg.generic op.
+ // This modifies the operation in-place.
+
+ %annotation = pdl.attribute = #iree_codegen.ukernel_descriptor<"pingpong_large_f16", tensor>
+ pdl.apply_native_rewrite "annotateOperation"(%generic_op, %attr_name, %annotation : !pdl.operation, !pdl.attribute, !pdl.attribute)
+
+ %config_name = pdl.attribute = "compilation_info"
+ %config = pdl.attribute = #iree_codegen.compilation_info<
+ lowering_config = #iree_gpu.lowering_config<{workgroup = [256, 256, 0]}>,
+ translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse
+ workgroup_size = [512, 1, 1] subgroup_size = 64,
+ // This strategy uses the maximum amount of possible shared memory on
+ // all gfx9 architectures so shared memory padding to reduce bank
+ // conflicts must be disabled. Also prefetching is done manually in the
+ // above and is disabled here as well.
+ {gpu_pipeline_options =
+ #iree_gpu.pipeline_options<
+ prefetch_shared_memory = false,
+ no_reduce_shared_memory_bank_conflicts = true>,
+ // This strategy requires 2 waves per SIMD.
+ llvm_func_attrs = {"amdgpu-waves-per-eu" = "2"}}>
+ >
+ pdl.apply_native_rewrite "annotateOperation"(%generic_op, %config_name, %config : !pdl.operation, !pdl.attribute, !pdl.attribute)
+
+ %builtin_attr = pdl.attribute = "rocm.builtin_name"
+ %builtin_annotation = pdl.attribute = "iree_uk_amdgpu_matmul_f16.mlir"
+ pdl.apply_native_rewrite "annotateOperation"(%generic_op, %builtin_attr, %builtin_annotation : !pdl.operation, !pdl.attribute, !pdl.attribute)
+ }
+}
+
+// This pattern matches a medium-sized f16 matmul-like operation and annotates it
+// with ukernel descriptor and configuration attributes.
+pdl.pattern @annotate_matmul_like_f16_medium_expanded : benefit(1) {
+ %elemtypes = pdl.attribute = [f16, f16, f32]
+ %imaps = pdl.attribute = [
+ affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>,
+ affine_map<(d0, d1, d2, d3) -> (d2, d3)>,
+ affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>
+ ]
+
+ %lhs_type = pdl.type
+ %rhs_type = pdl.type
+ %out_type = pdl.type
+ %zero_type = pdl.type : f32
+
+ %lhs = pdl.operand : %lhs_type
+ %rhs = pdl.operand : %rhs_type
+ %out_init = pdl.operand : %out_type
+
+ %zero_val = pdl.attribute = 0. : f32
+ %zero_op = pdl.operation "arith.constant" {"value" = %zero_val} -> (%zero_type : !pdl.type)
+ %zero = pdl.result 0 of %zero_op
+ %fill_op = pdl.operation "linalg.fill" (%zero, %out_init : !pdl.value, !pdl.value) -> (%out_type : !pdl.type)
+ %fill = pdl.result 0 of %fill_op
+
+ // Match the a matmul-like generic with above indexing maps.
+ %generic_op = pdl.operation (%lhs, %rhs, %fill : !pdl.value, !pdl.value, !pdl.value) -> (%out_type : !pdl.type)
+ pdl.apply_native_constraint "matchContraction"(
+ %generic_op, %elemtypes, %imaps
+ : !pdl.operation, !pdl.attribute, !pdl.attribute)
+
+ %attr_name = pdl.attribute = "iree_codegen.ukernel"
+ pdl.apply_native_constraint "hasAttr"(%generic_op, %attr_name : !pdl.operation, !pdl.attribute) {isNegated = true}
+
+ // M % 128 == 0, K % 64 == 0, N % 256 == 0
+ %empty = pdl.attribute = {}
+ %c0 = pdl.attribute = 0
+ %c1 = pdl.attribute = 1
+ %c2 = pdl.attribute = 2
+ %c64 = pdl.attribute = 64
+ %c128 = pdl.attribute = 128
+ %c256 = pdl.attribute = 256
+ pdl.apply_native_constraint "dimIsMultipleOf"(%lhs, %c1, %c128 : !pdl.value, !pdl.attribute, !pdl.attribute)
+ pdl.apply_native_constraint "dimIsMultipleOf"(%lhs, %c2, %c64 : !pdl.value, !pdl.attribute, !pdl.attribute)
+ pdl.apply_native_constraint "dimIsMultipleOf"(%rhs, %c0, %c256 : !pdl.value, !pdl.attribute, !pdl.attribute)
+ pdl.apply_native_constraint "dimIsMultipleOf"(%rhs, %c1, %c64 : !pdl.value, !pdl.attribute, !pdl.attribute)
+
+ // M, N >= 1024, K >= 256
+ %c1024 = pdl.attribute = 1024
+
+ // TODO: Kernel specialization is needed to apply this strategy selectively at
+ // runtime. Additionally model exports don't specify lower bounds so it is
+ // impossible to use this strategy with this check.
+ // pdl.apply_native_constraint "dimIsBound"(%lhs, %c0, %c4, %empty : !pdl.value, !pdl.attribute, !pdl.attribute, !pdl.attribute)
+
+ pdl.apply_native_constraint "dimIsBound"(%rhs, %c0, %c1024, %empty : !pdl.value, !pdl.attribute, !pdl.attribute, !pdl.attribute)
+ pdl.apply_native_constraint "dimIsBound"(%lhs, %c2, %c256, %empty : !pdl.value, !pdl.attribute, !pdl.attribute, !pdl.attribute)
+
+ pdl.rewrite {
+ // Call the C++ "annotateOperation" utility to add the attributes to the matched linalg.generic op.
+ // This modifies the operation in-place.
+
+ %annotation = pdl.attribute = #iree_codegen.ukernel_descriptor<"pingpong_medium_f16_expanded", tensor>
+ pdl.apply_native_rewrite "annotateOperation"(%generic_op, %attr_name, %annotation : !pdl.operation, !pdl.attribute, !pdl.attribute)
+
+ %config_name = pdl.attribute = "compilation_info"
+ %config = pdl.attribute = #iree_codegen.compilation_info<
+ lowering_config = #iree_gpu.lowering_config<{workgroup = [1, 128, 256, 0]}>,
+ translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse
+ workgroup_size = [512, 1, 1] subgroup_size = 64,
+ // This strategy uses the maximum amount of possible shared memory on
+ // all gfx9 architectures so shared memory padding to reduce bank
+ // conflicts must be disabled. Also prefetching is done manually in the
+ // above and is disabled here as well.
+ {gpu_pipeline_options =
+ #iree_gpu.pipeline_options<
+ prefetch_shared_memory = false,
+ no_reduce_shared_memory_bank_conflicts = true>,
+ // This strategy requires 2 waves per SIMD.
+ llvm_func_attrs = {"amdgpu-waves-per-eu" = "2"}}>
+ >
+ pdl.apply_native_rewrite "annotateOperation"(%generic_op, %config_name, %config : !pdl.operation, !pdl.attribute, !pdl.attribute)
+
+ %builtin_attr = pdl.attribute = "rocm.builtin_name"
+ %builtin_annotation = pdl.attribute = "iree_uk_amdgpu_matmul_f16.mlir"
+ pdl.apply_native_rewrite "annotateOperation"(%generic_op, %builtin_attr, %builtin_annotation : !pdl.operation, !pdl.attribute, !pdl.attribute)
+ }
+}
+
+// This pattern matches a medium-sized f16 matmul-like operation and annotates it
+// with ukernel descriptor and configuration attributes. This is preferred over the
+// medium-sized ukernel.
+pdl.pattern @annotate_matmul_like_f16_large_expanded : benefit(2) {
+ %elemtypes = pdl.attribute = [f16, f16, f32]
+ %imaps = pdl.attribute = [
+ affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>,
+ affine_map<(d0, d1, d2, d3) -> (d2, d3)>,
+ affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>
+ ]
+
+ %lhs_type = pdl.type
+ %rhs_type = pdl.type
+ %out_type = pdl.type
+ %zero_type = pdl.type : f32
+
+ %lhs = pdl.operand : %lhs_type
+ %rhs = pdl.operand : %rhs_type
+ %out_init = pdl.operand : %out_type
+
+ %zero_val = pdl.attribute = 0. : f32
+ %zero_op = pdl.operation "arith.constant" {"value" = %zero_val} -> (%zero_type : !pdl.type)
+ %zero = pdl.result 0 of %zero_op
+ %fill_op = pdl.operation "linalg.fill" (%zero, %out_init : !pdl.value, !pdl.value) -> (%out_type : !pdl.type)
+ %fill = pdl.result 0 of %fill_op
+
+ // Match the a matmul-like generic with above indexing maps.
+ %generic_op = pdl.operation (%lhs, %rhs, %fill : !pdl.value, !pdl.value, !pdl.value) -> (%out_type : !pdl.type)
+ pdl.apply_native_constraint "matchContraction"(
+ %generic_op, %elemtypes, %imaps
+ : !pdl.operation, !pdl.attribute, !pdl.attribute)
+
+ %attr_name = pdl.attribute = "iree_codegen.ukernel"
+ pdl.apply_native_constraint "hasAttr"(%generic_op, %attr_name : !pdl.operation, !pdl.attribute) {isNegated = true}
+
+ // M % 256 == 0, K % 64 == 0, N % 256 == 0
+ %empty = pdl.attribute = {}
+ %c0 = pdl.attribute = 0
+ %c1 = pdl.attribute = 1
+ %c2 = pdl.attribute = 2
+ %c64 = pdl.attribute = 64
+ %c256 = pdl.attribute = 256
+ pdl.apply_native_constraint "dimIsMultipleOf"(%lhs, %c1, %c256 : !pdl.value, !pdl.attribute, !pdl.attribute)
+ pdl.apply_native_constraint "dimIsMultipleOf"(%lhs, %c2, %c64 : !pdl.value, !pdl.attribute, !pdl.attribute)
+ pdl.apply_native_constraint "dimIsMultipleOf"(%rhs, %c0, %c256 : !pdl.value, !pdl.attribute, !pdl.attribute)
+ pdl.apply_native_constraint "dimIsMultipleOf"(%rhs, %c1, %c64 : !pdl.value, !pdl.attribute, !pdl.attribute)
+
+ // M, N >= 1024, K >= 256
+ %c1024 = pdl.attribute = 1024
+
+ // TODO: Kernel specialization is needed to apply this strategy selectively at
+ // runtime. Additionally model exports don't specify lower bounds so it is
+ // impossible to use this strategy with this check.
+ // pdl.apply_native_constraint "dimIsBound"(%lhs, %c0, %c4, %empty : !pdl.value, !pdl.attribute, !pdl.attribute, !pdl.attribute)
+
+ pdl.apply_native_constraint "dimIsBound"(%rhs, %c0, %c1024, %empty : !pdl.value, !pdl.attribute, !pdl.attribute, !pdl.attribute)
+ pdl.apply_native_constraint "dimIsBound"(%lhs, %c2, %c256, %empty : !pdl.value, !pdl.attribute, !pdl.attribute, !pdl.attribute)
+
+ pdl.rewrite {
+ // Call the C++ "annotateOperation" utility to add the attributes to the matched linalg.generic op.
+ // This modifies the operation in-place.
+
+ %annotation = pdl.attribute = #iree_codegen.ukernel_descriptor<"pingpong_large_f16_expanded", tensor>
+ pdl.apply_native_rewrite "annotateOperation"(%generic_op, %attr_name, %annotation : !pdl.operation, !pdl.attribute, !pdl.attribute)
+
+ %config_name = pdl.attribute = "compilation_info"
+ %config = pdl.attribute = #iree_codegen.compilation_info<
+ lowering_config = #iree_gpu.lowering_config<{workgroup = [1, 256, 256, 0]}>,
+ translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse
+ workgroup_size = [512, 1, 1] subgroup_size = 64,
+ // This strategy uses the maximum amount of possible shared memory on
+ // all gfx9 architectures so shared memory padding to reduce bank
+ // conflicts must be disabled. Also prefetching is done manually in the
+ // above and is disabled here as well.
+ {gpu_pipeline_options =
+ #iree_gpu.pipeline_options<
+ prefetch_shared_memory = false,
+ no_reduce_shared_memory_bank_conflicts = true>,
+ // This strategy requires 2 waves per SIMD.
+ llvm_func_attrs = {"amdgpu-waves-per-eu" = "2"}}>
+ >
+ pdl.apply_native_rewrite "annotateOperation"(%generic_op, %config_name, %config : !pdl.operation, !pdl.attribute, !pdl.attribute)
+
+ %builtin_attr = pdl.attribute = "rocm.builtin_name"
+ %builtin_annotation = pdl.attribute = "iree_uk_amdgpu_matmul_f16.mlir"
+ pdl.apply_native_rewrite "annotateOperation"(%generic_op, %builtin_attr, %builtin_annotation : !pdl.operation, !pdl.attribute, !pdl.attribute)
+ }
+}
+
+// BF16 Patterns
+
+// This pattern matches a bf16 matmul-like operation and annotates it
+// with ukernel descriptor and configuration attributes.
+pdl.pattern @annotate_matmul_like_bf16_large : benefit(1) {
+ %elemtypes = pdl.attribute = [bf16, bf16, f32]
+ %imaps = pdl.attribute = [
+ affine_map<(d0, d1, d2) -> (d0, d2)>,
+ affine_map<(d0, d1, d2) -> (d1, d2)>,
+ affine_map<(d0, d1, d2) -> (d0, d1)>
+ ]
+
+ %lhs_type = pdl.type
+ %rhs_type = pdl.type
+ %out_type = pdl.type
+ %zero_type = pdl.type : f32
+
+ %lhs = pdl.operand : %lhs_type
+ %rhs = pdl.operand : %rhs_type
+ %out_init = pdl.operand : %out_type
+
+ %zero_val = pdl.attribute = 0. : f32
+ %zero_op = pdl.operation "arith.constant" {"value" = %zero_val} -> (%zero_type : !pdl.type)
+ %zero = pdl.result 0 of %zero_op
+ %fill_op = pdl.operation "linalg.fill" (%zero, %out_init : !pdl.value, !pdl.value) -> (%out_type : !pdl.type)
+ %fill = pdl.result 0 of %fill_op
+
+ // Match the a matmul-like generic with above indexing maps.
+ %generic_op = pdl.operation (%lhs, %rhs, %fill : !pdl.value, !pdl.value, !pdl.value) -> (%out_type : !pdl.type)
+ pdl.apply_native_constraint "matchContraction"(
+ %generic_op, %elemtypes, %imaps
+ : !pdl.operation, !pdl.attribute, !pdl.attribute)
+
+ %attr_name = pdl.attribute = "iree_codegen.ukernel"
+ pdl.apply_native_constraint "hasAttr"(%generic_op, %attr_name : !pdl.operation, !pdl.attribute) {isNegated = true}
+
+ // M % 256 == 0, K % 64 == 0, N % 256 == 0
+ %empty = pdl.attribute = {}
+ %c0 = pdl.attribute = 0
+ %c1 = pdl.attribute = 1
+ %c2 = pdl.attribute = 2
+ %c64 = pdl.attribute = 64
+ %c256 = pdl.attribute = 256
+ pdl.apply_native_constraint "dimIsMultipleOf"(%lhs, %c0, %c256 : !pdl.value, !pdl.attribute, !pdl.attribute)
+ pdl.apply_native_constraint "dimIsMultipleOf"(%lhs, %c1, %c64 : !pdl.value, !pdl.attribute, !pdl.attribute)
+ pdl.apply_native_constraint "dimIsMultipleOf"(%rhs, %c0, %c256 : !pdl.value, !pdl.attribute, !pdl.attribute)
+ pdl.apply_native_constraint "dimIsMultipleOf"(%rhs, %c1, %c64 : !pdl.value, !pdl.attribute, !pdl.attribute)
+
+ // M, N >= 1024, K >= 256
+ %c1024 = pdl.attribute = 1024
+ pdl.apply_native_constraint "dimIsBound"(%lhs, %c0, %c1024, %empty : !pdl.value, !pdl.attribute, !pdl.attribute, !pdl.attribute)
+ pdl.apply_native_constraint "dimIsBound"(%rhs, %c0, %c1024, %empty : !pdl.value, !pdl.attribute, !pdl.attribute, !pdl.attribute)
+ pdl.apply_native_constraint "dimIsBound"(%lhs, %c1, %c256, %empty : !pdl.value, !pdl.attribute, !pdl.attribute, !pdl.attribute)
+
+ pdl.rewrite {
+ // Call the C++ "annotateOperation" utility to add the attributes to the matched linalg.generic op.
+ // This modifies the operation in-place.
+
+ %annotation = pdl.attribute = #iree_codegen.ukernel_descriptor<"pingpong_large_bf16", tensor>
+ pdl.apply_native_rewrite "annotateOperation"(%generic_op, %attr_name, %annotation : !pdl.operation, !pdl.attribute, !pdl.attribute)
+
+ %config_name = pdl.attribute = "compilation_info"
+ %config = pdl.attribute = #iree_codegen.compilation_info<
+ lowering_config = #iree_gpu.lowering_config<{workgroup = [256, 256, 0]}>,
+ translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse
+ workgroup_size = [512, 1, 1] subgroup_size = 64,
+ // This strategy uses the maximum amount of possible shared memory on
+ // all gfx9 architectures so shared memory padding to reduce bank
+ // conflicts must be disabled. Also prefetching is done manually in the
+ // above and is disabled here as well.
+ {gpu_pipeline_options =
+ #iree_gpu.pipeline_options<
+ prefetch_shared_memory = false,
+ no_reduce_shared_memory_bank_conflicts = true>,
+ // This strategy requires 2 waves per SIMD.
+ llvm_func_attrs = {"amdgpu-waves-per-eu" = "2"}}>
+ >
+ pdl.apply_native_rewrite "annotateOperation"(%generic_op, %config_name, %config : !pdl.operation, !pdl.attribute, !pdl.attribute)
+
+ %builtin_attr = pdl.attribute = "rocm.builtin_name"
+ %builtin_annotation = pdl.attribute = "iree_uk_amdgpu_matmul_bf16.mlir"
+ pdl.apply_native_rewrite "annotateOperation"(%generic_op, %builtin_attr, %builtin_annotation : !pdl.operation, !pdl.attribute, !pdl.attribute)
+ }
+}
+
+// This pattern matches an expanded bf16 matmul-like operation of medium size and annotates it
+// with ukernel descriptor and configuration attributes.
+pdl.pattern @annotate_matmul_like_bf16_medium_expanded : benefit(1) {
+ %elemtypes = pdl.attribute = [bf16, bf16, f32]
+ %imaps = pdl.attribute = [
+ affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>,
+ affine_map<(d0, d1, d2, d3) -> (d2, d3)>,
+ affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>
+ ]
+
+ %lhs_type = pdl.type
+ %rhs_type = pdl.type
+ %out_type = pdl.type
+ %zero_type = pdl.type : f32
+
+ %lhs = pdl.operand : %lhs_type
+ %rhs = pdl.operand : %rhs_type
+ %out_init = pdl.operand : %out_type
+
+ %zero_val = pdl.attribute = 0. : f32
+ %zero_op = pdl.operation "arith.constant" {"value" = %zero_val} -> (%zero_type : !pdl.type)
+ %zero = pdl.result 0 of %zero_op
+ %fill_op = pdl.operation "linalg.fill" (%zero, %out_init : !pdl.value, !pdl.value) -> (%out_type : !pdl.type)
+ %fill = pdl.result 0 of %fill_op
+
+ // Match the a matmul-like generic with above indexing maps.
+ %generic_op = pdl.operation (%lhs, %rhs, %fill : !pdl.value, !pdl.value, !pdl.value) -> (%out_type : !pdl.type)
+ pdl.apply_native_constraint "matchContraction"(
+ %generic_op, %elemtypes, %imaps
+ : !pdl.operation, !pdl.attribute, !pdl.attribute)
+
+ %attr_name = pdl.attribute = "iree_codegen.ukernel"
+ pdl.apply_native_constraint "hasAttr"(%generic_op, %attr_name : !pdl.operation, !pdl.attribute) {isNegated = true}
+
+ // M % 128 == 0, K % 64 == 0, N % 256 == 0
+ %empty = pdl.attribute = {}
+ %c0 = pdl.attribute = 0
+ %c1 = pdl.attribute = 1
+ %c2 = pdl.attribute = 2
+ %c64 = pdl.attribute = 64
+ %c128 = pdl.attribute = 128
+ %c256 = pdl.attribute = 256
+ pdl.apply_native_constraint "dimIsMultipleOf"(%lhs, %c1, %c128 : !pdl.value, !pdl.attribute, !pdl.attribute)
+ pdl.apply_native_constraint "dimIsMultipleOf"(%lhs, %c2, %c64 : !pdl.value, !pdl.attribute, !pdl.attribute)
+ pdl.apply_native_constraint "dimIsMultipleOf"(%rhs, %c0, %c256 : !pdl.value, !pdl.attribute, !pdl.attribute)
+ pdl.apply_native_constraint "dimIsMultipleOf"(%rhs, %c1, %c64 : !pdl.value, !pdl.attribute, !pdl.attribute)
+
+ // M, N >= 1024, K >= 256
+ %c4 = pdl.attribute = 4
+ %c512 = pdl.attribute = 512
+ %c1024 = pdl.attribute = 1024
+
+ // TODO: Kernel specialization is needed to apply this strategy selectively at
+ // runtime. Additionally model exports don't specify lower bounds so it is
+ // impossible to use this strategy with this check.
+ // pdl.apply_native_constraint "dimIsBound"(%lhs, %c0, %c4, %empty : !pdl.value, !pdl.attribute, !pdl.attribute, !pdl.attribute)
+
+ pdl.apply_native_constraint "dimIsBound"(%lhs, %c2, %c256, %empty : !pdl.value, !pdl.attribute, !pdl.attribute, !pdl.attribute)
+ pdl.apply_native_constraint "dimIsBound"(%rhs, %c0, %c1024, %empty : !pdl.value, !pdl.attribute, !pdl.attribute, !pdl.attribute)
+
+ pdl.rewrite {
+ // Call the C++ "annotateOperation" utility to add the attributes to the matched linalg.generic op.
+ // This modifies the operation in-place.
+
+ %annotation = pdl.attribute = #iree_codegen.ukernel_descriptor<"pingpong_medium_bf16_expanded", tensor>
+ pdl.apply_native_rewrite "annotateOperation"(%generic_op, %attr_name, %annotation : !pdl.operation, !pdl.attribute, !pdl.attribute)
+
+ %config_name = pdl.attribute = "compilation_info"
+ %config = pdl.attribute = #iree_codegen.compilation_info<
+ lowering_config = #iree_gpu.lowering_config<{workgroup = [1, 128, 256, 0]}>,
+ translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse
+ workgroup_size = [512, 1, 1] subgroup_size = 64,
+ // This strategy uses the maximum amount of possible shared memory on
+ // all gfx9 architectures so shared memory padding to reduce bank
+ // conflicts must be disabled. Also prefetching is done manually in the
+ // above and is disabled here as well.
+ {gpu_pipeline_options =
+ #iree_gpu.pipeline_options<
+ prefetch_shared_memory = false,
+ no_reduce_shared_memory_bank_conflicts = true>,
+ // This strategy requires 2 waves per SIMD.
+ llvm_func_attrs = {"amdgpu-waves-per-eu" = "2"}}>
+ >
+ pdl.apply_native_rewrite "annotateOperation"(%generic_op, %config_name, %config : !pdl.operation, !pdl.attribute, !pdl.attribute)
+
+ %builtin_attr = pdl.attribute = "rocm.builtin_name"
+ %builtin_annotation = pdl.attribute = "iree_uk_amdgpu_matmul_bf16.mlir"
+ pdl.apply_native_rewrite "annotateOperation"(%generic_op, %builtin_attr, %builtin_annotation : !pdl.operation, !pdl.attribute, !pdl.attribute)
+ }
+}
+
+// This pattern matches an expanded bf16 matmul-like operation of large size and annotates it
+// with ukernel descriptor and configuration attributes. This is preferred over the medium
+// strategy.
+pdl.pattern @annotate_matmul_like_bf16_large_expanded : benefit(2) {
+ %elemtypes = pdl.attribute = [bf16, bf16, f32]
+ %imaps = pdl.attribute = [
+ affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>,
+ affine_map<(d0, d1, d2, d3) -> (d2, d3)>,
+ affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>
+ ]
+
+ %lhs_type = pdl.type
+ %rhs_type = pdl.type
+ %out_type = pdl.type
+ %zero_type = pdl.type : f32
+
+ %lhs = pdl.operand : %lhs_type
+ %rhs = pdl.operand : %rhs_type
+ %out_init = pdl.operand : %out_type
+
+ %zero_val = pdl.attribute = 0. : f32
+ %zero_op = pdl.operation "arith.constant" {"value" = %zero_val} -> (%zero_type : !pdl.type)
+ %zero = pdl.result 0 of %zero_op
+ %fill_op = pdl.operation "linalg.fill" (%zero, %out_init : !pdl.value, !pdl.value) -> (%out_type : !pdl.type)
+ %fill = pdl.result 0 of %fill_op
+
+ // Match the a matmul-like generic with above indexing maps.
+ %generic_op = pdl.operation (%lhs, %rhs, %fill : !pdl.value, !pdl.value, !pdl.value) -> (%out_type : !pdl.type)
+ pdl.apply_native_constraint "matchContraction"(
+ %generic_op, %elemtypes, %imaps
+ : !pdl.operation, !pdl.attribute, !pdl.attribute)
+
+ %attr_name = pdl.attribute = "iree_codegen.ukernel"
+ pdl.apply_native_constraint "hasAttr"(%generic_op, %attr_name : !pdl.operation, !pdl.attribute) {isNegated = true}
+
+ // M % 256 == 0, K % 64 == 0, N % 256 == 0
+ %empty = pdl.attribute = {}
+ %c0 = pdl.attribute = 0
+ %c1 = pdl.attribute = 1
+ %c2 = pdl.attribute = 2
+ %c64 = pdl.attribute = 64
+ %c256 = pdl.attribute = 256
+ pdl.apply_native_constraint "dimIsMultipleOf"(%lhs, %c1, %c256 : !pdl.value, !pdl.attribute, !pdl.attribute)
+ pdl.apply_native_constraint "dimIsMultipleOf"(%lhs, %c2, %c64 : !pdl.value, !pdl.attribute, !pdl.attribute)
+ pdl.apply_native_constraint "dimIsMultipleOf"(%rhs, %c0, %c256 : !pdl.value, !pdl.attribute, !pdl.attribute)
+ pdl.apply_native_constraint "dimIsMultipleOf"(%rhs, %c1, %c64 : !pdl.value, !pdl.attribute, !pdl.attribute)
+
+ // M, N >= 1024, K >= 256
+ %c1024 = pdl.attribute = 1024
+ pdl.apply_native_constraint "dimIsBound"(%lhs, %c2, %c256, %empty : !pdl.value, !pdl.attribute, !pdl.attribute, !pdl.attribute)
+ pdl.apply_native_constraint "dimIsBound"(%rhs, %c0, %c1024, %empty : !pdl.value, !pdl.attribute, !pdl.attribute, !pdl.attribute)
+
+ pdl.rewrite {
+ // Call the C++ "annotateOperation" utility to add the attributes to the matched linalg.generic op.
+ // This modifies the operation in-place.
+
+ %annotation = pdl.attribute = #iree_codegen.ukernel_descriptor<"pingpong_large_bf16_expanded", tensor>
+ pdl.apply_native_rewrite "annotateOperation"(%generic_op, %attr_name, %annotation : !pdl.operation, !pdl.attribute, !pdl.attribute)
+
+ %config_name = pdl.attribute = "compilation_info"
+ %config = pdl.attribute = #iree_codegen.compilation_info<
+ lowering_config = #iree_gpu.lowering_config<{workgroup = [1, 256, 256, 0]}>,
+ translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse
+ workgroup_size = [512, 1, 1] subgroup_size = 64,
+ // This strategy uses the maximum amount of possible shared memory on
+ // all gfx9 architectures so shared memory padding to reduce bank
+ // conflicts must be disabled. Also prefetching is done manually in the
+ // above and is disabled here as well.
+ {gpu_pipeline_options =
+ #iree_gpu.pipeline_options<
+ prefetch_shared_memory = false,
+ no_reduce_shared_memory_bank_conflicts = true>,
+ // This strategy requires 2 waves per SIMD.
+ llvm_func_attrs = {"amdgpu-waves-per-eu" = "2"}}>
+ >
+ pdl.apply_native_rewrite "annotateOperation"(%generic_op, %config_name, %config : !pdl.operation, !pdl.attribute, !pdl.attribute)
+
+ %builtin_attr = pdl.attribute = "rocm.builtin_name"
+ %builtin_annotation = pdl.attribute = "iree_uk_amdgpu_matmul_bf16.mlir"
+ pdl.apply_native_rewrite "annotateOperation"(%generic_op, %builtin_attr, %builtin_annotation : !pdl.operation, !pdl.attribute, !pdl.attribute)
+ }
+}
diff --git a/compiler/plugins/target/ROCM/builtins/mlir_ukernel/ukernel_patterns_gfx950.mlir b/compiler/plugins/target/ROCM/builtins/mlir_ukernel/ukernel_patterns_gfx950.mlir
new file mode 100644
index 0000000..c140d97
--- /dev/null
+++ b/compiler/plugins/target/ROCM/builtins/mlir_ukernel/ukernel_patterns_gfx950.mlir
@@ -0,0 +1,180 @@
+// RUN: iree-opt -allow-unregistered-dialect %s
+
+// f8E4M3FN Patterns
+
+// This pattern matches a medium-sized expanded matmul-like operation and
+// annotates it with ukernel descriptor and configuration attributes.
+pdl.pattern @annotate_matmul_like_f8E4M3FN_medium_expanded : benefit(1) {
+ %elemtypes = pdl.attribute = [f8E4M3FN, f8E4M3FN, f32]
+ %imaps = pdl.attribute = [
+ affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>,
+ affine_map<(d0, d1, d2, d3) -> (d2, d3)>,
+ affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>
+ ]
+
+ %lhs_type = pdl.type
+ %rhs_type = pdl.type
+ %out_type = pdl.type
+ %zero_type = pdl.type : f32
+
+ %lhs = pdl.operand : %lhs_type
+ %rhs = pdl.operand : %rhs_type
+ %out_init = pdl.operand : %out_type
+
+ %zero_val = pdl.attribute = 0. : f32
+ %zero_op = pdl.operation "arith.constant" {"value" = %zero_val} -> (%zero_type : !pdl.type)
+ %zero = pdl.result 0 of %zero_op
+ %fill_op = pdl.operation "linalg.fill" (%zero, %out_init : !pdl.value, !pdl.value) -> (%out_type : !pdl.type)
+ %fill = pdl.result 0 of %fill_op
+
+ // Match the a matmul-like generic with above indexing maps.
+ %generic_op = pdl.operation (%lhs, %rhs, %fill : !pdl.value, !pdl.value, !pdl.value) -> (%out_type : !pdl.type)
+ pdl.apply_native_constraint "matchContraction"(
+ %generic_op, %elemtypes, %imaps
+ : !pdl.operation, !pdl.attribute, !pdl.attribute)
+
+ %attr_name = pdl.attribute = "iree_codegen.ukernel"
+ pdl.apply_native_constraint "hasAttr"(%generic_op, %attr_name : !pdl.operation, !pdl.attribute) {isNegated = true}
+
+ // M % 128 == 0, K % 128 == 0, N % 256 == 0
+ %empty = pdl.attribute = {}
+ %c0 = pdl.attribute = 0
+ %c1 = pdl.attribute = 1
+ %c2 = pdl.attribute = 2
+ %c128 = pdl.attribute = 128
+ %c256 = pdl.attribute = 256
+ pdl.apply_native_constraint "dimIsMultipleOf"(%lhs, %c1, %c128 : !pdl.value, !pdl.attribute, !pdl.attribute)
+ pdl.apply_native_constraint "dimIsMultipleOf"(%lhs, %c2, %c128 : !pdl.value, !pdl.attribute, !pdl.attribute)
+ pdl.apply_native_constraint "dimIsMultipleOf"(%rhs, %c0, %c256 : !pdl.value, !pdl.attribute, !pdl.attribute)
+ pdl.apply_native_constraint "dimIsMultipleOf"(%rhs, %c1, %c128 : !pdl.value, !pdl.attribute, !pdl.attribute)
+
+ // N >= 1024, K >= 512
+ %c512 = pdl.attribute = 512
+ %c1024 = pdl.attribute = 1024
+ pdl.apply_native_constraint "dimIsBound"(%rhs, %c0, %c1024, %empty : !pdl.value, !pdl.attribute, !pdl.attribute, !pdl.attribute)
+ pdl.apply_native_constraint "dimIsBound"(%lhs, %c2, %c512, %empty : !pdl.value, !pdl.attribute, !pdl.attribute, !pdl.attribute)
+
+ pdl.rewrite {
+ // Call the C++ "annotateOperation" utility to add the attributes to the matched linalg.generic op.
+ // This modifies the operation in-place.
+
+ %annotation = pdl.attribute = #iree_codegen.ukernel_descriptor<"pingpong_medium_f8E4M3FN_expanded", tensor>
+ pdl.apply_native_rewrite "annotateOperation"(%generic_op, %attr_name, %annotation : !pdl.operation, !pdl.attribute, !pdl.attribute)
+
+ %config_name = pdl.attribute = "compilation_info"
+ %config = pdl.attribute = #iree_codegen.compilation_info<
+ lowering_config = #iree_gpu.lowering_config<{
+ workgroup = [1, 128, 256, 0]
+ }>,
+ translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse
+ workgroup_size = [512, 1, 1] subgroup_size = 64,
+ // This strategy uses the maximum amount of possible shared memory on
+ // all gfx9 architectures so shared memory padding to reduce bank
+ // conflicts must be disabled. Also prefetching is done manually in the
+ // above and is disabled here as well.
+ {gpu_pipeline_options =
+ #iree_gpu.pipeline_options<
+ prefetch_shared_memory = false,
+ no_reduce_shared_memory_bank_conflicts = true>,
+ // This strategy requires 2 waves per SIMD.
+ llvm_func_attrs = {"amdgpu-waves-per-eu" = "2"}}>
+ >
+ pdl.apply_native_rewrite "annotateOperation"(%generic_op, %config_name, %config : !pdl.operation, !pdl.attribute, !pdl.attribute)
+
+ %builtin_attr = pdl.attribute = "rocm.builtin_name"
+ %builtin_annotation = pdl.attribute = "iree_uk_amdgpu_matmul_f8E4M3FN.mlir"
+ pdl.apply_native_rewrite "annotateOperation"(%generic_op, %builtin_attr, %builtin_annotation : !pdl.operation, !pdl.attribute, !pdl.attribute)
+ }
+}
+
+// This pattern matches a large expanded f8 matmul-like operation and annotates it
+// with ukernel descriptor and configuration attributes. This is preferred over the
+// medium-sized ukernel.
+pdl.pattern @annotate_matmul_like_f8E4M3FN_large_expanded : benefit(2) {
+ %elemtypes = pdl.attribute = [f8E4M3FN, f8E4M3FN, f32]
+ %imaps = pdl.attribute = [
+ affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>,
+ affine_map<(d0, d1, d2, d3) -> (d2, d3)>,
+ affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>
+ ]
+
+ %lhs_type = pdl.type
+ %rhs_type = pdl.type
+ %out_type = pdl.type
+ %zero_type = pdl.type : f32
+
+ %lhs = pdl.operand : %lhs_type
+ %rhs = pdl.operand : %rhs_type
+ %out_init = pdl.operand : %out_type
+
+ %zero_val = pdl.attribute = 0. : f32
+ %zero_op = pdl.operation "arith.constant" {"value" = %zero_val} -> (%zero_type : !pdl.type)
+ %zero = pdl.result 0 of %zero_op
+ %fill_op = pdl.operation "linalg.fill" (%zero, %out_init : !pdl.value, !pdl.value) -> (%out_type : !pdl.type)
+ %fill = pdl.result 0 of %fill_op
+
+ // Match the a matmul-like generic with above indexing maps.
+ %generic_op = pdl.operation (%lhs, %rhs, %fill : !pdl.value, !pdl.value, !pdl.value) -> (%out_type : !pdl.type)
+ pdl.apply_native_constraint "matchContraction"(
+ %generic_op, %elemtypes, %imaps
+ : !pdl.operation, !pdl.attribute, !pdl.attribute)
+
+ %attr_name = pdl.attribute = "iree_codegen.ukernel"
+ pdl.apply_native_constraint "hasAttr"(%generic_op, %attr_name : !pdl.operation, !pdl.attribute) {isNegated = true}
+
+ // M % 256 == 0, K % 128 == 0, N % 256 == 0
+ %empty = pdl.attribute = {}
+ %c0 = pdl.attribute = 0
+ %c1 = pdl.attribute = 1
+ %c2 = pdl.attribute = 2
+ %c128 = pdl.attribute = 128
+ %c256 = pdl.attribute = 256
+ pdl.apply_native_constraint "dimIsMultipleOf"(%lhs, %c1, %c256 : !pdl.value, !pdl.attribute, !pdl.attribute)
+ pdl.apply_native_constraint "dimIsMultipleOf"(%lhs, %c2, %c128 : !pdl.value, !pdl.attribute, !pdl.attribute)
+ pdl.apply_native_constraint "dimIsMultipleOf"(%rhs, %c0, %c256 : !pdl.value, !pdl.attribute, !pdl.attribute)
+ pdl.apply_native_constraint "dimIsMultipleOf"(%rhs, %c1, %c128 : !pdl.value, !pdl.attribute, !pdl.attribute)
+
+ // N >= 1024, K >= 512
+ %c512 = pdl.attribute = 512
+ %c1024 = pdl.attribute = 1024
+
+ // TODO: Kernel specialization is needed to apply this strategy selectively at
+ // runtime. Additionally model exports don't specify lower bounds so it is
+ // impossible to use this strategy with this check.
+ // pdl.apply_native_constraint "dimIsBound"(%lhs, %c0, %c4, %empty : !pdl.value, !pdl.attribute, !pdl.attribute, !pdl.attribute)
+
+ pdl.apply_native_constraint "dimIsBound"(%rhs, %c0, %c1024, %empty : !pdl.value, !pdl.attribute, !pdl.attribute, !pdl.attribute)
+ pdl.apply_native_constraint "dimIsBound"(%lhs, %c2, %c512, %empty : !pdl.value, !pdl.attribute, !pdl.attribute, !pdl.attribute)
+
+ pdl.rewrite {
+ // Call the C++ "annotateOperation" utility to add the attributes to the matched linalg.generic op.
+ // This modifies the operation in-place.
+
+ %annotation = pdl.attribute = #iree_codegen.ukernel_descriptor<"pingpong_large_f8E4M3FN_expanded", tensor>
+ pdl.apply_native_rewrite "annotateOperation"(%generic_op, %attr_name, %annotation : !pdl.operation, !pdl.attribute, !pdl.attribute)
+
+ %config_name = pdl.attribute = "compilation_info"
+ %config = pdl.attribute = #iree_codegen.compilation_info<
+ lowering_config = #iree_gpu.lowering_config<{
+ workgroup = [1, 256, 256, 0]
+ }>,
+ translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse
+ workgroup_size = [512, 1, 1] subgroup_size = 64,
+ // This strategy uses the maximum amount of possible shared memory on
+ // all gfx9 architectures so shared memory padding to reduce bank
+ // conflicts must be disabled. Also prefetching is done manually in the
+ // above and is disabled here as well.
+ {gpu_pipeline_options =
+ #iree_gpu.pipeline_options<
+ prefetch_shared_memory = false,
+ no_reduce_shared_memory_bank_conflicts = true>,
+ // This strategy requires 2 waves per SIMD.
+ llvm_func_attrs = {"amdgpu-waves-per-eu" = "2"}}>
+ >
+ pdl.apply_native_rewrite "annotateOperation"(%generic_op, %config_name, %config : !pdl.operation, !pdl.attribute, !pdl.attribute)
+
+ %builtin_attr = pdl.attribute = "rocm.builtin_name"
+ %builtin_annotation = pdl.attribute = "iree_uk_amdgpu_matmul_f8E4M3FN.mlir"
+ pdl.apply_native_rewrite "annotateOperation"(%generic_op, %builtin_attr, %builtin_annotation : !pdl.operation, !pdl.attribute, !pdl.attribute)
+ }
+}
diff --git a/compiler/plugins/target/ROCM/test/enable_tensor_ukernels.mlir b/compiler/plugins/target/ROCM/test/enable_tensor_ukernels.mlir
index 7b5b30c..54adc75 100644
--- a/compiler/plugins/target/ROCM/test/enable_tensor_ukernels.mlir
+++ b/compiler/plugins/target/ROCM/test/enable_tensor_ukernels.mlir
@@ -48,7 +48,7 @@
// CHECK: func.func @matmul_f8
// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK: linalg.generic
-// CHECK-SAME: iree_codegen.ukernel = #iree_codegen.ukernel_descriptor<"pingpong_medium_f8_expanded", tensor>
+// CHECK-SAME: iree_codegen.ukernel = #iree_codegen.ukernel_descriptor<"pingpong_medium_f8E4M3FNUZ_expanded", tensor>
// CHECK-SAME: lowering_config = #iree_gpu.lowering_config
-// CHECK: util.func private @pingpong_medium_f8_expanded
+// CHECK: util.func private @pingpong_medium_f8E4M3FNUZ_expanded
// CHECK: iree_codegen.inner_tiled