[GPU] Follow the official naming convention for WMMA attributes. (#18147)
https://github.com/iree-org/iree/commit/82012e69efa433e151bceec06dd5941534cbfbb2
missed the `WMMA_F32_16x16x16_F16` case. The `WMMA_F16_16x16x16_F16` is
fine because the input type and output type are all F16.
The revision addresses the failure on main branch:
https://github.com/iree-org/iree/actions/runs/10289449633/job/28478608054
The change is generated by the below command.
```
sed -i "s/WMMA_F16_16x16x16_F32/WMMA_F32_16x16x16_F16/g" **/*.h
sed -i "s/WMMA_F16_16x16x16_F32/WMMA_F32_16x16x16_F16/g" **/*.td
sed -i "s/WMMA_F16_16x16x16_F32/WMMA_F32_16x16x16_F16/g" **/*.cpp
sed -i "s/WMMA_F16_16x16x16_F32/WMMA_F32_16x16x16_F16/g" **/*.mlir
sed -i "s/WMMA_F16_16x16x16_F32/WMMA_F32_16x16x16_F16/g" **/*.py
```
ci-extra:
build_packages,test_amd_mi250,test_amd_mi300,test_amd_w7900,test_nvidia_t4
---------
Signed-off-by: hanhanW <hanhan0912@gmail.com>
diff --git a/compiler/plugins/target/ROCM/test/target_device_features.mlir b/compiler/plugins/target/ROCM/test/target_device_features.mlir
index 55098bf..d21ca0a 100644
--- a/compiler/plugins/target/ROCM/test/target_device_features.mlir
+++ b/compiler/plugins/target/ROCM/test/target_device_features.mlir
@@ -15,7 +15,7 @@
// GFX940-SAME: mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>],
// GFX1100: target = #iree_gpu.target<arch = "gfx1100",
-// GFX1100-SAME: mma = [<WMMA_F16_16x16x16_F32>, <WMMA_F16_16x16x16_F16>]
+// GFX1100-SAME: mma = [<WMMA_F32_16x16x16_F16>, <WMMA_F16_16x16x16_F16>]
// GFX1100-SAME: subgroup_size_choices = [32, 64]
// GFX941: target = #iree_gpu.target<arch = "gfx941",
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_nested_layout_contract_amdgpu.mlir b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_nested_layout_contract_amdgpu.mlir
index e229952..6f13d17 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_nested_layout_contract_amdgpu.mlir
+++ b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_nested_layout_contract_amdgpu.mlir
@@ -558,7 +558,7 @@
indexing_maps = [#map1, #map2, #map3],
iterator_types = ["parallel", "parallel", "reduction"],
kind = #vector.kind<add>,
- iree.amdgpu.mma = #iree_gpu.mma_layout<WMMA_F16_16x16x16_F32>
+ iree.amdgpu.mma = #iree_gpu.mma_layout<WMMA_F32_16x16x16_F16>
} %A, %B, %C : vector<16x16xf16>, vector<16x16xf16> into vector<16x16xf32>
%O = iree_vector_ext.to_layout %output to #layout_c : vector<16x16xf32>
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_vector_distribution.mlir b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_vector_distribution.mlir
index adb278c..032028c 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_vector_distribution.mlir
+++ b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_vector_distribution.mlir
@@ -582,7 +582,7 @@
affine_map<(d0, d1, d2) -> (d0, d1)>],
iterator_types = ["parallel", "parallel", "reduction"],
kind = #vector.kind<add>,
- iree.amdgpu.mma = #iree_gpu.mma_layout<WMMA_F16_16x16x16_F32>}
+ iree.amdgpu.mma = #iree_gpu.mma_layout<WMMA_F32_16x16x16_F16>}
%A, %B, %C : vector<16x16xf16>, vector<16x16xf16> into vector<16x16xf32>
%TM1 = arith.truncf %M1 : vector<16x16xf32> to vector<16x16xf16>
@@ -596,7 +596,7 @@
affine_map<(d0, d1, d2) -> (d0, d1)>],
iterator_types = ["parallel", "parallel", "reduction"],
kind = #vector.kind<add>,
- iree.amdgpu.mma = #iree_gpu.mma_layout<WMMA_F16_16x16x16_F32>}
+ iree.amdgpu.mma = #iree_gpu.mma_layout<WMMA_F32_16x16x16_F16>}
%A2, %B2, %C2 : vector<16x16xf16>, vector<16x16xf16> into vector<16x16xf32>
func.return %M2 : vector<16x16xf32>
diff --git a/compiler/src/iree/compiler/Codegen/Common/test/tile_and_distribute_to_workgroups.mlir b/compiler/src/iree/compiler/Codegen/Common/test/tile_and_distribute_to_workgroups.mlir
index 42e5487..89ac360 100644
--- a/compiler/src/iree/compiler/Codegen/Common/test/tile_and_distribute_to_workgroups.mlir
+++ b/compiler/src/iree/compiler/Codegen/Common/test/tile_and_distribute_to_workgroups.mlir
@@ -2689,7 +2689,7 @@
hal.return %x, %y, %z : index, index, index
}
builtin.module {
- func.func @set_size_to_tilesize_when_divisible() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [128, 1, 1] subgroup_size = 32, {mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<WMMA_F16_16x16x16_F32>, subgroup_m_count = 1, subgroup_n_count = 4>}>} {
+ func.func @set_size_to_tilesize_when_divisible() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [128, 1, 1] subgroup_size = 32, {mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<WMMA_F32_16x16x16_F16>, subgroup_m_count = 1, subgroup_n_count = 4>}>} {
%c0 = arith.constant 0 : index
%c32_i64 = arith.constant 32 : i64
%cst = arith.constant 0.000000e+00 : f16
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.cpp b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.cpp
index 7ccfccb..4b6cd7b 100644
--- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.cpp
+++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.cpp
@@ -231,7 +231,7 @@
case MMAIntrinsic::MFMA_I32_32x32x16_I8: {
return OpaqueMmaLayout{32, 32, 16, i8, i8, i32};
}
- case MMAIntrinsic::WMMA_F16_16x16x16_F32: {
+ case MMAIntrinsic::WMMA_F32_16x16x16_F16: {
return OpaqueMmaLayout{16, 16, 16, f16, f16, f32};
}
case MMAIntrinsic::WMMA_F16_16x16x16_F16: {
@@ -353,7 +353,7 @@
return ConcreteMmaLayout{opaqueLayout, aMLayout, aKLayout, bKLayout,
bNLayout, cMLayout, cNLayout};
}
- case MMAIntrinsic::WMMA_F16_16x16x16_F32:
+ case MMAIntrinsic::WMMA_F32_16x16x16_F16:
case MMAIntrinsic::WMMA_F16_16x16x16_F16: {
// #outer = #iree_vector_ext.per_dim_layout<[LANEX], [16]>
// #inner = #iree_vector_ext.per_dim_layout<[LANEY, VECTORX], [1, 16]>
@@ -463,7 +463,7 @@
auto cType = VectorType::get({16}, getCType());
return std::make_tuple(aType, bType, cType);
}
- case MMAIntrinsic::WMMA_F16_16x16x16_F32:
+ case MMAIntrinsic::WMMA_F32_16x16x16_F16:
case MMAIntrinsic::WMMA_F16_16x16x16_F16: {
auto aType = VectorType::get({16}, getAType());
auto bType = VectorType::get({16}, getBType());
@@ -492,7 +492,7 @@
case MMAIntrinsic::MFMA_I32_16x16x32_I8:
case MMAIntrinsic::MFMA_I32_32x32x16_I8:
case MMAIntrinsic::WMMA_F16_16x16x16_F16:
- case MMAIntrinsic::WMMA_F16_16x16x16_F32: {
+ case MMAIntrinsic::WMMA_F32_16x16x16_F16: {
return 1;
}
}
@@ -510,7 +510,7 @@
case MMAIntrinsic::MFMA_I32_32x32x16_I8: {
return 64;
}
- case MMAIntrinsic::WMMA_F16_16x16x16_F32:
+ case MMAIntrinsic::WMMA_F32_16x16x16_F16:
case MMAIntrinsic::WMMA_F16_16x16x16_F16: {
return 32;
}
@@ -542,7 +542,7 @@
return {/*outer=*/{1, 1}, /*thread=*/{32, 2}, /*strides=*/{1, 32},
/*element=*/{1, 8}};
}
- case MMAIntrinsic::WMMA_F16_16x16x16_F32:
+ case MMAIntrinsic::WMMA_F32_16x16x16_F16:
case MMAIntrinsic::WMMA_F16_16x16x16_F16: {
return {/*outer=*/{1, 1}, /*thread=*/{16, 1}, /*strides=*/{1, 16},
/*element=*/{1, 16}};
@@ -574,7 +574,7 @@
return {/*outer=*/{1, 1}, /*thread=*/{2, 32}, /*strides=*/{32, 1},
/*element=*/{8, 1}};
}
- case MMAIntrinsic::WMMA_F16_16x16x16_F32:
+ case MMAIntrinsic::WMMA_F32_16x16x16_F16:
case MMAIntrinsic::WMMA_F16_16x16x16_F16: {
return {/*outer=*/{1, 1}, /*thread=*/{1, 16}, /*strides=*/{16, 1},
/*element=*/{16, 1}};
@@ -597,7 +597,7 @@
return {/*outer=*/{4, 1}, /*thread=*/{2, 32}, /*strides=*/{32, 1},
/*element=*/{4, 1}};
}
- case MMAIntrinsic::WMMA_F16_16x16x16_F32:
+ case MMAIntrinsic::WMMA_F32_16x16x16_F16:
case MMAIntrinsic::WMMA_F16_16x16x16_F16: {
return {/*outer=*/{8, 1}, /*thread=*/{2, 16}, /*strides=*/{16, 1},
/*element=*/{1, 1}};
@@ -644,7 +644,7 @@
rhs, acc)
.getResult();
}
- case MMAIntrinsic::WMMA_F16_16x16x16_F32:
+ case MMAIntrinsic::WMMA_F32_16x16x16_F16:
case MMAIntrinsic::WMMA_F16_16x16x16_F16: {
return builder.create<amdgpu::WMMAOp>(loc, resultType, lhs, rhs, acc)
.getResult();
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUEnums.td b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUEnums.td
index a68a8a3..d9f45e1 100644
--- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUEnums.td
+++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUEnums.td
@@ -106,7 +106,7 @@
def MFMA_I32_16x16x32_I8 : I32EnumAttrCase<"MFMA_I32_16x16x32_I8", 4>;
def MFMA_I32_32x32x16_I8 : I32EnumAttrCase<"MFMA_I32_32x32x16_I8", 5>;
// TODO: Create separate WMMA ops for AMD and NVIDIA GPUs
-def WMMA_F16_16x16x16_F32 : I32EnumAttrCase<"WMMA_F16_16x16x16_F32", 6>;
+def WMMA_F32_16x16x16_F16 : I32EnumAttrCase<"WMMA_F32_16x16x16_F16", 6>;
def WMMA_F16_16x16x16_F16 : I32EnumAttrCase<"WMMA_F16_16x16x16_F16", 7>;
def IREEGPU_MMAIntrinsic : IREEGPU_I32MmaEnumAttr<"MMAIntrinsic",
@@ -117,7 +117,7 @@
MFMA_F32_16x16x32_F8E4M3FNUZ,
MFMA_I32_16x16x32_I8,
MFMA_I32_32x32x16_I8,
- WMMA_F16_16x16x16_F32,
+ WMMA_F32_16x16x16_F16,
WMMA_F16_16x16x16_F16
]>;
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/test/iree_gpu_attrs.mlir b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/test/iree_gpu_attrs.mlir
index 0766350..d100306 100644
--- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/test/iree_gpu_attrs.mlir
+++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/test/iree_gpu_attrs.mlir
@@ -20,12 +20,12 @@
module {
func.func @test_wmma_f16_16x16x16_f32() attributes {
- mma_types = #iree_gpu.mma_layout<WMMA_F16_16x16x16_F32>} {
+ mma_types = #iree_gpu.mma_layout<WMMA_F32_16x16x16_F16>} {
return
}
}
// CHECK-LABEL: func @test_wmma_f16_16x16x16_f32
-// CHECK-SAME: mma_types = #iree_gpu.mma_layout<WMMA_F16_16x16x16_F32>
+// CHECK-SAME: mma_types = #iree_gpu.mma_layout<WMMA_F32_16x16x16_F16>
module {
func.func @test_any_lowering_config() attributes {
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/KnownTargets.cpp b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/KnownTargets.cpp
index 6c50a8a..6f32054 100644
--- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/KnownTargets.cpp
+++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/KnownTargets.cpp
@@ -165,7 +165,7 @@
const WgpDetails *getRDNA3WgpDetails() {
static const MMAIntrinsic rdna3MMAOps[] = {
- MMAIntrinsic::WMMA_F16_16x16x16_F32,
+ MMAIntrinsic::WMMA_F32_16x16x16_F16,
MMAIntrinsic::WMMA_F16_16x16x16_F16,
};
static const WgpDetails rdna3Wgp = {
@@ -355,7 +355,7 @@
const WgpDetails *getAmpereWgpDetails() {
static const MMAIntrinsic mmaOps[] = {
- MMAIntrinsic::WMMA_F16_16x16x16_F32,
+ MMAIntrinsic::WMMA_F32_16x16x16_F16,
MMAIntrinsic::WMMA_F16_16x16x16_F16,
};
static const WgpDetails ampereWgp = {
@@ -368,7 +368,7 @@
const WgpDetails *getTuringWgpDetails() {
static const MMAIntrinsic mmaOps[] = {
- MMAIntrinsic::WMMA_F16_16x16x16_F32,
+ MMAIntrinsic::WMMA_F32_16x16x16_F16,
MMAIntrinsic::WMMA_F16_16x16x16_F16,
};
static const WgpDetails turingWgp = {
@@ -381,7 +381,7 @@
const WgpDetails *getVoltaWgpDetails() {
static const MMAIntrinsic mmaOps[] = {
- MMAIntrinsic::WMMA_F16_16x16x16_F32,
+ MMAIntrinsic::WMMA_F32_16x16x16_F16,
MMAIntrinsic::WMMA_F16_16x16x16_F16,
};
// clang-format off
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/TransformExtensions/test/lower_multi_mma.mlir b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TransformExtensions/test/lower_multi_mma.mlir
index c870015..8939941 100644
--- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/TransformExtensions/test/lower_multi_mma.mlir
+++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TransformExtensions/test/lower_multi_mma.mlir
@@ -77,7 +77,7 @@
%0 = iree_gpu.multi_mma %lhs, %rhs, %acc {
indexing_maps = #contraction_accesses,
iterator_types = [],
- kind = #iree_gpu.mma_layout<WMMA_F16_16x16x16_F32>
+ kind = #iree_gpu.mma_layout<WMMA_F32_16x16x16_F16>
} : vector<16xf16>, vector<16xf16> into vector<8xf32>
return %0 : vector<8xf32>
}
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_vector_distribute.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_vector_distribute.mlir
index d6bea0c..9f0e6cc 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_vector_distribute.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_vector_distribute.mlir
@@ -203,7 +203,7 @@
// WMMA: #[[$TILE_SIZES:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[64, 128, 64]{{\]}}
// WMMA: #iree_codegen.translation_info<LLVMGPUVectorDistribute
// WMMA-SAME: mma_schedule = #iree_gpu.mma_schedule
-// WMMA-SAME: intrinsic = #iree_gpu.mma_layout<WMMA_F16_16x16x16_F32>
+// WMMA-SAME: intrinsic = #iree_gpu.mma_layout<WMMA_F32_16x16x16_F16>
// WMMA-SAME: subgroup_m_count = 2, subgroup_n_count = 2
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_vector_distribute.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_vector_distribute.mlir
index d79d76d..9074458 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_vector_distribute.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_vector_distribute.mlir
@@ -480,7 +480,7 @@
}
// RDNA3: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [64, 2, 1] subgroup_size = 32
-// RDNA3-SAME: mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<WMMA_F16_16x16x16_F32>,
+// RDNA3-SAME: mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<WMMA_F32_16x16x16_F16>,
// RDNA3-SAME: subgroup_m_count = 2, subgroup_n_count = 2>
// RDNA3-SAME: prefetch_shared_memory
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/amdgpu_contraction_distribution.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/amdgpu_contraction_distribution.mlir
index 76e9f9b..cc0688a 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/amdgpu_contraction_distribution.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/amdgpu_contraction_distribution.mlir
@@ -218,7 +218,7 @@
// -----
-#layout = #iree_gpu.mma_layout<WMMA_F16_16x16x16_F32>
+#layout = #iree_gpu.mma_layout<WMMA_F32_16x16x16_F16>
#map1 = affine_map<(d0, d1, d2) -> (d1, d2)>
#map2 = affine_map<(d0, d1, d2) -> (d0, d2)>
#map3 = affine_map<(d0, d1, d2) -> (d1, d0)>
@@ -271,7 +271,7 @@
// -----
-#layout = #iree_gpu.mma_layout<WMMA_F16_16x16x16_F32>
+#layout = #iree_gpu.mma_layout<WMMA_F32_16x16x16_F16>
#map1 = affine_map<(d0, d1, d2) -> (d1, d2)>
#map2 = affine_map<(d0, d1, d2) -> (d0, d2)>
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/amdgpu_set_anchor_layouts.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/amdgpu_set_anchor_layouts.mlir
index 54fd9b1..9721435 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/amdgpu_set_anchor_layouts.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/amdgpu_set_anchor_layouts.mlir
@@ -65,7 +65,7 @@
// -----
-#layout = #iree_gpu.mma_layout<WMMA_F16_16x16x16_F32>
+#layout = #iree_gpu.mma_layout<WMMA_F32_16x16x16_F16>
#map1 = affine_map<(d0, d1, d2) -> (d1, d2)>
#map2 = affine_map<(d0, d1, d2) -> (d0, d2)>
#map3 = affine_map<(d0, d1, d2) -> (d1, d0)>
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/cast_type_to_fit_mma.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/cast_type_to_fit_mma.mlir
index d69b8e5..f8c4341 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/cast_type_to_fit_mma.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/cast_type_to_fit_mma.mlir
@@ -67,7 +67,7 @@
func.func @wmma_matmul_48x32x32_mm(%lhs: vector<48x32xf16>, %rhs: vector<32x32xf16>, %init: vector<48x32xf16>) -> vector<48x32xf16> attributes {
mma_schedule = #iree_gpu.mma_schedule<
- intrinsic = #iree_gpu.mma_layout<WMMA_F16_16x16x16_F32>,
+ intrinsic = #iree_gpu.mma_layout<WMMA_F32_16x16x16_F16>,
subgroup_m_count = 1, subgroup_n_count = 1>,
workgroup_size = [32, 1, 1]} {
%0 = vector.contract {
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/configure_vector_layout.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/configure_vector_layout.mlir
index 9885583..2f12b15 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/configure_vector_layout.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/configure_vector_layout.mlir
@@ -258,7 +258,7 @@
#translation = #iree_codegen.translation_info<LLVMGPUVectorDistribute
workgroup_size = [32, 1, 1]
subgroup_size = 32,
- {mma_schedule = #iree_gpu.mma_schedule< intrinsic = #iree_gpu.mma_layout<WMMA_F16_16x16x16_F32>, subgroup_m_count = 1, subgroup_n_count = 1>}>
+ {mma_schedule = #iree_gpu.mma_schedule< intrinsic = #iree_gpu.mma_layout<WMMA_F32_16x16x16_F16>, subgroup_m_count = 1, subgroup_n_count = 1>}>
// CHECK-DAG: #[[$NESTED:.+]] = #iree_vector_ext.nested_layout<subgroups_per_workgroup = [1, 1], batches_per_subgroup = [3, 2], outers_per_batch = [1, 1], threads_per_outer = [16, 1], elements_per_thread = [1, 16], subgroup_strides = [0, 0], thread_strides = [1, 0]>
// CHECK-DAG: #[[$NESTED1:.+]] = #iree_vector_ext.nested_layout<subgroups_per_workgroup = [1, 1], batches_per_subgroup = [2, 2], outers_per_batch = [1, 1], threads_per_outer = [1, 16], elements_per_thread = [16, 1], subgroup_strides = [0, 0], thread_strides = [0, 1]>
@@ -283,7 +283,7 @@
#translation = #iree_codegen.translation_info<LLVMGPUVectorDistribute
workgroup_size = [32, 1, 1]
subgroup_size = 32,
- {mma_schedule = #iree_gpu.mma_schedule< intrinsic = #iree_gpu.mma_layout<WMMA_F16_16x16x16_F32>, subgroup_m_count = 1, subgroup_n_count = 1>}>
+ {mma_schedule = #iree_gpu.mma_schedule< intrinsic = #iree_gpu.mma_layout<WMMA_F32_16x16x16_F16>, subgroup_m_count = 1, subgroup_n_count = 1>}>
// CHECK-DAG: #[[$NESTED:.+]] = #iree_vector_ext.nested_layout<subgroups_per_workgroup = [1, 1], batches_per_subgroup = [3, 2], outers_per_batch = [1, 1], threads_per_outer = [16, 1], elements_per_thread = [1, 16], subgroup_strides = [0, 0], thread_strides = [1, 0]>
// CHECK-DAG: #[[$NESTED1:.+]] = #iree_vector_ext.nested_layout<subgroups_per_workgroup = [1, 1], batches_per_subgroup = [2, 2], outers_per_batch = [1, 1], threads_per_outer = [16, 1], elements_per_thread = [1, 16], subgroup_strides = [0, 0], thread_strides = [1, 0]>
@@ -383,7 +383,7 @@
#translation = #iree_codegen.translation_info<LLVMGPUVectorDistribute
workgroup_size = [32, 4, 1]
subgroup_size = 32,
- {mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<WMMA_F16_16x16x16_F32>, subgroup_m_count = 1, subgroup_n_count = 4>}>
+ {mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<WMMA_F32_16x16x16_F16>, subgroup_m_count = 1, subgroup_n_count = 4>}>
// CHECK-DAG: #[[$NESTED:.+]] = #iree_vector_ext.nested_layout<subgroups_per_workgroup = [1, 1], batches_per_subgroup = [4, 1], outers_per_batch = [1, 1], threads_per_outer = [32, 4], elements_per_thread = [1, 32], subgroup_strides = [0, 0], thread_strides = [4, 1]>
diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/test/convert_gpu_target.mlir b/compiler/src/iree/compiler/Codegen/SPIRV/test/convert_gpu_target.mlir
index d8d3770..b1da56d 100644
--- a/compiler/src/iree/compiler/Codegen/SPIRV/test/convert_gpu_target.mlir
+++ b/compiler/src/iree/compiler/Codegen/SPIRV/test/convert_gpu_target.mlir
@@ -3,7 +3,7 @@
hal.executable @dispatch {
hal.executable.variant public @vulkan_spirv_fb target(<"vulkan-spirv", "vulkan-spirv-fb", {
iree.gpu.target = #iree_gpu.target<arch = "rdna3", features = "spirv:v1.6,cap:Shader",
- wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<WMMA_F16_16x16x16_F32>, <WMMA_F16_16x16x16_F16>],
+ wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<WMMA_F32_16x16x16_F16>, <WMMA_F16_16x16x16_F16>],
subgroup_size_choices = [32, 64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536>>}>) {
hal.executable.export public @dispatch ordinal(0) layout(#hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer>]>]>) {
^bb0(%arg0: !hal.device):
diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/test/illegal_configuration.mlir b/compiler/src/iree/compiler/Codegen/SPIRV/test/illegal_configuration.mlir
index bb76688..ba502da 100644
--- a/compiler/src/iree/compiler/Codegen/SPIRV/test/illegal_configuration.mlir
+++ b/compiler/src/iree/compiler/Codegen/SPIRV/test/illegal_configuration.mlir
@@ -237,7 +237,7 @@
#executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", {
iree.gpu.target = #iree_gpu.target<arch = "", features = "spirv:v1.6,cap:Shader", wgp = <
compute = fp32|fp16|int32, storage = b32|b16, subgroup = none, dot = none,
- mma = [<WMMA_F16_16x16x16_F32>, <WMMA_F16_16x16x16_F16>],
+ mma = [<WMMA_F32_16x16x16_F16>, <WMMA_F16_16x16x16_F16>],
subgroup_size_choices = [32, 64], max_workgroup_sizes = [1024, 1024, 1024],
max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536>>
}>
@@ -272,7 +272,7 @@
#executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", {
iree.gpu.target = #iree_gpu.target<arch = "", features = "spirv:v1.6,cap:Shader", wgp = <
compute = fp32|fp16|int32, storage = b32|b16, subgroup = none, dot = none,
- mma = [<WMMA_F16_16x16x16_F32>, <WMMA_F16_16x16x16_F16>],
+ mma = [<WMMA_F32_16x16x16_F16>, <WMMA_F16_16x16x16_F16>],
subgroup_size_choices = [32, 64], max_workgroup_sizes = [1024, 1024, 1024],
max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536>>
}>
@@ -307,7 +307,7 @@
#executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", {
iree.gpu.target = #iree_gpu.target<arch = "", features = "spirv:v1.6,cap:Shader", wgp = <
compute = fp32|fp16|int32, storage = b32|b16, subgroup = none, dot = none,
- mma = [<WMMA_F16_16x16x16_F32>, <WMMA_F16_16x16x16_F16>],
+ mma = [<WMMA_F32_16x16x16_F16>, <WMMA_F16_16x16x16_F16>],
subgroup_size_choices = [32, 64], max_workgroup_sizes = [1024, 1024, 1024],
max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536>>
}>
@@ -342,7 +342,7 @@
#executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", {
iree.gpu.target = #iree_gpu.target<arch = "", features = "spirv:v1.6,cap:Shader", wgp = <
compute = fp32|fp16|int32, storage = b32|b16, subgroup = none, dot = none,
- mma = [<WMMA_F16_16x16x16_F32>, <WMMA_F16_16x16x16_F16>],
+ mma = [<WMMA_F32_16x16x16_F16>, <WMMA_F16_16x16x16_F16>],
subgroup_size_choices = [32, 64], max_workgroup_sizes = [1024, 1024, 1024],
max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536>>
}>
@@ -377,7 +377,7 @@
#executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", {
iree.gpu.target = #iree_gpu.target<arch = "", features = "spirv:v1.6,cap:Shader", wgp = <
compute = fp32|fp16|int32, storage = b32|b16, subgroup = none, dot = none,
- mma = [<WMMA_F16_16x16x16_F32>, <WMMA_F16_16x16x16_F16>],
+ mma = [<WMMA_F32_16x16x16_F16>, <WMMA_F16_16x16x16_F16>],
subgroup_size_choices = [32, 64], max_workgroup_sizes = [1024, 1024, 1024],
max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536>>
}>
diff --git a/tests/e2e/matmul/generate_e2e_matmul_tests.py b/tests/e2e/matmul/generate_e2e_matmul_tests.py
index c57af2b..b4d371d 100644
--- a/tests/e2e/matmul/generate_e2e_matmul_tests.py
+++ b/tests/e2e/matmul/generate_e2e_matmul_tests.py
@@ -292,13 +292,13 @@
]
elif intrinsic == "WMMA":
schedules = [
- MMASchedule("WMMA_F16_16x16x16_F32", 1, 1, 1, 1, 1),
- MMASchedule("WMMA_F16_16x16x16_F32", 1, 1, 1, 1, 2),
- MMASchedule("WMMA_F16_16x16x16_F32", 1, 1, 1, 2, 1),
- MMASchedule("WMMA_F16_16x16x16_F32", 1, 1, 2, 1, 1),
- MMASchedule("WMMA_F16_16x16x16_F32", 2, 2, 1, 1, 1),
- MMASchedule("WMMA_F16_16x16x16_F32", 2, 4, 2, 1, 2),
- MMASchedule("WMMA_F16_16x16x16_F32", 4, 2, 4, 2, 2),
+ MMASchedule("WMMA_F32_16x16x16_F16", 1, 1, 1, 1, 1),
+ MMASchedule("WMMA_F32_16x16x16_F16", 1, 1, 1, 1, 2),
+ MMASchedule("WMMA_F32_16x16x16_F16", 1, 1, 1, 2, 1),
+ MMASchedule("WMMA_F32_16x16x16_F16", 1, 1, 2, 1, 1),
+ MMASchedule("WMMA_F32_16x16x16_F16", 2, 2, 1, 1, 1),
+ MMASchedule("WMMA_F32_16x16x16_F16", 2, 4, 2, 1, 2),
+ MMASchedule("WMMA_F32_16x16x16_F16", 4, 2, 4, 2, 2),
]
else:
raise NotImplementedError("unhandled intrinsic case")
@@ -338,7 +338,7 @@
wg_tile_m = schedule.m_count * schedule.m_tile_count * 32
wg_tile_n = schedule.n_count * schedule.n_tile_count * 32
wg_tile_k = schedule.k_tile_count * 16
- elif schedule.intrinsic == "WMMA_F16_16x16x16_F32":
+ elif schedule.intrinsic == "WMMA_F32_16x16x16_F16":
wg_tile_m = schedule.m_count * schedule.m_tile_count * 16
wg_tile_n = schedule.n_count * schedule.n_tile_count * 16
wg_tile_k = schedule.k_tile_count * 16