[GPU] Follow the official naming convention for WMMA attributes. (#18147)

https://github.com/iree-org/iree/commit/82012e69efa433e151bceec06dd5941534cbfbb2
missed the `WMMA_F32_16x16x16_F16` case. The `WMMA_F16_16x16x16_F16` is
fine because the input type and output type are all F16.

The revision addresses the failure on main branch:
https://github.com/iree-org/iree/actions/runs/10289449633/job/28478608054

The change is generated by the below command.

```
sed -i "s/WMMA_F16_16x16x16_F32/WMMA_F32_16x16x16_F16/g" **/*.h
sed -i "s/WMMA_F16_16x16x16_F32/WMMA_F32_16x16x16_F16/g" **/*.td
sed -i "s/WMMA_F16_16x16x16_F32/WMMA_F32_16x16x16_F16/g" **/*.cpp
sed -i "s/WMMA_F16_16x16x16_F32/WMMA_F32_16x16x16_F16/g" **/*.mlir
sed -i "s/WMMA_F16_16x16x16_F32/WMMA_F32_16x16x16_F16/g" **/*.py
```

ci-extra:
build_packages,test_amd_mi250,test_amd_mi300,test_amd_w7900,test_nvidia_t4

---------

Signed-off-by: hanhanW <hanhan0912@gmail.com>
diff --git a/compiler/plugins/target/ROCM/test/target_device_features.mlir b/compiler/plugins/target/ROCM/test/target_device_features.mlir
index 55098bf..d21ca0a 100644
--- a/compiler/plugins/target/ROCM/test/target_device_features.mlir
+++ b/compiler/plugins/target/ROCM/test/target_device_features.mlir
@@ -15,7 +15,7 @@
 // GFX940-SAME:         mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>],
 
 // GFX1100: target = #iree_gpu.target<arch = "gfx1100",
-// GFX1100-SAME:        mma = [<WMMA_F16_16x16x16_F32>, <WMMA_F16_16x16x16_F16>]
+// GFX1100-SAME:        mma = [<WMMA_F32_16x16x16_F16>, <WMMA_F16_16x16x16_F16>]
 // GFX1100-SAME:        subgroup_size_choices = [32, 64]
 
 // GFX941: target = #iree_gpu.target<arch = "gfx941",
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_nested_layout_contract_amdgpu.mlir b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_nested_layout_contract_amdgpu.mlir
index e229952..6f13d17 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_nested_layout_contract_amdgpu.mlir
+++ b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_nested_layout_contract_amdgpu.mlir
@@ -558,7 +558,7 @@
     indexing_maps = [#map1, #map2, #map3],
     iterator_types = ["parallel", "parallel", "reduction"],
     kind = #vector.kind<add>,
-    iree.amdgpu.mma = #iree_gpu.mma_layout<WMMA_F16_16x16x16_F32>
+    iree.amdgpu.mma = #iree_gpu.mma_layout<WMMA_F32_16x16x16_F16>
   } %A, %B, %C : vector<16x16xf16>, vector<16x16xf16> into vector<16x16xf32>
 
   %O = iree_vector_ext.to_layout %output to #layout_c : vector<16x16xf32>
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_vector_distribution.mlir b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_vector_distribution.mlir
index adb278c..032028c 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_vector_distribution.mlir
+++ b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_vector_distribution.mlir
@@ -582,7 +582,7 @@
                                           affine_map<(d0, d1, d2) -> (d0, d1)>],
                          iterator_types = ["parallel", "parallel", "reduction"],
                          kind = #vector.kind<add>,
-                         iree.amdgpu.mma = #iree_gpu.mma_layout<WMMA_F16_16x16x16_F32>}
+                         iree.amdgpu.mma = #iree_gpu.mma_layout<WMMA_F32_16x16x16_F16>}
         %A, %B, %C : vector<16x16xf16>, vector<16x16xf16> into vector<16x16xf32>
 
   %TM1 = arith.truncf %M1 : vector<16x16xf32> to vector<16x16xf16>
@@ -596,7 +596,7 @@
                                           affine_map<(d0, d1, d2) -> (d0, d1)>],
                          iterator_types = ["parallel", "parallel", "reduction"],
                          kind = #vector.kind<add>,
-                         iree.amdgpu.mma = #iree_gpu.mma_layout<WMMA_F16_16x16x16_F32>}
+                         iree.amdgpu.mma = #iree_gpu.mma_layout<WMMA_F32_16x16x16_F16>}
        %A2, %B2, %C2  : vector<16x16xf16>, vector<16x16xf16> into vector<16x16xf32>
 
   func.return %M2 : vector<16x16xf32>
diff --git a/compiler/src/iree/compiler/Codegen/Common/test/tile_and_distribute_to_workgroups.mlir b/compiler/src/iree/compiler/Codegen/Common/test/tile_and_distribute_to_workgroups.mlir
index 42e5487..89ac360 100644
--- a/compiler/src/iree/compiler/Codegen/Common/test/tile_and_distribute_to_workgroups.mlir
+++ b/compiler/src/iree/compiler/Codegen/Common/test/tile_and_distribute_to_workgroups.mlir
@@ -2689,7 +2689,7 @@
       hal.return %x, %y, %z : index, index, index
     }
     builtin.module {
-      func.func @set_size_to_tilesize_when_divisible() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [128, 1, 1] subgroup_size = 32, {mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<WMMA_F16_16x16x16_F32>, subgroup_m_count = 1, subgroup_n_count = 4>}>} {
+      func.func @set_size_to_tilesize_when_divisible() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [128, 1, 1] subgroup_size = 32, {mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<WMMA_F32_16x16x16_F16>, subgroup_m_count = 1, subgroup_n_count = 4>}>} {
         %c0 = arith.constant 0 : index
         %c32_i64 = arith.constant 32 : i64
         %cst = arith.constant 0.000000e+00 : f16
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.cpp b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.cpp
index 7ccfccb..4b6cd7b 100644
--- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.cpp
+++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.cpp
@@ -231,7 +231,7 @@
   case MMAIntrinsic::MFMA_I32_32x32x16_I8: {
     return OpaqueMmaLayout{32, 32, 16, i8, i8, i32};
   }
-  case MMAIntrinsic::WMMA_F16_16x16x16_F32: {
+  case MMAIntrinsic::WMMA_F32_16x16x16_F16: {
     return OpaqueMmaLayout{16, 16, 16, f16, f16, f32};
   }
   case MMAIntrinsic::WMMA_F16_16x16x16_F16: {
@@ -353,7 +353,7 @@
     return ConcreteMmaLayout{opaqueLayout, aMLayout, aKLayout, bKLayout,
                              bNLayout,     cMLayout, cNLayout};
   }
-  case MMAIntrinsic::WMMA_F16_16x16x16_F32:
+  case MMAIntrinsic::WMMA_F32_16x16x16_F16:
   case MMAIntrinsic::WMMA_F16_16x16x16_F16: {
     // #outer = #iree_vector_ext.per_dim_layout<[LANEX], [16]>
     // #inner = #iree_vector_ext.per_dim_layout<[LANEY, VECTORX], [1, 16]>
@@ -463,7 +463,7 @@
     auto cType = VectorType::get({16}, getCType());
     return std::make_tuple(aType, bType, cType);
   }
-  case MMAIntrinsic::WMMA_F16_16x16x16_F32:
+  case MMAIntrinsic::WMMA_F32_16x16x16_F16:
   case MMAIntrinsic::WMMA_F16_16x16x16_F16: {
     auto aType = VectorType::get({16}, getAType());
     auto bType = VectorType::get({16}, getBType());
@@ -492,7 +492,7 @@
   case MMAIntrinsic::MFMA_I32_16x16x32_I8:
   case MMAIntrinsic::MFMA_I32_32x32x16_I8:
   case MMAIntrinsic::WMMA_F16_16x16x16_F16:
-  case MMAIntrinsic::WMMA_F16_16x16x16_F32: {
+  case MMAIntrinsic::WMMA_F32_16x16x16_F16: {
     return 1;
   }
   }
@@ -510,7 +510,7 @@
   case MMAIntrinsic::MFMA_I32_32x32x16_I8: {
     return 64;
   }
-  case MMAIntrinsic::WMMA_F16_16x16x16_F32:
+  case MMAIntrinsic::WMMA_F32_16x16x16_F16:
   case MMAIntrinsic::WMMA_F16_16x16x16_F16: {
     return 32;
   }
@@ -542,7 +542,7 @@
     return {/*outer=*/{1, 1}, /*thread=*/{32, 2}, /*strides=*/{1, 32},
             /*element=*/{1, 8}};
   }
-  case MMAIntrinsic::WMMA_F16_16x16x16_F32:
+  case MMAIntrinsic::WMMA_F32_16x16x16_F16:
   case MMAIntrinsic::WMMA_F16_16x16x16_F16: {
     return {/*outer=*/{1, 1}, /*thread=*/{16, 1}, /*strides=*/{1, 16},
             /*element=*/{1, 16}};
@@ -574,7 +574,7 @@
     return {/*outer=*/{1, 1}, /*thread=*/{2, 32}, /*strides=*/{32, 1},
             /*element=*/{8, 1}};
   }
-  case MMAIntrinsic::WMMA_F16_16x16x16_F32:
+  case MMAIntrinsic::WMMA_F32_16x16x16_F16:
   case MMAIntrinsic::WMMA_F16_16x16x16_F16: {
     return {/*outer=*/{1, 1}, /*thread=*/{1, 16}, /*strides=*/{16, 1},
             /*element=*/{16, 1}};
@@ -597,7 +597,7 @@
     return {/*outer=*/{4, 1}, /*thread=*/{2, 32}, /*strides=*/{32, 1},
             /*element=*/{4, 1}};
   }
-  case MMAIntrinsic::WMMA_F16_16x16x16_F32:
+  case MMAIntrinsic::WMMA_F32_16x16x16_F16:
   case MMAIntrinsic::WMMA_F16_16x16x16_F16: {
     return {/*outer=*/{8, 1}, /*thread=*/{2, 16}, /*strides=*/{16, 1},
             /*element=*/{1, 1}};
@@ -644,7 +644,7 @@
                                 rhs, acc)
         .getResult();
   }
-  case MMAIntrinsic::WMMA_F16_16x16x16_F32:
+  case MMAIntrinsic::WMMA_F32_16x16x16_F16:
   case MMAIntrinsic::WMMA_F16_16x16x16_F16: {
     return builder.create<amdgpu::WMMAOp>(loc, resultType, lhs, rhs, acc)
         .getResult();
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUEnums.td b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUEnums.td
index a68a8a3..d9f45e1 100644
--- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUEnums.td
+++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUEnums.td
@@ -106,7 +106,7 @@
 def MFMA_I32_16x16x32_I8  : I32EnumAttrCase<"MFMA_I32_16x16x32_I8", 4>;
 def MFMA_I32_32x32x16_I8  : I32EnumAttrCase<"MFMA_I32_32x32x16_I8", 5>;
 // TODO: Create separate WMMA ops for AMD and NVIDIA GPUs
-def WMMA_F16_16x16x16_F32 : I32EnumAttrCase<"WMMA_F16_16x16x16_F32", 6>;
+def WMMA_F32_16x16x16_F16 : I32EnumAttrCase<"WMMA_F32_16x16x16_F16", 6>;
 def WMMA_F16_16x16x16_F16 : I32EnumAttrCase<"WMMA_F16_16x16x16_F16", 7>;
 
 def IREEGPU_MMAIntrinsic : IREEGPU_I32MmaEnumAttr<"MMAIntrinsic",
@@ -117,7 +117,7 @@
       MFMA_F32_16x16x32_F8E4M3FNUZ,
       MFMA_I32_16x16x32_I8,
       MFMA_I32_32x32x16_I8,
-      WMMA_F16_16x16x16_F32,
+      WMMA_F32_16x16x16_F16,
       WMMA_F16_16x16x16_F16
     ]>;
 
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/test/iree_gpu_attrs.mlir b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/test/iree_gpu_attrs.mlir
index 0766350..d100306 100644
--- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/test/iree_gpu_attrs.mlir
+++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/test/iree_gpu_attrs.mlir
@@ -20,12 +20,12 @@
 
 module {
   func.func @test_wmma_f16_16x16x16_f32() attributes {
-      mma_types = #iree_gpu.mma_layout<WMMA_F16_16x16x16_F32>} {
+      mma_types = #iree_gpu.mma_layout<WMMA_F32_16x16x16_F16>} {
     return
   }
 }
 // CHECK-LABEL: func @test_wmma_f16_16x16x16_f32
-//  CHECK-SAME:   mma_types = #iree_gpu.mma_layout<WMMA_F16_16x16x16_F32>
+//  CHECK-SAME:   mma_types = #iree_gpu.mma_layout<WMMA_F32_16x16x16_F16>
 
 module {
   func.func @test_any_lowering_config() attributes {
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/KnownTargets.cpp b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/KnownTargets.cpp
index 6c50a8a..6f32054 100644
--- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/KnownTargets.cpp
+++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/KnownTargets.cpp
@@ -165,7 +165,7 @@
 
 const WgpDetails *getRDNA3WgpDetails() {
   static const MMAIntrinsic rdna3MMAOps[] = {
-      MMAIntrinsic::WMMA_F16_16x16x16_F32,
+      MMAIntrinsic::WMMA_F32_16x16x16_F16,
       MMAIntrinsic::WMMA_F16_16x16x16_F16,
   };
   static const WgpDetails rdna3Wgp = {
@@ -355,7 +355,7 @@
 
 const WgpDetails *getAmpereWgpDetails() {
   static const MMAIntrinsic mmaOps[] = {
-      MMAIntrinsic::WMMA_F16_16x16x16_F32,
+      MMAIntrinsic::WMMA_F32_16x16x16_F16,
       MMAIntrinsic::WMMA_F16_16x16x16_F16,
   };
   static const WgpDetails ampereWgp = {
@@ -368,7 +368,7 @@
 
 const WgpDetails *getTuringWgpDetails() {
   static const MMAIntrinsic mmaOps[] = {
-      MMAIntrinsic::WMMA_F16_16x16x16_F32,
+      MMAIntrinsic::WMMA_F32_16x16x16_F16,
       MMAIntrinsic::WMMA_F16_16x16x16_F16,
   };
   static const WgpDetails turingWgp = {
@@ -381,7 +381,7 @@
 
 const WgpDetails *getVoltaWgpDetails() {
   static const MMAIntrinsic mmaOps[] = {
-      MMAIntrinsic::WMMA_F16_16x16x16_F32,
+      MMAIntrinsic::WMMA_F32_16x16x16_F16,
       MMAIntrinsic::WMMA_F16_16x16x16_F16,
   };
   // clang-format off
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/TransformExtensions/test/lower_multi_mma.mlir b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TransformExtensions/test/lower_multi_mma.mlir
index c870015..8939941 100644
--- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/TransformExtensions/test/lower_multi_mma.mlir
+++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TransformExtensions/test/lower_multi_mma.mlir
@@ -77,7 +77,7 @@
   %0 = iree_gpu.multi_mma %lhs, %rhs, %acc {
     indexing_maps = #contraction_accesses,
     iterator_types = [],
-    kind = #iree_gpu.mma_layout<WMMA_F16_16x16x16_F32>
+    kind = #iree_gpu.mma_layout<WMMA_F32_16x16x16_F16>
   } : vector<16xf16>, vector<16xf16> into vector<8xf32>
   return %0 : vector<8xf32>
 }
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_vector_distribute.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_vector_distribute.mlir
index d6bea0c..9f0e6cc 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_vector_distribute.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_vector_distribute.mlir
@@ -203,7 +203,7 @@
 // WMMA:      #[[$TILE_SIZES:.+]] = #iree_codegen.lowering_config<tile_sizes =  {{\[}}[64, 128, 64]{{\]}}
 // WMMA:      #iree_codegen.translation_info<LLVMGPUVectorDistribute
 // WMMA-SAME: mma_schedule = #iree_gpu.mma_schedule
-// WMMA-SAME:   intrinsic = #iree_gpu.mma_layout<WMMA_F16_16x16x16_F32>
+// WMMA-SAME:   intrinsic = #iree_gpu.mma_layout<WMMA_F32_16x16x16_F16>
 // WMMA-SAME:   subgroup_m_count = 2, subgroup_n_count = 2
 
 #pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_vector_distribute.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_vector_distribute.mlir
index d79d76d..9074458 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_vector_distribute.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_vector_distribute.mlir
@@ -480,7 +480,7 @@
 }
 
 //       RDNA3: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [64, 2, 1] subgroup_size = 32
-//  RDNA3-SAME:   mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<WMMA_F16_16x16x16_F32>,
+//  RDNA3-SAME:   mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<WMMA_F32_16x16x16_F16>,
 //  RDNA3-SAME:     subgroup_m_count = 2, subgroup_n_count = 2>
 //  RDNA3-SAME:   prefetch_shared_memory
 
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/amdgpu_contraction_distribution.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/amdgpu_contraction_distribution.mlir
index 76e9f9b..cc0688a 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/amdgpu_contraction_distribution.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/amdgpu_contraction_distribution.mlir
@@ -218,7 +218,7 @@
 
 // -----
 
-#layout = #iree_gpu.mma_layout<WMMA_F16_16x16x16_F32>
+#layout = #iree_gpu.mma_layout<WMMA_F32_16x16x16_F16>
 #map1 = affine_map<(d0, d1, d2) -> (d1, d2)>
 #map2 = affine_map<(d0, d1, d2) -> (d0, d2)>
 #map3 = affine_map<(d0, d1, d2) -> (d1, d0)>
@@ -271,7 +271,7 @@
 
 // -----
 
-#layout = #iree_gpu.mma_layout<WMMA_F16_16x16x16_F32>
+#layout = #iree_gpu.mma_layout<WMMA_F32_16x16x16_F16>
 
 #map1 = affine_map<(d0, d1, d2) -> (d1, d2)>
 #map2 = affine_map<(d0, d1, d2) -> (d0, d2)>
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/amdgpu_set_anchor_layouts.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/amdgpu_set_anchor_layouts.mlir
index 54fd9b1..9721435 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/amdgpu_set_anchor_layouts.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/amdgpu_set_anchor_layouts.mlir
@@ -65,7 +65,7 @@
 
 // -----
 
-#layout = #iree_gpu.mma_layout<WMMA_F16_16x16x16_F32>
+#layout = #iree_gpu.mma_layout<WMMA_F32_16x16x16_F16>
 #map1 = affine_map<(d0, d1, d2) -> (d1, d2)>
 #map2 = affine_map<(d0, d1, d2) -> (d0, d2)>
 #map3 = affine_map<(d0, d1, d2) -> (d1, d0)>
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/cast_type_to_fit_mma.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/cast_type_to_fit_mma.mlir
index d69b8e5..f8c4341 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/cast_type_to_fit_mma.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/cast_type_to_fit_mma.mlir
@@ -67,7 +67,7 @@
 
 func.func @wmma_matmul_48x32x32_mm(%lhs: vector<48x32xf16>, %rhs: vector<32x32xf16>, %init: vector<48x32xf16>) -> vector<48x32xf16> attributes {
     mma_schedule = #iree_gpu.mma_schedule<
-      intrinsic = #iree_gpu.mma_layout<WMMA_F16_16x16x16_F32>,
+      intrinsic = #iree_gpu.mma_layout<WMMA_F32_16x16x16_F16>,
       subgroup_m_count = 1, subgroup_n_count = 1>,
     workgroup_size = [32, 1, 1]} {
     %0 = vector.contract {
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/configure_vector_layout.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/configure_vector_layout.mlir
index 9885583..2f12b15 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/configure_vector_layout.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/configure_vector_layout.mlir
@@ -258,7 +258,7 @@
 #translation = #iree_codegen.translation_info<LLVMGPUVectorDistribute
                                               workgroup_size = [32, 1, 1]
                                               subgroup_size = 32,
-      {mma_schedule = #iree_gpu.mma_schedule< intrinsic = #iree_gpu.mma_layout<WMMA_F16_16x16x16_F32>, subgroup_m_count = 1, subgroup_n_count = 1>}>
+      {mma_schedule = #iree_gpu.mma_schedule< intrinsic = #iree_gpu.mma_layout<WMMA_F32_16x16x16_F16>, subgroup_m_count = 1, subgroup_n_count = 1>}>
 
 // CHECK-DAG: #[[$NESTED:.+]] = #iree_vector_ext.nested_layout<subgroups_per_workgroup = [1, 1], batches_per_subgroup = [3, 2], outers_per_batch = [1, 1], threads_per_outer = [16, 1], elements_per_thread = [1, 16], subgroup_strides = [0, 0], thread_strides = [1, 0]>
 // CHECK-DAG: #[[$NESTED1:.+]] = #iree_vector_ext.nested_layout<subgroups_per_workgroup = [1, 1], batches_per_subgroup = [2, 2], outers_per_batch = [1, 1], threads_per_outer = [1, 16], elements_per_thread = [16, 1], subgroup_strides = [0, 0], thread_strides = [0, 1]>
@@ -283,7 +283,7 @@
 #translation = #iree_codegen.translation_info<LLVMGPUVectorDistribute
                                               workgroup_size = [32, 1, 1]
                                               subgroup_size = 32,
-      {mma_schedule = #iree_gpu.mma_schedule< intrinsic = #iree_gpu.mma_layout<WMMA_F16_16x16x16_F32>, subgroup_m_count = 1, subgroup_n_count = 1>}>
+      {mma_schedule = #iree_gpu.mma_schedule< intrinsic = #iree_gpu.mma_layout<WMMA_F32_16x16x16_F16>, subgroup_m_count = 1, subgroup_n_count = 1>}>
 
 // CHECK-DAG: #[[$NESTED:.+]] = #iree_vector_ext.nested_layout<subgroups_per_workgroup = [1, 1], batches_per_subgroup = [3, 2], outers_per_batch = [1, 1], threads_per_outer = [16, 1], elements_per_thread = [1, 16], subgroup_strides = [0, 0], thread_strides = [1, 0]>
 // CHECK-DAG: #[[$NESTED1:.+]] = #iree_vector_ext.nested_layout<subgroups_per_workgroup = [1, 1], batches_per_subgroup = [2, 2], outers_per_batch = [1, 1], threads_per_outer = [16, 1], elements_per_thread = [1, 16], subgroup_strides = [0, 0], thread_strides = [1, 0]>
@@ -383,7 +383,7 @@
 #translation = #iree_codegen.translation_info<LLVMGPUVectorDistribute
                                               workgroup_size = [32, 4, 1]
                                               subgroup_size = 32,
-      {mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<WMMA_F16_16x16x16_F32>, subgroup_m_count = 1, subgroup_n_count = 4>}>
+      {mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<WMMA_F32_16x16x16_F16>, subgroup_m_count = 1, subgroup_n_count = 4>}>
 
 // CHECK-DAG: #[[$NESTED:.+]] = #iree_vector_ext.nested_layout<subgroups_per_workgroup = [1, 1], batches_per_subgroup = [4, 1], outers_per_batch = [1, 1], threads_per_outer = [32, 4], elements_per_thread = [1, 32], subgroup_strides = [0, 0], thread_strides = [4, 1]>
 
diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/test/convert_gpu_target.mlir b/compiler/src/iree/compiler/Codegen/SPIRV/test/convert_gpu_target.mlir
index d8d3770..b1da56d 100644
--- a/compiler/src/iree/compiler/Codegen/SPIRV/test/convert_gpu_target.mlir
+++ b/compiler/src/iree/compiler/Codegen/SPIRV/test/convert_gpu_target.mlir
@@ -3,7 +3,7 @@
 hal.executable @dispatch {
 hal.executable.variant public @vulkan_spirv_fb target(<"vulkan-spirv", "vulkan-spirv-fb", {
     iree.gpu.target = #iree_gpu.target<arch = "rdna3", features = "spirv:v1.6,cap:Shader",
-      wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<WMMA_F16_16x16x16_F32>, <WMMA_F16_16x16x16_F16>],
+      wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<WMMA_F32_16x16x16_F16>, <WMMA_F16_16x16x16_F16>],
       subgroup_size_choices = [32, 64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536>>}>) {
   hal.executable.export public @dispatch ordinal(0) layout(#hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer>]>]>) {
   ^bb0(%arg0: !hal.device):
diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/test/illegal_configuration.mlir b/compiler/src/iree/compiler/Codegen/SPIRV/test/illegal_configuration.mlir
index bb76688..ba502da 100644
--- a/compiler/src/iree/compiler/Codegen/SPIRV/test/illegal_configuration.mlir
+++ b/compiler/src/iree/compiler/Codegen/SPIRV/test/illegal_configuration.mlir
@@ -237,7 +237,7 @@
 #executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", {
   iree.gpu.target = #iree_gpu.target<arch = "", features = "spirv:v1.6,cap:Shader", wgp = <
     compute = fp32|fp16|int32, storage = b32|b16, subgroup = none, dot = none,
-    mma = [<WMMA_F16_16x16x16_F32>, <WMMA_F16_16x16x16_F16>],
+    mma = [<WMMA_F32_16x16x16_F16>, <WMMA_F16_16x16x16_F16>],
     subgroup_size_choices = [32, 64], max_workgroup_sizes = [1024, 1024, 1024],
     max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536>>
 }>
@@ -272,7 +272,7 @@
 #executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", {
   iree.gpu.target = #iree_gpu.target<arch = "", features = "spirv:v1.6,cap:Shader", wgp = <
     compute = fp32|fp16|int32, storage = b32|b16, subgroup = none, dot = none,
-    mma = [<WMMA_F16_16x16x16_F32>, <WMMA_F16_16x16x16_F16>],
+    mma = [<WMMA_F32_16x16x16_F16>, <WMMA_F16_16x16x16_F16>],
     subgroup_size_choices = [32, 64], max_workgroup_sizes = [1024, 1024, 1024],
     max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536>>
 }>
@@ -307,7 +307,7 @@
 #executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", {
   iree.gpu.target = #iree_gpu.target<arch = "", features = "spirv:v1.6,cap:Shader", wgp = <
     compute = fp32|fp16|int32, storage = b32|b16, subgroup = none, dot = none,
-    mma = [<WMMA_F16_16x16x16_F32>, <WMMA_F16_16x16x16_F16>],
+    mma = [<WMMA_F32_16x16x16_F16>, <WMMA_F16_16x16x16_F16>],
     subgroup_size_choices = [32, 64], max_workgroup_sizes = [1024, 1024, 1024],
     max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536>>
 }>
@@ -342,7 +342,7 @@
 #executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", {
   iree.gpu.target = #iree_gpu.target<arch = "", features = "spirv:v1.6,cap:Shader", wgp = <
     compute = fp32|fp16|int32, storage = b32|b16, subgroup = none, dot = none,
-    mma = [<WMMA_F16_16x16x16_F32>, <WMMA_F16_16x16x16_F16>],
+    mma = [<WMMA_F32_16x16x16_F16>, <WMMA_F16_16x16x16_F16>],
     subgroup_size_choices = [32, 64], max_workgroup_sizes = [1024, 1024, 1024],
     max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536>>
 }>
@@ -377,7 +377,7 @@
 #executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", {
   iree.gpu.target = #iree_gpu.target<arch = "", features = "spirv:v1.6,cap:Shader", wgp = <
     compute = fp32|fp16|int32, storage = b32|b16, subgroup = none, dot = none,
-    mma = [<WMMA_F16_16x16x16_F32>, <WMMA_F16_16x16x16_F16>],
+    mma = [<WMMA_F32_16x16x16_F16>, <WMMA_F16_16x16x16_F16>],
     subgroup_size_choices = [32, 64], max_workgroup_sizes = [1024, 1024, 1024],
     max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536>>
 }>
diff --git a/tests/e2e/matmul/generate_e2e_matmul_tests.py b/tests/e2e/matmul/generate_e2e_matmul_tests.py
index c57af2b..b4d371d 100644
--- a/tests/e2e/matmul/generate_e2e_matmul_tests.py
+++ b/tests/e2e/matmul/generate_e2e_matmul_tests.py
@@ -292,13 +292,13 @@
         ]
     elif intrinsic == "WMMA":
         schedules = [
-            MMASchedule("WMMA_F16_16x16x16_F32", 1, 1, 1, 1, 1),
-            MMASchedule("WMMA_F16_16x16x16_F32", 1, 1, 1, 1, 2),
-            MMASchedule("WMMA_F16_16x16x16_F32", 1, 1, 1, 2, 1),
-            MMASchedule("WMMA_F16_16x16x16_F32", 1, 1, 2, 1, 1),
-            MMASchedule("WMMA_F16_16x16x16_F32", 2, 2, 1, 1, 1),
-            MMASchedule("WMMA_F16_16x16x16_F32", 2, 4, 2, 1, 2),
-            MMASchedule("WMMA_F16_16x16x16_F32", 4, 2, 4, 2, 2),
+            MMASchedule("WMMA_F32_16x16x16_F16", 1, 1, 1, 1, 1),
+            MMASchedule("WMMA_F32_16x16x16_F16", 1, 1, 1, 1, 2),
+            MMASchedule("WMMA_F32_16x16x16_F16", 1, 1, 1, 2, 1),
+            MMASchedule("WMMA_F32_16x16x16_F16", 1, 1, 2, 1, 1),
+            MMASchedule("WMMA_F32_16x16x16_F16", 2, 2, 1, 1, 1),
+            MMASchedule("WMMA_F32_16x16x16_F16", 2, 4, 2, 1, 2),
+            MMASchedule("WMMA_F32_16x16x16_F16", 4, 2, 4, 2, 2),
         ]
     else:
         raise NotImplementedError("unhandled intrinsic case")
@@ -338,7 +338,7 @@
             wg_tile_m = schedule.m_count * schedule.m_tile_count * 32
             wg_tile_n = schedule.n_count * schedule.n_tile_count * 32
             wg_tile_k = schedule.k_tile_count * 16
-        elif schedule.intrinsic == "WMMA_F16_16x16x16_F32":
+        elif schedule.intrinsic == "WMMA_F32_16x16x16_F16":
             wg_tile_m = schedule.m_count * schedule.m_tile_count * 16
             wg_tile_n = schedule.n_count * schedule.n_tile_count * 16
             wg_tile_k = schedule.k_tile_count * 16