[cuda] Fix verifier for tensorcore pipeline. (#8622)

The second level of tiling didn't consider the fact that we distribute N
dimension on x threads and M dimension on y threads.
diff --git a/iree/compiler/Codegen/LLVMGPU/Verifiers.cpp b/iree/compiler/Codegen/LLVMGPU/Verifiers.cpp
index fda1c08..c4854ea 100644
--- a/iree/compiler/Codegen/LLVMGPU/Verifiers.cpp
+++ b/iree/compiler/Codegen/LLVMGPU/Verifiers.cpp
@@ -160,15 +160,11 @@
            << pipelineName << ", got " << workgroupSize[2];
   }
 
-  // The second level of tiling = first level tile size divided by the
-  // warps per workgroup size
-  SmallVector<int64_t, 3> warpsPerWorkgroup = {
-      workgroupSize[0] / kWarpSize, workgroupSize[1], workgroupSize[2]};
-  SmallVector<int64_t, 3> secondLevelTileSizes;
-  for (int i = 0; i < 3; ++i) {
-    secondLevelTileSizes.push_back(firstLevelTileSizes[i] /
-                                   warpsPerWorkgroup[i]);
-  }
+  // The second level of tiling = [M / numWarp.y, N / numWarp.x, K].
+  SmallVector<int64_t, 3> secondLevelTileSizes = {
+      firstLevelTileSizes[0] / workgroupSize[1],
+      firstLevelTileSizes[1] / (workgroupSize[0] / kWarpSize),
+      firstLevelTileSizes[2]};
 
   // Verify the TensorCore size divides the second level tile size
   SmallVector<int64_t, 3> tensorCoreSize({16, 16, 8});
diff --git a/iree/compiler/Codegen/LLVMGPU/test/illegal_configuration.mlir b/iree/compiler/Codegen/LLVMGPU/test/illegal_configuration.mlir
index 58e9859..37d03b3 100644
--- a/iree/compiler/Codegen/LLVMGPU/test/illegal_configuration.mlir
+++ b/iree/compiler/Codegen/LLVMGPU/test/illegal_configuration.mlir
@@ -192,6 +192,38 @@
 
 // -----
 
+#config = #iree_codegen.lowering_config<tile_sizes = [[64, 32, 16]]>
+#translation = #iree_codegen.translation_info<LLVMGPUMatmulTensorCore>
+#executable_layout = #hal.executable.layout<push_constants = 0, sets = [
+  #hal.descriptor_set.layout<0, bindings = [
+    #hal.descriptor_set.binding<0, storage_buffer>,
+    #hal.descriptor_set.binding<1, storage_buffer>,
+    #hal.descriptor_set.binding<2, storage_buffer>
+  ]>
+]>
+hal.executable private @matmul_tensors {
+  hal.executable.variant @cuda, target = #hal.executable.target<"cuda", "cuda-nvptx-fb"> {
+    hal.executable.entry_point @illegal layout(#executable_layout) {
+      translation_info = #translation,
+      workgroup_size = [128 : index, 1 : index, 1 : index]
+    }
+    builtin.module {
+      func @illegal() {
+        %c0 = arith.constant 0 : index
+        %lhs = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<1024x512xf32>
+        %rhs = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<512x256xf32>
+        %result = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : memref<1024x256xf32>
+        // expected-error @+1 {{tensorcore size doesn't factor into second level tile size for LLVMGPUMatmulTensorCore}}
+        linalg.matmul {lowering_config = #config} ins(%lhs, %rhs : memref<1024x512xf32>, memref<512x256xf32>)
+          outs(%result: memref<1024x256xf32>)
+        return
+      }
+    }
+  }
+}
+
+// -----
+
 #config = #iree_codegen.lowering_config<tile_sizes = [[32, 32, 16]]>
 #translation = #iree_codegen.translation_info<LLVMGPUMatmulTensorCore>
 #executable_layout = #hal.executable.layout<push_constants = 0, sets = [