[cuda] Fix verifier for tensorcore pipeline. (#8622)
The second level of tiling didn't consider the fact that we distribute N
dimension on x threads and M dimension on y threads.
diff --git a/iree/compiler/Codegen/LLVMGPU/Verifiers.cpp b/iree/compiler/Codegen/LLVMGPU/Verifiers.cpp
index fda1c08..c4854ea 100644
--- a/iree/compiler/Codegen/LLVMGPU/Verifiers.cpp
+++ b/iree/compiler/Codegen/LLVMGPU/Verifiers.cpp
@@ -160,15 +160,11 @@
<< pipelineName << ", got " << workgroupSize[2];
}
- // The second level of tiling = first level tile size divided by the
- // warps per workgroup size
- SmallVector<int64_t, 3> warpsPerWorkgroup = {
- workgroupSize[0] / kWarpSize, workgroupSize[1], workgroupSize[2]};
- SmallVector<int64_t, 3> secondLevelTileSizes;
- for (int i = 0; i < 3; ++i) {
- secondLevelTileSizes.push_back(firstLevelTileSizes[i] /
- warpsPerWorkgroup[i]);
- }
+ // The second level of tiling = [M / numWarp.y, N / numWarp.x, K].
+ SmallVector<int64_t, 3> secondLevelTileSizes = {
+ firstLevelTileSizes[0] / workgroupSize[1],
+ firstLevelTileSizes[1] / (workgroupSize[0] / kWarpSize),
+ firstLevelTileSizes[2]};
// Verify the TensorCore size divides the second level tile size
SmallVector<int64_t, 3> tensorCoreSize({16, 16, 8});
diff --git a/iree/compiler/Codegen/LLVMGPU/test/illegal_configuration.mlir b/iree/compiler/Codegen/LLVMGPU/test/illegal_configuration.mlir
index 58e9859..37d03b3 100644
--- a/iree/compiler/Codegen/LLVMGPU/test/illegal_configuration.mlir
+++ b/iree/compiler/Codegen/LLVMGPU/test/illegal_configuration.mlir
@@ -192,6 +192,38 @@
// -----
+#config = #iree_codegen.lowering_config<tile_sizes = [[64, 32, 16]]>
+#translation = #iree_codegen.translation_info<LLVMGPUMatmulTensorCore>
+#executable_layout = #hal.executable.layout<push_constants = 0, sets = [
+ #hal.descriptor_set.layout<0, bindings = [
+ #hal.descriptor_set.binding<0, storage_buffer>,
+ #hal.descriptor_set.binding<1, storage_buffer>,
+ #hal.descriptor_set.binding<2, storage_buffer>
+ ]>
+]>
+hal.executable private @matmul_tensors {
+ hal.executable.variant @cuda, target = #hal.executable.target<"cuda", "cuda-nvptx-fb"> {
+ hal.executable.entry_point @illegal layout(#executable_layout) {
+ translation_info = #translation,
+ workgroup_size = [128 : index, 1 : index, 1 : index]
+ }
+ builtin.module {
+ func @illegal() {
+ %c0 = arith.constant 0 : index
+ %lhs = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<1024x512xf32>
+ %rhs = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<512x256xf32>
+ %result = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : memref<1024x256xf32>
+ // expected-error @+1 {{tensorcore size doesn't factor into second level tile size for LLVMGPUMatmulTensorCore}}
+ linalg.matmul {lowering_config = #config} ins(%lhs, %rhs : memref<1024x512xf32>, memref<512x256xf32>)
+ outs(%result: memref<1024x256xf32>)
+ return
+ }
+ }
+ }
+}
+
+// -----
+
#config = #iree_codegen.lowering_config<tile_sizes = [[32, 32, 16]]>
#translation = #iree_codegen.translation_info<LLVMGPUMatmulTensorCore>
#executable_layout = #hal.executable.layout<push_constants = 0, sets = [