[cuda] Fix verifier for tensorcore pipeline. (#8622) The second level of tiling didn't consider the fact that we distribute N dimension on x threads and M dimension on y threads.
diff --git a/iree/compiler/Codegen/LLVMGPU/Verifiers.cpp b/iree/compiler/Codegen/LLVMGPU/Verifiers.cpp index fda1c08..c4854ea 100644 --- a/iree/compiler/Codegen/LLVMGPU/Verifiers.cpp +++ b/iree/compiler/Codegen/LLVMGPU/Verifiers.cpp
@@ -160,15 +160,11 @@ << pipelineName << ", got " << workgroupSize[2]; } - // The second level of tiling = first level tile size divided by the - // warps per workgroup size - SmallVector<int64_t, 3> warpsPerWorkgroup = { - workgroupSize[0] / kWarpSize, workgroupSize[1], workgroupSize[2]}; - SmallVector<int64_t, 3> secondLevelTileSizes; - for (int i = 0; i < 3; ++i) { - secondLevelTileSizes.push_back(firstLevelTileSizes[i] / - warpsPerWorkgroup[i]); - } + // The second level of tiling = [M / numWarp.y, N / numWarp.x, K]. + SmallVector<int64_t, 3> secondLevelTileSizes = { + firstLevelTileSizes[0] / workgroupSize[1], + firstLevelTileSizes[1] / (workgroupSize[0] / kWarpSize), + firstLevelTileSizes[2]}; // Verify the TensorCore size divides the second level tile size SmallVector<int64_t, 3> tensorCoreSize({16, 16, 8});
diff --git a/iree/compiler/Codegen/LLVMGPU/test/illegal_configuration.mlir b/iree/compiler/Codegen/LLVMGPU/test/illegal_configuration.mlir index 58e9859..37d03b3 100644 --- a/iree/compiler/Codegen/LLVMGPU/test/illegal_configuration.mlir +++ b/iree/compiler/Codegen/LLVMGPU/test/illegal_configuration.mlir
@@ -192,6 +192,38 @@ // ----- +#config = #iree_codegen.lowering_config<tile_sizes = [[64, 32, 16]]> +#translation = #iree_codegen.translation_info<LLVMGPUMatmulTensorCore> +#executable_layout = #hal.executable.layout<push_constants = 0, sets = [ + #hal.descriptor_set.layout<0, bindings = [ + #hal.descriptor_set.binding<0, storage_buffer>, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> +hal.executable private @matmul_tensors { + hal.executable.variant @cuda, target = #hal.executable.target<"cuda", "cuda-nvptx-fb"> { + hal.executable.entry_point @illegal layout(#executable_layout) { + translation_info = #translation, + workgroup_size = [128 : index, 1 : index, 1 : index] + } + builtin.module { + func @illegal() { + %c0 = arith.constant 0 : index + %lhs = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<1024x512xf32> + %rhs = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<512x256xf32> + %result = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : memref<1024x256xf32> + // expected-error @+1 {{tensorcore size doesn't factor into second level tile size for LLVMGPUMatmulTensorCore}} + linalg.matmul {lowering_config = #config} ins(%lhs, %rhs : memref<1024x512xf32>, memref<512x256xf32>) + outs(%result: memref<1024x256xf32>) + return + } + } + } +} + +// ----- + #config = #iree_codegen.lowering_config<tile_sizes = [[32, 32, 16]]> #translation = #iree_codegen.translation_info<LLVMGPUMatmulTensorCore> #executable_layout = #hal.executable.layout<push_constants = 0, sets = [