[LLVMGPU] Improve mfma tile sizes for TileAndFuse pipeline (#18459)
This attempts to select better tile sizes on the TileAndFuse pipeline
based on the number of elements loaded along the K dimension tile. A new
field is added to `GPUMMAHeuristicSeeds` called
`bestKElementCountPerSubgroup`, which represents the best number of
elements to have along the K dimension of a subgroup tile. This can be
more useful than the existing `bestKTileCountPerSubgroup`, since it
takes into account the size of the intrinsic being selected. Having
control over the specific number of elements loaded helps give better
control over overall VGPR utilization and load queue saturation.
The tile size selection logic in `setMatmulLoweringConfig` has been
tuned using this new field. The current tile sizes improve overall
performance on SDXL for the TileAndFuse pipeline over the old tile size
selection logic.
---------
Signed-off-by: Max Dawkins <max.dawkins@gmail.com>
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUHeuristics.cpp b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUHeuristics.cpp
index 008e7bc..dc30783 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUHeuristics.cpp
+++ b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUHeuristics.cpp
@@ -244,8 +244,13 @@
const uint64_t kTotalTileCount =
llvm::divideCeil(problem.kSize, intrinsic.kSize);
- APInt kGCD = GreatestCommonDivisor(
- APInt(64, kTotalTileCount), APInt(64, seeds.bestKTileCountPerSubgroup));
+ int64_t bestKTileCountPerSubgroup =
+ seeds.bestKElementCountPerSubgroup
+ ? llvm::divideCeil(seeds.bestKElementCountPerSubgroup,
+ intrinsic.kSize)
+ : seeds.bestKTileCountPerSubgroup;
+ APInt kGCD = GreatestCommonDivisor(APInt(64, kTotalTileCount),
+ APInt(64, bestKTileCountPerSubgroup));
int64_t kTileCount = kGCD.getSExtValue();
return GPUMMASchedule{intrinsicIndex, intrinsic.mSize, intrinsic.nSize,
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUHeuristics.h b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUHeuristics.h
index a05afe6..8211443 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUHeuristics.h
+++ b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUHeuristics.h
@@ -29,6 +29,10 @@
int64_t bestMNTileCountPerSubgroup;
// The best number of tiles along K dimension per subgroup
int64_t bestKTileCountPerSubgroup;
+ // The best number of elements along K dimension per subgroup. This is
+ // equivalent to `bestKTileCountPerSubgroup * bestIntrinsic.kSize`, for
+ // some chosen intrinsic `bestIntrinsic`.
+ int64_t bestKElementCountPerSubgroup = 0;
};
struct GPUMMASchedule {
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp
index 4fc2b67..cfe13f6 100644
--- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp
+++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp
@@ -27,6 +27,8 @@
namespace mlir::iree_compiler::IREE::GPU {
+constexpr int64_t kCacheLineSizeBits = 128 * 8;
+
LogicalResult setMatmulLoweringConfig(IREE::GPU::TargetAttr target,
mlir::FunctionOpInterface entryPoint,
Operation *op) {
@@ -86,6 +88,7 @@
return failure();
GPUMMAHeuristicSeeds seeds;
+ int64_t inBitWidth = lhsElemType.getIntOrFloatBitWidth();
// Note that the following heuristic seeds are just placeholder values.
// We need to clean it up and make it adjusting to different targets.
@@ -96,11 +99,13 @@
// and a larger bestKTileCountPerSubgroup.
seeds = {/*bestSubgroupCountPerWorkgroup=*/4,
/*bestMNTileCountPerSubgroup=*/4,
- /*bestKTileCountPerSubgroup=*/8};
+ /*bestKTileCountPerSubgroup=*/8,
+ /*bestKElementCountPerSubgroup*/ kCacheLineSizeBits / inBitWidth};
} else {
seeds = {/*bestSubgroupCountPerWorkgroup=*/4,
/*bestMNTileCountPerSubgroup=*/8,
- /*bestKTileCountPerSubgroup=*/4};
+ /*bestKTileCountPerSubgroup=*/4,
+ /*bestKElementCountPerSubgroup*/ kCacheLineSizeBits / inBitWidth};
}
int64_t maxSharedMemoryBytes = target.getWgp().getMaxWorkgroupMemoryBytes();
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_tile_and_fuse.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_tile_and_fuse.mlir
index cc57259..2abc60a 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_tile_and_fuse.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_tile_and_fuse.mlir
@@ -34,7 +34,7 @@
// CHECK: linalg.generic {{.*}}lowering_config = #iree_gpu.lowering_config
// CHECK-SAME: mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>
-// CHECK-SAME: reduction = [0 : index, 0 : index, 0 : index, 0 : index, 8 : index]
+// CHECK-SAME: reduction = [0 : index, 0 : index, 0 : index, 0 : index, 4 : index]
// CHECK-SAME: subgroup = [0 : index, 0 : index, 4 : index, 1 : index, 0 : index]
// CHECK-SAME: workgroup = [1 : index, 1 : index, 64 : index, 64 : index, 0 : index]