[LLVMGPU] Improve mfma tile sizes for TileAndFuse pipeline (#18459) This attempts to select better tile sizes on the TileAndFuse pipeline based on the number of elements loaded along the K dimension tile. A new field is added to `GPUMMAHeuristicSeeds` called `bestKElementCountPerSubgroup`, which represents the best number of elements to have along the K dimension of a subgroup tile. This can be more useful than the existing `bestKTileCountPerSubgroup`, since it takes into account the size of the intrinsic being selected. Having control over the specific number of elements loaded helps give better control over overall VGPR utilization and load queue saturation. The tile size selection logic in `setMatmulLoweringConfig` has been tuned using this new field. The current tile sizes improve overall performance on SDXL for the TileAndFuse pipeline over the old tile size selection logic. --------- Signed-off-by: Max Dawkins <max.dawkins@gmail.com>

commit: d5c4ef12db0d24204a208c45a881b852a3e56a42 [log] [tgz]
author: Max191 <44243577+Max191@users.noreply.github.com> Mon Sep 09 13:06:20 2024 -0700
committer: GitHub <noreply@github.com> Mon Sep 09 16:06:20 2024 -0400
tree: 5d117a2795cfbd4e698d1ac0599947033bb88ee0
parent: 69ca7dfa042141602f0ff748c456c31be86e3db6 [diff]
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUHeuristics.cpp b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUHeuristics.cpp
index 008e7bc..dc30783 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUHeuristics.cpp
+++ b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUHeuristics.cpp

@@ -244,8 +244,13 @@
 
   const uint64_t kTotalTileCount =
       llvm::divideCeil(problem.kSize, intrinsic.kSize);
-  APInt kGCD = GreatestCommonDivisor(
-      APInt(64, kTotalTileCount), APInt(64, seeds.bestKTileCountPerSubgroup));
+  int64_t bestKTileCountPerSubgroup =
+      seeds.bestKElementCountPerSubgroup
+          ? llvm::divideCeil(seeds.bestKElementCountPerSubgroup,
+                             intrinsic.kSize)
+          : seeds.bestKTileCountPerSubgroup;
+  APInt kGCD = GreatestCommonDivisor(APInt(64, kTotalTileCount),
+                                     APInt(64, bestKTileCountPerSubgroup));
   int64_t kTileCount = kGCD.getSExtValue();
 
   return GPUMMASchedule{intrinsicIndex,  intrinsic.mSize, intrinsic.nSize,

diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUHeuristics.h b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUHeuristics.h
index a05afe6..8211443 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUHeuristics.h
+++ b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUHeuristics.h

@@ -29,6 +29,10 @@
   int64_t bestMNTileCountPerSubgroup;
   // The best number of tiles along K dimension per subgroup
   int64_t bestKTileCountPerSubgroup;
+  // The best number of elements along K dimension per subgroup. This is
+  // equivalent to `bestKTileCountPerSubgroup * bestIntrinsic.kSize`, for
+  // some chosen intrinsic `bestIntrinsic`.
+  int64_t bestKElementCountPerSubgroup = 0;
 };
 
 struct GPUMMASchedule {

diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp
index 4fc2b67..cfe13f6 100644
--- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp
+++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp

@@ -27,6 +27,8 @@
 
 namespace mlir::iree_compiler::IREE::GPU {
 
+constexpr int64_t kCacheLineSizeBits = 128 * 8;
+
 LogicalResult setMatmulLoweringConfig(IREE::GPU::TargetAttr target,
                                       mlir::FunctionOpInterface entryPoint,
                                       Operation *op) {
@@ -86,6 +88,7 @@
     return failure();
 
   GPUMMAHeuristicSeeds seeds;
+  int64_t inBitWidth = lhsElemType.getIntOrFloatBitWidth();
 
   // Note that the following heuristic seeds are just placeholder values.
   // We need to clean it up and make it adjusting to different targets.
@@ -96,11 +99,13 @@
     // and a larger bestKTileCountPerSubgroup.
     seeds = {/*bestSubgroupCountPerWorkgroup=*/4,
              /*bestMNTileCountPerSubgroup=*/4,
-             /*bestKTileCountPerSubgroup=*/8};
+             /*bestKTileCountPerSubgroup=*/8,
+             /*bestKElementCountPerSubgroup*/ kCacheLineSizeBits / inBitWidth};
   } else {
     seeds = {/*bestSubgroupCountPerWorkgroup=*/4,
              /*bestMNTileCountPerSubgroup=*/8,
-             /*bestKTileCountPerSubgroup=*/4};
+             /*bestKTileCountPerSubgroup=*/4,
+             /*bestKElementCountPerSubgroup*/ kCacheLineSizeBits / inBitWidth};
   }
 
   int64_t maxSharedMemoryBytes = target.getWgp().getMaxWorkgroupMemoryBytes();

diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_tile_and_fuse.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_tile_and_fuse.mlir
index cc57259..2abc60a 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_tile_and_fuse.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_tile_and_fuse.mlir

@@ -34,7 +34,7 @@
 
 //       CHECK:   linalg.generic {{.*}}lowering_config = #iree_gpu.lowering_config
 //  CHECK-SAME:     mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>
-//  CHECK-SAME:     reduction = [0 : index, 0 : index, 0 : index, 0 : index, 8 : index]
+//  CHECK-SAME:     reduction = [0 : index, 0 : index, 0 : index, 0 : index, 4 : index]
 //  CHECK-SAME:     subgroup = [0 : index, 0 : index, 4 : index, 1 : index, 0 : index]
 //  CHECK-SAME:     workgroup = [1 : index, 1 : index, 64 : index, 64 : index, 0 : index]
commit	d5c4ef12db0d24204a208c45a881b852a3e56a42	[log] [tgz]
author	Max191 <44243577+Max191@users.noreply.github.com>	Mon Sep 09 13:06:20 2024 -0700
committer	GitHub <noreply@github.com>	Mon Sep 09 16:06:20 2024 -0400
tree	5d117a2795cfbd4e698d1ac0599947033bb88ee0
parent	69ca7dfa042141602f0ff748c456c31be86e3db6 [diff]