[GPU] Disable unaligned to instrinsic batch matmul codegen with vector distribute (#18935) This path doesnt support all batch matmul shapes but tries to and fails e.g. https://github.com/iree-org/iree/issues/18601 So this PR makes this change because by default we should favor higher functionality support over performance. Solution is to keep this path behind a flag which is off by default. Fixes : https://github.com/iree-org/iree/issues/18601 If we bail out here, we will go down SIMT (note that we do anyway for non batch matmul GEMMs for such shapes) for now with Tile and Fuse pipeline support planned for the future. In the time being models who have shapes that are supported by this path can do so using the provided flag. And tuners can always use this pipeline if it works for the shape. We can also turn this on by default if we can add correct heuristics on when it is okay to use this path. --------- Signed-off-by: Nirvedh <nirvedh@gmail.com>

commit: db590705390f7182d8d329483585265ab5524dcf [log] [tgz]
author: Nirvedh Meshram <96096277+nirvedhmeshram@users.noreply.github.com> Fri Nov 01 11:26:56 2024 -0500
committer: GitHub <noreply@github.com> Fri Nov 01 11:26:56 2024 -0500
tree: ec822aa6c9f88a99a0f3175931312df294ee37d7
parent: bb542eee65fa0a498963df1f2ee2f205a3dd8bd0 [diff]
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp
index b4567e3..344565b 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp

@@ -62,6 +62,15 @@
     llvm::cl::desc("enable the usage of the vector distribution pipeline"),
     llvm::cl::init(true));
 
+// TODO (nirvedhmeshram): Drop this whole path after we have support with
+// TileAndFuse pipeline from completion of
+// https://github.com/iree-org/iree/issues/18858
+llvm::cl::opt<bool> clGPUUnalignedGEMMVectorDistribution(
+    "iree-codegen-llvmgpu-use-unaligned-gemm-vector-distribution",
+    llvm::cl::desc("enable the usage of the vector distribution pipeline for "
+                   "unaligned GEMMs when supported"),
+    llvm::cl::init(false));
+
 /// Flag to force using WMMA tensorcore operations.
 llvm::cl::opt<bool>
     clGPUUseWMMA("iree-codegen-llvmgpu-use-wmma",
@@ -598,7 +607,8 @@
   // Only batch_matmul is supported in the LLVMGPUPadAndVectorDistribute
   // pipeline.
   // TODO(hanchung): Support cases that there are fused producers.
-  if (!schedule && !contractionDims->batch.empty() && !hasFusedLeadingOp(op)) {
+  if (!schedule && !contractionDims->batch.empty() && !hasFusedLeadingOp(op) &&
+      clGPUUnalignedGEMMVectorDistribution) {
     LDBG("Matmul Pad and Vector Distribute");
     pipeline = CodeGenPipeline::LLVMGPUPadAndVectorDistribute;
     bool mustBeAligned = false;

diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_vector_distribute_gfx940.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_vector_distribute_gfx940.mlir
index a15177d..da6a563 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_vector_distribute_gfx940.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_vector_distribute_gfx940.mlir

@@ -1,4 +1,5 @@
 // RUN: iree-opt --split-input-file --iree-gpu-test-target=gfx940 --iree-codegen-llvmgpu-use-vector-distribution \
+// RUN:   --iree-codegen-llvmgpu-use-unaligned-gemm-vector-distribution \
 // RUN:   --pass-pipeline="builtin.module(iree-llvmgpu-select-lowering-strategy)" %s | FileCheck %s
 
 // TODO: This test is still using the legacy LLVMGPU kernel config. This needs
commit	db590705390f7182d8d329483585265ab5524dcf	[log] [tgz]
author	Nirvedh Meshram <96096277+nirvedhmeshram@users.noreply.github.com>	Fri Nov 01 11:26:56 2024 -0500
committer	GitHub <noreply@github.com>	Fri Nov 01 11:26:56 2024 -0500
tree	ec822aa6c9f88a99a0f3175931312df294ee37d7
parent	bb542eee65fa0a498963df1f2ee2f205a3dd8bd0 [diff]