[LLVMGPU][TD] Don't apply the unalgined strategy for unsupported cases (#13450)

We currently have some limitations with respect to the optionality of
pads in the current transform dialect strategy. As a result make sure we
don't apply this strategy when some of the pads may be folded away.

The existing code was already checking for that, but the condition was
slightly off.

Fix that, i.e., only apply the unaligned strategy when both M and K or N
and K are unaligned.

This fixes https://github.com/openxla/iree/issues/13448
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/set_transform_strategy.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/set_transform_strategy.mlir
index 9c04dfc..60f59db 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/set_transform_strategy.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/set_transform_strategy.mlir
@@ -260,3 +260,38 @@
 // generalization along this axis.
 // CHECK-NOT: transform.sequence
 
+// -----
+hal.executable @matmul_parially_unaligned {
+hal.executable.variant public @cuda_nvptx_fb, target = <"cuda", "cuda-nvptx-fb", {target_arch = "sm_80"}> {
+  hal.executable.export public @matmul ordinal(0) layout(#hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer, ReadOnly>, <2, storage_buffer>]>]>) {
+  ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
+    %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg2, %arg3
+    hal.return %x, %y, %z : index, index, index
+  }
+  builtin.module {
+    func.func @matmul_parially_unaligned() {
+      %c0 = arith.constant 0 : index
+      %cst = arith.constant 0.000000e+00 : f32
+      %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x2044xf32>>
+      %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2044x1024xf32>>
+      %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2048x1024xf32>>
+      %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x2044xf32>> -> tensor<2048x2044xf32>
+      %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2044x1024xf32>> -> tensor<2044x1024xf32>
+      %5 = tensor.empty() : tensor<2048x1024xf32>
+      %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2048x1024xf32>) -> tensor<2048x1024xf32>
+      %7 = linalg.matmul ins(%3, %4 : tensor<2048x2044xf32>, tensor<2044x1024xf32>) outs(%6 : tensor<2048x1024xf32>) -> tensor<2048x1024xf32>
+      flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : tensor<2048x1024xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x1024xf32>>
+      return
+    }
+  }
+}
+}
+
+// CHECK-LABEL: func @matmul_parially_unaligned
+
+// "Enough" of this matmul's dimensions are divisible by 64/64/16.
+// We currently bail on such cases because at least one of the paddings involved
+// in the strategy fold away and result in the strategy failing to apply.
+// In the future we should also support this case but for now we are missing the
+// generalization along this axis.
+// CHECK-NOT: transform.sequence
diff --git a/compiler/src/iree/compiler/Codegen/TransformDialectStrategies/GPU/Common.cpp b/compiler/src/iree/compiler/Codegen/TransformDialectStrategies/GPU/Common.cpp
index be42749..0041111 100644
--- a/compiler/src/iree/compiler/Codegen/TransformDialectStrategies/GPU/Common.cpp
+++ b/compiler/src/iree/compiler/Codegen/TransformDialectStrategies/GPU/Common.cpp
@@ -525,10 +525,11 @@
   //   - n and k are not aligned to the tile sizes (conservatively, take 64, 16)
   // Other cases currently result in folding and fall back to the default
   // unaligned IREE strategy.
-  bool unsupportedAlignedCases =
-      (matmulSize[0] % 64 == 0 && matmulSize[2] % 16 == 0) ||
-      (matmulSize[1] % 64 == 0 && matmulSize[2] % 16 == 0);
-  if (unsupportedAlignedCases) {
+  bool supportedUnalignedCases =
+      (matmulSize[0] % 64 != 0 && matmulSize[2] % 16 != 0) ||
+      (matmulSize[1] % 64 != 0 && matmulSize[2] % 16 != 0);
+
+  if (!supportedUnalignedCases) {
     LDBG("--Matmul strategy alignment check failed\n");
     return failure();
   }