[Codegen][GPU] Fuse into destinations for parallel tiling (#18666)

Currently we disable fusion of destinations for all tiling levels, but
that's only required for reduction tiling. Turn on fusion along
destinations for parallel tiling levels.
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUApplyTilingLevel.cpp b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUApplyTilingLevel.cpp
index 4b029d0..a6b6bf8 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUApplyTilingLevel.cpp
+++ b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUApplyTilingLevel.cpp
@@ -148,8 +148,11 @@
       if (auto tilingOwner = dyn_cast<TilingInterface>(owner)) {
         shouldFuse = !payloadOps.contains(tilingOwner);
       }
-      // Do not fuse destination operands.
-      shouldFuse &= !isDestinationOperand;
+      // Do not fuse destination operands for reduction tiling.
+      if (isDestinationOperand &&
+          tilingLevel == IREE::GPU::TilingLevel::Reduction) {
+        shouldFuse = false;
+      }
       if (shouldFuse) {
         return scf::SCFTileAndFuseOptions::ControlFnResult{
             yieldProducerReplacement};
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_apply_tiling_level.mlir b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_apply_tiling_level.mlir
index 22001c7..18b620f 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_apply_tiling_level.mlir
+++ b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_apply_tiling_level.mlir
@@ -155,6 +155,30 @@
 
 // -----
 
+#config = #iree_gpu.lowering_config<{reduction = [0, 0, 8], thread = [8, 8, 0]}>
+#map = affine_map<(d0, d1) -> (d0, d1)>
+func.func @matmul_fuse_destination(%3: tensor<64x64xf32>, %4: tensor<64x64xf32>) -> tensor<64x64xf32> {
+  %empty = tensor.empty() : tensor<64x64xf32>
+  %cst = arith.constant 0.0 : f32
+  %5 = linalg.fill ins(%cst : f32) outs(%empty : tensor<64x64xf32>) -> tensor<64x64xf32>
+  %7 = linalg.matmul {lowering_config = #config} ins(%3, %4 : tensor<64x64xf32>, tensor<64x64xf32>) outs(%5 : tensor<64x64xf32>) -> tensor<64x64xf32>
+  return %7 : tensor<64x64xf32>
+}
+
+// Verify that destinations are not fused for reduction tiling.
+// CHECK-LABEL: func.func @matmul_fuse_destination
+//       CHECK:   %[[FILL:.+]] = linalg.fill ins(%{{.*}} : tensor<64x64xf32>)
+//       CHECK:   scf.for %{{.*}} = %c0 to %c64 step %c8 iter_args(%[[ITER:.+]] = %[[FILL]]
+//       CHECK:     linalg.matmul
+
+// THREAD-LABEL: func.func @matmul_fuse_destination
+//       THREAD:   %[[EMPTY:.+]] = tensor.empty() : tensor<64x64xf32>
+//       THREAD:   scf.forall {{.*}} shared_outs(%[[INIT:.+]] = %[[EMPTY]]
+//       THREAD:     linalg.fill
+//       THREAD:     linalg.matmul
+
+// -----
+
 #config = #iree_gpu.lowering_config<{thread = [8, 8]}>
 func.func @matmul_cleanup(%3: tensor<64x64xf32>, %4: tensor<64x64xf32>, %5: tensor<64x64xf32>) -> tensor<64x64xf32> {
   %c8 = arith.constant 8 : index