[Codegen][GPU] Avoid generic TileAndFuse promotion for convolutions. (#24435)

Generic TileAndFuse promotion can attach promote_operands to convolution
ops that did not go through a convolution-specific MMA config path.
Those paths are responsible for selecting an MMA schedule and attaching
mma_kind.

Skip generic contraction promotion for convolution ops so direct conv
and IGEMM configs remain the only promoted convolution paths.

This fixes NaN issues for ResNet ONNX models.

Fixes https://github.com/iree-org/iree/issues/24414

---------

Signed-off-by: hanhanW <hanhan0912@gmail.com>
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp
index 151f542..fba5c36 100644
--- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp
+++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp
@@ -1180,10 +1180,10 @@
                             workgroupSize, targetSubgroupSize, pipelineConfig));
 }
 
-/// Helper to identify contraction like operations for operand promotiong.
+/// Helper to identify contraction-like operations for operand promotion.
 static bool isNonMatvecContraction(Operation *op) {
   auto linalgOp = dyn_cast<linalg::LinalgOp>(op);
-  if (!linalgOp) {
+  if (!linalgOp || !linalg::isaContractionOpInterface(linalgOp)) {
     return false;
   }
   SmallVector<int64_t> bounds = linalgOp.getStaticLoopRanges();
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/BUILD.bazel b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/BUILD.bazel
index 541a9b4..5fd7192 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/BUILD.bazel
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/BUILD.bazel
@@ -26,6 +26,7 @@
             "config_igemm_tile_and_fuse.mlir",
             "config_reduction_transposed_output.mlir",
             "config_tile_and_fuse.mlir",
+            "config_tile_and_fuse_convolution.mlir",
             "config_tile_and_fuse_gfx1201.mlir",
             "config_tile_and_fuse_gfx950.mlir",
             "config_tile_and_fuse_outer_reduction.mlir",
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/CMakeLists.txt
index 7086d09..38171f6 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/CMakeLists.txt
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/CMakeLists.txt
@@ -21,6 +21,7 @@
     "config_igemm_tile_and_fuse.mlir"
     "config_reduction_transposed_output.mlir"
     "config_tile_and_fuse.mlir"
+    "config_tile_and_fuse_convolution.mlir"
     "config_tile_and_fuse_gfx1201.mlir"
     "config_tile_and_fuse_gfx950.mlir"
     "config_tile_and_fuse_outer_reduction.mlir"
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_tile_and_fuse_convolution.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_tile_and_fuse_convolution.mlir
new file mode 100644
index 0000000..e3b1301
--- /dev/null
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_tile_and_fuse_convolution.mlir
@@ -0,0 +1,28 @@
+// RUN: iree-opt --mlir-print-local-scope --split-input-file --iree-gpu-test-target=gfx1100 \
+// RUN:   --iree-codegen-llvmgpu-use-igemm=false --iree-codegen-llvmgpu-use-direct-convolution=false \
+// RUN:   --pass-pipeline="builtin.module(iree-llvmgpu-select-lowering-strategy)" %s | FileCheck %s
+
+#input_map = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d4, d2 * 2 + d5, d3 * 2 + d6)>
+#filter_map = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d1, d4, d5, d6)>
+#output_map = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3)>
+
+func.func @generic_dynamic_nchw_fchw_conv(
+    %input: tensor<?x3x230x230xf32>, %filter: tensor<64x3x7x7xf32>,
+    %init: tensor<?x64x112x112xf32>) -> tensor<?x64x112x112xf32> {
+  // CHECK-LABEL: func.func @generic_dynamic_nchw_fchw_conv
+  // CHECK:       #iree_codegen.translation_info<pipeline = #iree_gpu.pipeline<TileAndFuse>
+  // CHECK:       linalg.generic
+  // CHECK-NOT:   promote_operands
+  // CHECK:       ^bb0
+  %result = linalg.generic {
+      indexing_maps = [#input_map, #filter_map, #output_map],
+      iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]}
+      ins(%input, %filter : tensor<?x3x230x230xf32>, tensor<64x3x7x7xf32>)
+      outs(%init : tensor<?x64x112x112xf32>) {
+  ^bb0(%in: f32, %weight: f32, %out: f32):
+    %product = arith.mulf %in, %weight : f32
+    %sum = arith.addf %out, %product : f32
+    linalg.yield %sum : f32
+  } -> tensor<?x64x112x112xf32>
+  return %result : tensor<?x64x112x112xf32>
+}
diff --git a/tests/external/iree-test-suites/onnx_models/onnx_models_gpu_hip_rdna3.json b/tests/external/iree-test-suites/onnx_models/onnx_models_gpu_hip_rdna3.json
index f57f6f3..ce93a0b 100644
--- a/tests/external/iree-test-suites/onnx_models/onnx_models_gpu_hip_rdna3.json
+++ b/tests/external/iree-test-suites/onnx_models/onnx_models_gpu_hip_rdna3.json
@@ -19,8 +19,8 @@
     "tests/model_zoo/validated/vision/classification_models_test.py::test_models[mnist/model/mnist-12.onnx]": "pass",
     "tests/model_zoo/validated/vision/classification_models_test.py::test_models[mobilenet/model/mobilenetv2-12.onnx]": "pass",
     "tests/model_zoo/validated/vision/classification_models_test.py::test_models[rcnn_ilsvrc13/model/rcnn-ilsvrc13-9.onnx]": "pass",
-    "tests/model_zoo/validated/vision/classification_models_test.py::test_models[resnet/model/resnet50-v1-12.onnx]": "fail-run",
-    "tests/model_zoo/validated/vision/classification_models_test.py::test_models[resnet/model/resnet50-v2-7.onnx]": "fail-run",
+    "tests/model_zoo/validated/vision/classification_models_test.py::test_models[resnet/model/resnet50-v1-12.onnx]": "pass",
+    "tests/model_zoo/validated/vision/classification_models_test.py::test_models[resnet/model/resnet50-v2-7.onnx]": "pass",
     "tests/model_zoo/validated/vision/classification_models_test.py::test_models[shufflenet/model/shufflenet-9.onnx]": "pass",
     "tests/model_zoo/validated/vision/classification_models_test.py::test_models[shufflenet/model/shufflenet-v2-12.onnx]": "pass",
     "tests/model_zoo/validated/vision/object_detection_segmentation_models_test.py::test_models[tiny-yolov2/model/tinyyolov2-8.onnx]": "pass",
diff --git a/tests/external/iree-test-suites/onnx_models/onnx_models_gpu_hip_rdna4.json b/tests/external/iree-test-suites/onnx_models/onnx_models_gpu_hip_rdna4.json
index db32204..982b771 100644
--- a/tests/external/iree-test-suites/onnx_models/onnx_models_gpu_hip_rdna4.json
+++ b/tests/external/iree-test-suites/onnx_models/onnx_models_gpu_hip_rdna4.json
@@ -19,8 +19,8 @@
     "tests/model_zoo/validated/vision/classification_models_test.py::test_models[mnist/model/mnist-12.onnx]": "pass",
     "tests/model_zoo/validated/vision/classification_models_test.py::test_models[mobilenet/model/mobilenetv2-12.onnx]": "pass",
     "tests/model_zoo/validated/vision/classification_models_test.py::test_models[rcnn_ilsvrc13/model/rcnn-ilsvrc13-9.onnx]": "pass",
-    "tests/model_zoo/validated/vision/classification_models_test.py::test_models[resnet/model/resnet50-v1-12.onnx]": "fail-run",
-    "tests/model_zoo/validated/vision/classification_models_test.py::test_models[resnet/model/resnet50-v2-7.onnx]": "fail-run",
+    "tests/model_zoo/validated/vision/classification_models_test.py::test_models[resnet/model/resnet50-v1-12.onnx]": "pass",
+    "tests/model_zoo/validated/vision/classification_models_test.py::test_models[resnet/model/resnet50-v2-7.onnx]": "pass",
     "tests/model_zoo/validated/vision/classification_models_test.py::test_models[shufflenet/model/shufflenet-9.onnx]": "pass",
     "tests/model_zoo/validated/vision/classification_models_test.py::test_models[shufflenet/model/shufflenet-v2-12.onnx]": "pass",
     "tests/model_zoo/validated/vision/object_detection_segmentation_models_test.py::test_models[tiny-yolov2/model/tinyyolov2-8.onnx]": "pass",