[GPU] Do not treat pad as a tilable producer for operand promotion (#18918)

PadOp doesn't have an implementation for deriving thread configuration
from derived_thread_config, so ignore promoting it until an
implementation is added.
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUPromoteMatmulOperands.cpp b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUPromoteMatmulOperands.cpp
index dd498fa..5e50a95 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUPromoteMatmulOperands.cpp
+++ b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUPromoteMatmulOperands.cpp
@@ -53,9 +53,15 @@
         return;
       }
     }
-    setLoweringConfig(producer, IREE::GPU::DerivedThreadConfigAttr::get(
-                                    builder.getContext()));
-    return;
+
+    // We only support thread tile size derivation of linalgOp and Im2colOp for
+    // now.
+    if (isa<linalg::LinalgOp, IREE::LinalgExt::Im2colOp>(
+            producer.getOperation())) {
+      setLoweringConfig(producer, IREE::GPU::DerivedThreadConfigAttr::get(
+                                      builder.getContext()));
+      return;
+    }
   }
 
   auto tensorType = dyn_cast<RankedTensorType>(operand.getType());
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_promote_matmul_operands.mlir b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_promote_matmul_operands.mlir
index f05cf7b..643b12c 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_promote_matmul_operands.mlir
+++ b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_promote_matmul_operands.mlir
@@ -82,3 +82,27 @@
 // CHECK-LABEL: func.func @no_promote_fill
 //   CHECK-NOT:   iree_gpu.derived_thread_config
 //       CHECK: return
+
+// -----
+
+#lowering_config = #iree_gpu.lowering_config<{promote_operands = [0]}>
+
+func.func @promote_pad(%a : tensor<4x127xf32>, %b: tensor<128x128xf32>) -> tensor<4x128xf32> {
+  %cst = arith.constant 0.000000e+00 : f32
+  %empty = tensor.empty() : tensor<4x128xf32>
+  %fill = linalg.fill ins(%cst : f32) outs(%empty : tensor<4x128xf32>) -> tensor<4x128xf32>
+  %padded = tensor.pad %a low[0, 0] high[0, 1] {
+  ^bb0(%arg0: index, %arg1: index):
+    tensor.yield %cst : f32
+  } : tensor<4x127xf32> to tensor<4x128xf32>
+  %mm = linalg.matmul {lowering_config = #lowering_config}
+    ins(%padded, %b : tensor<4x128xf32>, tensor<128x128xf32>) outs(%fill : tensor<4x128xf32>) -> tensor<4x128xf32>
+  return %mm : tensor<4x128xf32>
+}
+
+// Verify that pad is promoted with linalg.copy
+// CHECK-LABEL: func.func @promote_pad
+//   CHECK:   tensor.pad
+//   CHECK:   linalg.copy
+// CHECK-SAME: derived_thread_config
+//       CHECK: return