[Codegen][GPU] Make operand promotion pattern work with generics (#17650)
The pattern was previously using the `isMatmulOrBatchMatmul` helper that
only looked for named ops. Change the logic to use inferred contraction
dims and look at the static bounds of the op to filter out matvec cases.
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUPromoteMatmulOperands.cpp b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUPromoteMatmulOperands.cpp
index cf16ccf..5024d9c 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUPromoteMatmulOperands.cpp
+++ b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUPromoteMatmulOperands.cpp
@@ -64,6 +64,34 @@
op->setOperand(index, copy.getResult(0));
}
+bool isNonMatvecContraction(linalg::LinalgOp linalgOp) {
+ SmallVector<int64_t, 4> bounds = linalgOp.getStaticLoopRanges();
+ FailureOr<mlir::linalg::ContractionDimensions> contractionDims =
+ mlir::linalg::inferContractionDims(linalgOp);
+ if (failed(contractionDims)) {
+ return false;
+ }
+
+ if (contractionDims->k.size() < 1 || contractionDims->m.size() < 1 ||
+ contractionDims->n.size() < 1) {
+ return false;
+ }
+
+ auto getElementCount = [&](ArrayRef<unsigned> dims) {
+ int64_t acc = 1;
+ for (auto mDim : dims) {
+ int64_t size = bounds[mDim];
+ if (ShapedType::isDynamic(size)) {
+ return size;
+ }
+ acc *= size;
+ }
+ return acc;
+ };
+ return getElementCount(contractionDims->m) != 1 &&
+ getElementCount(contractionDims->n) != 1;
+}
+
struct GPUPromoteMatmulOperandsPass final
: impl::GPUPromoteMatmulOperandsPassBase<GPUPromoteMatmulOperandsPass> {
void runOnOperation() override {
@@ -71,7 +99,7 @@
OpBuilder builder(funcOp);
funcOp.walk([&](linalg::LinalgOp linalgOp) {
- if (!isMatmulOrBatchMatmul(linalgOp)) {
+ if (!isNonMatvecContraction(linalgOp)) {
return;
}
builder.setInsertionPoint(linalgOp);
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_promote_matmul_operands.mlir b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_promote_matmul_operands.mlir
index 3150e7d..2873140 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_promote_matmul_operands.mlir
+++ b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_promote_matmul_operands.mlir
@@ -29,3 +29,29 @@
// CHECK-LABEL: func.func @matvec
// CHECK-NOT: linalg.copy
// CHECK: return
+
+// -----
+
+#map = affine_map<(d0, d1, d2) -> (d0, d2)>
+#map1 = affine_map<(d0, d1, d2) -> (d2, d1)>
+#map2 = affine_map<(d0, d1, d2) -> (d0, d1)>
+func.func @generic_matmul(%a: tensor<32x1024xf32>, %b: tensor<1024x128xf32>) -> tensor<32x128xf32> {
+ %cst = arith.constant 0.000000e+00 : f32
+ %empty = tensor.empty() : tensor<32x128xf32>
+ %fill = linalg.fill ins(%cst : f32) outs(%empty : tensor<32x128xf32>) -> tensor<32x128xf32>
+ %mm = linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction"]}
+ ins(%a, %b : tensor<32x1024xf32>, tensor<1024x128xf32>) outs(%fill : tensor<32x128xf32>) {
+ ^bb0(%in: f32, %in_0: f32, %out: f32):
+ %7 = arith.mulf %in, %in_0 : f32
+ %8 = arith.addf %out, %7 : f32
+ linalg.yield %8 : f32
+ } -> tensor<32x128xf32>
+ return %mm : tensor<32x128xf32>
+}
+
+// CHECK-LABEL: func.func @generic_matmul
+// CHECK-SAME: %[[A:[A-Za-z0-9]+]]: tensor<32x1024xf32>
+// CHECK-SAME: %[[B:[A-Za-z0-9]+]]: tensor<1024x128xf32>
+// CHECK-DAG: %[[PA:.+]] = linalg.copy {{.*}} ins(%[[A]] : tensor<32x1024xf32>)
+// CHECK-DAG: %[[PB:.+]] = linalg.copy {{.*}} ins(%[[B]] : tensor<1024x128xf32>)
+// CHECK: linalg.generic {{.*}} ins(%[[PA]], %[[PB]] : tensor<32x1024xf32>, tensor<1024x128xf32>)