Categorize dispatch name better for linalg.generic cases (#16677)
This could be quite useful for viewing trace captures.
diff --git a/compiler/src/iree/compiler/Dialect/Flow/Transforms/AnnotateDispatches.cpp b/compiler/src/iree/compiler/Dialect/Flow/Transforms/AnnotateDispatches.cpp
index 0e2da3a..bbd9d3e 100644
--- a/compiler/src/iree/compiler/Dialect/Flow/Transforms/AnnotateDispatches.cpp
+++ b/compiler/src/iree/compiler/Dialect/Flow/Transforms/AnnotateDispatches.cpp
@@ -182,6 +182,22 @@
}
}
+ // Categorize linalg.generic ops better.
+ if (prefix.empty() && isa<linalg::GenericOp>(op)) {
+ if (llvm::all_of(op.getIndexingMapsArray(),
+ [](AffineMap m) { return m.isIdentity(); })) {
+ prefix = "elementwise";
+ } else if (llvm::all_of(op.getIndexingMapsArray(),
+ [](AffineMap m) { return m.isMinorIdentity(); })) {
+ // We have checked that this is not pure elementwise in the above.
+ prefix = "broadcast";
+ } else if (linalg::isaContractionOpInterface(op)) {
+ prefix = "contract";
+ } else if (linalg::isaConvolutionOpInterface(op)) {
+ prefix = "conv";
+ }
+ }
+
if (prefix.empty()) {
// By default, use the op name as prefix.
auto opName = op->getName().getStringRef();
diff --git a/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/annotate_dispatches.mlir b/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/annotate_dispatches.mlir
index 7ba9196..1936b66 100644
--- a/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/annotate_dispatches.mlir
+++ b/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/annotate_dispatches.mlir
@@ -1,4 +1,4 @@
-// RUN: iree-opt --allow-unregistered-dialect --split-input-file --iree-flow-annotate-dispatches %s | FileCheck %s
+// RUN: iree-opt --split-input-file --iree-flow-annotate-dispatches %s | FileCheck %s
// Dispatches containing some ops get a heuristics-driven summary in their name.
// This also tests symbol reference renaming.
@@ -88,7 +88,7 @@
// Dispatch key op with multiple datatypes should be reflected in summary.
flow.executable private @ex {
- // CHECK: flow.executable.export public @dispatch_generic_4x8_i32xf32
+ // CHECK: flow.executable.export public @dispatch_elementwise_4x8_i32xf32
flow.executable.export public @dispatch
builtin.module {
func.func @dispatch(%arg0: !flow.dispatch.tensor<writeonly:tensor<4x8xf32>>) {
@@ -211,7 +211,7 @@
#map = affine_map<(d0, d1) -> (d1)>
#map1 = affine_map<(d0, d1) -> (d0, d1)>
flow.executable private @ex {
- // CHECK: flow.executable.export public @ex_unpack_generic_384x512_f32_pack
+ // CHECK: flow.executable.export public @ex_unpack_broadcast_384x512_f32_pack
flow.executable.export public @ex
builtin.module {
func.func @ex(%arg0: !flow.dispatch.tensor<readonly:tensor<24x32x16x16xf32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<512xf32>>, %arg2: !flow.dispatch.tensor<writeonly:tensor<24x512x16x1xf32>>) {
@@ -266,3 +266,80 @@
}
}
}
+
+// -----
+
+#map = affine_map<(d0, d1, d2) -> (d0, d2)>
+#map1 = affine_map<(d0, d1, d2) -> (d2, d1)>
+#map2 = affine_map<(d0, d1, d2) -> (d0, d1)>
+
+flow.executable private @ex {
+ // CHECK: flow.executable.export public @dispatch_contract_16x32x8_f32
+ flow.executable.export public @dispatch
+ builtin.module {
+ func.func @dispatch(%arg0: !flow.dispatch.tensor<readwrite:tensor<16x32xf32>>) {
+ %0 = tensor.empty() : tensor<16x8xf32>
+ %1 = tensor.empty() : tensor<8x32xf32>
+ %init = flow.dispatch.tensor.load %arg0, offsets = [0, 0], sizes = [16, 32], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<16x32xf32>> -> tensor<16x32xf32>
+ %2 = linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction"]}
+ ins(%0, %1 : tensor<16x8xf32>, tensor<8x32xf32>) outs(%init : tensor<16x32xf32>) {
+ ^bb0(%in: f32, %in_0: f32, %out: f32):
+ %3 = arith.mulf %in, %in_0 : f32
+ %4 = arith.addf %out, %3 : f32
+ linalg.yield %4 : f32
+ } -> tensor<16x32xf32>
+ flow.dispatch.tensor.store %2, %arg0, offsets = [0, 0], sizes = [16, 32], strides = [1, 1] : tensor<16x32xf32> -> !flow.dispatch.tensor<readwrite:tensor<16x32xf32>>
+ return
+ }
+ }
+}
+
+// -----
+
+#map = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1 + d5, d2 + d6, d3)>
+#map1 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d5, d6, d3, d4)>
+#map2 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3, d4)>
+
+flow.executable private @ex {
+ // CHECK: flow.executable.export public @dispatch_conv_2x3x4x2x3x2x2_f32
+ flow.executable.export public @dispatch
+ builtin.module {
+ func.func @dispatch(%arg0: !flow.dispatch.tensor<readwrite:tensor<2x3x4x2x3xf32>>) {
+ %0 = tensor.empty() : tensor<2x4x5x2xf32>
+ %1 = tensor.empty() : tensor<2x2x2x3xf32>
+ %init = flow.dispatch.tensor.load %arg0, offsets = [0, 0, 0, 0, 0], sizes = [2, 3, 4, 2, 3], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<2x3x4x2x3xf32>> -> tensor<2x3x4x2x3xf32>
+ %2 = linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "reduction", "reduction"]}
+ ins(%0, %1 : tensor<2x4x5x2xf32>, tensor<2x2x2x3xf32>) outs(%init : tensor<2x3x4x2x3xf32>) {
+ ^bb0(%in: f32, %in_0: f32, %out: f32):
+ %3 = arith.mulf %in, %in_0 : f32
+ %4 = arith.addf %out, %3 : f32
+ linalg.yield %4 : f32
+ } -> tensor<2x3x4x2x3xf32>
+ flow.dispatch.tensor.store %2, %arg0, offsets = [0, 0, 0, 0, 0], sizes = [2, 3, 4, 2, 3], strides = [1, 1, 1, 1, 1] : tensor<2x3x4x2x3xf32> -> !flow.dispatch.tensor<readwrite:tensor<2x3x4x2x3xf32>>
+ return
+ }
+ }
+}
+
+// -----
+
+#map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
+
+flow.executable private @ex {
+ // CHECK: flow.executable.export public @dispatch_elementwise_8x16x32_f32
+ flow.executable.export public @dispatch
+ builtin.module {
+ func.func @dispatch(%arg0: !flow.dispatch.tensor<readwrite:tensor<8x16x32xf32>>) {
+ %0 = tensor.empty() : tensor<8x16x32xf32>
+ %init = flow.dispatch.tensor.load %arg0, offsets = [0, 0, 0], sizes = [8, 16, 32], strides = [1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<8x16x32xf32>> -> tensor<8x16x32xf32>
+ %2 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel"]}
+ ins(%0 : tensor<8x16x32xf32>) outs(%init : tensor<8x16x32xf32>) {
+ ^bb0(%in: f32, %out: f32):
+ %3 = arith.maximumf %in, %out : f32
+ linalg.yield %3 : f32
+ } -> tensor<8x16x32xf32>
+ flow.dispatch.tensor.store %2, %arg0, offsets = [0, 0, 0], sizes = [8, 16, 32], strides = [1, 1, 1] : tensor<8x16x32xf32> -> !flow.dispatch.tensor<readwrite:tensor<8x16x32xf32>>
+ return
+ }
+ }
+}
diff --git a/tools/test/executable_benchmarks.mlir b/tools/test/executable_benchmarks.mlir
index 48199a0..0e5fc61 100644
--- a/tools/test/executable_benchmarks.mlir
+++ b/tools/test/executable_benchmarks.mlir
@@ -26,7 +26,7 @@
// reduced/simplified. Dynamic shapes, for example, will usually stop a dispatch
// from being benchmarkable without explicit shape arguments.
-// CHECK: BM_abs_dispatch_0_vmvx_bytecode_fb_abs_dispatch_0_generic
+// CHECK: BM_abs_dispatch_0_vmvx_bytecode_fb_abs_dispatch_0_elementwise
func.func @abs(%input : tensor<f32>) -> (tensor<f32>) {
%result = math.absf %input : tensor<f32>
return %result : tensor<f32>
diff --git a/tools/test/executable_configurations.mlir b/tools/test/executable_configurations.mlir
index 98554fc..35235c9 100644
--- a/tools/test/executable_configurations.mlir
+++ b/tools/test/executable_configurations.mlir
@@ -41,4 +41,4 @@
// CHECK: IR Dump Before SerializeExecutablesPass
// CHECK: hal.executable public @abs_dispatch_0
// CHECK: hal.executable.variant public @vmvx_bytecode_fb
-// CHECK: vm.func private @abs_dispatch_0_generic
+// CHECK: vm.func private @abs_dispatch_0_elementwise
diff --git a/tools/test/executable_sources.mlir b/tools/test/executable_sources.mlir
index 30e305e..df5fa94 100644
--- a/tools/test/executable_sources.mlir
+++ b/tools/test/executable_sources.mlir
@@ -39,4 +39,4 @@
// CHECK: IR Dump Before SerializeExecutablesPass
// CHECK: hal.executable public @abs_dispatch_0
// CHECK: hal.executable.variant public @vmvx_bytecode_fb
-// CHECK: vm.func private @abs_dispatch_0_generic
+// CHECK: vm.func private @abs_dispatch_0_elementwise