Categorize dispatch name better for linalg.generic cases (#16677)

This could be quite useful for viewing trace captures.
diff --git a/compiler/src/iree/compiler/Dialect/Flow/Transforms/AnnotateDispatches.cpp b/compiler/src/iree/compiler/Dialect/Flow/Transforms/AnnotateDispatches.cpp
index 0e2da3a..bbd9d3e 100644
--- a/compiler/src/iree/compiler/Dialect/Flow/Transforms/AnnotateDispatches.cpp
+++ b/compiler/src/iree/compiler/Dialect/Flow/Transforms/AnnotateDispatches.cpp
@@ -182,6 +182,22 @@
     }
   }
 
+  // Categorize linalg.generic ops better.
+  if (prefix.empty() && isa<linalg::GenericOp>(op)) {
+    if (llvm::all_of(op.getIndexingMapsArray(),
+                     [](AffineMap m) { return m.isIdentity(); })) {
+      prefix = "elementwise";
+    } else if (llvm::all_of(op.getIndexingMapsArray(),
+                            [](AffineMap m) { return m.isMinorIdentity(); })) {
+      // We have checked that this is not pure elementwise in the above.
+      prefix = "broadcast";
+    } else if (linalg::isaContractionOpInterface(op)) {
+      prefix = "contract";
+    } else if (linalg::isaConvolutionOpInterface(op)) {
+      prefix = "conv";
+    }
+  }
+
   if (prefix.empty()) {
     // By default, use the op name as prefix.
     auto opName = op->getName().getStringRef();
diff --git a/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/annotate_dispatches.mlir b/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/annotate_dispatches.mlir
index 7ba9196..1936b66 100644
--- a/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/annotate_dispatches.mlir
+++ b/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/annotate_dispatches.mlir
@@ -1,4 +1,4 @@
-// RUN: iree-opt --allow-unregistered-dialect --split-input-file --iree-flow-annotate-dispatches %s | FileCheck %s
+// RUN: iree-opt --split-input-file --iree-flow-annotate-dispatches %s | FileCheck %s
 
 // Dispatches containing some ops get a heuristics-driven summary in their name.
 // This also tests symbol reference renaming.
@@ -88,7 +88,7 @@
 // Dispatch key op with multiple datatypes should be reflected in summary.
 
 flow.executable private @ex {
-  // CHECK: flow.executable.export public @dispatch_generic_4x8_i32xf32
+  // CHECK: flow.executable.export public @dispatch_elementwise_4x8_i32xf32
   flow.executable.export public @dispatch
   builtin.module {
     func.func @dispatch(%arg0: !flow.dispatch.tensor<writeonly:tensor<4x8xf32>>) {
@@ -211,7 +211,7 @@
 #map = affine_map<(d0, d1) -> (d1)>
 #map1 = affine_map<(d0, d1) -> (d0, d1)>
 flow.executable private @ex {
-  // CHECK: flow.executable.export public @ex_unpack_generic_384x512_f32_pack
+  // CHECK: flow.executable.export public @ex_unpack_broadcast_384x512_f32_pack
   flow.executable.export public @ex
   builtin.module {
     func.func @ex(%arg0: !flow.dispatch.tensor<readonly:tensor<24x32x16x16xf32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<512xf32>>, %arg2: !flow.dispatch.tensor<writeonly:tensor<24x512x16x1xf32>>) {
@@ -266,3 +266,80 @@
     }
   }
 }
+
+// -----
+
+#map = affine_map<(d0, d1, d2) -> (d0, d2)>
+#map1 = affine_map<(d0, d1, d2) -> (d2, d1)>
+#map2 = affine_map<(d0, d1, d2) -> (d0, d1)>
+
+flow.executable private @ex {
+  // CHECK: flow.executable.export public @dispatch_contract_16x32x8_f32
+  flow.executable.export public @dispatch
+  builtin.module {
+    func.func @dispatch(%arg0: !flow.dispatch.tensor<readwrite:tensor<16x32xf32>>) {
+      %0 = tensor.empty() : tensor<16x8xf32>
+      %1 = tensor.empty() : tensor<8x32xf32>
+      %init = flow.dispatch.tensor.load %arg0, offsets = [0, 0], sizes = [16, 32], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<16x32xf32>> -> tensor<16x32xf32>
+      %2 = linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction"]}
+              ins(%0, %1 : tensor<16x8xf32>, tensor<8x32xf32>) outs(%init : tensor<16x32xf32>) {
+      ^bb0(%in: f32, %in_0: f32, %out: f32):
+        %3 = arith.mulf %in, %in_0 : f32
+        %4 = arith.addf %out, %3 : f32
+        linalg.yield %4 : f32
+      } -> tensor<16x32xf32>
+      flow.dispatch.tensor.store %2, %arg0, offsets = [0, 0], sizes = [16, 32], strides = [1, 1] : tensor<16x32xf32> -> !flow.dispatch.tensor<readwrite:tensor<16x32xf32>>
+      return
+    }
+  }
+}
+
+// -----
+
+#map = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1 + d5, d2 + d6, d3)>
+#map1 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d5, d6, d3, d4)>
+#map2 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3, d4)>
+
+flow.executable private @ex {
+  // CHECK: flow.executable.export public @dispatch_conv_2x3x4x2x3x2x2_f32
+  flow.executable.export public @dispatch
+  builtin.module {
+    func.func @dispatch(%arg0: !flow.dispatch.tensor<readwrite:tensor<2x3x4x2x3xf32>>) {
+      %0 = tensor.empty() : tensor<2x4x5x2xf32>
+      %1 = tensor.empty() : tensor<2x2x2x3xf32>
+      %init = flow.dispatch.tensor.load %arg0, offsets = [0, 0, 0, 0, 0], sizes = [2, 3, 4, 2, 3], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<2x3x4x2x3xf32>> -> tensor<2x3x4x2x3xf32>
+      %2 = linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "reduction", "reduction"]}
+              ins(%0, %1 : tensor<2x4x5x2xf32>, tensor<2x2x2x3xf32>) outs(%init : tensor<2x3x4x2x3xf32>) {
+      ^bb0(%in: f32, %in_0: f32, %out: f32):
+        %3 = arith.mulf %in, %in_0 : f32
+        %4 = arith.addf %out, %3 : f32
+        linalg.yield %4 : f32
+      } -> tensor<2x3x4x2x3xf32>
+      flow.dispatch.tensor.store %2, %arg0, offsets = [0, 0, 0, 0, 0], sizes = [2, 3, 4, 2, 3], strides = [1, 1, 1, 1, 1] : tensor<2x3x4x2x3xf32> -> !flow.dispatch.tensor<readwrite:tensor<2x3x4x2x3xf32>>
+      return
+    }
+  }
+}
+
+// -----
+
+#map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
+
+flow.executable private @ex {
+  // CHECK: flow.executable.export public @dispatch_elementwise_8x16x32_f32
+  flow.executable.export public @dispatch
+  builtin.module {
+    func.func @dispatch(%arg0: !flow.dispatch.tensor<readwrite:tensor<8x16x32xf32>>) {
+      %0 = tensor.empty() : tensor<8x16x32xf32>
+      %init = flow.dispatch.tensor.load %arg0, offsets = [0, 0, 0], sizes = [8, 16, 32], strides = [1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<8x16x32xf32>> -> tensor<8x16x32xf32>
+      %2 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel"]}
+              ins(%0 : tensor<8x16x32xf32>) outs(%init : tensor<8x16x32xf32>) {
+      ^bb0(%in: f32, %out: f32):
+        %3 = arith.maximumf %in, %out : f32
+        linalg.yield %3 : f32
+      } -> tensor<8x16x32xf32>
+      flow.dispatch.tensor.store %2, %arg0, offsets = [0, 0, 0], sizes = [8, 16, 32], strides = [1, 1, 1] : tensor<8x16x32xf32> -> !flow.dispatch.tensor<readwrite:tensor<8x16x32xf32>>
+      return
+    }
+  }
+}
diff --git a/tools/test/executable_benchmarks.mlir b/tools/test/executable_benchmarks.mlir
index 48199a0..0e5fc61 100644
--- a/tools/test/executable_benchmarks.mlir
+++ b/tools/test/executable_benchmarks.mlir
@@ -26,7 +26,7 @@
 // reduced/simplified. Dynamic shapes, for example, will usually stop a dispatch
 // from being benchmarkable without explicit shape arguments.
 
-// CHECK: BM_abs_dispatch_0_vmvx_bytecode_fb_abs_dispatch_0_generic
+// CHECK: BM_abs_dispatch_0_vmvx_bytecode_fb_abs_dispatch_0_elementwise
 func.func @abs(%input : tensor<f32>) -> (tensor<f32>) {
   %result = math.absf %input : tensor<f32>
   return %result : tensor<f32>
diff --git a/tools/test/executable_configurations.mlir b/tools/test/executable_configurations.mlir
index 98554fc..35235c9 100644
--- a/tools/test/executable_configurations.mlir
+++ b/tools/test/executable_configurations.mlir
@@ -41,4 +41,4 @@
 // CHECK: IR Dump Before SerializeExecutablesPass
 // CHECK: hal.executable public @abs_dispatch_0
 // CHECK:   hal.executable.variant public @vmvx_bytecode_fb
-// CHECK:     vm.func private @abs_dispatch_0_generic
+// CHECK:     vm.func private @abs_dispatch_0_elementwise
diff --git a/tools/test/executable_sources.mlir b/tools/test/executable_sources.mlir
index 30e305e..df5fa94 100644
--- a/tools/test/executable_sources.mlir
+++ b/tools/test/executable_sources.mlir
@@ -39,4 +39,4 @@
 // CHECK: IR Dump Before SerializeExecutablesPass
 // CHECK: hal.executable public @abs_dispatch_0
 // CHECK:   hal.executable.variant public @vmvx_bytecode_fb
-// CHECK:     vm.func private @abs_dispatch_0_generic
+// CHECK:     vm.func private @abs_dispatch_0_elementwise