[Flow] Improve dispatch name categorization around broadcast/transpose (#17890) The dispatch names are largely to tell us 1) What kind of computation it is and 2) What did fusion come up with This patch changes the way that broadcast and transpose is labeled to reflect what we want to know about each dispatch. Essentially, it tries to categorize dispatches as follows: Elementwise: Dispatches that are pure elementwise (identity) maps with potentially some minor transposed/broadcasted operands. This indicates that the core memory bound operands are pure elementwise. Transpose: Same as elementwise except either the input or output maps are permuted. This indicates that there is data movement happening. Broadcast: Cases where the input maps are all strict projections of the output maps. This should only ever appear if something in fusion went off the rails.

diff --git a/compiler/src/iree/compiler/Dialect/Flow/Transforms/AnnotateDispatches.cpp b/compiler/src/iree/compiler/Dialect/Flow/Transforms/AnnotateDispatches.cpp
index aef1bf5..e71f856 100644
--- a/compiler/src/iree/compiler/Dialect/Flow/Transforms/AnnotateDispatches.cpp
+++ b/compiler/src/iree/compiler/Dialect/Flow/Transforms/AnnotateDispatches.cpp

@@ -227,13 +227,50 @@
   // Categorize linalg.generic ops better. The following checks more specific
   // cases before more general ones.
   if (prefix.empty() && isa<linalg::GenericOp>(op)) {
-    if (llvm::all_of(op.getIndexingMapsArray(),
-                     [](AffineMap m) { return m.isIdentity(); })) {
+    SmallVector<AffineMap> indexingMaps = op.getIndexingMapsArray();
+    ArrayRef<AffineMap> inputMaps(indexingMaps.begin(),
+                                  indexingMaps.begin() + op.getNumDpsInputs());
+    ArrayRef<AffineMap> outputMaps(indexingMaps.begin() + op.getNumDpsInputs(),
+                                   indexingMaps.end());
+    bool isIdentityOuts =
+        llvm::all_of(outputMaps, [](AffineMap m) { return m.isIdentity(); });
+    bool isPermutationOuts =
+        llvm::all_of(outputMaps, [](AffineMap m) { return m.isPermutation(); });
+    bool isProjectedPermIns = llvm::all_of(
+        inputMaps, [](AffineMap m) { return m.isProjectedPermutation(true); });
+    int64_t numIdentityIn =
+        llvm::count_if(inputMaps, [](AffineMap m) { return m.isIdentity(); });
+    int64_t numPermutationIn = llvm::count_if(
+        inputMaps, [](AffineMap m) { return m.isPermutation(); });
+    // We categorize elementwise operations as follows:
+    //   1. All output maps are identity with the iteration space.
+    //   2. There is at least one input with an identity indexing map.
+    //   3. There are no permuted inputs that are not also broadcast.
+    //
+    // This categorization tells us that the dispatch includes limited or no
+    // non-trivial data movement.
+    bool hasIdentityInputRoot =
+        numIdentityIn > 0 && numIdentityIn == numPermutationIn;
+    if (isIdentityOuts && isProjectedPermIns && hasIdentityInputRoot) {
       prefix = "elementwise";
-    } else if (llvm::all_of(op.getIndexingMapsArray(),
-                            [](AffineMap m) { return m.isMinorIdentity(); })) {
-      // We have checked that this is not pure elementwise in the above.
-      prefix = "broadcast";
+      // We draw a distinction between pure elementwise operations and
+      // elementwise operations that include a transpose. To separate
+      // transposes, there are two cases:
+      //   1. 2) and 3) hold for elementwise, but the output maps are instead
+      //      permutations.
+      //   2. The output maps are permutations or identity, and the most major
+      //      input indexing map is a permutation.
+    } else if (isPermutationOuts && isProjectedPermIns &&
+               ((hasIdentityInputRoot && !isIdentityOuts) ||
+                numPermutationIn > numIdentityIn)) {
+      prefix = "elementwise_transpose";
+      // Broadcasts are an indication that fusion went off the rails. We treat
+      // anything where all output maps are permutations, but the inputs are all
+      // projected permutations (without full rank) as a broadcast, which could
+      // potentially be fused with other elementwise operations/transposes.
+    } else if (isPermutationOuts && isProjectedPermIns &&
+               numPermutationIn == 0) {
+      prefix = "elementwise_broadcast";
     } else if (isMatvecLike(op)) {
       prefix = "matvec_like";
     } else if (isMatmulLike(op)) {

diff --git a/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/annotate_dispatches.mlir b/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/annotate_dispatches.mlir
index 87f113a..70b3576 100644
--- a/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/annotate_dispatches.mlir
+++ b/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/annotate_dispatches.mlir

@@ -211,7 +211,7 @@
 #map = affine_map<(d0, d1) -> (d1)>
 #map1 = affine_map<(d0, d1) -> (d0, d1)>
 flow.executable private @ex {
-  // CHECK: flow.executable.export public @ex_unpack_broadcast_384x512_f32_pack
+  // CHECK: flow.executable.export public @ex_unpack_elementwise_384x512_f32_pack
   flow.executable.export public @ex
   builtin.module {
     func.func @ex(%arg0: !flow.dispatch.tensor<readonly:tensor<24x32x16x16xf32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<512xf32>>, %arg2: !flow.dispatch.tensor<writeonly:tensor<24x512x16x1xf32>>) {
@@ -509,3 +509,162 @@
       }
     }
 }
+
+// -----
+
+// Test transposing elementwise operation.
+
+#map = affine_map<(d0, d1) -> (d0)>
+#map1 = affine_map<(d0, d1) -> (d1, d0)>
+#map2 = affine_map<(d0, d1) -> (d0, d1)>
+flow.executable private @ex {
+  // CHECK: flow.executable.export public @ex_elementwise_transpose_7x5_f32
+  flow.executable.export public @ex
+  builtin.module {
+    func.func @ex(%arg0: !flow.dispatch.tensor<readonly:tensor<5x7xf32>>,
+                  %arg1: !flow.dispatch.tensor<readonly:tensor<7xf32>>,
+                  %arg2: !flow.dispatch.tensor<writeonly:tensor<7x5xf32>>) {
+      %0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0], sizes = [5, 7], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<5x7xf32>> -> tensor<5x7xf32>
+      %1 = flow.dispatch.tensor.load %arg1, offsets = [0], sizes = [7], strides = [1] : !flow.dispatch.tensor<readonly:tensor<7xf32>> -> tensor<7xf32>
+      %2 = tensor.empty() : tensor<7x5xf32>
+      %3 = linalg.generic {
+        indexing_maps = [#map, #map1, #map2],
+        iterator_types = ["parallel", "parallel"]
+      } ins(%1, %0 : tensor<7xf32>, tensor<5x7xf32>) outs(%2 : tensor<7x5xf32>) {
+      ^bb0(%in: f32, %in_0: f32, %out: f32):
+        %5 = arith.addf %in, %in_0 : f32
+        linalg.yield %5 : f32
+      } -> tensor<7x5xf32>
+      flow.dispatch.tensor.store %3, %arg2, offsets = [0, 0], sizes = [7, 5], strides = [1, 1] : tensor<7x5xf32> -> !flow.dispatch.tensor<writeonly:tensor<7x5xf32>>
+      return
+    }
+  }
+}
+
+// -----
+
+// Same as the above, but with the transpose map represented on the output.
+
+#map = affine_map<(d0, d1) -> (d1)>
+#map1 = affine_map<(d0, d1) -> (d0, d1)>
+#map2 = affine_map<(d0, d1) -> (d1, d0)>
+flow.executable private @ex {
+  // CHECK: flow.executable.export public @ex_elementwise_transpose_5x7_f32
+  flow.executable.export public @ex
+  builtin.module {
+    func.func @ex(%arg0: !flow.dispatch.tensor<readonly:tensor<5x7xf32>>,
+                  %arg1: !flow.dispatch.tensor<readonly:tensor<7xf32>>,
+                  %arg2: !flow.dispatch.tensor<writeonly:tensor<7x5xf32>>) {
+      %0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0], sizes = [5, 7], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<5x7xf32>> -> tensor<5x7xf32>
+      %1 = flow.dispatch.tensor.load %arg1, offsets = [0], sizes = [7], strides = [1] : !flow.dispatch.tensor<readonly:tensor<7xf32>> -> tensor<7xf32>
+      %2 = tensor.empty() : tensor<7x5xf32>
+      %3 = linalg.generic {
+        indexing_maps = [#map, #map1, #map2],
+        iterator_types = ["parallel", "parallel"]
+      } ins(%1, %0 : tensor<7xf32>, tensor<5x7xf32>) outs(%2 : tensor<7x5xf32>) {
+      ^bb0(%in: f32, %in_0: f32, %out: f32):
+        %5 = arith.addf %in, %in_0 : f32
+        linalg.yield %5 : f32
+      } -> tensor<7x5xf32>
+      flow.dispatch.tensor.store %3, %arg2, offsets = [0, 0], sizes = [7, 5], strides = [1, 1] : tensor<7x5xf32> -> !flow.dispatch.tensor<writeonly:tensor<7x5xf32>>
+      return
+    }
+  }
+}
+
+// -----
+
+// Test marking a strictly broadcasting elementwise operation as a broadcast.
+
+#map = affine_map<(d0, d1) -> (d1)>
+#map1 = affine_map<(d0, d1) -> (d0, d1)>
+flow.executable private @ex {
+  // CHECK: flow.executable.export public @ex_elementwise_broadcast_7x5_f32
+  flow.executable.export public @ex
+  builtin.module {
+    func.func @ex(%arg0: !flow.dispatch.tensor<readonly:tensor<5xf32>>,
+                  %arg1: !flow.dispatch.tensor<readonly:tensor<5xf32>>,
+                  %arg2: !flow.dispatch.tensor<writeonly:tensor<7x5xf32>>) {
+      %0 = flow.dispatch.tensor.load %arg0, offsets = [0], sizes = [7], strides = [1] : !flow.dispatch.tensor<readonly:tensor<5xf32>> -> tensor<5xf32>
+      %1 = flow.dispatch.tensor.load %arg1, offsets = [0], sizes = [7], strides = [1] : !flow.dispatch.tensor<readonly:tensor<5xf32>> -> tensor<5xf32>
+      %2 = tensor.empty() : tensor<7x5xf32>
+      %3 = linalg.generic {
+        indexing_maps = [#map, #map, #map1],
+        iterator_types = ["parallel", "parallel"]
+      } ins(%1, %0 : tensor<5xf32>, tensor<5xf32>) outs(%2 : tensor<7x5xf32>) {
+      ^bb0(%in: f32, %in_0: f32, %out: f32):
+        %5 = arith.addf %in, %in_0 : f32
+        linalg.yield %5 : f32
+      } -> tensor<7x5xf32>
+      flow.dispatch.tensor.store %3, %arg2, offsets = [0, 0], sizes = [7, 5], strides = [1, 1] : tensor<7x5xf32> -> !flow.dispatch.tensor<writeonly:tensor<7x5xf32>>
+      return
+    }
+  }
+}
+
+// -----
+
+// Test a pure elementwise operation with a broadcasted operand.
+
+#map = affine_map<(d0, d1) -> (d0)>
+#map1 = affine_map<(d0, d1) -> (d0, d1)>
+flow.executable private @ex {
+  // CHECK: flow.executable.export public @ex_elementwise_7x5_f32
+  flow.executable.export public @ex
+  builtin.module {
+    func.func @ex(%arg0: !flow.dispatch.tensor<readonly:tensor<7x5xf32>>,
+                  %arg1: !flow.dispatch.tensor<readonly:tensor<7xf32>>,
+                  %arg2: !flow.dispatch.tensor<writeonly:tensor<7x5xf32>>,
+                  %arg3: !flow.dispatch.tensor<writeonly:tensor<7x5xf32>>) {
+      %0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0], sizes = [7, 5], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<7x5xf32>> -> tensor<7x5xf32>
+      %1 = flow.dispatch.tensor.load %arg1, offsets = [0], sizes = [7], strides = [1] : !flow.dispatch.tensor<readonly:tensor<7xf32>> -> tensor<7xf32>
+      %2 = tensor.empty() : tensor<7x5xf32>
+      %3:2 = linalg.generic {
+        indexing_maps = [#map, #map1, #map1, #map1],
+        iterator_types = ["parallel", "parallel"]
+      } ins(%1, %0 : tensor<7xf32>, tensor<7x5xf32>) outs(%2, %2 : tensor<7x5xf32>, tensor<7x5xf32>) {
+      ^bb0(%in: f32, %in_0: f32, %out: f32, %out_0: f32):
+        %4 = arith.mulf %in, %in_0 : f32
+        %5 = arith.addf %in, %in_0 : f32
+        linalg.yield %4, %5 : f32, f32
+      } -> (tensor<7x5xf32>, tensor<7x5xf32>)
+      flow.dispatch.tensor.store %3#0, %arg2, offsets = [0, 0], sizes = [7, 5], strides = [1, 1] : tensor<7x5xf32> -> !flow.dispatch.tensor<writeonly:tensor<7x5xf32>>
+      flow.dispatch.tensor.store %3#1, %arg3, offsets = [0, 0], sizes = [7, 5], strides = [1, 1] : tensor<7x5xf32> -> !flow.dispatch.tensor<writeonly:tensor<7x5xf32>>
+      return
+    }
+  }
+}
+
+// -----
+
+// Test a multi-result elementwise operation where one result is transposed.
+
+#map = affine_map<(d0, d1) -> (d0)>
+#map1 = affine_map<(d0, d1) -> (d0, d1)>
+#map2 = affine_map<(d0, d1) -> (d1, d0)>
+flow.executable private @ex {
+  // CHECK: flow.executable.export public @ex_elementwise_transpose_7x5_f32
+  flow.executable.export public @ex
+  builtin.module {
+    func.func @ex(%arg0: !flow.dispatch.tensor<readonly:tensor<7x5xf32>>,
+                  %arg1: !flow.dispatch.tensor<readonly:tensor<7xf32>>,
+                  %arg2: !flow.dispatch.tensor<writeonly:tensor<7x5xf32>>,
+                  %arg3: !flow.dispatch.tensor<writeonly:tensor<5x7xf32>>) {
+      %0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0], sizes = [7, 5], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<7x5xf32>> -> tensor<7x5xf32>
+      %1 = flow.dispatch.tensor.load %arg1, offsets = [0], sizes = [7], strides = [1] : !flow.dispatch.tensor<readonly:tensor<7xf32>> -> tensor<7xf32>
+      %2 = tensor.empty() : tensor<7x5xf32>
+      %3 = tensor.empty() : tensor<5x7xf32>
+      %4:2 = linalg.generic {
+        indexing_maps = [#map, #map1, #map1, #map2],
+        iterator_types = ["parallel", "parallel"]
+      } ins(%1, %0 : tensor<7xf32>, tensor<7x5xf32>) outs(%2, %3 : tensor<7x5xf32>, tensor<5x7xf32>) {
+      ^bb0(%in: f32, %in_0: f32, %out: f32, %out_0: f32):
+        %5 = arith.addf %in, %in_0 : f32
+        linalg.yield %5, %5 : f32, f32
+      } -> (tensor<7x5xf32>, tensor<5x7xf32>)
+      flow.dispatch.tensor.store %4#0, %arg2, offsets = [0, 0], sizes = [7, 5], strides = [1, 1] : tensor<7x5xf32> -> !flow.dispatch.tensor<writeonly:tensor<7x5xf32>>
+      flow.dispatch.tensor.store %4#1, %arg3, offsets = [0, 0], sizes = [5, 7], strides = [1, 1] : tensor<5x7xf32> -> !flow.dispatch.tensor<writeonly:tensor<5x7xf32>>
+      return
+    }
+  }
+}