[Codegen] Add support for vectorizing tensor.unpack ops with masking. (#16664)

The revision implements the shape inference for unpack ops, and add the
op to the analysis list. It works for a single tiled unpack op and
fusion cases.
diff --git a/compiler/src/iree/compiler/Codegen/Common/GenericVectorization.cpp b/compiler/src/iree/compiler/Codegen/Common/GenericVectorization.cpp
index 631418b..4e49c80 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GenericVectorization.cpp
+++ b/compiler/src/iree/compiler/Codegen/Common/GenericVectorization.cpp
@@ -148,6 +148,53 @@
   return result;
 }
 
+/// Returns the result sizes and vector input sizes of the tensor.unpack op. The
+/// inferred bounding size is returned if it is dynamic shape. Returns
+/// std::nullopt if the shape inference failed.
+static std::optional<VectorizationTileSizes>
+inferSizesFromIR(tensor::UnPackOp op) {
+  LLVM_DEBUG(VEC_DBGS() << "Inferring dest sizes for:\n" << op << "\n");
+
+  if (llvm::any_of(op.getInnerTiles(), [](OpFoldResult v) {
+        return !getConstantIntValue(v).has_value();
+      })) {
+    LLVM_DEBUG(
+        VEC_DBGS()
+        << "failed on inference because inner_tiles are not all constant");
+    return std::nullopt;
+  }
+
+  VectorizationTileSizes result;
+  std::optional<VectorizationTileSizes> inferred =
+      inferSizesFromIR(op.getSource());
+  if (!inferred) {
+    return std::nullopt;
+  }
+  result.vectorSizes = inferred.value().destShape;
+
+  result.vectorSizes.resize(op.getDestType().getRank());
+  auto outerDimsPerm = op.getOuterDimsPerm();
+  if (!outerDimsPerm.empty()) {
+    applyPermutationToVector(result.vectorSizes,
+                             invertPermutationVector(outerDimsPerm));
+  }
+  for (auto [dimPos, tileSize] :
+       llvm::zip_equal(op.getInnerDimsPos(), op.getStaticInnerTiles())) {
+    result.vectorSizes[dimPos] *= tileSize;
+  }
+
+  LLVM_DEBUG({
+    VEC_DBGS() << "After adjustment with inner tiles and "
+                  "outer_dims_perm:\n";
+    for (auto [idx, val] : llvm::enumerate(result.vectorSizes)) {
+      llvm::dbgs() << "Dim #" << idx << ": " << val << "\n";
+    }
+  });
+  result.destShape = result.vectorSizes;
+
+  return result;
+}
+
 /// See the documentation in the above function declaration.
 static std::optional<VectorizationTileSizes> inferSizesFromIR(Value val) {
   std::optional<VectorizationTileSizes> result;
@@ -204,8 +251,8 @@
           vectorSizes = result->vectorSizes;
         }
       })
-      .Case<tensor::PackOp>([&](tensor::PackOp packOp) {
-        std::optional<VectorizationTileSizes> result = inferSizesFromIR(packOp);
+      .Case<tensor::PackOp, tensor::UnPackOp>([&](auto op) {
+        std::optional<VectorizationTileSizes> result = inferSizesFromIR(op);
         if (result) {
           vectorSizes = result->vectorSizes;
         }
@@ -219,6 +266,7 @@
         vectorSizes = SmallVector<int64_t>(ty.getShape());
       })
       .Default([&](Operation *) {});
+
   if (vectorSizes) {
     // This can't identify scalable flags, so pad them with `false`.
     return std::make_pair(vectorSizes.value(),
@@ -275,7 +323,8 @@
     } else if (vectorizePadding && enableVectorMasking &&
                isa<tensor::PadOp>(op)) {
       candidates.push_back(op);
-    } else if (enableVectorMasking && isa<tensor::PackOp>(op)) {
+    } else if (enableVectorMasking &&
+               isa<tensor::PackOp, tensor::UnPackOp>(op)) {
       candidates.push_back(op);
     }
   });
diff --git a/compiler/src/iree/compiler/Codegen/Common/test/generic_vectorization.mlir b/compiler/src/iree/compiler/Codegen/Common/test/generic_vectorization.mlir
index 704a291..6943348 100644
--- a/compiler/src/iree/compiler/Codegen/Common/test/generic_vectorization.mlir
+++ b/compiler/src/iree/compiler/Codegen/Common/test/generic_vectorization.mlir
@@ -245,3 +245,124 @@
 // CHECK-MASK:           %[[EMPTY:.+]] = tensor.empty(%[[DEST_SZ1]], %[[DEST_SZ2]]) : tensor<2x?x?x16x2xbf16>
 // CHECK-MASK:           %[[PACK_WRITE_MASK:.+]] = vector.create_mask %[[C2]], %[[DEST_SZ1]], %[[DEST_SZ2]], %[[C16]], %[[C2]] : vector<2x4x6x16x2xi1>
 // CHECK-MASK:           vector.transfer_write %[[PACK_TRANSP]], %[[EMPTY]]{{.+}}, %[[PACK_WRITE_MASK]]
+
+// -----
+
+#map = affine_map<(d0)[s0] -> (16, -d0 + s0)>
+#map1 = affine_map<(d0)[s0] -> (32, -d0 + s0)>
+#map2 = affine_map<(d0) -> (d0 floordiv 16)>
+#map3 = affine_map<(d0) -> (d0 ceildiv 16)>
+func.func @single_dynamic_unpack_infer_vector_size(%arg0: tensor<?x?x16x16xf32>, %arg1: tensor<?x?xf32>) -> tensor<?x?xf32> {
+  %c0 = arith.constant 0 : index
+  %dim = tensor.dim %arg1, %c0 : tensor<?x?xf32>
+  %c1 = arith.constant 1 : index
+  %dim_0 = tensor.dim %arg1, %c1 : tensor<?x?xf32>
+  %c0_1 = arith.constant 0 : index
+  %c16 = arith.constant 16 : index
+  %0 = scf.for %arg2 = %c0_1 to %dim step %c16 iter_args(%arg3 = %arg1) -> (tensor<?x?xf32>) {
+    %c0_2 = arith.constant 0 : index
+    %c32 = arith.constant 32 : index
+    %1 = scf.for %arg4 = %c0_2 to %dim_0 step %c32 iter_args(%arg5 = %arg3) -> (tensor<?x?xf32>) {
+      %2 = affine.min #map(%arg2)[%dim]
+      %3 = affine.min #map1(%arg4)[%dim_0]
+      %4 = affine.apply #map2(%arg2)
+      %5 = affine.apply #map2(%arg4)
+      %6 = affine.apply #map3(%3)
+      %extracted_slice = tensor.extract_slice %arg0[%4, %5, 0, 0] [1, %6, 16, 16] [1, 1, 1, 1] : tensor<?x?x16x16xf32> to tensor<1x?x16x16xf32>
+      %extracted_slice_3 = tensor.extract_slice %arg5[%arg2, %arg4] [%2, %3] [1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
+      %unpack = tensor.unpack %extracted_slice outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %extracted_slice_3 : tensor<1x?x16x16xf32> -> tensor<?x?xf32>
+      %inserted_slice = tensor.insert_slice %unpack into %arg5[%arg2, %arg4] [%2, %3] [1, 1] : tensor<?x?xf32> into tensor<?x?xf32>
+      scf.yield %inserted_slice : tensor<?x?xf32>
+    }
+    scf.yield %1 : tensor<?x?xf32>
+  }
+  return %0 : tensor<?x?xf32>
+}
+// CHECK-MASK: #[[$MAP0:.+]] = affine_map<(d0)[s0] -> (16, -d0 + s0)>
+// CHECK-MASK: #[[$MAP1:.+]] = affine_map<(d0)[s0] -> (32, -d0 + s0)>
+// CHECK-MASK: #[[$MAP2:.+]] = affine_map<(d0) -> (d0 floordiv 16)>
+// CHECK-MASK: #[[$MAP3:.+]] = affine_map<(d0) -> (d0 ceildiv 16)>
+// CHECK-MASK-LABEL: func.func @single_dynamic_unpack_infer_vector_size
+// CHECK-MASK-SAME:    %[[SRC:[a-zA-Z0-9]+]]
+// CHECK-MASK-DAG:     %[[C0:.+]] = arith.constant 0 : index
+// CHECK-MASK-DAG:     %[[C1:.+]] = arith.constant 1 : index
+// CHECK-MASK-DAG:     %[[C16:.+]] = arith.constant 16 : index
+// CHECK-MASK:         scf.for
+// CHECK-MASK:         scf.for
+// CHECK-MASK-DAG:       %[[DEST_SZ0:.+]] = affine.min #[[$MAP0]]
+// CHECK-MASK-DAG:       %[[DEST_SZ1:.+]] = affine.min #[[$MAP1]]
+// CHECK-MASK-DAG:       %[[SRC_SZ1:.+]] = affine.apply #[[$MAP3]]
+// CHECK-MASK:           %[[SRC_SLICE:.+]] = tensor.extract_slice %[[SRC]]
+// CHECK-MASK:           %[[READ_MASK:.+]] = vector.create_mask %[[C1]], %[[SRC_SZ1]], %[[C16]], %[[C16]] : vector<1x2x16x16xi1>
+// CHECK-MASK:           %[[READ:.+]] = vector.transfer_read %[[SRC_SLICE]]{{.+}}, %[[READ_MASK]]
+// CHECK-MASK:           %[[TRANSP:.+]] = vector.transpose %[[READ]], [0, 2, 1, 3]
+// CHECK-MASK:           %[[SHAPE_CAST:.+]] = vector.shape_cast %[[TRANSP]] : vector<1x16x2x16xf32> to vector<16x32xf32>
+// CHECK-MASK:           %[[EMPTY:.+]] = tensor.empty(%[[DEST_SZ0]], %[[DEST_SZ1]]) : tensor<?x?xf32>
+// CHECK-MASK:           %[[WRITE_MASK:.+]] = vector.create_mask %[[DEST_SZ0]], %[[DEST_SZ1]] : vector<16x32xi1>
+// CHECK-MASK:           vector.transfer_write %[[SHAPE_CAST]], {{.+}}, %[[WRITE_MASK]]
+
+// -----
+
+#map = affine_map<(d0)[s0] -> (-d0 + s0, 16)>
+#map1 = affine_map<(d0)[s0] -> (-d0 + s0, 32)>
+#map2 = affine_map<(d0) -> (d0 floordiv 16)>
+#map3 = affine_map<(d0) -> (d0 ceildiv 16)>
+#map4 = affine_map<(d0, d1) -> (d0, d1)>
+func.func @generic_unpack_infer_vector_size(%arg0: tensor<?x?x16x16xf32>, %arg1: tensor<?x?xf32>, %arg2: tensor<?x?xf32>) -> tensor<?x?xf32> {
+  %c32 = arith.constant 32 : index
+  %c16 = arith.constant 16 : index
+  %c1 = arith.constant 1 : index
+  %c0 = arith.constant 0 : index
+  %dim = tensor.dim %arg1, %c0 : tensor<?x?xf32>
+  %dim_0 = tensor.dim %arg1, %c1 : tensor<?x?xf32>
+  %0 = scf.for %arg3 = %c0 to %dim step %c16 iter_args(%arg4 = %arg2) -> (tensor<?x?xf32>) {
+    %1 = scf.for %arg5 = %c0 to %dim_0 step %c32 iter_args(%arg6 = %arg4) -> (tensor<?x?xf32>) {
+      %2 = affine.min #map(%arg3)[%dim]
+      %3 = affine.min #map1(%arg5)[%dim_0]
+      %4 = affine.apply #map2(%arg3)
+      %5 = affine.apply #map2(%arg5)
+      %6 = affine.apply #map3(%3)
+      %extracted_slice = tensor.extract_slice %arg0[%4, %5, 0, 0] [1, %6, 16, 16] [1, 1, 1, 1] : tensor<?x?x16x16xf32> to tensor<1x?x16x16xf32>
+      %extracted_slice_1 = tensor.extract_slice %arg1[%arg3, %arg5] [%2, %3] [1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
+      %unpack = tensor.unpack %extracted_slice outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %extracted_slice_1 : tensor<1x?x16x16xf32> -> tensor<?x?xf32>
+      %extracted_slice_2 = tensor.extract_slice %arg6[%arg3, %arg5] [%2, %3] [1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
+      %7 = linalg.generic {indexing_maps = [#map4, #map4], iterator_types = ["parallel", "parallel"]} ins(%unpack : tensor<?x?xf32>) outs(%extracted_slice_2 : tensor<?x?xf32>) {
+      ^bb0(%in: f32, %out: f32):
+        %8 = math.exp %in : f32
+        linalg.yield %8 : f32
+      } -> tensor<?x?xf32>
+      %inserted_slice = tensor.insert_slice %7 into %arg6[%arg3, %arg5] [%2, %3] [1, 1] : tensor<?x?xf32> into tensor<?x?xf32>
+      scf.yield %inserted_slice : tensor<?x?xf32>
+    }
+    scf.yield %1 : tensor<?x?xf32>
+  }
+  return %0 : tensor<?x?xf32>
+}
+// CHECK-MASK: #[[$MAP0:.+]] = affine_map<(d0)[s0] -> (-d0 + s0, 16)>
+// CHECK-MASK: #[[$MAP1:.+]] = affine_map<(d0)[s0] -> (-d0 + s0, 32)>
+// CHECK-MASK: #[[$MAP2:.+]] = affine_map<(d0) -> (d0 floordiv 16)>
+// CHECK-MASK: #[[$MAP3:.+]] = affine_map<(d0) -> (d0 ceildiv 16)>
+// CHECK-MASK-LABEL: func.func @generic_unpack_infer_vector_size
+// CHECK-MASK-SAME:    %[[SRC:[a-zA-Z0-9]+]]
+// CHECK-MASK-DAG:     %[[C0:.+]] = arith.constant 0 : index
+// CHECK-MASK-DAG:     %[[C1:.+]] = arith.constant 1 : index
+// CHECK-MASK-DAG:     %[[C16:.+]] = arith.constant 16 : index
+// CHECK-MASK:         scf.for
+// CHECK-MASK:         scf.for
+// CHECK-MASK-DAG:       %[[DEST_SZ0:.+]] = affine.min #[[$MAP0]]
+// CHECK-MASK-DAG:       %[[DEST_SZ1:.+]] = affine.min #[[$MAP1]]
+// CHECK-MASK-DAG:       %[[SRC_SZ1:.+]] = affine.apply #[[$MAP3]]
+// CHECK-MASK:           %[[SRC_SLICE:.+]] = tensor.extract_slice %[[SRC]]
+// CHECK-MASK:           %[[READ_MASK:.+]] = vector.create_mask %[[C1]], %[[SRC_SZ1]], %[[C16]], %[[C16]] : vector<1x2x16x16xi1>
+// CHECK-MASK:           %[[READ:.+]] = vector.transfer_read %[[SRC_SLICE]]{{.+}}, %[[READ_MASK]]
+// CHECK-MASK:           %[[TRANSP:.+]] = vector.transpose %[[READ]], [0, 2, 1, 3]
+// CHECK-MASK:           %[[SHAPE_CAST:.+]] = vector.shape_cast %[[TRANSP]] : vector<1x16x2x16xf32> to vector<16x32xf32>
+// CHECK-MASK:           %[[EMPTY:.+]] = tensor.empty(%[[DEST_SZ0]], %[[DEST_SZ1]]) : tensor<?x?xf32>
+// CHECK-MASK:           %[[WRITE_MASK:.+]] = vector.create_mask %[[DEST_SZ0]], %[[DEST_SZ1]] : vector<16x32xi1>
+// CHECK-MASK:           %[[UNPACK_WRITE:.+]] = vector.transfer_write %[[SHAPE_CAST]], {{.+}}, %[[WRITE_MASK]]
+// CHECK-MASK:           %[[D0:.+]] = tensor.dim %[[UNPACK_WRITE]], %[[C0]]
+// CHECK-MASK:           %[[D1:.+]] = tensor.dim %[[UNPACK_WRITE]], %[[C1]]
+// CHECK-MASK:           %[[GENERIC_MASK:.+]] = vector.create_mask %[[D0]], %[[D1]] : vector<16x32xi1>
+// CHECK-MASK:           %[[GENERIC_SRC:.+]] = vector.transfer_read %[[UNPACK_WRITE]]{{.+}}, %[[GENERIC_MASK]]
+// CHECK-MASK:           %[[EXP:.+]] = math.exp %[[GENERIC_SRC]]
+// CHECK-MASK:           vector.transfer_write %[[EXP]]{{.+}}, %[[GENERIC_MASK]]