Enable tile+distribute+vectorize for unpack ops. (#11232)

The tiling sizes are set to multiple of inner tile sizes. In this
context, we don't need stack buffer allocation for holding temp results.

Test case:

```mlir
    %result = iree_linalg_ext.unpack %result_4d inner_dims_pos = [0, 1] inner_tiles = [8, 8] into %result_init
       : (tensor<32x32x8x8xi32> tensor<256x256xi32>) -> tensor<256x256xi32>
```

The commit gives us 1.86x improvements (93.63 us -> 49.44 us).
diff --git a/compiler/src/iree/compiler/Codegen/Common/TileDispatchUsingInterface.cpp b/compiler/src/iree/compiler/Codegen/Common/TileDispatchUsingInterface.cpp
index 49b5da3..69b27de 100644
--- a/compiler/src/iree/compiler/Codegen/Common/TileDispatchUsingInterface.cpp
+++ b/compiler/src/iree/compiler/Codegen/Common/TileDispatchUsingInterface.cpp
@@ -379,6 +379,7 @@
     return tilingResult;
   }
 
+  SmallVector<Operation *> tiledImplementation;
   {
     SmallVector<OpFoldResult> offsets, sizes;
     // If there is an interchange specified, permute the iteration domain and
@@ -458,13 +459,8 @@
     if (!tilingResult.loops.empty())
       rewriter.setInsertionPoint(
           tilingResult.loops.back().getBody()->getTerminator());
-    SmallVector<Operation *> tiledImplementation =
-        op.getTiledImplementation(rewriter, offsets, sizes);
-    if (tiledImplementation.size() != 1) {
-      return rewriter.notifyMatchFailure(
-          op, "expected tiled implementation to return a single op");
-    }
-    tilingResult.tiledOp = tiledImplementation[0];
+    tiledImplementation = op.getTiledImplementation(rewriter, offsets, sizes);
+    tilingResult.tiledOp = tiledImplementation.back();
 
     LLVM_DEBUG({
       if (!tilingResult.loops.empty()) {
@@ -478,7 +474,9 @@
   }
 
   // Update the filter.
-  filter.replaceLinalgTransformationFilter(rewriter, tilingResult.tiledOp);
+  for (auto tiledOp : tiledImplementation) {
+    filter.replaceLinalgTransformationFilter(rewriter, tiledOp);
+  }
 
   if (op->getNumResults() == 0) {
     rewriter.eraseOp(op);
diff --git a/compiler/src/iree/compiler/Codegen/Common/test/tile_and_distribute_to_workgroups.mlir b/compiler/src/iree/compiler/Codegen/Common/test/tile_and_distribute_to_workgroups.mlir
index 63e238e..3740539 100644
--- a/compiler/src/iree/compiler/Codegen/Common/test/tile_and_distribute_to_workgroups.mlir
+++ b/compiler/src/iree/compiler/Codegen/Common/test/tile_and_distribute_to_workgroups.mlir
@@ -1872,3 +1872,46 @@
 //       CHECK:       %[[SIZE_X:.+]] = affine.min #[[MAP8]](%{{.+}})[%{{.+}}]
 //       CHECK:       flow.dispatch.tensor.store
 //  CHECK-SAME:           sizes = [%[[SIZE_Y]], %[[SIZE_X]], 8, 4]
+
+// -----
+
+hal.executable private @dynamic_unpack {
+  hal.executable.variant public @embedded_elf_x86_64, target = <"llvm-cpu", "embedded-elf-x86_64", {}> {
+    hal.executable.export public @dynamic_unpack ordinal(0) layout(
+        #hal.pipeline.layout<push_constants = 4, sets = [
+            <0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>)
+        attributes {translation_info = #iree_codegen.translation_info<CPUDataTiling>} {
+    ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
+      %c1 = arith.constant 1 : index
+      %0 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%arg1]
+      %1 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%arg2]
+      hal.return %1, %0, %c1 : index, index, index
+    }
+    builtin.module {
+      func.func @dynamic_unpack() {
+        %c131072 = arith.constant 131072 : index
+        %c0 = arith.constant 0 : index
+        %0 = hal.interface.constant.load[0] : i32
+        %1 = hal.interface.constant.load[1] : i32
+        %2 = hal.interface.constant.load[2] : i32
+        %3 = hal.interface.constant.load[3] : i32
+        %4 = arith.index_castui %0 : i32 to index
+        %5 = arith.index_castui %1 : i32 to index
+        %6 = arith.index_castui %2 : i32 to index
+        %7 = arith.index_castui %3 : i32 to index
+        %8 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:tensor<?x?x32x16xi32>>{%4, %5}
+        %9 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c131072) alignment(64) : !flow.dispatch.tensor<writeonly:tensor<?x?xi32>>{%6, %7}
+        %10 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, 32, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x32x16xi32>>{%4, %5} -> tensor<?x?x32x16xi32>
+        %11 = tensor.empty(%6, %7) : tensor<?x?xi32>
+        %12 = iree_linalg_ext.unpack {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64]]>}
+          %10 inner_dims_pos = [0, 1] inner_tiles = [32, 16] into %11 : (tensor<?x?x32x16xi32> tensor<?x?xi32>) -> tensor<?x?xi32>
+        flow.dispatch.tensor.store %12, %9, offsets = [0, 0], sizes = [%6, %7], strides = [1, 1] : tensor<?x?xi32> -> !flow.dispatch.tensor<writeonly:tensor<?x?xi32>>{%6, %7}
+        return
+      }
+    }
+  }
+}
+// CHECK-LABEL: func.func @dynamic_unpack
+// CHECK:         scf.for
+// CHECK:           scf.for
+// CHECK:             iree_linalg_ext.unpack
diff --git a/compiler/src/iree/compiler/Codegen/Interfaces/PartitionableLoopsInterface.cpp b/compiler/src/iree/compiler/Codegen/Interfaces/PartitionableLoopsInterface.cpp
index 4844275..9766fcd 100644
--- a/compiler/src/iree/compiler/Codegen/Interfaces/PartitionableLoopsInterface.cpp
+++ b/compiler/src/iree/compiler/Codegen/Interfaces/PartitionableLoopsInterface.cpp
@@ -210,7 +210,7 @@
     IREE::LinalgExt::PackOp::attachInterface<
         OuterParallelAsPartitionableLoops<IREE::LinalgExt::PackOp>>(*ctx);
     IREE::LinalgExt::UnPackOp::attachInterface<
-        NoPartitionableLoops<IREE::LinalgExt::UnPackOp>>(*ctx);
+        OuterParallelAsPartitionableLoops<IREE::LinalgExt::UnPackOp>>(*ctx);
     IREE::LinalgExt::ScanOp::attachInterface<
         AllParallelAsPartitionableLoops<IREE::LinalgExt::ScanOp>>(*ctx);
     IREE::LinalgExt::ScatterOp::attachInterface<
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp b/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp
index c4927de..7a529b2 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp
@@ -1047,6 +1047,7 @@
       workgroupTileSizes[dim] = 0;
     }
   }
+
   return workgroupTileSizes;
 }
 
@@ -1057,6 +1058,27 @@
       entryPointFn, op, tileSizes, DispatchLoweringPassPipeline::CPUDataTiling);
 }
 
+static LogicalResult setRootConfig(
+    func::FuncOp entryPointFn, IREE::LinalgExt::UnPackOp op,
+    DispatchLoweringPassPipeline pipeline =
+        DispatchLoweringPassPipeline::CPUDataTiling) {
+  SmallVector<int64_t> tileSizes = getLinalgExtDefaultWorkgroupTileSizes(op);
+
+  // Fixup for making tileSizes be multiple of inner_tile_sizes.
+  SmallVector<int64_t> innerTiles = op.getStaticTiles();
+  SmallVector<int64_t> dimPos = extractFromI64ArrayAttr(op.getInnerDimsPos());
+  for (auto it : llvm::zip(dimPos, innerTiles)) {
+    int64_t pos = std::get<0>(it);
+    int64_t size = std::get<1>(it);
+    if (tileSizes[pos] == 0 || ShapedType::isDynamic(size)) continue;
+    tileSizes[pos] = llvm::alignDown(tileSizes[pos], size);
+  }
+
+  TileSizesListType tileSizesList = {tileSizes};
+  return setOpConfigAndEntryPointFnTranslation(entryPointFn, op, tileSizesList,
+                                               pipeline);
+}
+
 /// Sets the lowering configuration for dispatch region for linalg_ext.fft
 /// root op.
 static LogicalResult setRootConfig(
@@ -1601,15 +1623,15 @@
           return setRootConfig(entryPointFn, op, LinalgOpInfo(op),
                                targetMLTransInfo);
         })
-        .Case<IREE::LinalgExt::FftOp, linalg::Mmt4DOp, linalg::Conv2DNhwcHwcfOp,
-              linalg::Conv2DNchwFchwOp, linalg::DepthwiseConv2DNhwcHwcOp>(
+        .Case<IREE::LinalgExt::FftOp, IREE::LinalgExt::PackOp,
+              IREE::LinalgExt::UnPackOp, linalg::Mmt4DOp,
+              linalg::Conv2DNhwcHwcfOp, linalg::Conv2DNchwFchwOp,
+              linalg::DepthwiseConv2DNhwcHwcOp>(
             [&](auto op) { return setRootConfig(entryPointFn, op); })
         .Case<linalg::ContractionOpInterface>(
             [&](auto op) { return setRootConfig(entryPointFn, op); })
         .Case<linalg::LinalgOp>(
             [&](auto op) { return setRootConfig(entryPointFn, op); })
-        .Case<IREE::LinalgExt::PackOp>(
-            [&](auto op) { return setRootConfig(entryPointFn, op); })
         .Case<TilingInterface>(
             [&](auto op) { return setRootConfig(entryPointFn, op); })
         .Default([&](Operation *op) { return success(); });
@@ -1626,7 +1648,7 @@
   // Redirect to individual operations.
   auto setRootConfigFn = [&](Operation *op) -> LogicalResult {
     return TypeSwitch<Operation *, LogicalResult>(op)
-        .Case<IREE::LinalgExt::FftOp>([&](auto op) {
+        .Case<IREE::LinalgExt::FftOp, IREE::LinalgExt::UnPackOp>([&](auto op) {
           return setRootConfig(entryPointFn, op,
                                DispatchLoweringPassPipeline::VMVXDefault);
         })