[CPU] Improve tile sizes selection for tensor.pack ops. (#15397)

It disables special vector sizes for non-f32 cases because the logic is
only for 16x16 transpose cases. The improvements of dispatch sizes are
from vectorization. We are not able to vectorize named ops if they have
dynamic shapes, which is fixed by
https://github.com/llvm/llvm-project/commit/03529b99b36788ca836b7ce238ea9400ce89847b.
The change allows backends to vectorize them because they become static
shapes (by tiling with size=1). It is not a hard condition; we track it
in https://github.com/openxla/iree/issues/15441

The revision takes the number of threads into account, so we have better
performance on multi-threaded. It also reduces runtime overheads.

This is a step toward to https://github.com/openxla/iree/issues/15391
and https://github.com/openxla/iree/issues/15349

It improves the performance of
[tensor.pack](https://github.com/openxla/iree/issues/15349) op from 420
ms to 170 ms on 8-threaded x86 CPU.
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp b/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp
index a39d0b8..b7f8571 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp
@@ -371,6 +371,83 @@
   return referenceTypeLengthInBytes;
 }
 
+// Reduces the number of workgroups in cases where we are dividing the work too
+// much. Over-provision the number of workgroups to twice the number of
+// threads.
+static void reduceDistributionWorkgroups(
+    ArrayRef<int64_t> workload, SmallVectorImpl<int64_t> &distributedTileSizes,
+    std::optional<ArrayRef<int64_t>> maxTileSizes = std::nullopt,
+    std::optional<ArrayRef<int64_t>> vectorSizeHints = std::nullopt) {
+  assert(workload.size() == distributedTileSizes.size());
+  SmallVector<int64_t> numWorkgroupsPerDim(workload.size(), 1);
+  for (auto [idx, value] : llvm::enumerate(workload)) {
+    if (distributedTileSizes[idx] == 0 || ShapedType::isDynamic(value)) {
+      continue;
+    }
+    numWorkgroupsPerDim[idx] =
+        llvm::divideCeil(value, distributedTileSizes[idx]);
+  }
+
+  int64_t numWorkgroupsLimit = 2 * clNumberOfRuntimeThreads;
+  int64_t numWorkgroups =
+      std::accumulate(numWorkgroupsPerDim.begin(), numWorkgroupsPerDim.end(),
+                      1LL, std::multiplies<int64_t>{});
+  unsigned currDim = workload.size();
+  while (numWorkgroups > numWorkgroupsLimit && currDim > 0) {
+    unsigned index = currDim - 1;
+    int64_t currSize = distributedTileSizes[index];
+    if (workload[index] == ShapedType::kDynamic ||
+        (maxTileSizes && currSize >= maxTileSizes.value()[index]) ||
+        currSize >= workload[index]) {
+      currDim--;
+      continue;
+    }
+
+    int64_t newSize = std::min<int64_t>(currSize * 2, workload[index]);
+    int64_t vectorSize = vectorSizeHints ? vectorSizeHints.value()[index] : 0;
+
+    // Chech if it's the ideal size with vector size hint. And skip if the new
+    // size will break the ideal size.
+    if (vectorSize > 1 &&
+        (currSize % vectorSize == 0 && workload[index] % currSize == 0) &&
+        (newSize % vectorSize != 0 || workload[index] % newSize != 0)) {
+      currDim--;
+      continue;
+    }
+
+    distributedTileSizes[index] = newSize;
+    int64_t nwg =
+        llvm::divideCeil(workload[index], distributedTileSizes[index]);
+    if (nwg < numWorkgroupsPerDim[index]) {
+      numWorkgroups /= numWorkgroupsPerDim[index];
+      numWorkgroupsPerDim[index] = nwg;
+      numWorkgroups *= nwg;
+    } else {
+      currDim--;
+    }
+  }
+
+  // Final fixup for dividing workload evenly.
+  for (auto i : llvm::seq<unsigned>(0, distributedTileSizes.size())) {
+    if (distributedTileSizes[i] == 0 || ShapedType::isDynamic(workload[i])) {
+      continue;
+    }
+
+    int64_t nwg = llvm::divideCeil(workload[i], distributedTileSizes[i]);
+    int64_t newSize = llvm::divideCeil(workload[i], nwg);
+
+    // Chech if it's the ideal size with vector size hint. And skip if the new
+    // size will break the ideal size.
+    int64_t vectorSize = vectorSizeHints ? vectorSizeHints.value()[i] : 0;
+    if (vectorSize > 1 &&
+        (newSize % vectorSize != 0 || workload[i] % newSize != 0)) {
+      continue;
+    }
+
+    distributedTileSizes[i] = newSize;
+  }
+}
+
 /// Returns the default tile sizes to use for the loops that are distributed.
 static SmallVector<int64_t>
 getDefaultDistributionTileSizes(ArrayRef<int64_t> lbs, ArrayRef<int64_t> ubs,
@@ -389,7 +466,6 @@
   }
 
   SmallVector<int64_t> distributedTileSizes(numDims, 1);
-  SmallVector<int64_t> numWorkgroupsPerDim(numDims, 1);
   SmallVector<int64_t> workload(numDims, 1);
   for (auto i : llvm::seq<size_t>(0, numDims)) {
     if (maxTileSizes[i] == 0 || ShapedType::isDynamic(lbs[i]) ||
@@ -423,69 +499,10 @@
     // work per invocation reasonable.
     distributedTileSizes[i] =
         std::min<int64_t>(candidateTileSize, maxTileSizes[i]);
-    numWorkgroupsPerDim[i] =
-        llvm::divideCeil(workload[i], distributedTileSizes[i]);
   }
 
-  // Reduce the number of workgroups in cases where we are dividing the work too
-  // much. Over-provision the number of workgroups to twice the number of
-  // threads.
-  int64_t numWorkgroupsLimit = 2 * clNumberOfRuntimeThreads;
-  int64_t numWorkgroups =
-      std::accumulate(numWorkgroupsPerDim.begin(), numWorkgroupsPerDim.end(),
-                      1LL, std::multiplies<int64_t>{});
-  unsigned currDim = numDims;
-  while (numWorkgroups > numWorkgroupsLimit && currDim > 0) {
-    unsigned index = currDim - 1;
-    int64_t currSize = distributedTileSizes[index];
-    if (workload[index] == ShapedType::kDynamic ||
-        currSize >= maxTileSizes[index] || currSize >= workload[index]) {
-      currDim--;
-      continue;
-    }
-
-    int64_t newSize = std::min<int64_t>(currSize * 2, workload[index]);
-    int64_t vectorSize = vectorSizeHints[index];
-
-    // Chech if it's the ideal size with vector size hint. And skip if the new
-    // size will break the ideal size.
-    if (vectorSize > 1 &&
-        (currSize % vectorSize == 0 && workload[index] % currSize == 0) &&
-        (newSize % vectorSize != 0 || workload[index] % newSize != 0)) {
-      currDim--;
-      continue;
-    }
-
-    distributedTileSizes[index] = newSize;
-    int64_t nwg =
-        llvm::divideCeil(workload[index], distributedTileSizes[index]);
-    if (nwg < numWorkgroupsPerDim[index]) {
-      numWorkgroups /= numWorkgroupsPerDim[index];
-      numWorkgroupsPerDim[index] = nwg;
-      numWorkgroups *= nwg;
-    } else {
-      currDim--;
-    }
-  }
-
-  // Final fixup for dividing workload evenly.
-  for (auto i : llvm::seq<unsigned>(0, distributedTileSizes.size())) {
-    if (distributedTileSizes[i] == 0 || ShapedType::isDynamic(workload[i]))
-      continue;
-
-    int64_t nwg = llvm::divideCeil(workload[i], distributedTileSizes[i]);
-    int64_t newSize = llvm::divideCeil(workload[i], nwg);
-
-    // Chech if it's the ideal size with vector size hint. And skip if the new
-    // size will break the ideal size.
-    int64_t vectorSize = vectorSizeHints[i];
-    if (vectorSize > 1 &&
-        (newSize % vectorSize != 0 || workload[i] % newSize != 0)) {
-      continue;
-    }
-
-    distributedTileSizes[i] = newSize;
-  }
+  reduceDistributionWorkgroups(workload, distributedTileSizes, maxTileSizes,
+                               vectorSizeHints);
 
   return distributedTileSizes;
 }
@@ -1358,7 +1375,9 @@
   SmallVector<int64_t> tileSizes(op.getSourceRank(), 1);
   auto targetAttr = IREE::HAL::ExecutableTargetAttr::lookup(entryPointFn);
   int64_t vectorSize = getVectorSize(entryPointFn, op.getSourceType());
-  if (hasAVX512fFeature(targetAttr) && isPackMatmulLHS(op)) {
+  // TODO(#15421): Improve tile sizes selection for non f32 cases.
+  if (op.getSourceType().getElementType().isF32() &&
+      hasAVX512fFeature(targetAttr) && isPackMatmulLHS(op)) {
     tileSizes.back() = vectorSize;
   }
   return tileSizes;
@@ -1370,6 +1389,16 @@
   SmallVector<int64_t> distTileSizes =
       getDefaultDistributionTileSizes(cast<TilingInterface>(op.getOperation()));
 
+  int64_t vectorSize = getVectorSize(entryPointFn, op.getSourceType());
+  SmallVector<int64_t> vectorSizeHints(op.getSourceRank(), 1);
+  for (auto dim : op.getInnerDimsPos()) {
+    vectorSizeHints[dim] = vectorSize;
+  }
+
+  SmallVector<int64_t> workload(op.getSourceType().getShape());
+  reduceDistributionWorkgroups(workload, distTileSizes,
+                               /*maxTileSizes=*/std::nullopt, vectorSizeHints);
+
   // The default function aims to returns the number of workload per workgroup,
   // but it does not know that it is working on packed domain. We need to take
   // inner tile sizes into account and adjust the distribution tile sizes.
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/materialize_aarch64_launch_configuration.mlir b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/materialize_aarch64_launch_configuration.mlir
index 020fce6..2d84542 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/materialize_aarch64_launch_configuration.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/materialize_aarch64_launch_configuration.mlir
@@ -498,7 +498,7 @@
     }
   }
 }
-//  CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[8, 64], [1, 1]]>
+//  CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[2, 40], [1, 1]]>
 //  CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDataTiling>
 //      CHECK: hal.executable.export public @pack
 // CHECK-SAME:     translation_info = #[[TRANSLATION]]
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/materialize_x86_64_launch_configuration.mlir b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/materialize_x86_64_launch_configuration.mlir
index 6c3d495..834462f 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/materialize_x86_64_launch_configuration.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/materialize_x86_64_launch_configuration.mlir
@@ -1575,6 +1575,43 @@
 // -----
 
 #pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
+  #hal.descriptor_set.layout<0, bindings = [
+    #hal.descriptor_set.binding<0, storage_buffer>,
+    #hal.descriptor_set.binding<1, storage_buffer>
+  ]>
+]>
+hal.executable private @pack_many_elements  {
+  hal.executable.variant public @embedded_elf_x86_64 target(<"llvm-cpu", "embedded-elf-x86_64", {
+    cpu_features = "+avx512f",
+    data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128",
+    native_vector_size = 64 : index,
+    target_triple = "x86_64-none-elf"
+  }>) {
+  hal.executable.export public @pack_many_elements layout(#pipeline_layout)
+    builtin.module {
+      func.func @pack_many_elements() {
+        %c0 = arith.constant 0 : index
+        %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1200x500000xf32>>
+        %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<31250x1200x16x1xf32>>
+        %2 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1200, 500000], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1200x500000xf32>> -> tensor<1200x500000xf32>
+        %3 = tensor.empty() : tensor<31250x1200x16x1xf32>
+        %pack = tensor.pack %2 outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %3 : tensor<1200x500000xf32> -> tensor<31250x1200x16x1xf32>
+        flow.dispatch.tensor.store %pack, %1, offsets = [0, 0, 0, 0], sizes = [31250, 1200, 16, 1], strides = [1, 1, 1, 1] : tensor<31250x1200x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<31250x1200x16x1xf32>>
+        return
+      }
+    }
+  }
+}
+//  CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[128, 31250], [1, 1]]>
+//  CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDataTiling>
+//      CHECK: hal.executable.export public @pack_many_elements
+// CHECK-SAME:     translation_info = #[[TRANSLATION]]
+//      CHECK:   tensor.pack
+// CHECK-SAME:       lowering_config = #[[CONFIG]]
+
+// -----
+
+#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
   <0, bindings = [
     <0, storage_buffer, ReadOnly>,
     <1, storage_buffer, ReadOnly>,