[CPU] Improve tile sizes selection for tensor.pack ops. (#15397) It disables special vector sizes for non-f32 cases because the logic is only for 16x16 transpose cases. The improvements of dispatch sizes are from vectorization. We are not able to vectorize named ops if they have dynamic shapes, which is fixed by https://github.com/llvm/llvm-project/commit/03529b99b36788ca836b7ce238ea9400ce89847b. The change allows backends to vectorize them because they become static shapes (by tiling with size=1). It is not a hard condition; we track it in https://github.com/openxla/iree/issues/15441 The revision takes the number of threads into account, so we have better performance on multi-threaded. It also reduces runtime overheads. This is a step toward to https://github.com/openxla/iree/issues/15391 and https://github.com/openxla/iree/issues/15349 It improves the performance of [tensor.pack](https://github.com/openxla/iree/issues/15349) op from 420 ms to 170 ms on 8-threaded x86 CPU.

commit: e3f2ab393fdef5a22877c982f8e2e89effdbfa91 [log] [tgz]
author: Han-Chung Wang <hanhan0912@gmail.com> Tue Nov 07 09:45:51 2023 -0800
committer: GitHub <noreply@github.com> Tue Nov 07 09:45:51 2023 -0800
tree: 0e74f3d96d131f3e81654846bfdee844f42fca4b
parent: 41a23ade9c5857ec56a70e3cb907ffaa9feb6c5e [diff]
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp b/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp
index a39d0b8..b7f8571 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp

@@ -371,6 +371,83 @@
   return referenceTypeLengthInBytes;
 }
 
+// Reduces the number of workgroups in cases where we are dividing the work too
+// much. Over-provision the number of workgroups to twice the number of
+// threads.
+static void reduceDistributionWorkgroups(
+    ArrayRef<int64_t> workload, SmallVectorImpl<int64_t> &distributedTileSizes,
+    std::optional<ArrayRef<int64_t>> maxTileSizes = std::nullopt,
+    std::optional<ArrayRef<int64_t>> vectorSizeHints = std::nullopt) {
+  assert(workload.size() == distributedTileSizes.size());
+  SmallVector<int64_t> numWorkgroupsPerDim(workload.size(), 1);
+  for (auto [idx, value] : llvm::enumerate(workload)) {
+    if (distributedTileSizes[idx] == 0 || ShapedType::isDynamic(value)) {
+      continue;
+    }
+    numWorkgroupsPerDim[idx] =
+        llvm::divideCeil(value, distributedTileSizes[idx]);
+  }
+
+  int64_t numWorkgroupsLimit = 2 * clNumberOfRuntimeThreads;
+  int64_t numWorkgroups =
+      std::accumulate(numWorkgroupsPerDim.begin(), numWorkgroupsPerDim.end(),
+                      1LL, std::multiplies<int64_t>{});
+  unsigned currDim = workload.size();
+  while (numWorkgroups > numWorkgroupsLimit && currDim > 0) {
+    unsigned index = currDim - 1;
+    int64_t currSize = distributedTileSizes[index];
+    if (workload[index] == ShapedType::kDynamic ||
+        (maxTileSizes && currSize >= maxTileSizes.value()[index]) ||
+        currSize >= workload[index]) {
+      currDim--;
+      continue;
+    }
+
+    int64_t newSize = std::min<int64_t>(currSize * 2, workload[index]);
+    int64_t vectorSize = vectorSizeHints ? vectorSizeHints.value()[index] : 0;
+
+    // Chech if it's the ideal size with vector size hint. And skip if the new
+    // size will break the ideal size.
+    if (vectorSize > 1 &&
+        (currSize % vectorSize == 0 && workload[index] % currSize == 0) &&
+        (newSize % vectorSize != 0 || workload[index] % newSize != 0)) {
+      currDim--;
+      continue;
+    }
+
+    distributedTileSizes[index] = newSize;
+    int64_t nwg =
+        llvm::divideCeil(workload[index], distributedTileSizes[index]);
+    if (nwg < numWorkgroupsPerDim[index]) {
+      numWorkgroups /= numWorkgroupsPerDim[index];
+      numWorkgroupsPerDim[index] = nwg;
+      numWorkgroups *= nwg;
+    } else {
+      currDim--;
+    }
+  }
+
+  // Final fixup for dividing workload evenly.
+  for (auto i : llvm::seq<unsigned>(0, distributedTileSizes.size())) {
+    if (distributedTileSizes[i] == 0 || ShapedType::isDynamic(workload[i])) {
+      continue;
+    }
+
+    int64_t nwg = llvm::divideCeil(workload[i], distributedTileSizes[i]);
+    int64_t newSize = llvm::divideCeil(workload[i], nwg);
+
+    // Chech if it's the ideal size with vector size hint. And skip if the new
+    // size will break the ideal size.
+    int64_t vectorSize = vectorSizeHints ? vectorSizeHints.value()[i] : 0;
+    if (vectorSize > 1 &&
+        (newSize % vectorSize != 0 || workload[i] % newSize != 0)) {
+      continue;
+    }
+
+    distributedTileSizes[i] = newSize;
+  }
+}
+
 /// Returns the default tile sizes to use for the loops that are distributed.
 static SmallVector<int64_t>
 getDefaultDistributionTileSizes(ArrayRef<int64_t> lbs, ArrayRef<int64_t> ubs,
@@ -389,7 +466,6 @@
   }
 
   SmallVector<int64_t> distributedTileSizes(numDims, 1);
-  SmallVector<int64_t> numWorkgroupsPerDim(numDims, 1);
   SmallVector<int64_t> workload(numDims, 1);
   for (auto i : llvm::seq<size_t>(0, numDims)) {
     if (maxTileSizes[i] == 0 || ShapedType::isDynamic(lbs[i]) ||
@@ -423,69 +499,10 @@
     // work per invocation reasonable.
     distributedTileSizes[i] =
         std::min<int64_t>(candidateTileSize, maxTileSizes[i]);
-    numWorkgroupsPerDim[i] =
-        llvm::divideCeil(workload[i], distributedTileSizes[i]);
   }
 
-  // Reduce the number of workgroups in cases where we are dividing the work too
-  // much. Over-provision the number of workgroups to twice the number of
-  // threads.
-  int64_t numWorkgroupsLimit = 2 * clNumberOfRuntimeThreads;
-  int64_t numWorkgroups =
-      std::accumulate(numWorkgroupsPerDim.begin(), numWorkgroupsPerDim.end(),
-                      1LL, std::multiplies<int64_t>{});
-  unsigned currDim = numDims;
-  while (numWorkgroups > numWorkgroupsLimit && currDim > 0) {
-    unsigned index = currDim - 1;
-    int64_t currSize = distributedTileSizes[index];
-    if (workload[index] == ShapedType::kDynamic ||
-        currSize >= maxTileSizes[index] || currSize >= workload[index]) {
-      currDim--;
-      continue;
-    }
-
-    int64_t newSize = std::min<int64_t>(currSize * 2, workload[index]);
-    int64_t vectorSize = vectorSizeHints[index];
-
-    // Chech if it's the ideal size with vector size hint. And skip if the new
-    // size will break the ideal size.
-    if (vectorSize > 1 &&
-        (currSize % vectorSize == 0 && workload[index] % currSize == 0) &&
-        (newSize % vectorSize != 0 || workload[index] % newSize != 0)) {
-      currDim--;
-      continue;
-    }
-
-    distributedTileSizes[index] = newSize;
-    int64_t nwg =
-        llvm::divideCeil(workload[index], distributedTileSizes[index]);
-    if (nwg < numWorkgroupsPerDim[index]) {
-      numWorkgroups /= numWorkgroupsPerDim[index];
-      numWorkgroupsPerDim[index] = nwg;
-      numWorkgroups *= nwg;
-    } else {
-      currDim--;
-    }
-  }
-
-  // Final fixup for dividing workload evenly.
-  for (auto i : llvm::seq<unsigned>(0, distributedTileSizes.size())) {
-    if (distributedTileSizes[i] == 0 || ShapedType::isDynamic(workload[i]))
-      continue;
-
-    int64_t nwg = llvm::divideCeil(workload[i], distributedTileSizes[i]);
-    int64_t newSize = llvm::divideCeil(workload[i], nwg);
-
-    // Chech if it's the ideal size with vector size hint. And skip if the new
-    // size will break the ideal size.
-    int64_t vectorSize = vectorSizeHints[i];
-    if (vectorSize > 1 &&
-        (newSize % vectorSize != 0 || workload[i] % newSize != 0)) {
-      continue;
-    }
-
-    distributedTileSizes[i] = newSize;
-  }
+  reduceDistributionWorkgroups(workload, distributedTileSizes, maxTileSizes,
+                               vectorSizeHints);
 
   return distributedTileSizes;
 }
@@ -1358,7 +1375,9 @@
   SmallVector<int64_t> tileSizes(op.getSourceRank(), 1);
   auto targetAttr = IREE::HAL::ExecutableTargetAttr::lookup(entryPointFn);
   int64_t vectorSize = getVectorSize(entryPointFn, op.getSourceType());
-  if (hasAVX512fFeature(targetAttr) && isPackMatmulLHS(op)) {
+  // TODO(#15421): Improve tile sizes selection for non f32 cases.
+  if (op.getSourceType().getElementType().isF32() &&
+      hasAVX512fFeature(targetAttr) && isPackMatmulLHS(op)) {
     tileSizes.back() = vectorSize;
   }
   return tileSizes;
@@ -1370,6 +1389,16 @@
   SmallVector<int64_t> distTileSizes =
       getDefaultDistributionTileSizes(cast<TilingInterface>(op.getOperation()));
 
+  int64_t vectorSize = getVectorSize(entryPointFn, op.getSourceType());
+  SmallVector<int64_t> vectorSizeHints(op.getSourceRank(), 1);
+  for (auto dim : op.getInnerDimsPos()) {
+    vectorSizeHints[dim] = vectorSize;
+  }
+
+  SmallVector<int64_t> workload(op.getSourceType().getShape());
+  reduceDistributionWorkgroups(workload, distTileSizes,
+                               /*maxTileSizes=*/std::nullopt, vectorSizeHints);
+
   // The default function aims to returns the number of workload per workgroup,
   // but it does not know that it is working on packed domain. We need to take
   // inner tile sizes into account and adjust the distribution tile sizes.

diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/materialize_aarch64_launch_configuration.mlir b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/materialize_aarch64_launch_configuration.mlir
index 020fce6..2d84542 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/materialize_aarch64_launch_configuration.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/materialize_aarch64_launch_configuration.mlir

@@ -498,7 +498,7 @@
     }
   }
 }
-//  CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[8, 64], [1, 1]]>
+//  CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[2, 40], [1, 1]]>
 //  CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDataTiling>
 //      CHECK: hal.executable.export public @pack
 // CHECK-SAME:     translation_info = #[[TRANSLATION]]

diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/materialize_x86_64_launch_configuration.mlir b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/materialize_x86_64_launch_configuration.mlir
index 6c3d495..834462f 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/materialize_x86_64_launch_configuration.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/materialize_x86_64_launch_configuration.mlir

@@ -1575,6 +1575,43 @@
 // -----
 
 #pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
+  #hal.descriptor_set.layout<0, bindings = [
+    #hal.descriptor_set.binding<0, storage_buffer>,
+    #hal.descriptor_set.binding<1, storage_buffer>
+  ]>
+]>
+hal.executable private @pack_many_elements  {
+  hal.executable.variant public @embedded_elf_x86_64 target(<"llvm-cpu", "embedded-elf-x86_64", {
+    cpu_features = "+avx512f",
+    data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128",
+    native_vector_size = 64 : index,
+    target_triple = "x86_64-none-elf"
+  }>) {
+  hal.executable.export public @pack_many_elements layout(#pipeline_layout)
+    builtin.module {
+      func.func @pack_many_elements() {
+        %c0 = arith.constant 0 : index
+        %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1200x500000xf32>>
+        %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<31250x1200x16x1xf32>>
+        %2 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1200, 500000], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1200x500000xf32>> -> tensor<1200x500000xf32>
+        %3 = tensor.empty() : tensor<31250x1200x16x1xf32>
+        %pack = tensor.pack %2 outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %3 : tensor<1200x500000xf32> -> tensor<31250x1200x16x1xf32>
+        flow.dispatch.tensor.store %pack, %1, offsets = [0, 0, 0, 0], sizes = [31250, 1200, 16, 1], strides = [1, 1, 1, 1] : tensor<31250x1200x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<31250x1200x16x1xf32>>
+        return
+      }
+    }
+  }
+}
+//  CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[128, 31250], [1, 1]]>
+//  CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDataTiling>
+//      CHECK: hal.executable.export public @pack_many_elements
+// CHECK-SAME:     translation_info = #[[TRANSLATION]]
+//      CHECK:   tensor.pack
+// CHECK-SAME:       lowering_config = #[[CONFIG]]
+
+// -----
+
+#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
   <0, bindings = [
     <0, storage_buffer, ReadOnly>,
     <1, storage_buffer, ReadOnly>,
commit	e3f2ab393fdef5a22877c982f8e2e89effdbfa91	[log] [tgz]
author	Han-Chung Wang <hanhan0912@gmail.com>	Tue Nov 07 09:45:51 2023 -0800
committer	GitHub <noreply@github.com>	Tue Nov 07 09:45:51 2023 -0800
tree	0e74f3d96d131f3e81654846bfdee844f42fca4b
parent	41a23ade9c5857ec56a70e3cb907ffaa9feb6c5e [diff]