Heuristically pick up tiling sizes to vectorize Linalg operations. (#8517)
Configurations: taskset 80 + dylib-sync on local Pixel 4
| Model | Before | After |
| ---------------- | ------- | ------- |
| DeepLabV3 | 711 ms | 188 ms |
| MobileBertSquad | 672 ms | 673 ms |
| MobileNetV2 | 103 ms | 46.4 ms |
| MobileNetV3Small | 24.6 ms | 14.6 ms |
| MobileSSD | 204 ms | 166 ms |
| PoseNet | 545 ms | 241 ms |
Configurations: taskset f0 + dylib on local Pixel 4
| Model | Before | After |
| ---------------- | ------- | ------- |
| DeepLabV3 | 358 ms | 161 ms |
| MobileBertSquad | 376 ms | 376 ms |
| MobileNetV2 | 65.6 ms | 29.8 ms |
| MobileNetV3Small | 20.7 ms | 15.7 ms |
| MobileSSD | 94 ms | 79.9 ms |
| PoseNet | 183 ms | 121 ms |
diff --git a/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp b/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp
index 2899681..7dcf9d3 100644
--- a/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp
+++ b/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp
@@ -198,8 +198,7 @@
/// Adjusts the workload per workgroup to be a multiple of vector size to ensure
/// that the op vectorizes.
static int64_t getMaxTileSize(int64_t lb, int64_t ub, int64_t maxSize,
- int64_t vectorSizeVal,
- Optional<int64_t> fallbackSizeVal = llvm::None) {
+ int64_t vectorSizeVal) {
if (ub == ShapedType::kDynamicSize || lb == ShapedType::kDynamicSize) {
return maxSize;
}
@@ -210,7 +209,16 @@
return i;
}
}
- return fallbackSizeVal ? fallbackSizeVal.getValue() : vectorSizeVal;
+ // If it can't be a multiple of vectorSizeVal, let's choose a factor of dim
+ // sizes heuristically.
+ int64_t start = std::min(maxSize, dim);
+ start = std::min(start, vectorSizeVal * 2);
+ for (int64_t i = start; i > 0; --i) {
+ if (dim % i == 0) {
+ return i;
+ }
+ }
+ return 1;
}
/// Returns the tile size to use for the Flow level of an operation that
@@ -340,15 +348,17 @@
SmallVector<int64_t> l1TileSizes;
int64_t nLoops = cast<linalg::LinalgOp>(op.getOperation()).getNumLoops();
l1TileSizes.append(nLoops - 3, 1);
- l1TileSizes.push_back(getMaxTileSize(0, flowTileSizes[nLoops - 3], 8, 8));
- l1TileSizes.push_back(getMaxTileSize(0, flowTileSizes[nLoops - 2], 32, 32));
+ l1TileSizes.push_back(
+ getMaxTileSize(0, flowTileSizes[nLoops - 3], 8, vectorSize));
+ l1TileSizes.push_back(
+ getMaxTileSize(0, flowTileSizes[nLoops - 2], 32, vectorSize));
l1TileSizes.push_back(0);
auto lhsShapedType = op.lhs().getType().cast<ShapedType>();
int64_t K = lhsShapedType.getShape().back();
SmallVector<int64_t> vectorTileSizes;
vectorTileSizes.append(nLoops - 1, 0);
- vectorTileSizes.push_back(getMaxTileSize(0, K, 16, 16));
+ vectorTileSizes.push_back(getMaxTileSize(0, K, 16, vectorSize));
TileSizesListType tileSizes;
tileSizes.emplace_back(flowTileSizes.begin(), flowTileSizes.end());
@@ -668,8 +678,7 @@
// If the tile size is intended to be 1, do not adjust it to `vectorSize`.
// The ops will be decomposed to lower-rank named ops.
if (l1TileSizes[i] != 1) {
- l1TileSizes[i] = getMaxTileSize(0, tileSize, l1TileSizes[i], vectorSize,
- /*fallbackTileSize=*/1);
+ l1TileSizes[i] = getMaxTileSize(0, tileSize, l1TileSizes[i], vectorSize);
}
}
SmallVector<int64_t> vectorTileSizes;
diff --git a/iree/compiler/Codegen/LLVMCPU/test/materialize_launch_configuration.mlir b/iree/compiler/Codegen/LLVMCPU/test/materialize_launch_configuration.mlir
index 5896aaf..f2150cf 100644
--- a/iree/compiler/Codegen/LLVMCPU/test/materialize_launch_configuration.mlir
+++ b/iree/compiler/Codegen/LLVMCPU/test/materialize_launch_configuration.mlir
@@ -872,7 +872,7 @@
}
}
}
-// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[4, 0, 0], [4, 0, 0], [0, 1, 4]{{\]}}>
+// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[1, 0, 0], [1, 0, 0], [0, 1, 4]{{\]}}>
// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDoubleTilingExpert>
// CHECK: hal.executable.entry_point public @predict_dispatch_86
// CHECK-SAME: translation_info = #[[TRANSLATION]]
@@ -1033,6 +1033,52 @@
// -----
+#executable_layout = #hal.executable.layout<push_constants = 4, sets = [
+ #hal.descriptor_set.layout<0, bindings = [
+ #hal.descriptor_set.binding<0, storage_buffer>,
+ #hal.descriptor_set.binding<1, storage_buffer>,
+ #hal.descriptor_set.binding<2, storage_buffer>
+ ]>
+]>
+#executable_target_embedded_elf_x86_64_ = #hal.executable.target<
+ "llvm", "embedded-elf-x86_64", {
+ data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128",
+ native_vector_size = 16 : index,
+ target_triple = "x86_64-unknown-unknown-eabi-elf"
+ }
+>
+hal.executable private @matmul_odd {
+ hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
+ hal.executable.entry_point public @matmul_odd ordinal(0) layout(#executable_layout)
+ builtin.module {
+ func @matmul_odd() {
+ %cst = arith.constant 0.000000e+00 : f32
+ %c0 = arith.constant 0 : index
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(32) : !flow.dispatch.tensor<readonly:33x16xf32>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(32) : !flow.dispatch.tensor<readonly:16x49xf32>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(32) : !flow.dispatch.tensor<readonly:33x49xf32>
+ %3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) offset(%c0) alignment(32) : !flow.dispatch.tensor<writeonly:33x49xf32>
+ %4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [33, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:33x16xf32> -> tensor<33x16xf32>
+ %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [16, 49], strides = [1, 1] : !flow.dispatch.tensor<readonly:16x49xf32> -> tensor<16x49xf32>
+ %6 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [33, 49], strides = [1, 1] : !flow.dispatch.tensor<readonly:33x49xf32> -> tensor<33x49xf32>
+ %7 = linalg.init_tensor [33, 49] : tensor<33x49xf32>
+ %8 = linalg.fill(%cst, %7) : f32, tensor<33x49xf32> -> tensor<33x49xf32>
+ %9 = linalg.matmul ins(%4, %5 : tensor<33x16xf32>, tensor<16x49xf32>) outs(%8 : tensor<33x49xf32>) -> tensor<33x49xf32>
+ flow.dispatch.tensor.store %9, %3, offsets = [0, 0], sizes = [33, 49], strides = [1, 1] : tensor<33x49xf32> -> !flow.dispatch.tensor<writeonly:33x49xf32>
+ return
+ }
+ }
+ }
+}
+// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[3, 7, 0], [3, 7, 0], [0, 0, 16]]>
+// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDoubleTilingExpert>
+// CHECK: hal.executable.entry_point public @matmul_odd
+// CHECK-SAME: translation_info = #[[TRANSLATION]]
+// CHECK: linalg.matmul
+// CHECK-SAME: lowering_config = #[[CONFIG]]
+
+// -----
+
#executable_layout = #hal.executable.layout<push_constants = 0, sets = [
#hal.descriptor_set.layout<0, bindings = [
#hal.descriptor_set.binding<0, storage_buffer>,