Make default distribution logic divide work evenly. (#15414)
Also fix a bug in tile size adjustment, which reduce the number of
workgroups in cases where we are dividing the work too much. The
`numWorkgroupsPerDim[index]` was not updated after adjusting it to a new
tile size.
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp b/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp
index 3c6f93e..a39d0b8 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp
@@ -461,11 +461,32 @@
llvm::divideCeil(workload[index], distributedTileSizes[index]);
if (nwg < numWorkgroupsPerDim[index]) {
numWorkgroups /= numWorkgroupsPerDim[index];
+ numWorkgroupsPerDim[index] = nwg;
numWorkgroups *= nwg;
} else {
currDim--;
}
}
+
+ // Final fixup for dividing workload evenly.
+ for (auto i : llvm::seq<unsigned>(0, distributedTileSizes.size())) {
+ if (distributedTileSizes[i] == 0 || ShapedType::isDynamic(workload[i]))
+ continue;
+
+ int64_t nwg = llvm::divideCeil(workload[i], distributedTileSizes[i]);
+ int64_t newSize = llvm::divideCeil(workload[i], nwg);
+
+ // Chech if it's the ideal size with vector size hint. And skip if the new
+ // size will break the ideal size.
+ int64_t vectorSize = vectorSizeHints[i];
+ if (vectorSize > 1 &&
+ (newSize % vectorSize != 0 || workload[i] % newSize != 0)) {
+ continue;
+ }
+
+ distributedTileSizes[i] = newSize;
+ }
+
return distributedTileSizes;
}
@@ -1134,9 +1155,13 @@
maxTileSizes[0] = 192;
maxTileSizes[1] = 128;
}
+ SmallVector<int64_t> vectorSizeHints(numLoops, vectorSize);
+ if (isBM) {
+ vectorSizeHints[0] = 1;
+ }
distTileSizes = getDefaultDistributedLevelTileSizes(
linalgOp, vecTileSizes, maxTileSizes,
- /*allowIncompleteTile=*/true);
+ /*allowIncompleteTile=*/true, vectorSizeHints);
} else {
distTileSizes = getDefaultDistributedLevelTileSizes(linalgOp, vecTileSizes,
maxTileSizes);
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/materialize_aarch64_launch_configuration.mlir b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/materialize_aarch64_launch_configuration.mlir
index f86d347..020fce6 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/materialize_aarch64_launch_configuration.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/materialize_aarch64_launch_configuration.mlir
@@ -368,7 +368,7 @@
}
}
}
-// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[0, 1, 7, 64, 0, 0], [1, 1, 1, 4, 0, 0], [0, 0, 0, 0, 1, 1], [0, 0, 0, 0, 0, 0]]>
+// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[0, 7, 7, 64, 0, 0], [1, 1, 1, 4, 0, 0], [0, 0, 0, 0, 1, 1], [0, 0, 0, 0, 0, 0]]>
// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>
// CHECK: hal.executable.export public @restrict_num_workgroups
// CHECK-SAME: translation_info = #[[TRANSLATION]]
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/materialize_x86_64_launch_configuration.mlir b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/materialize_x86_64_launch_configuration.mlir
index b2b3a4a..6c3d495 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/materialize_x86_64_launch_configuration.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/materialize_x86_64_launch_configuration.mlir
@@ -977,7 +977,7 @@
}
}
-// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[128, 64, 0], [8, 32, 0], [0, 0, 16], [0, 0, 0]]>
+// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[192, 64, 0], [8, 32, 0], [0, 0, 16], [0, 0, 0]]>
// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDoubleTilingPadExpert>
// CHECK: hal.executable.export public @matmul_static
// CHECK-SAME: translation_info = #[[TRANSLATION]]
@@ -1863,7 +1863,7 @@
}
}
}
-// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[256, 72, 0], [8, 32, 0], [0, 0, 12], [0, 0, 0]]>
+// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[192, 144, 0], [8, 32, 0], [0, 0, 12], [0, 0, 0]]>
// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDoubleTilingPadExpert>
// CHECK: hal.executable.export public @quant_model
// CHECK-SAME: translation_info = #[[TRANSLATION]]