[spirv] Tile along the last three partitioned (parallel) dimensions (#6806)
If an op have more than three parallel dimensions, we want to
tile along the innermost ones. This happens for convolution ops,
where the first dimension is typically one for inference.
There is no need to tile along that.
This should fix the 3-4x performance regression on Adreno GPUs.
diff --git a/iree/compiler/Codegen/SPIRV/KernelDispatchUtils.cpp b/iree/compiler/Codegen/SPIRV/KernelDispatchUtils.cpp
index 9ac980a..3d2b5ea 100644
--- a/iree/compiler/Codegen/SPIRV/KernelDispatchUtils.cpp
+++ b/iree/compiler/Codegen/SPIRV/KernelDispatchUtils.cpp
@@ -319,9 +319,7 @@
vectorize = true;
}
SmallVector<int64_t, 4> candidateTileSizes;
- if (vectorize) {
- candidateTileSizes.append({4 * subgroupSize, 2 * subgroupSize});
- }
+ if (vectorize) candidateTileSizes.push_back(4 * subgroupSize);
candidateTileSizes.push_back(subgroupSize);
for (int64_t size : candidateTileSizes) {
if (outputShape.back() % size != 0) continue;
@@ -339,15 +337,24 @@
}
std::array<int64_t, 3> workgroupSize = {subgroupSize, 1, 1};
+
unsigned loopDepth = partitionedLoops.back() + 1;
- SmallVector<int64_t, 4> workgroupTileSize(loopDepth, 1),
- threadTileSize(loopDepth, 1);
+ SmallVector<int64_t, 4> workgroupTileSize(loopDepth, 0);
+ SmallVector<int64_t, 4> threadTileSize(loopDepth, 0);
+
+ // Tiling along partitioned loops with size 1.
+ for (int64_t loopIndex : partitionedLoops) {
+ workgroupTileSize[loopIndex] = threadTileSize[loopIndex] = 1;
+ }
+ // Overwrite the configuration for the innermost dimension.
workgroupTileSize.back() = lowerWorkgroupTs;
threadTileSize.back() = lowerThreadTs;
+
TileSizesListType tileSizes;
tileSizes.emplace_back(workgroupTileSize); // Workgroup level
tileSizes.emplace_back(); // Subgroup level
tileSizes.emplace_back(threadTileSize); // Invocation level
+
return setOpConfigAndEntryPointFnTranslation(
entryPoint, op, tileSizes,
/*nativeVectorSize =*/ArrayRef<int64_t>{}, pipeline, workgroupSize);