[spirv] Tile along the last three partitioned (parallel) dimensions (#6806) If an op have more than three parallel dimensions, we want to tile along the innermost ones. This happens for convolution ops, where the first dimension is typically one for inference. There is no need to tile along that. This should fix the 3-4x performance regression on Adreno GPUs.

commit: 96264e53ef1975ea08164615313a355fc45a8d91 [log] [tgz]
author: Lei Zhang <antiagainst@google.com> Thu Aug 19 13:52:08 2021 -0400
committer: GitHub <noreply@github.com> Thu Aug 19 13:52:08 2021 -0400
tree: 6f47d804c05254e4f5832a479f2d8fda033c99bd
parent: 29a0b471b3baf889781981a9c8e4afefc676e024 [diff]
diff --git a/iree/compiler/Codegen/SPIRV/KernelDispatchUtils.cpp b/iree/compiler/Codegen/SPIRV/KernelDispatchUtils.cpp
index 9ac980a..3d2b5ea 100644
--- a/iree/compiler/Codegen/SPIRV/KernelDispatchUtils.cpp
+++ b/iree/compiler/Codegen/SPIRV/KernelDispatchUtils.cpp

@@ -319,9 +319,7 @@
       vectorize = true;
     }
     SmallVector<int64_t, 4> candidateTileSizes;
-    if (vectorize) {
-      candidateTileSizes.append({4 * subgroupSize, 2 * subgroupSize});
-    }
+    if (vectorize) candidateTileSizes.push_back(4 * subgroupSize);
     candidateTileSizes.push_back(subgroupSize);
     for (int64_t size : candidateTileSizes) {
       if (outputShape.back() % size != 0) continue;
@@ -339,15 +337,24 @@
   }
 
   std::array<int64_t, 3> workgroupSize = {subgroupSize, 1, 1};
+
   unsigned loopDepth = partitionedLoops.back() + 1;
-  SmallVector<int64_t, 4> workgroupTileSize(loopDepth, 1),
-      threadTileSize(loopDepth, 1);
+  SmallVector<int64_t, 4> workgroupTileSize(loopDepth, 0);
+  SmallVector<int64_t, 4> threadTileSize(loopDepth, 0);
+
+  // Tiling along partitioned loops with size 1.
+  for (int64_t loopIndex : partitionedLoops) {
+    workgroupTileSize[loopIndex] = threadTileSize[loopIndex] = 1;
+  }
+  // Overwrite the configuration for the innermost dimension.
   workgroupTileSize.back() = lowerWorkgroupTs;
   threadTileSize.back() = lowerThreadTs;
+
   TileSizesListType tileSizes;
   tileSizes.emplace_back(workgroupTileSize);  // Workgroup level
   tileSizes.emplace_back();                   // Subgroup level
   tileSizes.emplace_back(threadTileSize);     // Invocation level
+
   return setOpConfigAndEntryPointFnTranslation(
       entryPoint, op, tileSizes,
       /*nativeVectorSize =*/ArrayRef<int64_t>{}, pipeline, workgroupSize);
commit	96264e53ef1975ea08164615313a355fc45a8d91	[log] [tgz]
author	Lei Zhang <antiagainst@google.com>	Thu Aug 19 13:52:08 2021 -0400
committer	GitHub <noreply@github.com>	Thu Aug 19 13:52:08 2021 -0400
tree	6f47d804c05254e4f5832a479f2d8fda033c99bd
parent	29a0b471b3baf889781981a9c8e4afefc676e024 [diff]