Revert "[VectorDistribute] Correctly find new dimensions during reduction config" (#21810)
Reverts iree-org/iree#21797
This patch does some weird things with elementwise consumers, tiling
them serially, when they should be tiled by consumer fusion. This needs
more work before it can be landed.
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp
index 7510180..cc9fafb 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp
@@ -492,7 +492,6 @@
return loweringConfig;
}
-// TODO: Use IndexingMapInterface here instead of linalg::LinalgOp.
static LogicalResult
populateConfigInfo(const llvm::SetVector<linalg::LinalgOp> &computeOps,
IREE::GPU::TargetAttr target, int64_t workgroupSize,
@@ -524,28 +523,42 @@
// LinalgOp with only parallel dims. This is needed if the op cannot be fused
// with a reduction or introduces new loop dimensions.
auto shouldAttachLoweringConfig = [&](linalg::LinalgOp linalgOp) -> bool {
- // We want to attach a lowering config to this operation if it introduces
- // a new dimension, when going by topological order in the backward slice.
- // The only two ways to introduce a new dimension are:
- //
- // 1. We have a reduction dimension.
- if (hasReductionIterator(linalgOp)) {
- return true;
+ // If the operation has a gather, we want to fuse it with the
+ // reduction.
+ if (hasExternalCapture(cast<linalg::GenericOp>(linalgOp))) {
+ return false;
}
- // 2. There is no consumer which is a compute op (i.e., it already
- // has some way of getting fused).
- if (llvm::none_of(linalgOp->getUsers(), [&](Operation *user) {
+ // If some of the users are in computeOps and some are outside of
+ // computeOps; attach lowering config, since the op can't be fused.
+ if (llvm::any_of(linalgOp->getUsers(),
+ [&](Operation *user) {
+ auto linalgUser = dyn_cast<linalg::LinalgOp>(user);
+ return linalgUser && computeOps.contains(linalgUser);
+ }) &&
+ llvm::any_of(linalgOp->getUsers(), [&](Operation *user) {
auto linalgUser = dyn_cast<linalg::LinalgOp>(user);
- return linalgUser && computeOps.contains(linalgUser);
+ return !linalgUser;
})) {
return true;
}
+ // If the indexing map introduces new dimensions (more inputs than results),
+ // attach a lowering config.
+ for (OpOperand *operand : linalgOp.getDpsInputOperands()) {
+ int64_t operandIdx = linalgOp.getIndexingMapIndex(operand);
+ AffineMap indexingMap = linalgOp.getIndexingMapsArray()[operandIdx];
+ if (indexingMap.getNumResults() > 0 &&
+ indexingMap.getNumInputs() > indexingMap.getNumResults()) {
+ return true;
+ }
+ }
+
return false;
};
for (linalg::LinalgOp linalgOp : computeOps) {
- if (shouldAttachLoweringConfig(linalgOp)) {
+ if (hasReductionIterator(linalgOp) ||
+ shouldAttachLoweringConfig(linalgOp)) {
auto loweringConfig = getVectorDistributeReductionConfig(
linalgOp, target, sharedWgpTiles, workgroupSize, subgroupSize,
threadLoads);
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp
index 1d64876..59ec04b 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp
@@ -755,6 +755,9 @@
/*convertToDpsOptions=*/std::nullopt,
/*reorderStrategy=*/reorderStrategy);
+ // Some of the elementwise fusion can benefit from this pass.
+ funcPassManager.addPass(createRematerializeParallelOpsPass());
+
funcPassManager.addPass(
IREE::LinalgExt::createConvertAttentionToOnlineAttentionPass());
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_vector_distribute_reduction_gfx942.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_vector_distribute_reduction_gfx942.mlir
index 6a21cd7..5af6585 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_vector_distribute_reduction_gfx942.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_vector_distribute_reduction_gfx942.mlir
@@ -245,7 +245,12 @@
// CHECK: func.func @test_multiple_stores
// CHECK-SAME: translation_info = #[[$TRANSLATION]]
// CHECK: linalg.generic
-// CHECK-NOT: lowering_config
+// CHECK-SAME: attrs = {lowering_config = #iree_gpu.lowering_config<{
+// CHECK-SAME: lane_basis = {{\[}}[1, 64], [0, 1]],
+// CHECK-SAME: reduction = [0, 4096],
+// CHECK-SAME: subgroup_basis = {{\[}}[1, 16], [0, 1]],
+// CHECK-SAME: thread = [0, 4],
+// CHECK-SAME: workgroup = [1, 0]
// CHECK: linalg.generic
// CHECK-SAME: attrs = {lowering_config = #iree_gpu.lowering_config<{
// CHECK-SAME: lane_basis = {{\[}}[1, 64], [0, 1]],