[GPU] Use Affine map for size calculations of alloca's in fission pass (#21870)
This makes it easier for value bounds interface to find the upper bound
when we later pad the allocs to make them static.
See https://github.com/iree-org/iree/issues/21872 for details on why we
need this.
Signed-off-by: Nirvedh Meshram <nirvedh@gmail.com>
diff --git a/compiler/src/iree/compiler/Codegen/Common/FissionTransferOpsInControlFlow.cpp b/compiler/src/iree/compiler/Codegen/Common/FissionTransferOpsInControlFlow.cpp
index 48779f8..6c72bd6 100644
--- a/compiler/src/iree/compiler/Codegen/Common/FissionTransferOpsInControlFlow.cpp
+++ b/compiler/src/iree/compiler/Codegen/Common/FissionTransferOpsInControlFlow.cpp
@@ -10,6 +10,7 @@
#include "iree/compiler/Codegen/Utils/Utils.h"
#include "llvm/Support/DebugLog.h"
#include "mlir/Analysis/SliceAnalysis.h"
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
#include "mlir/Dialect/GPU/IR/GPUDialect.h"
#include "mlir/Dialect/Vector/IR/VectorOps.h"
#include "mlir/IR/PatternMatch.h"
@@ -32,11 +33,17 @@
vector::TransferReadOp readOp,
scf::ForOp forOp) {
auto loc = forOp.getLoc();
- auto allocaSize = rewriter.create<arith::CeilDivUIOp>(
- loc,
- rewriter.create<arith::SubIOp>(loc, forOp.getUpperBound(),
- forOp.getLowerBound()),
- forOp.getStep());
+ // We use an affine map to calculate the size of the alloca rather
+ // than arith ops directly due to an issue with finding the static upper bound
+ // when using arith.ceildivui described in
+ // https://github.com/iree-org/iree/issues/21872
+ AffineExpr lb, ub, step;
+ bindSymbols(rewriter.getContext(), lb, ub, step);
+ AffineMap sizeMap = AffineMap::get(0, 3, (ub - lb).ceilDiv(step));
+ auto allocaSize = affine::makeComposedFoldedAffineApply(
+ rewriter, loc, sizeMap,
+ getAsOpFoldResult(ValueRange{forOp.getLowerBound(), forOp.getUpperBound(),
+ forOp.getStep()}));
auto vectorType = cast<VectorType>(readOp.getVectorType());
SmallVector<int64_t> memrefShape(vectorType.getShape());
@@ -46,8 +53,9 @@
auto memrefType = MemRefType::get(memrefShape, vectorType.getElementType(),
AffineMap{}, privateAddrSpaceAttr);
- return rewriter.create<memref::AllocaOp>(loc, memrefType,
- ValueRange{allocaSize});
+ return rewriter.create<memref::AllocaOp>(
+ loc, memrefType,
+ ValueRange{getValueOrCreateConstantIndexOp(rewriter, loc, allocaSize)});
}
/// Creates an index for accessing the memref in the loop. This index is
diff --git a/compiler/src/iree/compiler/Codegen/Common/Passes.td b/compiler/src/iree/compiler/Codegen/Common/Passes.td
index d0d38ec..bc34858 100644
--- a/compiler/src/iree/compiler/Codegen/Common/Passes.td
+++ b/compiler/src/iree/compiler/Codegen/Common/Passes.td
@@ -370,7 +370,8 @@
let summary =
"Fission transfer read and write ops in control flow to allow prefetching.";
let dependentDialects = [
- "memref::MemRefDialect"
+ "memref::MemRefDialect",
+ "affine::AffineDialect"
];
let options = [
Option<"FissionMultiTrip", "fission-multi-trip",
diff --git a/compiler/src/iree/compiler/Codegen/Common/test/fission_transfer_ops_control_flow.mlir b/compiler/src/iree/compiler/Codegen/Common/test/fission_transfer_ops_control_flow.mlir
index cfca25d..99cae6c 100644
--- a/compiler/src/iree/compiler/Codegen/Common/test/fission_transfer_ops_control_flow.mlir
+++ b/compiler/src/iree/compiler/Codegen/Common/test/fission_transfer_ops_control_flow.mlir
@@ -1,5 +1,5 @@
-// RUN: iree-opt --split-input-file -pass-pipeline="builtin.module(func.func(iree-codegen-fission-transfer-ops-in-control-flow{fission-multi-trip}))" %s | FileCheck %s --check-prefixes=CHECK-ALL,MULTI
-// RUN: iree-opt --split-input-file -pass-pipeline="builtin.module(func.func(iree-codegen-fission-transfer-ops-in-control-flow))" %s | FileCheck %s --check-prefixes=CHECK-ALL,SINGLE
+// RUN: iree-opt --split-input-file -pass-pipeline="builtin.module(func.func(iree-codegen-fission-transfer-ops-in-control-flow{fission-multi-trip}))" --mlir-print-local-scope %s | FileCheck %s --check-prefixes=CHECK-ALL,MULTI
+// RUN: iree-opt --split-input-file -pass-pipeline="builtin.module(func.func(iree-codegen-fission-transfer-ops-in-control-flow))" --mlir-print-local-scope %s | FileCheck %s --check-prefixes=CHECK-ALL,SINGLE
// CHECK-ALL-LABEL: @fission_global_read_to_private_write
// CHECK-ALL-SAME: %[[ARG0:.*]]: memref<1x?x?x8xbf16, #amdgpu.address_space<fat_raw_buffer>>
@@ -66,8 +66,7 @@
}
return
}
-// MULTI: %[[SUB:.*]] = arith.subi %c16, %[[ARG0]]
-// MULTI: %[[DIV:.*]] = arith.ceildivui %[[SUB]], %c128
+// MULTI: %[[DIV:.*]] = affine.apply affine_map<()[s0] -> ((-s0 + 16) ceildiv 128)>()[%[[ARG0]]]
// MULTI: %[[ALLOCA:.*]] = memref.alloca(%[[DIV]])
// MULTI: scf.for %[[ITER:.*]] = %[[ARG0]] to %c16 step %c128 {
// MULTI: %[[READ:.*]] = vector.transfer_read %arg1[%c0, %c0], %cst {in_bounds = [true, true]}