blob: 8e6355cdc1bb2ab3e1e8faca9423b1ddadd759fa [file]
// Copyright 2021 The IREE Authors
//
// Licensed under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
#include "iree/compiler/Codegen/PassDetail.h"
#include "iree/compiler/Codegen/Passes.h"
#include "iree/compiler/Codegen/Utils/Utils.h"
#include "mlir/Dialect/GPU/GPUDialect.h"
#include "mlir/Dialect/GPU/Passes.h"
#include "mlir/Dialect/SCF/Transforms.h"
#include "mlir/Dialect/Vector/IR/VectorOps.h"
#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
//====---------------------------------------------------------------------===//
// Pass to pipeline copy to shared memory for matmul op.
//====---------------------------------------------------------------------===//
namespace mlir {
namespace iree_compiler {
static const StringLiteral kPipeliningLoopMarker = "__pipelining_K_loop__";
static const StringLiteral kPipeliningGlobalLoad = "__pipelining_global_load__";
/// Helper to recursively add operation dependencies within `block` to `dep`
/// set.
static void addDepOps(llvm::SmallDenseSet<Operation*>& dep, Operation* op,
Block* block) {
if (!dep.insert(op).second) return;
for (Value operand : op->getOperands()) {
Operation* defOp = operand.getDefiningOp();
if (defOp && defOp->getBlock() == block) addDepOps(dep, defOp, block);
}
}
/// Assign stages to the loop ops. Simple logic for now, put load from global
/// memory in stage 0 and the rest in stage 1.
static void getPipelineStages(scf::ForOp forOp,
std::vector<std::pair<Operation*, unsigned>>& ops,
unsigned depth) {
if (!forOp->hasAttr(kPipeliningLoopMarker)) return;
// Track dependencies of the global memory load.
llvm::SmallDenseSet<Operation*> loadDep;
for (Operation& op : forOp.getBody()->getOperations()) {
if (op.hasAttr(kPipeliningGlobalLoad)) {
addDepOps(loadDep, &op, forOp.getBody());
}
}
// Create a modulo schedule with loads from global memory and the operations
// it depends on in stage 0. Store to shared memory and computation are in
// stage `maxDepth`. In order to have a correct scheduling even with back
// edges we order stages in decreasing order.
for (Operation& op : forOp.getBody()->getOperations()) {
if (!loadDep.count(&op) && !isa<scf::YieldOp>(op))
ops.push_back(std::make_pair(&op, depth));
}
for (Operation& op : forOp.getBody()->getOperations()) {
if (loadDep.count(&op)) ops.push_back(std::make_pair(&op, 0));
}
}
static void setAsyncAnnotations(Operation* op,
scf::PipeliningOption::PipelinerPart part,
unsigned iteration, unsigned depth) {
auto waitOp = dyn_cast<gpu::DeviceAsyncWaitOp>(op);
if (!waitOp || waitOp.numGroups()) return;
int numGroupInFlight = 0;
if (part == scf::PipeliningOption::PipelinerPart::Kernel) {
numGroupInFlight = depth - 1;
} else {
// By construction there should be no wait op in the prologue as all the
// wait should be in the last stage.
assert(part == scf::PipeliningOption::PipelinerPart::Epilogue);
// Based on the schedule we pick we know how many groups are in flight for
// each iteration of the epilogue.
numGroupInFlight = depth - 1 - iteration;
}
OpBuilder b(op);
waitOp->setAttr(waitOp.numGroupsAttrName(),
b.getI32IntegerAttr(numGroupInFlight));
}
namespace {
struct LLVMGPUPipeliningPass
: public LLVMGPUPipeliningBase<LLVMGPUPipeliningPass> {
LLVMGPUPipeliningPass(unsigned depth) : depth(depth) {}
void runOnOperation() override {
auto funcOp = getOperation();
MLIRContext* context = &getContext();
// Mark the loop with shared memory copy for pipelining.
funcOp.walk([](scf::ForOp forOp) {
bool copyToWorkgroupMemory = false;
OpBuilder builder(forOp.getContext());
SmallVector<Operation*> barriers;
for (Operation& op : forOp.getBody()->getOperations()) {
// Pipeline the most inner for op that should be a flat region.
if (op.getNumRegions() > 0) return;
if (isa<gpu::BarrierOp>(op)) {
barriers.push_back(&op);
}
if (isa<gpu::DeviceAsyncCopyOp, gpu::DeviceAsyncCreateGroupOp>(op)) {
copyToWorkgroupMemory = true;
op.setAttr(kPipeliningGlobalLoad, builder.getUnitAttr());
// async copy ops need to be moved along with previous barrier.
for (Operation* barrier : barriers) {
barrier->setAttr(kPipeliningGlobalLoad, builder.getUnitAttr());
}
barriers.clear();
continue;
}
auto ld = dyn_cast<vector::TransferReadOp>(op);
if (!ld) continue;
unsigned ldAddSpace =
ld.getSource().getType().cast<MemRefType>().getMemorySpaceAsInt();
if (ldAddSpace != 0 || !ld->hasOneUse()) continue;
auto st =
dyn_cast<vector::TransferWriteOp>(ld->use_begin()->getOwner());
if (!st) continue;
unsigned stAddSpace =
st.getSource().getType().cast<MemRefType>().getMemorySpaceAsInt();
if (stAddSpace != 3) continue;
copyToWorkgroupMemory = true;
ld->setAttr(kPipeliningGlobalLoad, builder.getUnitAttr());
}
if (copyToWorkgroupMemory) {
forOp->setAttr(kPipeliningLoopMarker, builder.getUnitAttr());
}
});
scf::PipeliningOption options;
unsigned maxDepth = depth;
auto getSchedule = [maxDepth](
scf::ForOp forOp,
std::vector<std::pair<Operation*, unsigned>>& ops) {
return getPipelineStages(forOp, ops, maxDepth);
};
auto setAnnotation = [maxDepth](Operation* op,
scf::PipeliningOption::PipelinerPart part,
unsigned iteration) {
return setAsyncAnnotations(op, part, iteration, maxDepth);
};
options.getScheduleFn = getSchedule;
options.annotateFn = setAnnotation;
RewritePatternSet pipeliningPatterns(context);
scf::populateSCFLoopPipeliningPatterns(pipeliningPatterns, options);
if (failed(applyPatternsAndFoldGreedily(funcOp,
std::move(pipeliningPatterns)))) {
return signalPassFailure();
}
}
private:
unsigned depth;
};
} // namespace
std::unique_ptr<OperationPass<func::FuncOp>> createLLVMGPUPipeliningPass(
unsigned depth) {
return std::make_unique<LLVMGPUPipeliningPass>(depth);
}
} // namespace iree_compiler
} // namespace mlir