blob: f9653f56937e01ec1bdd6cd51222cf8aa173c1e2 [file]
// Copyright 2021 The IREE Authors
//
// Licensed under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
#include "iree/compiler/Codegen/Common/CommonPasses.h"
#include "iree/compiler/Codegen/Common/Transforms.h"
#include "iree/compiler/Codegen/PassDetail.h"
#include "iree/compiler/Codegen/Utils/GPUUtils.h"
#include "iree/compiler/Dialect/HAL/IR/HALTypes.h"
#include "llvm/Support/Debug.h"
#include "mlir/Dialect/Arith/IR/Arith.h"
#include "mlir/Dialect/GPU/IR/GPUDialect.h"
#include "mlir/Dialect/NVGPU/IR/NVGPUDialect.h"
#include "mlir/Dialect/SCF/Transforms/Patterns.h"
#include "mlir/Dialect/SCF/Transforms/Transforms.h"
#include "mlir/Dialect/Vector/IR/VectorOps.h"
#include "mlir/Interfaces/SideEffectInterfaces.h"
#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
#define DEBUG_TYPE "iree-gpu-pipelining"
//====---------------------------------------------------------------------===//
// Pass to pipeline copy to shared memory for matmul op.
//====---------------------------------------------------------------------===//
namespace mlir {
namespace iree_compiler {
static const StringLiteral kPipeliningLoopMarker = "__pipelining_K_loop__";
static const StringLiteral kPipeliningFirstStage = "__pipelining_first_stage__";
static const StringLiteral kPipeliningExtraBarrier =
"__pipelining_extra_barrier__";
/// Returns true if the given `memrefType` has the default numeric address space
/// 0 or a HAL descriptor type address space.
static bool hasDefaultOrHALAddressSpace(MemRefType memrefType) {
Attribute addrSpace = memrefType.getMemorySpace();
if (!addrSpace) return true;
auto intAttr = addrSpace.dyn_cast<IntegerAttr>();
// Accept both default numeric address space and HAL descriptor type address
// space--the former is used by LLVMGPU while the latter is used by SPIR-V.
if (intAttr && intAttr.getInt() == 0) return true;
return addrSpace.isa<IREE::HAL::DescriptorTypeAttr>();
}
/// Returns a new predicated operation to support unpeeled epilogue. Unpeeled
/// epilogue needs to handle the last iterations within the mainloop which
/// requires predicating operations, for e.g., OOB global memory access. This
/// helper function predicates operations (where predication is avialable),
/// checks if unpredicated operations are side-effect free and acceptable to
/// execute speculatively.
static Operation* replaceOpWithPredicatedOp(RewriterBase& rewriter,
Operation* op, Value pred) {
// Predication is only supported for AsyncCopyOp. Thus, for operations which
// are *not* AsyncCopyOp additional checks are requrired in order to be issued
// speculatively.
if (!isa<nvgpu::DeviceAsyncCopyOp>(op)) {
// Return/execute the op if it is a side effect free.
if (mlir::isMemoryEffectFree(op)) return op;
// Return/execute the op if it is barrier, commit group, or ldmatrix op.
if (isa<gpu::BarrierOp, nvgpu::DeviceAsyncCreateGroupOp, nvgpu::LdMatrixOp,
nvgpu::DeviceAsyncWaitOp>(op))
return op;
// Return/execute the op if it is a shared memory load.
if (auto loadOp = dyn_cast<vector::LoadOp>(op)) {
auto loadBaseType = loadOp.getBase().getType().cast<MemRefType>();
if (hasSharedMemoryAddressSpace(loadBaseType)) return op;
}
if (auto loadOp = dyn_cast<memref::LoadOp>(op)) {
auto loadBaseType = loadOp.getMemRefType();
if (hasSharedMemoryAddressSpace(loadBaseType)) return op;
}
// If we are here that means the operation does not have predication support
// and cannot be speculatively executed. Thus, unpeeled epilogue is not
// supported.
assert(false &&
"Unpeeled epilogue not supported with a side-effect instruction "
"with no predication.");
}
// Replace mainloop AsyncCopy with AsyncCopy(zfill) inline asm.
auto asyncCopyOp = dyn_cast<nvgpu::DeviceAsyncCopyOp>(op);
auto loc = asyncCopyOp->getLoc();
// Create srcElement Value based on the pred.
// The next few lins generate the below code:
// srcElement = (pred) ? prevSrcElements : 0;
Value dstElements =
rewriter.create<arith::ConstantOp>(loc, asyncCopyOp.getDstElementsAttr());
Value originalSrcElement =
asyncCopyOp.getSrcElements() ? asyncCopyOp.getSrcElements() : dstElements;
Value c0Index = rewriter.create<arith::ConstantIndexOp>(loc, 0);
auto srcElements =
rewriter.create<arith::SelectOp>(loc, pred, originalSrcElement, c0Index);
auto asyncCopyZfillOp = rewriter.create<nvgpu::DeviceAsyncCopyOp>(
loc, nvgpu::DeviceAsyncTokenType::get(asyncCopyOp.getContext()),
asyncCopyOp.getDst(), asyncCopyOp.getDstIndices(), asyncCopyOp.getSrc(),
asyncCopyOp.getSrcIndices(), asyncCopyOp.getDstElements(), srcElements,
UnitAttr());
rewriter.eraseOp(asyncCopyOp);
// Return the newly create predicated AsyncCopyZfillOp.
return asyncCopyZfillOp;
}
/// Helper to recursively add operation dependencies within `block` to `dep`
/// set.
static void addDepOps(llvm::SmallDenseSet<Operation*>& dep, Operation* op,
Block* block) {
if (!dep.insert(op).second) return;
for (Value operand : op->getOperands()) {
Operation* defOp = operand.getDefiningOp();
if (defOp && defOp->getBlock() == block) addDepOps(dep, defOp, block);
}
}
/// Assign stages to the loop ops. Simple logic by default, put load from global
/// memory in stage 0 and the rest in stage 1. If store_stage = 0 then put store
/// to shared memory in stage 0 as well.
static void getPipelineStages(scf::ForOp forOp,
std::vector<std::pair<Operation*, unsigned>>& ops,
unsigned depth) {
if (!forOp->hasAttr(kPipeliningLoopMarker)) return;
// Track dependencies of stage 0 ops.
llvm::SmallDenseSet<Operation*> loadDep;
for (Operation& op : forOp.getBody()->getOperations()) {
if (op.hasAttr(kPipeliningFirstStage)) {
addDepOps(loadDep, &op, forOp.getBody());
}
}
// Create a modulo schedule with loads from global memory and the operations
// it depends on in stage 0. Store to shared memory and computation are in
// stage `maxDepth`. In order to have a correct scheduling even with back
// edges we order stages in decreasing order.
for (Operation& op : forOp.getBody()->getOperations()) {
if (!loadDep.count(&op) && !isa<scf::YieldOp>(op))
ops.push_back(std::make_pair(&op, depth));
}
for (Operation& op : forOp.getBody()->getOperations()) {
if (loadDep.count(&op)) ops.push_back(std::make_pair(&op, 0));
}
}
static void setAsyncAnnotations(Operation* op,
scf::PipeliningOption::PipelinerPart part,
unsigned iteration, unsigned depth,
PipeliningSchedulingStrategy schedule) {
if (auto waitOp = dyn_cast<nvgpu::DeviceAsyncWaitOp>(op)) {
// Based on the order copies within the loop we need to adjust the number of
// copies in flight.
bool copyBeforeLoad =
schedule == PipeliningSchedulingStrategy::nvidiaTensorCore;
if (waitOp.getNumGroups()) return;
int numGroupInFlight = 0;
if (part == scf::PipeliningOption::PipelinerPart::Kernel ||
part == scf::PipeliningOption::PipelinerPart::Prologue) {
numGroupInFlight = copyBeforeLoad ? depth - 2 : depth - 1;
} else {
// By construction there should be no wait op in the prologue as all the
// wait should be in the last stage.
assert(part == scf::PipeliningOption::PipelinerPart::Epilogue);
// Based on the schedule we pick we know how many groups are in flight for
// each iteration of the epilogue.
numGroupInFlight = depth - 1 - iteration;
}
OpBuilder b(op);
waitOp->setAttr(waitOp.getNumGroupsAttrName(),
b.getI32IntegerAttr(numGroupInFlight));
} else if (auto barrierOp = dyn_cast<gpu::BarrierOp>(op)) {
unsigned pipelineStoreStage =
schedule == PipeliningSchedulingStrategy::loadStoreStage0 ? 0 : 1;
if (pipelineStoreStage != 0 ||
part != mlir::scf::PipeliningOption::PipelinerPart::Prologue ||
iteration >= depth - 1)
return;
OpBuilder b(op);
barrierOp->setAttr(kPipeliningExtraBarrier, b.getUnitAttr());
}
}
/// Check if the for operations contains a shared memory copy that can be
/// pipelined and annotate operations with stage information if this is the
/// case.
static bool setPipeliningMarkers(scf::ForOp forOp, bool pipelineStoreStage) {
bool copyToWorkgroupMemory = false;
OpBuilder builder(forOp.getContext());
SmallVector<Operation*> barriers;
for (Operation& op : forOp.getBody()->getOperations()) {
// Pipeline the most inner for op that should be a flat region.
if (op.getNumRegions() > 0) return false;
if (isa<gpu::BarrierOp>(op)) {
barriers.push_back(&op);
if (pipelineStoreStage == 0)
op.setAttr(kPipeliningFirstStage, builder.getUnitAttr());
}
if (isa<nvgpu::DeviceAsyncCopyOp, nvgpu::DeviceAsyncCreateGroupOp>(op)) {
copyToWorkgroupMemory = true;
op.setAttr(kPipeliningFirstStage, builder.getUnitAttr());
// async copy ops need to be moved along with previous barrier.
for (Operation* barrier : barriers) {
barrier->setAttr(kPipeliningFirstStage, builder.getUnitAttr());
}
barriers.clear();
continue;
}
auto ld = dyn_cast<vector::TransferReadOp>(op);
if (!ld) continue;
auto ldSrcType = ld.getSource().getType().cast<MemRefType>();
if (!hasDefaultOrHALAddressSpace(ldSrcType) || !ld->hasOneUse()) continue;
auto st = dyn_cast<vector::TransferWriteOp>(ld->use_begin()->getOwner());
if (!st) continue;
auto stSrcType = st.getSource().getType().cast<MemRefType>();
if (!hasSharedMemoryAddressSpace(stSrcType)) continue;
copyToWorkgroupMemory = true;
ld->setAttr(kPipeliningFirstStage, builder.getUnitAttr());
if (pipelineStoreStage == 0)
st->setAttr(kPipeliningFirstStage, builder.getUnitAttr());
}
if (copyToWorkgroupMemory) {
forOp->setAttr(kPipeliningLoopMarker, builder.getUnitAttr());
if (pipelineStoreStage == 0 && !barriers.empty()) {
barriers.front()->erase();
}
}
return copyToWorkgroupMemory;
}
/// Warp-level TensorOp.
/// The data structure holds the warp-level Tensor Core (mma.sync) operations
/// and their dependencies for a kgroup.
struct WarpMmaOp {
// Defining op and its dependencies for mma.sync's lhs/matrixA/OperandA.
llvm::SetVector<Operation*> lhsOperations;
// Defining op and its dependencies for mma.sync's rhs/matrixB/OperandB.
llvm::SetVector<Operation*> rhsOperations;
// Warp-level Tensor Core operations on operands in registers.
llvm::SetVector<Operation*> mmaOperations;
};
/// Structure to hold the matmul's mainloop information:
/// Seperates the mma operations into kgroups and collects the Shared Memory
/// loads for each kgroup. This information is used to pipeline the mainloop and
/// to generate an optimal schedule; interleaving Global Memory loads, Shared
/// Memory loads, and math operations.
struct MainLoopInfo {
// Mainloop asyncronous copy operations:
// `cp.async` GlobalMemory -> SharedMemory
llvm::SetVector<Operation*> copyGlobalToSharedOps;
llvm::SetVector<Operation*> asyncCreateGroupOp;
llvm::SetVector<Operation*> barrierOps;
llvm::SetVector<Operation*> asyncWaitOps;
// Mainloop asyncronous copy operations dependencies
llvm::SetVector<Operation*> copyGlobalToSharedOpDeps;
// Warp-level syncronous operations:
// `ldmatrix, ld.shared` SharedMemory -> Registers
// `mma.sync` Registers -> Tensor Cores.
llvm::SmallVector<WarpMmaOp, 4> warpOperations;
// Set to track the dependencies already seen to a backward slice.
llvm::SetVector<Operation*> seenDepOps;
// Set to track the mma operations in forward slice to count kgroups and
// populate the warp-level warpOperations
llvm::SetVector<Operation*> seenMmaOps;
// Boolen to store if the mainloop can be pipelined (coarse-grained
// scheduling) and the instructions can be interleaved (fine-grained
// scheduling).
bool isSchedulable = false;
// Populates the dependent operations in ``dependentOps`` for the given a op
// recursively that are in the same block and not added to the backward slice
// of some other op.
void backwardSliceOfDependentOps(llvm::SetVector<Operation*>& dependentOps,
Operation* op, Block* block) {
if (!seenDepOps.insert(op)) return;
// Add the unseen op to the dependentOps and recurse on its operands.
dependentOps.insert(op);
for (Value operand : op->getOperands()) {
Operation* defOp = operand.getDefiningOp();
if (defOp && defOp->getBlock() == block)
backwardSliceOfDependentOps(dependentOps, defOp, block);
}
}
// Obtains nvgpu.ldmatrix, memref.load, vector.extract_strided_slice, or
// vector.insert operations that is the defining operations of the mma.sync
// operand. The operations are added to a set of specific kgroup operations.
void mmaOperandDefOperation(Operation* op,
llvm::SetVector<Operation*>& defOperation,
Block* block) {
if (!op) return;
// If the operations defining the mma.sync's operand is one of the
// qualifying operations, add the operations to the current kgroup defining
// operations set.
if (isa<nvgpu::LdMatrixOp, memref::LoadOp, vector::ExtractStridedSliceOp,
vector::InsertOp>(op)) {
if (op->getBlock() == block) {
defOperation.insert(op);
}
return;
}
}
// Recursively traverse the chain of mma operations for all kgroups from 0
// (start) to numKgroups (ends scf.yield).
// Assumption: The mma operations are in a chain of monotonicaly increasing
// kgroup order.
void vistMmaSyncOp(Operation* op, int kgroup) {
// if the operation in an `scf.yield`, we reached the end of MmaSyncOp chain
// return.
if (seenMmaOps.count(op) || isa<scf::YieldOp>(op)) return;
seenMmaOps.insert(op);
// If the kgroup is not in the vector, create a new WarpMmaOp.
if (warpOperations.size() < kgroup + 1)
warpOperations.push_back(WarpMmaOp());
mmaOperandDefOperation(op->getOperand(0).getDefiningOp(),
warpOperations[kgroup].lhsOperations,
op->getBlock());
mmaOperandDefOperation(op->getOperand(1).getDefiningOp(),
warpOperations[kgroup].rhsOperations,
op->getBlock());
warpOperations[kgroup].mmaOperations.insert(op);
vistMmaSyncOp((op->getUses().begin())->getOwner(), ++kgroup);
}
MainLoopInfo(scf::ForOp forOp) : isSchedulable(true) { analyze(forOp); }
// Iterate through the mainloop and collect `cp.async`, `cp.commit_group`,
// `cp.wait_group`, and `barrier` operations. These operations are used to
// pipeline the mainloop and cheorograph asyncroncy for a *coarse-grained*
// schedule. Additionally, collect the `mma.sync` and `ldmatrix`/`ld.shared`
// operations and separate them into kgroups. The information is helpful in
// generating an optimal *finer-grained* instruction interleaving of global
// memory loads, shared memory loads, and math operations.
void analyze(scf::ForOp forOp) {
for (Operation& op : forOp.getBody()->getOperations()) {
if (op.getNumRegions() > 0) {
// Pipeline and schedule the most inner for op ,i.e., the mainloop that
// should be a flat region.
isSchedulable = false;
return;
}
if (isa<nvgpu::DeviceAsyncCopyOp>(op)) {
copyGlobalToSharedOps.insert(&op);
}
if (isa<nvgpu::DeviceAsyncCreateGroupOp>(op)) {
asyncCreateGroupOp.insert(&op);
}
if (isa<gpu::BarrierOp>(op)) {
barrierOps.insert(&op);
}
if (isa<nvgpu::DeviceAsyncWaitOp>(op)) {
asyncWaitOps.insert(&op);
}
if (isa<nvgpu::MmaSyncOp>(op)) {
// MmaSyncOp visitor traverses the chain of mma operations and separates
// them into kgroups.
vistMmaSyncOp(&op, 0 /*kgroup=0*/);
}
}
// Debug print warpOperations for kgroup-by-kgroup.
LLVM_DEBUG({
for (int i = 0; i < warpOperations.size(); ++i) {
llvm::dbgs() << "kgroup: " << i << "\n";
llvm::dbgs() << "mma.sync: \n";
for (auto op : warpOperations[i].mmaOperations) {
op->dump();
}
llvm::dbgs() << "\n";
llvm::dbgs() << "defining operations for lhs: \n";
for (auto op : warpOperations[i].lhsOperations) {
op->dump();
}
llvm::dbgs() << "\n";
llvm::dbgs() << "defining operations for rhs: \n";
for (auto op : warpOperations[i].rhsOperations) {
op->dump();
}
llvm::dbgs() << "\n";
}
});
// If one of the ingredients (`cp.async`, `cp.commit_group`,
// `cp.wait_group`, `bar.sync`, `mma.sync`, `ldmatrix` or `ld.shared`) for
// scheduling is missing, the mainloop cannot be scheduled.
if (copyGlobalToSharedOps.empty() || asyncCreateGroupOp.empty() ||
asyncWaitOps.empty() || barrierOps.empty() || warpOperations.empty()) {
isSchedulable = false;
return;
}
// Collect the dependent operations for `cp.async` in the mainloop order for
// coarse-grained software pipeling. The deps are collected in stage order,
// i.e., `cp.async`'s deps in stage 0 are collected first.
for (Operation& op : forOp.getBody()->getOperations()) {
if (isa<nvgpu::DeviceAsyncCopyOp>(&op)) {
backwardSliceOfDependentOps(copyGlobalToSharedOpDeps, &op,
forOp.getBody());
}
}
// Collect the dependent operations for `mma.sync`, lhs, and rhs defining
// operations. The operation and their dependencies are seperated by kgroups
// for fine-grained instruction scheduling.
for (int kgroup = 0; kgroup < getNumberOfKgroups(); ++kgroup) {
for (Operation& op : forOp.getBody()->getOperations()) {
if (isa<nvgpu::LdMatrixOp, memref::LoadOp,
vector::ExtractStridedSliceOp, vector::InsertOp>(&op)) {
if (warpOperations[kgroup].lhsOperations.count(&op)) {
backwardSliceOfDependentOps(warpOperations[kgroup].lhsOperations,
&op, forOp.getBody());
}
if (warpOperations[kgroup].rhsOperations.count(&op)) {
backwardSliceOfDependentOps(warpOperations[kgroup].rhsOperations,
&op, forOp.getBody());
}
}
}
for (Operation& op : forOp.getBody()->getOperations()) {
if (isa<nvgpu::MmaSyncOp>(&op)) {
if (warpOperations[kgroup].mmaOperations.count(&op)) {
backwardSliceOfDependentOps(warpOperations[kgroup].mmaOperations,
&op, forOp.getBody());
}
}
}
}
}
// Returns the number of kgroups in the Warp-level MMA operations.
int getNumberOfKgroups() { return warpOperations.size(); }
};
/// Prints the given `funcOp` after a leading `step` comment header.
static void debugMainloopSchedule(
MainLoopInfo& mainloop, int numStages,
std::vector<std::pair<Operation*, unsigned>>& ops) {
LLVM_DEBUG({
llvm::dbgs() << "//--- Mainloop schedule generated for Nvidia Ampere "
"mma.sync TensorCore Pipeline. ---//\n";
llvm::dbgs() << " Number of stages: " << numStages << "\n";
llvm::dbgs() << " Number of kgroups: " << mainloop.getNumberOfKgroups()
<< "\n";
llvm::dbgs() << " Number of mainloop instructions " << ops.size() << "\n";
llvm::dbgs() << " Mainloop instructions schedule and stage assignment: \n";
for (auto& stage_op_pair : ops) {
llvm::dbgs() << " Stage (" << stage_op_pair.second << ") , Operation: ";
stage_op_pair.first->dump();
}
llvm::dbgs() << "\n\n";
});
}
/// This function returns an *coarse-grained* stage assignment for software
/// pipelining of the mainloop and a *fine-grained* instruction interleaving.
/// The schedule provides good performance on Nvidia Ampere architecture using
/// Ampere-style multi-staged pipeline.
///
/// @param forOp the mainloop to pipeline and schedule.
/// @param ops a vector of pairs: [(operations, pipeline_stage)].
/// @param numStages the total number of pipeline stages used for multi-buffer.
static void getNvidiaAmpereTensorCorePipeline(
scf::ForOp forOp, std::vector<std::pair<Operation*, unsigned>>& ops,
unsigned numStages) {
// Analyze the main loop and obtain information for coarse-grained pipelining
// and fine-grained instruction scheduling.
MainLoopInfo mainloop(forOp);
// If the mainloop is not schedulable, return an empty schedule.
if (!mainloop.isSchedulable) return;
// NVIDIA Ampere Tensor Core multi-staged pipeline requires at least 2 kgroups
// and 3 software pipeline stages. If the conditions are not met, return an
// empty schedule.
int numKgroups = mainloop.getNumberOfKgroups();
if (numKgroups < 2 || numStages < 3) {
return;
}
// Un-pipelined mainloop should have only one occurance of
// cp.async.commit_group and cp.async.wait_group. Additionally, two barrier
// ops are inserted around each staged copy. The barrier op before the copy is
// un-necessary and will be removed. If the conditions are not met, return an
// empty schedule.
if (!(mainloop.asyncCreateGroupOp.size() == 1) ||
!(mainloop.asyncWaitOps.size() == 1) ||
!(mainloop.barrierOps.size() == 2)) {
return;
}
// Start pipelining and scheduling the main loop, all kgroups but the last
// one.
for (int kgroup = 0; kgroup < numKgroups - 1; kgroup++) {
// Fine-grained instruction scheduling: interleave Shared Memory loads
// into and mma.sync operations to hide load latencies.
// Load the next kgroup into registers.
for (Operation& op : forOp.getBody()->getOperations()) {
if (mainloop.warpOperations[kgroup + 1].lhsOperations.count(&op) ||
mainloop.warpOperations[kgroup + 1].rhsOperations.count(&op)) {
ops.push_back(std::make_pair(&op, numStages - 1));
}
}
// Issue mma.sync on previous loaded kgroup.
for (Operation& op : forOp.getBody()->getOperations()) {
if (mainloop.warpOperations[kgroup].mmaOperations.count(&op))
ops.push_back(std::make_pair(&op, numStages - 1));
}
}
// Coarse-grained instruction pipelining: pipeline Global Memory
// transfer (GMEM -> SMEM) several stages in advance.
// Schedule all cp.async and one cp.async.commit_group.
// TODO: Distribute cp.async throughout the main loop and do not concentrate
// it at one place.
// Schedule all cp.async and one cp.async.commit_group.
for (Operation& op : forOp.getBody()->getOperations()) {
if (mainloop.copyGlobalToSharedOpDeps.count(&op))
ops.push_back(std::make_pair(&op, 0 /*pipelineStage*/));
}
ops.push_back(
std::make_pair(mainloop.asyncCreateGroupOp[0], 0 /*pipelineStage*/));
// Schedule and pipeline all async.wait and barrier
ops.push_back(std::make_pair(mainloop.asyncWaitOps[0], numStages - 2));
mainloop.barrierOps[0]->erase();
ops.push_back(std::make_pair(mainloop.barrierOps[1], numStages - 2));
//////////////////////////////////////////////////////////////////////////////
// Coarse-grained instruction pipelining: pipeline Shared Memory loads
// (SMEM -> Registers) for the first kgroup (kgroup = 0) one stage in
// advance.
// Schedule the Shared Memory loads for the first kgroup and pipeline them
// into one stage ahead.
for (Operation& op : forOp.getBody()->getOperations()) {
if (mainloop.warpOperations[0].lhsOperations.count(&op) ||
mainloop.warpOperations[0].rhsOperations.count(&op))
ops.push_back(std::make_pair(&op, numStages - 2));
}
// Issue mma.sync on for the last kgroup at the end of the mainloop.
for (Operation& op : forOp.getBody()->getOperations()) {
if (mainloop.warpOperations[numKgroups - 1].mmaOperations.count(&op))
ops.push_back(std::make_pair(&op, numStages - 1));
}
// Prints the mainloop schedule generated for NVIDIA Ampere through native
// Tensor Core operations (asyncronous copy, load matrix, and mma.sync).
debugMainloopSchedule(mainloop, numStages, ops);
}
// Apply pipeline rewrite pattern assuming the operations were already
// annotated with stage information.
// TODO: move away from using attribute annotations.
static FailureOr<scf::ForOp> applyPipelining(
scf::ForOp forOp, int64_t depth, bool epiloguePeeling,
PipeliningSchedulingStrategy schedule) {
// TODO: Refactor schedules to not rely on markers.
if (schedule == PipeliningSchedulingStrategy::loadGlobalStage0 ||
schedule == PipeliningSchedulingStrategy::loadStoreStage0) {
unsigned pipelineStoreStage =
schedule == PipeliningSchedulingStrategy::loadGlobalStage0;
if (!setPipeliningMarkers(forOp, pipelineStoreStage)) {
return failure();
}
}
scf::PipeliningOption options;
unsigned maxDepth = depth;
auto getSchedule = [maxDepth, schedule](
scf::ForOp forOp,
std::vector<std::pair<Operation*, unsigned>>& ops) {
if (schedule == PipeliningSchedulingStrategy::nvidiaTensorCore) {
return getNvidiaAmpereTensorCorePipeline(forOp, ops, maxDepth);
}
return getPipelineStages(forOp, ops, maxDepth);
};
auto setAnnotation = [maxDepth, schedule](
Operation* op,
scf::PipeliningOption::PipelinerPart part,
unsigned iteration) {
return setAsyncAnnotations(op, part, iteration, maxDepth, schedule);
};
options.getScheduleFn = getSchedule;
options.annotateFn = setAnnotation;
// Use un-peeled epilogue (i.e. epiloguePeeling=flase) only when predication
// is avialable a.k.a. AsyncCopyOp.
if (!epiloguePeeling) {
options.peelEpilogue = false;
options.predicateFn = [](RewriterBase& rewriter, Operation* op,
Value pred) {
return replaceOpWithPredicatedOp(rewriter, op, pred);
};
}
scf::ForLoopPipeliningPattern pattern(options, forOp->getContext());
IRRewriter rewriter(forOp->getContext());
rewriter.setInsertionPoint(forOp);
return pipelineForLoop(rewriter, forOp, options);
}
namespace {
struct GPUPipeliningPass : public GPUPipeliningBase<GPUPipeliningPass> {
GPUPipeliningPass(bool epiloguePeeling, int64_t depth,
PipeliningSchedulingStrategy schedule)
: depth(depth), schedule(schedule), epiloguePeeling(epiloguePeeling) {}
void initOptions() {
if (GPUPipeliningBase::depth.hasValue())
depth = GPUPipeliningBase::depth.getValue();
if (GPUPipeliningBase::epiloguePeeling.hasValue())
epiloguePeeling = GPUPipeliningBase::epiloguePeeling.getValue();
if (GPUPipeliningBase::scheduleIndex.hasValue())
schedule = (PipeliningSchedulingStrategy)
GPUPipeliningBase::scheduleIndex.getValue();
}
void runOnOperation() override {
initOptions();
auto funcOp = getOperation();
SmallVector<scf::ForOp> forOps;
// Mark the loop with shared memory copy for pipelining.
funcOp.walk([&forOps](scf::ForOp forOp) { forOps.push_back(forOp); });
for (scf::ForOp forOp : forOps) {
(void)applyPipelining(forOp, depth, epiloguePeeling, schedule);
}
// Remove extra barriers from the prologue assuming appropriate
// multi-buffering.
funcOp.walk([](gpu::BarrierOp barrierOp) {
if (barrierOp->hasAttr(kPipeliningExtraBarrier)) barrierOp->erase();
});
}
private:
int64_t depth;
PipeliningSchedulingStrategy schedule;
bool epiloguePeeling;
};
} // namespace
FailureOr<scf::ForOp> pipelineSharedMemoryCopy(
RewriterBase& rewriter, scf::ForOp forOp,
PipeliningSchedulingStrategy strategy, bool peelEpilogue, int64_t depth) {
return applyPipelining(forOp, depth, peelEpilogue, strategy);
}
/// Pass options
/// epiloguePeeling - try enable/disable epilogue peeling.
/// true : Peel epilogue (no additional checks required)
/// false : Try and use unpeeled epilogue (check if predication is supported
/// is avialable)
std::unique_ptr<OperationPass<func::FuncOp>> createGPUPipeliningPass(
bool epiloguePeeling, unsigned depth,
PipeliningSchedulingStrategy schedule) {
return std::make_unique<GPUPipeliningPass>(epiloguePeeling, depth, schedule);
}
} // namespace iree_compiler
} // namespace mlir