blob: 6ec315ca499f890ade6666e1eaa6188b406eb1d4 [file] [log] [blame]
// Copyright 2020 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//===- ConvertToGPUPass.cpp -----------------------------------------------===//
//
// Partition computation within dispatch function to workgroups/workitems.
//
//===----------------------------------------------------------------------===//
#include <array>
#include "iree/compiler/Conversion/LinalgToSPIRV/Attributes.h"
#include "iree/compiler/Conversion/LinalgToSPIRV/MarkerUtils.h"
#include "iree/compiler/Conversion/LinalgToSPIRV/Passes.h"
#include "iree/compiler/Conversion/LinalgToSPIRV/Utils.h"
#include "mlir/Conversion/AffineToStandard/AffineToStandard.h"
#include "mlir/Dialect/Affine/IR/AffineOps.h"
#include "mlir/Dialect/GPU/GPUDialect.h"
#include "mlir/Dialect/Linalg/IR/LinalgOps.h"
#include "mlir/Dialect/Linalg/Transforms/Transforms.h"
#include "mlir/Dialect/SCF/SCF.h"
#include "mlir/Dialect/SPIRV/TargetAndABI.h"
#include "mlir/Dialect/StandardOps/IR/Ops.h"
#include "mlir/IR/AffineMap.h"
#include "mlir/IR/FunctionSupport.h"
#include "mlir/Support/LLVM.h"
#include "mlir/Transforms/DialectConversion.h"
#include "mlir/Transforms/LoopUtils.h"
namespace mlir {
namespace iree_compiler {
static llvm::cl::opt<bool> isWorkgroupCountConstrained(
"iree-codegen-constrained-workgroup-count",
llvm::cl::desc("Specify whether the number of workgroups can be assumed to "
"be large enough to cover the entire workload"),
llvm::cl::init(false));
// TODO(#2134): Remove this flag/set it to false always when issue with
// convolution is resolved (see bug for more details).
// TODO(#2346): Make this a pass specific option.
llvm::cl::opt<bool> useLegacyConvLowering{
"iree-codegen-use-legacy-conv-lowering",
llvm::cl::desc("Use conv lowering that does not assume 1:1 mapping "
"between threads within a block and iterations of "
"parallel loops distributed to the block"),
llvm::cl::init(true)};
//===----------------------------------------------------------------------===//
// Loop utilities
//===----------------------------------------------------------------------===//
/// Builds an empty scf.for operation. The default builder adds an entry basic
/// block which needs to be avoided here.
static scf::ForOp buildEmptyForOp(Location loc, OpBuilder &builder, Value lb,
Value ub, Value step) {
OperationState state(loc, scf::ForOp::getOperationName());
state.addOperands({lb, ub, step});
state.addRegion();
return cast<scf::ForOp>(builder.createOperation(state));
}
/// Builds an empty scf.if operation without the then and else blocks.
static scf::IfOp buildEmptyIfOp(Location loc, OpBuilder &builder, Value cond) {
OperationState state(loc, scf::IfOp::getOperationName());
state.addOperands(cond);
state.addRegion();
state.addRegion();
return cast<scf::IfOp>(builder.createOperation(state));
}
namespace {
struct LoopBounds {
Value lb;
Value ub;
Value step;
};
} // namespace
/// Replaces a scf.parallelOp with an optional scf.parallel op and nested
/// scf.for operations. To create the scf.parallel op as the outermost loop,
/// pass the lower bound, upper bound and steps in `newPLoopLbs`, `newPLoopUbs`,
/// and `newPLoopStep` respectively. The bounds of the inner scf.for operations
/// to be created are passed in `forLbs`, `forUbs`, and `forStep`. The
/// `permutation` vector contains a mapping from the original loop order, to the
/// loop order to be generated.
static Operation *replacePLoopOp(ConversionPatternRewriter &rewriter,
scf::ParallelOp pLoopOp,
ArrayRef<LoopBounds> newPLoopBounds,
ArrayRef<LoopBounds> forBounds,
ArrayRef<unsigned> permutation) {
assert(!forBounds.empty() && "unhandled case of no scf.for created");
unsigned numLoops = pLoopOp.getNumLoops();
Location loc = pLoopOp.getLoc();
assert(forBounds.size() + newPLoopBounds.size() == numLoops &&
"cannot drop loops when splitting scf.parallel operation");
assert(permutation.size() == numLoops);
OpBuilder::InsertionGuard guard(rewriter);
// Need a signature conversion for the body of the scf.parallel operation,
// before can it can be used as the body of the innermost loop created here.
TypeConverter::SignatureConversion signatureConverter(numLoops);
Operation *outermostLoop = nullptr;
auto permuteIt = permutation.begin();
// Create the scf.parallel operation as the outermost loop, if specified.
if (!newPLoopBounds.empty()) {
auto lbs = llvm::to_vector<2>(llvm::map_range(
newPLoopBounds, [](LoopBounds bounds) -> Value { return bounds.lb; }));
auto ubs = llvm::to_vector<2>(llvm::map_range(
newPLoopBounds, [](LoopBounds bounds) { return bounds.ub; }));
auto steps = llvm::to_vector<2>(llvm::map_range(
newPLoopBounds, [](LoopBounds bounds) { return bounds.step; }));
auto newPLoop = rewriter.create<scf::ParallelOp>(loc, lbs, ubs, steps);
for (auto iv : newPLoop.getInductionVars()) {
signatureConverter.remapInput(*permuteIt, iv);
permuteIt++;
}
rewriter.setInsertionPointToStart(newPLoop.getBody());
outermostLoop = newPLoop.getOperation();
}
// Generate the nested scf.for operations with the bounds passed.
for (auto it : enumerate(forBounds)) {
Value lb = it.value().lb, ub = it.value().ub, step = it.value().step;
if (it.index() != forBounds.size() - 1) {
auto forOp = rewriter.create<scf::ForOp>(loc, lb, ub, step);
if (!outermostLoop) outermostLoop = forOp.getOperation();
signatureConverter.remapInput(*permuteIt, forOp.getInductionVar());
rewriter.setInsertionPointToStart(forOp.getBody());
} else {
// For the last loop, move the body of the scf.parallel op as the body of
// the loop after signature conversion.
auto forOp = buildEmptyForOp(loc, rewriter, lb, ub, step);
if (!outermostLoop) outermostLoop = forOp.getOperation();
signatureConverter.addInputs(*permuteIt, rewriter.getIndexType());
Region &pLoopOpRegion = pLoopOp.getLoopBody();
rewriter.applySignatureConversion(&pLoopOpRegion, signatureConverter);
Region &forOpRegion = forOp.getLoopBody();
rewriter.inlineRegionBefore(pLoopOpRegion, forOpRegion,
forOpRegion.begin());
}
permuteIt++;
}
rewriter.eraseOp(pLoopOp);
return outermostLoop;
}
/// Serializes the dimensions of the scf.parallel specified in
/// `serializedDimensions`, by creating an nested scf.for operation for each
/// dimension.
// TODO(ravishankarm): Move this into LoopUtils.h in MLIR.
static Operation *serializeDimensions(ConversionPatternRewriter &rewriter,
scf::ParallelOp pLoopOp,
ArrayRef<unsigned> serializedDimensions) {
assert(!serializedDimensions.empty() &&
"unhandled corner case of no serializing dims");
OpBuilder::InsertionGuard guard(rewriter);
DenseSet<unsigned> serializedDimSet;
serializedDimSet.insert(serializedDimensions.begin(),
serializedDimensions.end());
assert(serializedDimSet.size() == serializedDimensions.size() &&
"cannot repeat dimensions during serialization of scf.parallel");
SmallVector<LoopBounds, 2> newPLoopBounds, forBounds;
SmallVector<unsigned, 2> permutation;
auto lbs = pLoopOp.lowerBound();
auto ubs = pLoopOp.upperBound();
auto steps = pLoopOp.step();
for (unsigned i : llvm::seq<unsigned>(0, pLoopOp.getNumLoops())) {
if (serializedDimSet.count(i)) {
forBounds.push_back({lbs[i], ubs[i], steps[i]});
} else {
newPLoopBounds.push_back({lbs[i], ubs[i], steps[i]});
permutation.push_back(i);
}
}
permutation.append(serializedDimensions.begin(), serializedDimensions.end());
return replacePLoopOp(rewriter, pLoopOp, newPLoopBounds, forBounds,
permutation);
}
/// Serialize all inner dimensions of a `pLoopOp` starting from `serializeFrom`.
static Operation *serializeDimensionsFrom(ConversionPatternRewriter &rewriter,
scf::ParallelOp pLoopOp,
unsigned serializeFrom) {
unsigned numLoops = pLoopOp.getNumLoops();
assert(serializeFrom > 0 && "unhandled serializaing all dimensions");
assert(serializeFrom < numLoops &&
"unhandled corner case of no serialization");
SmallVector<unsigned, 2> serializedDimensions;
for (unsigned dim : llvm::seq(serializeFrom, numLoops))
serializedDimensions.push_back(dim);
return serializeDimensions(rewriter, pLoopOp, serializedDimensions);
}
/// Collapses all loops in a scf.parallel into one scf.parallel operation. This
/// is done by
/// 1) Normalize the loop bounds to be [0, (ub - lb) / step)
/// 2) Compute the total number of iterations.
/// 3) From the induction variable of the modified loop, compute the values of
/// the original induction variables by de-linearization.
scf::ParallelOp collapseParallelLoops(ConversionPatternRewriter &rewriter,
scf::ParallelOp pLoopOp) {
if (pLoopOp.getNumReductions()) return nullptr;
unsigned numLoops = pLoopOp.getNumLoops();
if (numLoops == 1) return pLoopOp;
// Compute the number of iterations of each loops starting from the innermost.
Location loc = pLoopOp.getLoc();
Value totalNumIterations = rewriter.create<ConstantIndexOp>(loc, 1);
// Track the "stride" of each loop, i.e. product of the total number of
// iterations of the inner loops.
SmallVector<Value, 2> iterationStride;
iterationStride.resize(pLoopOp.getNumLoops());
auto lbs = pLoopOp.lowerBound();
auto ubs = pLoopOp.upperBound();
auto steps = pLoopOp.step();
for (int i = numLoops - 1; i >= 0; --i) {
Value lb = lbs[i], ub = ubs[i], step = steps[i];
Value iterCount = rewriter.create<SignedDivIOp>(
loc, rewriter.create<SubIOp>(loc, ub, lb), step);
iterationStride[i] = totalNumIterations;
totalNumIterations =
rewriter.create<MulIOp>(loc, totalNumIterations, iterCount);
}
// Create the collapsed parallel loop op with lowerbound 0, step 1 and upper
// bound being the totalNumIterations.
Value newLb = rewriter.create<ConstantIndexOp>(loc, 0);
Value newStep = rewriter.create<ConstantIndexOp>(loc, 1);
scf::ParallelOp newPLoopOp =
rewriter.create<scf::ParallelOp>(loc, newLb, totalNumIterations, newStep);
// Build the body of the collapsed loop by cloning the original loop body. The
// replacement value of the induction variables of the original loop body,
// from the induction variable of the new loop, using
// origLoopIv[i] = loopIv / iterationStride[i]
// loopIv = loopIv % iterationStride[i]
OpBuilder::InsertionGuard guard(rewriter);
Block &pLoopBody = pLoopOp.getLoopBody().front();
rewriter.setInsertionPointToStart(&newPLoopOp.getLoopBody().front());
Value loopIv = *newPLoopOp.getInductionVars().begin();
BlockAndValueMapping map;
for (int i : llvm::seq<int>(0, numLoops)) {
Value iterNum =
rewriter.create<SignedDivIOp>(loc, loopIv, iterationStride[i]);
Value newIv = rewriter.create<AddIOp>(
loc, lbs[i], rewriter.create<MulIOp>(loc, iterNum, steps[i]));
map.map(pLoopBody.getArgument(i), newIv);
loopIv = rewriter.create<SignedRemIOp>(loc, loopIv, iterationStride[i]);
}
for (Operation &op : pLoopBody.without_terminator()) {
rewriter.clone(op, map);
}
rewriter.eraseOp(pLoopOp);
return newPLoopOp;
}
//===----------------------------------------------------------------------===//
// GPU processor ID mapping utilities
//===----------------------------------------------------------------------===//
/// Distributes scf.parallel to processors with the processors logically
/// arranged with same dimensionality as the number of loops, i.e. a
/// scf.parallel with 2 loops to a 2D grid of processors. `processorIDs` and
/// `numProcessors` must be of same size as the number of loops and are the
/// values to use for process ID and number of processors along each dimension
/// in the distributed code.
/// This method accounts for the case where the number of processors is not
/// enough to execute the entire iteration space with one iteration mapped to
/// each processor. So implements a cyclic distribution of iterations to
/// processors.
static LogicalResult distributeCyclicallyToProcessors(
ConversionPatternRewriter &rewriter, scf::ParallelOp pLoopOp,
ArrayRef<Value> processorIDs, ArrayRef<Value> numProcessors) {
unsigned numLoops = pLoopOp.getNumLoops();
assert(numLoops == processorIDs.size() &&
"expected as many ids as number of loops");
assert(numLoops == numProcessors.size() &&
"expected as many nprocs as number of loops");
SmallVector<LoopBounds, 2> forBounds;
SmallVector<unsigned, 2> permutation;
forBounds.reserve(numLoops);
permutation.reserve(numLoops);
Location loc = pLoopOp.getLoc();
auto lbs = pLoopOp.lowerBound(), ubs = pLoopOp.upperBound(),
steps = pLoopOp.step();
for (unsigned i : llvm::seq<unsigned>(0, processorIDs.size())) {
Value mappedLb = rewriter.create<AddIOp>(
loc, lbs[i], rewriter.create<MulIOp>(loc, steps[i], processorIDs[i]));
Value mappedStep = rewriter.create<MulIOp>(loc, steps[i], numProcessors[i]);
forBounds.push_back({mappedLb, ubs[i], mappedStep});
permutation.push_back(i);
}
replacePLoopOp(rewriter, pLoopOp, /*newPLoopBounds=*/{}, forBounds,
permutation);
return success();
}
/// Distributes scf.parallel to processors with the processors logically
/// arranged with same dimensionality as the number of loops, i.e. a
/// scf.parallel with 2 loops to a 2D grid of processors. `processorIDs` must be
/// of same size as the number of loops and are the values to use for process ID
/// and number of processors along each dimension in the distributed code. This
/// method assumes that the number of processors is greater than or equal to the
/// number of iterations. So just generates an if statement to mask of
/// processors with no work. When the number of processors is known to be
/// exactly equal to the number of iterations, the if statement is not needed as
/// well. In such cases, `generateGuard` can be set to `false` to avoid
/// generating the if statement.
static LogicalResult distributeSingleIterationPerProcessor(
ConversionPatternRewriter &rewriter, scf::ParallelOp pLoopOp,
ArrayRef<Value> processorIDs, bool generateGuard = false) {
unsigned numLoops = pLoopOp.getNumLoops();
Location loc = pLoopOp.getLoc();
assert(numLoops == processorIDs.size() &&
"expected as many ids as number of loops");
auto lbs = pLoopOp.lowerBound();
auto step = pLoopOp.step();
SmallVector<Value, 2> ivReplacements;
for (unsigned i : llvm::seq<unsigned>(0, numLoops)) {
Value iterValue = rewriter.create<AddIOp>(
loc, lbs[i], rewriter.create<MulIOp>(loc, processorIDs[i], step[i]));
ivReplacements.push_back(iterValue);
}
Region &pLoopOpRegion = pLoopOp.getLoopBody();
if (generateGuard) {
TypeConverter::SignatureConversion signatureConverter(numLoops);
Value cond = nullptr;
auto ubs = pLoopOp.upperBound();
for (unsigned i : llvm::seq<unsigned>(0, numLoops)) {
Value cmp = rewriter.create<CmpIOp>(loc, CmpIPredicate::slt,
ivReplacements[i], ubs[i]);
cond = (cond ? rewriter.create<AndOp>(loc, cond, cmp) : cmp);
signatureConverter.remapInput(i, ivReplacements[i]);
}
rewriter.applySignatureConversion(&pLoopOpRegion, signatureConverter);
scf::IfOp ifOp = buildEmptyIfOp(loc, rewriter, cond);
Region &ifOpRegion = ifOp.getRegion(0);
rewriter.inlineRegionBefore(pLoopOpRegion, ifOpRegion, ifOpRegion.begin());
} else {
// The body of the scf.parallel needs to be moved into its parent
// operation.
// - Split the block just before the scf.parallel operation.
// - Move the only block of scf.parallel before the newly created block
// (after signature conversion).
// - Add branch from the original block to the moved block of the
// scf.parallel's region, and from the latter to the block created by the
// split operation.
// - Canonicalization will fold these branches away.
Block *destBlock = pLoopOp.getOperation()->getBlock();
Block *remainingInst =
rewriter.splitBlock(destBlock, Block::iterator(pLoopOp));
Block *sourceBlock = &pLoopOpRegion.front();
rewriter.eraseOp(sourceBlock->getTerminator());
rewriter.mergeBlocks(&pLoopOpRegion.front(), destBlock, ivReplacements);
rewriter.mergeBlocks(remainingInst, destBlock, {});
}
rewriter.eraseOp(pLoopOp);
return success();
}
namespace {
struct ProcessorIdAndCount {
Value id;
Value count;
};
/// These are class declarations that are only used for template
/// specialization. They wont be needed if GPU dialect has ops for global
/// invocation ID directly.
class GPUGlobalId;
class GPUGlobalCount;
} // namespace
template <typename GPUIdOp, typename GPUCountOp>
static ProcessorIdAndCount getGPUProcessorIdAndCount(
Location loc, StringRef dim, ConversionPatternRewriter &rewriter) {
Type indexType = rewriter.getIndexType();
return {
rewriter.create<GPUIdOp>(loc, indexType, rewriter.getStringAttr(dim)),
rewriter.create<GPUCountOp>(loc, indexType, rewriter.getStringAttr(dim))};
}
template <>
ProcessorIdAndCount getGPUProcessorIdAndCount<GPUGlobalId, GPUGlobalCount>(
Location loc, StringRef dim, ConversionPatternRewriter &rewriter) {
Type indexType = rewriter.getIndexType();
Value gridDim = rewriter.create<gpu::GridDimOp>(loc, indexType,
rewriter.getStringAttr(dim));
Value blockId = rewriter.create<gpu::BlockIdOp>(loc, indexType,
rewriter.getStringAttr(dim));
Value blockDim = rewriter.create<gpu::BlockDimOp>(
loc, indexType, rewriter.getStringAttr(dim));
Value threadId = rewriter.create<gpu::ThreadIdOp>(
loc, indexType, rewriter.getStringAttr(dim));
return {rewriter.create<AddIOp>(
loc, rewriter.create<MulIOp>(loc, blockId, blockDim), threadId),
rewriter.create<MulIOp>(loc, blockDim, gridDim)};
}
template <typename GPUIdOp, typename GPUCountOp>
static void getGPUProcessorIdsAndCounts(Location loc,
ConversionPatternRewriter &rewriter,
unsigned numDims,
MutableArrayRef<Value> id,
MutableArrayRef<Value> count) {
std::array<StringRef, 3> dims{"x", "y", "z"};
assert(id.size() == numDims);
assert(count.size() == numDims);
for (unsigned i = 0; i < numDims; ++i) {
ProcessorIdAndCount idAndCount =
getGPUProcessorIdAndCount<GPUIdOp, GPUCountOp>(loc, dims[i], rewriter);
id[numDims - 1 - i] = idAndCount.id;
count[numDims - 1 - i] = idAndCount.count;
}
}
/// Distributes scf.parallel to processors where `IdOp` is used to get the
/// processor ID and `DimOp` is used to get the number of processors along a
/// dimension.
template <typename GPUIdOp, typename GPUCountOp>
static LogicalResult distributeCyclicallyToProcessors(
ConversionPatternRewriter &rewriter, scf::ParallelOp pLoopOp) {
unsigned numLoops = pLoopOp.getNumLoops();
if (numLoops > 3) {
pLoopOp =
cast<scf::ParallelOp>(serializeDimensionsFrom(rewriter, pLoopOp, 3));
numLoops = 3;
}
SmallVector<Value, 2> id(numLoops), count(numLoops);
getGPUProcessorIdsAndCounts<GPUIdOp, GPUCountOp>(pLoopOp.getLoc(), rewriter,
numLoops, id, count);
return distributeCyclicallyToProcessors(rewriter, pLoopOp, id, count);
}
/// Distributes scf.parallel to processors where `IdOp` is used to get the
/// processor ID and `DimOp` is used to get the number of processors along a
/// dimension. Assumes that the number of processors will be less than equal to
/// the number of iterations of the pLoopOp along all dimensions.
template <typename GPUIdOp, typename GPUCountOp>
static LogicalResult distributeSingleIterationPerProcessor(
ConversionPatternRewriter &rewriter, scf::ParallelOp pLoopOp,
bool generateGuard = true) {
unsigned numLoops = pLoopOp.getNumLoops();
if (numLoops > 3) {
pLoopOp =
cast<scf::ParallelOp>(serializeDimensionsFrom(rewriter, pLoopOp, 3));
numLoops = 3;
}
SmallVector<Value, 2> id(numLoops), count(numLoops);
getGPUProcessorIdsAndCounts<GPUIdOp, GPUCountOp>(pLoopOp.getLoc(), rewriter,
numLoops, id, count);
return distributeSingleIterationPerProcessor(rewriter, pLoopOp, id,
generateGuard);
}
/// Distribute the scf.parallel to workgroups.
static LogicalResult mapToWorkgroups(ConversionPatternRewriter &rewriter,
scf::ParallelOp pLoopOp,
bool useCyclicDistribution = false) {
if (useCyclicDistribution)
return distributeCyclicallyToProcessors<gpu::BlockIdOp, gpu::GridDimOp>(
rewriter, pLoopOp);
return distributeSingleIterationPerProcessor<gpu::BlockIdOp, gpu::GridDimOp>(
rewriter, pLoopOp, false);
}
/// Distributes scf.parallel to workitems using local invocation ID.
static LogicalResult mapToLocalInvocationId(ConversionPatternRewriter &rewriter,
scf::ParallelOp pLoopOp) {
return distributeSingleIterationPerProcessor<gpu::ThreadIdOp,
gpu::BlockDimOp>(rewriter,
pLoopOp);
}
/// Distributes scf.parallel to workitems using global invocation ID. The GPU
/// dialect doesn't have a direct operation to do this. This could be done using
/// id = blockIdx * blockDim + gridIdx. count = blockDim * gridDim.
static LogicalResult mapToGlobalInvocationId(
ConversionPatternRewriter &rewriter, scf::ParallelOp pLoopOp) {
return distributeSingleIterationPerProcessor<GPUGlobalId, GPUGlobalCount>(
rewriter, pLoopOp);
}
//===----------------------------------------------------------------------===//
// Pass and patterns.
//===----------------------------------------------------------------------===//
/// In some cases the iterations of the loops when partitioned to workgroups
/// need to be distributed in a cyclic manner. The main use cases here is when
/// the number of workgroups is constrained such that the number of iterations
/// is greater than equal to number of processors (along any dimension). In
/// those cases, distribute the iterations in a cyclic manner. This adds
/// additional control flow, but isn't too detrimental to performance since they
/// are convergent for the most part.
// TODO(#2134): Mapping iterations to processors directly by assuming number of
// iterations <= number of processors again seems to have an issue with
// convolution/pooling. Needs further investigation.
static bool useCyclicLoopDistribution(scf::ParallelOp pLoopOp) {
if (!useLegacyConvLowering) return false;
auto walkResult = pLoopOp.walk([](Operation *op) -> WalkResult {
if (isa<linalg::ConvOp>(op) || isa<linalg::PoolingMaxOp>(op) ||
isa<linalg::PoolingMinOp>(op) || isa<linalg::PoolingSumOp>(op))
return WalkResult::interrupt();
return WalkResult::advance();
});
return walkResult.wasInterrupted();
}
namespace {
/// Pass to convert from tiled and fused linalg ops into gpu.func.
struct ConvertToGPUPass : public PassWrapper<ConvertToGPUPass, FunctionPass> {
ConvertToGPUPass() = default;
ConvertToGPUPass(const ConvertToGPUPass &pass) {}
void runOnFunction() override;
};
/// Pattern to map scf.parallel to workgroups.
struct PartitionPLoopToWorkgroups
: public OpConversionPattern<scf::ParallelOp> {
using OpConversionPattern<scf::ParallelOp>::OpConversionPattern;
LogicalResult matchAndRewrite(
scf::ParallelOp pLoopOp, ArrayRef<Value> operands,
ConversionPatternRewriter &rewriter) const override {
return mapToWorkgroups(
rewriter, pLoopOp,
isWorkgroupCountConstrained || useCyclicLoopDistribution(pLoopOp));
}
};
/// Map tiled linalg op to workitems by lowering it to scf.parallel and
/// partitioning it to workitems.
template <typename LinalgOpTy>
struct MapLinalgOpToLocalInvocationId : public OpConversionPattern<LinalgOpTy> {
using OpConversionPattern<LinalgOpTy>::OpConversionPattern;
LogicalResult matchAndRewrite(
LinalgOpTy linalgOp, ArrayRef<Value> operands,
ConversionPatternRewriter &rewriter) const override {
// Check for marker that specifies that the linalg op is to be partitioned
// across threads within a workgroup.
if (!hasWorkGroupMarker(linalgOp)) return failure();
Optional<linalg::LinalgLoops> loops =
linalg::linalgLowerOpToLoops<scf::ParallelOp>(rewriter, linalgOp);
if (!loops) return failure();
if (!loops.getValue().empty()) {
scf::ParallelOp pLoopOp = dyn_cast<scf::ParallelOp>(loops.getValue()[0]);
if (!pLoopOp || failed(mapToLocalInvocationId(rewriter, pLoopOp)))
return failure();
}
rewriter.eraseOp(linalgOp);
return success();
}
};
/// Legacy path for lowering tiled conv/pooling op to loops.
// TODO(#2134): Remove this pattern. The default path of using
// `MapLinalgOpToLocalInvocationId` seems to have a bug. It only shows up
// currently on Resnet50. Remove this pattern after the bug is triaged/fixed.
template <typename LinalgOpTy>
struct MapConvPoolToLocalInvocationId : public OpConversionPattern<LinalgOpTy> {
using OpConversionPattern<LinalgOpTy>::OpConversionPattern;
LogicalResult matchAndRewrite(
LinalgOpTy linalgOp, ArrayRef<Value> operands,
ConversionPatternRewriter &rewriter) const override {
if (!hasWorkGroupMarker(linalgOp)) return failure();
Optional<linalg::LinalgLoops> loops =
linalg::linalgLowerOpToLoops<scf::ParallelOp>(rewriter, linalgOp);
if (!loops) return failure();
scf::ParallelOp pLoopOp = cast<scf::ParallelOp>(loops.getValue()[0]);
if (failed(
distributeCyclicallyToProcessors<gpu::ThreadIdOp, gpu::BlockDimOp>(
rewriter, pLoopOp)))
return failure();
rewriter.eraseOp(linalgOp);
return success();
}
};
/// Map linalg operation to execute on GPU in parallel by mapping the parallel
/// loops to "GlobalInvocationId".
template <typename LinalgOpTy>
struct MapLinalgOpToGlobalInvocationId
: public OpConversionPattern<LinalgOpTy> {
using OpConversionPattern<LinalgOpTy>::OpConversionPattern;
LogicalResult matchAndRewrite(
LinalgOpTy linalgOp, ArrayRef<Value> operands,
ConversionPatternRewriter &rewriter) const override {
// If marker exists do nothing.
if (hasMarker(linalgOp)) return failure();
FuncOp funcOp = linalgOp.template getParentOfType<FuncOp>();
if (!funcOp) return failure();
Optional<linalg::LinalgLoops> loops =
linalg::linalgLowerOpToLoops<scf::ParallelOp>(rewriter, linalgOp);
if (!loops) return failure();
SmallVector<int64_t, 3> workgroupSize(3, 1);
if (!loops.getValue().empty()) {
scf::ParallelOp pLoopOp = dyn_cast<scf::ParallelOp>(loops.getValue()[0]);
// If there are parallel loops partition them to threads using global
// invocation ID.
if (pLoopOp) {
pLoopOp = collapseParallelLoops(rewriter, pLoopOp);
if (!pLoopOp) return failure();
if (failed(mapToGlobalInvocationId(rewriter, pLoopOp)))
return rewriter.notifyMatchFailure(
linalgOp, "mapping to GlobalInvocationID failed");
workgroupSize = {32, 1, 1};
}
}
rewriter.eraseOp(linalgOp);
if (failed(updateWorkGroupSize(funcOp, workgroupSize))) return failure();
funcOp.setAttr(getWorkgroupCountAttrName(),
rewriter.getI32IntegerAttr(static_cast<int32_t>(
WorkgroupCountMethodology::LinearizeResultShape)));
return success();
}
};
/// Remove the linalg.range operation created when lowering to loops.
struct RemoveLinalgRange : public OpConversionPattern<linalg::RangeOp> {
using OpConversionPattern<linalg::RangeOp>::OpConversionPattern;
LogicalResult matchAndRewrite(
linalg::RangeOp rangeOp, ArrayRef<Value> operands,
ConversionPatternRewriter &rewriter) const override {
if (!rangeOp.getResult().use_empty()) return failure();
rewriter.eraseOp(rangeOp);
return success();
}
};
} // namespace
void populateParallelLoopToWorkgroupPatterns(
MLIRContext *context, OwningRewritePatternList &patterns) {
patterns.insert<PartitionPLoopToWorkgroups>(context);
}
void ConvertToGPUPass::runOnFunction() {
FuncOp funcOp = getFunction();
Region &body = funcOp.getBody();
if (!llvm::hasSingleElement(body)) {
funcOp.emitError("unhandled dispatch function with multiple blocks");
return signalPassFailure();
}
MLIRContext *context = &getContext();
ConversionTarget target(*context);
// After this pass Linalg and scf.parallel ops should be gone.
target.addIllegalOp<scf::ParallelOp>();
target.addIllegalDialect<linalg::LinalgDialect>();
// Reshape ops are treated legal since they just change the way the underlying
// buffer is viewed. These are legalized downstream. They become no ops when
// lowering to SPIR-V since the SPIR-V code uses linearized arrays.
target.addLegalOp<linalg::ReshapeOp>();
// Let the rest fall through.
target.markUnknownOpDynamicallyLegal([](Operation *) { return true; });
OwningRewritePatternList patterns;
// clang-format off
patterns.insert<
#define ADD_ALL_LINALG_PATTERNS(OP_NAME) \
MapLinalgOpToGlobalInvocationId<OP_NAME>, \
MapLinalgOpToLocalInvocationId<OP_NAME>
ADD_ALL_LINALG_PATTERNS(linalg::CopyOp),
ADD_ALL_LINALG_PATTERNS(linalg::FillOp),
ADD_ALL_LINALG_PATTERNS(linalg::GenericOp),
ADD_ALL_LINALG_PATTERNS(linalg::IndexedGenericOp),
#undef ADD_ALL_LINALG_PATTERNS
#define ADD_ALL_CONV_POOL_PATTERNS(OP_NAME) \
MapConvPoolToLocalInvocationId<OP_NAME>, \
MapLinalgOpToGlobalInvocationId<OP_NAME>
ADD_ALL_CONV_POOL_PATTERNS(linalg::PoolingMaxOp),
ADD_ALL_CONV_POOL_PATTERNS(linalg::PoolingMinOp),
ADD_ALL_CONV_POOL_PATTERNS(linalg::PoolingSumOp),
#undef ADD_ALL_CONV_POOL_PATTERNS
MapLinalgOpToLocalInvocationId<linalg::MatmulOp>,
PartitionPLoopToWorkgroups, RemoveLinalgRange>(context);
// clang-format on
patterns.insert<MapLinalgOpToGlobalInvocationId<linalg::ConvOp>>(context);
if (useLegacyConvLowering)
patterns.insert<MapConvPoolToLocalInvocationId<linalg::ConvOp>>(context);
else
patterns.insert<MapLinalgOpToLocalInvocationId<linalg::ConvOp>>(context);
if (failed(applyFullConversion(funcOp, target, patterns)))
return signalPassFailure();
}
std::unique_ptr<OperationPass<FuncOp>> createConvertToGPUPass() {
return std::make_unique<ConvertToGPUPass>();
}
static PassRegistration<ConvertToGPUPass> pass(
"iree-codegen-convert-to-gpu", "Map tiled linalg and loop ops to GPU",
[] { return std::make_unique<ConvertToGPUPass>(); });
} // namespace iree_compiler
} // namespace mlir