blob: 8b36bca1a89c4d1481b54fed42db36a36af77005 [file] [log] [blame]
// Copyright 2020 The IREE Authors
//
// Licensed under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//===- Utils.cpp - Utility functions used in Linalg to SPIR-V lowering ----===//
//
// Implementaiton of utility functions used while lowering from Linalg to SPIRV.
//
//===----------------------------------------------------------------------===//
#include "iree/compiler/Codegen/SPIRV/Utils.h"
#include "iree/compiler/Codegen/SPIRV/MemorySpace.h"
#include "iree/compiler/Codegen/Utils/MarkerUtils.h"
#include "iree/compiler/Dialect/HAL/IR/HALOps.h"
#include "mlir/Dialect/GPU/GPUDialect.h"
#include "mlir/Dialect/Linalg/IR/LinalgOps.h"
#include "mlir/Dialect/Linalg/Utils/Utils.h"
#include "mlir/Dialect/MemRef/IR/MemRef.h"
#include "mlir/Dialect/SPIRV/IR/TargetAndABI.h"
#include "mlir/IR/BuiltinOps.h"
#include "mlir/IR/Identifier.h"
#include "mlir/IR/Matchers.h"
#include "mlir/IR/Region.h"
#include "mlir/Support/LogicalResult.h"
namespace mlir {
namespace iree_compiler {
spirv::TargetEnvAttr getSPIRVTargetEnvAttr(Operation *op) {
auto variant = op->getParentOfType<IREE::HAL::ExecutableVariantOp>();
if (!variant) return nullptr;
IREE::HAL::ExecutableTargetAttr targetAttr = variant.target();
if (!targetAttr) return nullptr;
auto config = targetAttr.getConfiguration();
if (!config) return nullptr;
return config.getAs<spirv::TargetEnvAttr>(spirv::getTargetEnvAttrName());
}
LogicalResult updateWorkGroupSize(FuncOp funcOp,
ArrayRef<int64_t> workGroupSize) {
// Need to update both the surrounding FuncOp that has the spv.entry_point_abi
// attribute, and the hal.executable.
Region &body = funcOp.getBody();
if (!llvm::hasSingleElement(body))
return funcOp.emitError("unhandled dispatch function with multiple blocks");
if (workGroupSize.size() != 3)
return funcOp.emitError("expected workgroup size to have three entries");
SmallVector<int32_t, 3> workGroupSizeVec = llvm::to_vector<3>(llvm::map_range(
workGroupSize, [](int64_t v) { return static_cast<int32_t>(v); }));
funcOp->setAttr(
spirv::getEntryPointABIAttrName(),
spirv::getEntryPointABIAttr(workGroupSizeVec, funcOp.getContext()));
return success();
}
LogicalResult copyToWorkgroupMemory(OpBuilder &b, Value src, Value dst) {
auto copyOp = b.create<linalg::CopyOp>(src.getLoc(), src, dst);
setMarker(copyOp, getCopyToWorkgroupMemoryMarker());
return success();
}
Optional<Value> allocateWorkgroupMemory(OpBuilder &b, memref::SubViewOp subview,
ArrayRef<Value> boundingSubViewSize,
DataLayout &layout) {
// Allocate the memory into the entry block of the parent FuncOp. This better
// aligns with the semantics of this memory which is available at the entry of
// the function.
OpBuilder::InsertionGuard guard(b);
FuncOp funcOp = subview->getParentOfType<FuncOp>();
if (!funcOp) {
subview.emitError("expected op to be within std.func");
return llvm::None;
}
b.setInsertionPointToStart(&(*funcOp.getBody().begin()));
// The bounding subview size is expected to be constant. This specified the
// shape of the allocation.
SmallVector<int64_t, 2> shape = llvm::to_vector<2>(
llvm::map_range(boundingSubViewSize, [](Value v) -> int64_t {
APInt value;
if (matchPattern(v, m_ConstantInt(&value))) return value.getSExtValue();
return -1;
}));
if (llvm::any_of(shape, [](int64_t v) { return v == -1; })) return {};
MemRefType allocType = MemRefType::get(
shape, subview.getType().getElementType(), {}, getWorkgroupMemorySpace());
Value buffer = b.create<memref::AllocOp>(subview.getLoc(), allocType);
return buffer;
}
LogicalResult deallocateWorkgroupMemory(OpBuilder &b, Value buffer) {
// There is no utility of an explicit deallocation (as of now). Instead the
// workgroup memory is effectively stack memory that is automatically dead at
// the end of the function. The SPIR-V lowering treats such deallocs as
// no-ops. So dont insert it in the first place, rather just check that the
// deallocation is for workgroup memory.
MemRefType bufferType = buffer.getType().dyn_cast<MemRefType>();
if (!bufferType) return failure();
return success(bufferType.getMemorySpaceAsInt() == getWorkgroupMemorySpace());
}
template <typename GPUIdOp, typename GPUCountOp>
static linalg::ProcInfo getGPUProcessorIdAndCountImpl(OpBuilder &builder,
Location loc,
unsigned dim) {
assert(dim < kNumGPUDims && "processor index out of range!");
std::array<const char *, kNumGPUDims> dimAttr{"x", "y", "z"};
StringAttr attr = builder.getStringAttr(dimAttr[dim]);
Type indexType = builder.getIndexType();
return {builder.create<GPUIdOp>(loc, indexType, attr),
builder.create<GPUCountOp>(loc, indexType, attr)};
}
template <>
linalg::ProcInfo getGPUProcessorIdAndCountImpl<GPUGlobalId, GPUGlobalCount>(
OpBuilder &builder, Location loc, unsigned dim) {
assert(dim < kNumGPUDims && "processor index out of range!");
std::array<const char *, kNumGPUDims> dimAttr{"x", "y", "z"};
StringAttr attr = builder.getStringAttr(dimAttr[dim]);
Type indexType = builder.getIndexType();
Value gridDim = builder.create<gpu::GridDimOp>(loc, indexType, attr);
Value blockId = builder.create<gpu::BlockIdOp>(loc, indexType, attr);
Value blockDim = builder.create<gpu::BlockDimOp>(loc, indexType, attr);
Value threadId = builder.create<gpu::ThreadIdOp>(loc, indexType, attr);
// TODO(ravishankarm): Using affine_maps here would be beneficial, and we can
// do this because the blockDim is constant. But this would lead to an
// ordering issue cause it assumes that the workgroup size has already been
// set. If using affine_map can help, make sure that the workgroup size is set
// before.
return {
builder.create<arith::AddIOp>(
loc, builder.create<arith::MulIOp>(loc, blockId, blockDim), threadId),
builder.create<arith::MulIOp>(loc, blockDim, gridDim)};
}
template <typename GPUIdOp, typename GPUCountOp>
static SmallVector<linalg::ProcInfo, 2> getGPUProcessorIdsAndCountsImpl(
OpBuilder &builder, Location loc, unsigned numDims) {
SmallVector<linalg::ProcInfo, 2> procInfo(numDims);
for (unsigned i = 0; i < numDims; ++i) {
procInfo[numDims - 1 - i] =
getGPUProcessorIdAndCountImpl<GPUIdOp, GPUCountOp>(builder, loc, i);
}
return procInfo;
}
template <typename GPUIdOp, typename GPUCountOp>
SmallVector<linalg::ProcInfo, 2> getGPUProcessorIdsAndCounts(OpBuilder &builder,
Location loc,
unsigned numDims) {
return getGPUProcessorIdsAndCountsImpl<GPUIdOp, GPUCountOp>(builder, loc,
numDims);
}
/// Explicit instantiation of gpuGPUProcessorIdsAndCounts.
template SmallVector<linalg::ProcInfo, 2>
getGPUProcessorIdsAndCounts<gpu::BlockIdOp, gpu::GridDimOp>(OpBuilder &builder,
Location loc,
unsigned numDims);
template SmallVector<linalg::ProcInfo, 2>
getGPUProcessorIdsAndCounts<gpu::ThreadIdOp, gpu::BlockDimOp>(
OpBuilder &builder, Location loc, unsigned numDims);
template SmallVector<linalg::ProcInfo, 2>
getGPUProcessorIdsAndCounts<GPUGlobalId, GPUGlobalCount>(OpBuilder &builder,
Location loc,
unsigned numDims);
scf::ParallelOp collapseParallelLoops(PatternRewriter &rewriter,
scf::ParallelOp pLoopOp) {
if (pLoopOp.getNumReductions()) return nullptr;
unsigned numLoops = pLoopOp.getNumLoops();
if (numLoops == 1) return pLoopOp;
// Compute the number of iterations of each loops starting from the innermost.
Location loc = pLoopOp.getLoc();
Value totalNumIterations = rewriter.create<arith::ConstantIndexOp>(loc, 1);
// Track the "stride" of each loop, i.e. product of the total number of
// iterations of the inner loops.
SmallVector<Value, 2> iterationStride;
iterationStride.resize(pLoopOp.getNumLoops());
auto lbs = pLoopOp.lowerBound();
auto ubs = pLoopOp.upperBound();
auto steps = pLoopOp.step();
for (int i = numLoops - 1; i >= 0; --i) {
Value lb = lbs[i], ub = ubs[i], step = steps[i];
Value iterCount = rewriter.create<arith::DivSIOp>(
loc, rewriter.create<arith::SubIOp>(loc, ub, lb), step);
iterationStride[i] = totalNumIterations;
totalNumIterations =
rewriter.create<arith::MulIOp>(loc, totalNumIterations, iterCount);
}
// Create the collapsed parallel loop op with lowerbound 0, step 1 and upper
// bound being the totalNumIterations.
Value newLb = rewriter.create<arith::ConstantIndexOp>(loc, 0);
Value newStep = rewriter.create<arith::ConstantIndexOp>(loc, 1);
scf::ParallelOp newPLoopOp =
rewriter.create<scf::ParallelOp>(loc, newLb, totalNumIterations, newStep);
// Build the body of the collapsed loop by cloning the original loop body. The
// replacement value of the induction variables of the original loop body,
// from the induction variable of the new loop, using
// origLoopIv[i] = loopIv / iterationStride[i]
// loopIv = loopIv % iterationStride[i]
OpBuilder::InsertionGuard guard(rewriter);
Block &pLoopBody = pLoopOp.getLoopBody().front();
rewriter.setInsertionPointToStart(&newPLoopOp.getLoopBody().front());
Value loopIv = *newPLoopOp.getInductionVars().begin();
BlockAndValueMapping map;
for (int i : llvm::seq<int>(0, numLoops)) {
Value iterNum =
rewriter.create<arith::DivSIOp>(loc, loopIv, iterationStride[i]);
AffineExpr d0, d1;
bindDims(rewriter.getContext(), d0, d1);
AffineExpr s0 = getAffineSymbolExpr(0, rewriter.getContext());
Value newIv = makeComposedAffineApply(rewriter, loc, d0 + d1 * s0,
{lbs[i], iterNum, steps[i]});
map.map(pLoopBody.getArgument(i), newIv);
loopIv = rewriter.create<arith::RemSIOp>(loc, loopIv, iterationStride[i]);
}
for (Operation &op : pLoopBody.without_terminator()) {
rewriter.clone(op, map);
}
rewriter.eraseOp(pLoopOp);
return newPLoopOp;
}
/// Builds an empty scf.for operation. The default builder adds an entry basic
/// block which needs to be avoided here.
static scf::ForOp buildEmptyForOp(Location loc, OpBuilder &builder, Value lb,
Value ub, Value step) {
OperationState state(loc, scf::ForOp::getOperationName());
state.addOperands({lb, ub, step});
state.addRegion();
return cast<scf::ForOp>(builder.createOperation(state));
}
Operation *replacePLoopOp(ConversionPatternRewriter &rewriter,
scf::ParallelOp pLoopOp,
ArrayRef<LoopBounds> newPLoopBounds,
ArrayRef<LoopBounds> forBounds,
ArrayRef<unsigned> permutation) {
assert(!forBounds.empty() && "unhandled case of no scf.for created");
unsigned numLoops = pLoopOp.getNumLoops();
Location loc = pLoopOp.getLoc();
assert(forBounds.size() + newPLoopBounds.size() == numLoops &&
"cannot drop loops when splitting scf.parallel operation");
assert(permutation.size() == numLoops);
OpBuilder::InsertionGuard guard(rewriter);
// Need a signature conversion for the body of the scf.parallel operation,
// before can it can be used as the body of the innermost loop created here.
TypeConverter::SignatureConversion signatureConverter(numLoops);
Operation *outermostLoop = nullptr;
auto permuteIt = permutation.begin();
// Create the scf.parallel operation as the outermost loop, if specified.
if (!newPLoopBounds.empty()) {
auto lbs = llvm::to_vector<2>(llvm::map_range(
newPLoopBounds, [](LoopBounds bounds) -> Value { return bounds.lb; }));
auto ubs = llvm::to_vector<2>(llvm::map_range(
newPLoopBounds, [](LoopBounds bounds) { return bounds.ub; }));
auto steps = llvm::to_vector<2>(llvm::map_range(
newPLoopBounds, [](LoopBounds bounds) { return bounds.step; }));
auto newPLoop = rewriter.create<scf::ParallelOp>(loc, lbs, ubs, steps);
for (auto iv : newPLoop.getInductionVars()) {
signatureConverter.remapInput(*permuteIt, iv);
permuteIt++;
}
rewriter.setInsertionPointToStart(newPLoop.getBody());
outermostLoop = newPLoop.getOperation();
}
// Generate the nested scf.for operations with the bounds passed.
for (auto it : enumerate(forBounds)) {
Value lb = it.value().lb, ub = it.value().ub, step = it.value().step;
if (it.index() != forBounds.size() - 1) {
auto forOp = rewriter.create<scf::ForOp>(loc, lb, ub, step);
if (!outermostLoop) outermostLoop = forOp.getOperation();
signatureConverter.remapInput(*permuteIt, forOp.getInductionVar());
rewriter.setInsertionPointToStart(forOp.getBody());
} else {
// For the last loop, move the body of the scf.parallel op as the body of
// the loop after signature conversion.
auto forOp = buildEmptyForOp(loc, rewriter, lb, ub, step);
if (!outermostLoop) outermostLoop = forOp.getOperation();
signatureConverter.addInputs(*permuteIt, rewriter.getIndexType());
Region &pLoopOpRegion = pLoopOp.getLoopBody();
rewriter.applySignatureConversion(&pLoopOpRegion, signatureConverter);
Region &forOpRegion = forOp.getLoopBody();
rewriter.inlineRegionBefore(pLoopOpRegion, forOpRegion,
forOpRegion.begin());
}
permuteIt++;
}
rewriter.eraseOp(pLoopOp);
return outermostLoop;
}
/// Builds an empty scf.if operation without the then and else blocks.
static scf::IfOp buildEmptyIfOp(Location loc, OpBuilder &builder, Value cond) {
OperationState state(loc, scf::IfOp::getOperationName());
state.addOperands(cond);
state.addRegion();
state.addRegion();
return cast<scf::IfOp>(builder.createOperation(state));
}
LogicalResult distributeSingleIterationPerProcessor(
ConversionPatternRewriter &rewriter, scf::ParallelOp pLoopOp,
ArrayRef<linalg::ProcInfo> procInfo, bool generateGuard) {
unsigned numLoops = pLoopOp.getNumLoops();
Location loc = pLoopOp.getLoc();
assert(numLoops == procInfo.size() &&
"expected as many ids as number of loops");
auto lbs = pLoopOp.lowerBound();
auto step = pLoopOp.step();
SmallVector<Value, 2> ivReplacements;
for (unsigned i : llvm::seq<unsigned>(0, numLoops)) {
Value iterValue = rewriter.create<arith::AddIOp>(
loc, lbs[i],
rewriter.create<arith::MulIOp>(loc, procInfo[i].procId, step[i]));
ivReplacements.push_back(iterValue);
}
Region &pLoopOpRegion = pLoopOp.getLoopBody();
if (generateGuard) {
TypeConverter::SignatureConversion signatureConverter(numLoops);
Value cond = nullptr;
auto ubs = pLoopOp.upperBound();
for (unsigned i : llvm::seq<unsigned>(0, numLoops)) {
Value cmp = rewriter.create<arith::CmpIOp>(loc, arith::CmpIPredicate::slt,
ivReplacements[i], ubs[i]);
cond = (cond ? rewriter.create<arith::AndIOp>(loc, cond, cmp) : cmp);
signatureConverter.remapInput(i, ivReplacements[i]);
}
rewriter.applySignatureConversion(&pLoopOpRegion, signatureConverter);
scf::IfOp ifOp = buildEmptyIfOp(loc, rewriter, cond);
Region &ifOpRegion = ifOp.getRegion(0);
rewriter.inlineRegionBefore(pLoopOpRegion, ifOpRegion, ifOpRegion.begin());
} else {
// The body of the scf.parallel needs to be moved into its parent
// operation.
// - Split the block just before the scf.parallel operation.
// - Move the only block of scf.parallel before the newly created block
// (after signature conversion).
// - Add branch from the original block to the moved block of the
// scf.parallel's region, and from the latter to the block created by the
// split operation.
// - Canonicalization will fold these branches away.
Block *destBlock = pLoopOp.getOperation()->getBlock();
Block *remainingInst =
rewriter.splitBlock(destBlock, Block::iterator(pLoopOp));
Block *sourceBlock = &pLoopOpRegion.front();
rewriter.eraseOp(sourceBlock->getTerminator());
rewriter.mergeBlocks(&pLoopOpRegion.front(), destBlock, ivReplacements);
rewriter.mergeBlocks(remainingInst, destBlock, {});
}
rewriter.eraseOp(pLoopOp);
return success();
}
} // namespace iree_compiler
} // namespace mlir