blob: 933756a55310568ca53634ee36cf70debe07f577 [file] [log] [blame]
// Copyright 2021 The IREE Authors
//
// Licensed under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
#include "iree/compiler/Codegen/LLVMGPU/KernelConfig.h"
#include <numeric>
#include "iree-dialects/Dialect/LinalgExt/IR/LinalgExtOps.h"
#include "iree/compiler/Codegen/Dialect/LoweringConfig.h"
#include "iree/compiler/Dialect/Flow/IR/FlowOps.h"
#include "llvm/Support/Debug.h"
#include "mlir/Dialect/Linalg/Transforms/Transforms.h"
#include "mlir/IR/Matchers.h"
#include "mlir/IR/Types.h"
#include "mlir/IR/Value.h"
using namespace mlir;
using namespace mlir::iree_compiler;
static constexpr unsigned cudaWarpSize = 32;
namespace {
struct TileWorkgroupSizePair {
// How many scalar elements each workgroup should handle along each dimension.
std::array<int64_t, 3> tileSize;
std::array<int64_t, 3> workgroupSize;
};
} // namespace
/// Return the best combination of tile size and wg size. It will then used to
/// pick the best size aligned with the shape dimension.
static void getMatmulConfig(SmallVectorImpl<TileWorkgroupSizePair> &tileSizes) {
// Pick tile size so that M*K and K*N dividible by wgSize * \*vecSize=*\4.
// This way workgroup memory copy don't need to be masked. Once we support
// masked load we can get performance out of more configuration.
tileSizes.push_back(TileWorkgroupSizePair({{32, 128, 32}, {32, 8, 1}}));
tileSizes.push_back(TileWorkgroupSizePair({{128, 64, 8}, {16, 8, 1}}));
tileSizes.push_back(TileWorkgroupSizePair({{16, 256, 32}, {64, 2, 1}}));
tileSizes.push_back(TileWorkgroupSizePair({{8, 32, 32}, {8, 8, 1}}));
tileSizes.push_back(TileWorkgroupSizePair({{8, 128, 4}, {32, 1, 1}}));
tileSizes.push_back(TileWorkgroupSizePair({{16, 64, 4}, {16, 2, 1}}));
tileSizes.push_back(TileWorkgroupSizePair({{1, 128, 8}, {32, 1, 1}}));
}
/// Return the best combination of tile size and wg size when using tensorcore
/// operations.
static void getTensorCoreConfig(
SmallVectorImpl<TileWorkgroupSizePair> &tileSizes) {
// Tile sizes are skewed towards small matmul for now. Long term the plan is
// to not rely on hardcoded configurations.
tileSizes.push_back(TileWorkgroupSizePair({{32, 32, 16}, {64, 2, 1}}));
}
static std::string getTargetArch(FuncOp entryPoint) {
if (auto variantOp =
entryPoint->getParentOfType<IREE::HAL::ExecutableVariantOp>()) {
IREE::HAL::ExecutableTargetAttr targetAttr = variantOp.target();
if (auto config = targetAttr.getConfiguration()) {
if (auto attr = config.getAs<StringAttr>("target_arch")) {
return attr.getValue().str();
}
}
}
return "";
}
static bool supportsTensorCore(FuncOp entryPoint, linalg::LinalgOp op) {
// Limit tensor core pipeline to matmul as not all combinations of transpose
// are supported upstream.
// TODO(thomasraoux): Enable batchMatmul and generic contraction.
if (getTargetArch(entryPoint) != "sm_80" ||
!(isa<linalg::MatmulOp>(op) || isa<linalg::BatchMatmulOp>(op))) {
return false;
}
// Check that we support converting any fused operation. When using the
// tensorcore pipeline we need to be sure we can generate MMA ops otherwise
// the code will be highly inneficent.
bool fusedOpSupported = true;
entryPoint.walk([&fusedOpSupported](linalg::GenericOp linalgOp) {
for (Operation &fusedOp : linalgOp.getOps()) {
if (!isa<arith::AddFOp, arith::MulFOp, arith::MaxFOp, arith::MinFOp,
linalg::YieldOp, arith::DivFOp>(fusedOp)) {
fusedOpSupported = false;
break;
}
}
});
if (!fusedOpSupported) return false;
return true;
}
static LogicalResult setContractConfig(FuncOp entryPoint, linalg::LinalgOp op) {
auto setMatmulConfig =
[&entryPoint, &op](int64_t tileX, int64_t tileY, int64_t tileK,
llvm::ArrayRef<int64_t> workgroupSize,
IREE::Codegen::DispatchLoweringPassPipeline pipeline) {
TileSizesListType tileSizes;
unsigned numParallelLoops = op.getNumParallelLoops();
SmallVector<int64_t> workgroupTileSizes(numParallelLoops - 2, 1);
workgroupTileSizes.append({tileX, tileY});
workgroupTileSizes.append(op.getNumReductionLoops(), tileK);
SmallVector<unsigned> partitionedLoops =
cast<IREE::Flow::PartitionableLoopsInterface>(op.getOperation())
.getPartitionableLoops(kNumMaxParallelDims);
llvm::SmallDenseSet<unsigned, 4> partitionedLoopsSet;
partitionedLoopsSet.insert(partitionedLoops.begin(),
partitionedLoops.end());
for (auto loopID : llvm::seq<unsigned>(0, numParallelLoops)) {
if (!partitionedLoopsSet.count(loopID)) {
workgroupTileSizes[loopID] = 0;
}
}
tileSizes.emplace_back(
std::move(workgroupTileSizes)); // Workgroup level.
return setOpConfigAndEntryPointFnTranslation(entryPoint, op, tileSizes,
pipeline, workgroupSize);
};
// Infer the MxN size of the matmul based on operands and indexing maps.
auto lhsShape =
op.getInputOperand(0)->get().getType().cast<ShapedType>().getShape();
auto rhsShape =
op.getInputOperand(1)->get().getType().cast<ShapedType>().getShape();
int64_t sizeM = ShapedType::kDynamicSize;
int64_t sizeN = ShapedType::kDynamicSize;
int64_t sizeK = ShapedType::kDynamicSize;
auto outputMap = op.getTiedIndexingMap(op.getOutputOperand(0));
for (unsigned i = 0; i < lhsShape.size(); i++) {
if (op.getTiedIndexingMap(op.getInputOperand(0)).getDimPosition(i) ==
outputMap.getDimPosition(outputMap.getNumResults() - 2)) {
sizeM = lhsShape[i];
break;
}
}
for (unsigned i = 0; i < rhsShape.size(); i++) {
if (op.getTiedIndexingMap(op.getInputOperand(1)).getDimPosition(i) ==
outputMap.getDimPosition(outputMap.getNumResults() - 1)) {
sizeN = rhsShape[i];
break;
}
}
SmallVector<unsigned> exprs;
op.getReductionDims(exprs);
if (exprs.size() == 1) {
for (unsigned i = 0; i < lhsShape.size(); i++) {
if (op.getTiedIndexingMap(op.getInputOperand(0)).getDimPosition(i) ==
exprs[0]) {
sizeK = lhsShape[i];
break;
}
}
}
bool isStaticSize = sizeM != ShapedType::kDynamicSize &&
sizeN != ShapedType::kDynamicSize &&
sizeK != ShapedType::kDynamicSize;
if (isStaticSize) {
/// Try tensorcore config first.
if (supportsTensorCore(entryPoint, op)) {
SmallVector<TileWorkgroupSizePair> TCtileSizeConfig;
getTensorCoreConfig(TCtileSizeConfig);
// Pick the best configuration where the original shape is aligned on the
// tile size.
for (TileWorkgroupSizePair &config : TCtileSizeConfig) {
if (sizeK % config.tileSize[2] == 0 &&
sizeN % config.tileSize[1] == 0 &&
sizeM % config.tileSize[0] == 0) {
return setMatmulConfig(config.tileSize[0], config.tileSize[1],
config.tileSize[2], config.workgroupSize,
IREE::Codegen::DispatchLoweringPassPipeline::
LLVMGPUMatmulTensorCore);
}
}
}
// Special case for very small matrices.
if (sizeM * sizeN <= cudaWarpSize) {
return setMatmulConfig(
sizeN, sizeM, 4, {sizeM, sizeN, 1},
IREE::Codegen::DispatchLoweringPassPipeline::LLVMGPUMatmulSimt);
}
// simt matmul case
SmallVector<TileWorkgroupSizePair> tileSizeConfig;
// Query the best configuration.
getMatmulConfig(tileSizeConfig);
// Pick the best configuration where the original shape is aligned on the
// tile size.
for (TileWorkgroupSizePair &config : tileSizeConfig) {
if (sizeN % config.tileSize[1] == 0 && sizeM % config.tileSize[0] == 0) {
return setMatmulConfig(
config.tileSize[0], config.tileSize[1], config.tileSize[2],
config.workgroupSize,
IREE::Codegen::DispatchLoweringPassPipeline::LLVMGPUMatmulSimt);
}
}
}
// If we haven't found any config, fall back to default config.
int64_t tileX = 2;
int64_t tileY = 256;
int64_t tileK = 4;
SmallVector<int64_t, 3> workgroupSize = {2 * cudaWarpSize, 1, 1};
return setMatmulConfig(
tileX, tileY, tileK, workgroupSize,
IREE::Codegen::DispatchLoweringPassPipeline::LLVMGPUMatmulSimt);
}
static LogicalResult setFftConfig(FuncOp entryPoint,
IREE::LinalgExt::FftOp op) {
auto interfaceOp = cast<IREE::Flow::PartitionableLoopsInterface>(*op);
auto partitionedLoops =
interfaceOp.getPartitionableLoops(kNumMaxParallelDims);
unsigned loopDepth = partitionedLoops.back() + 1;
SmallVector<int64_t> workgroupTileSize(loopDepth, 0);
SmallVector<int64_t, 3> workgroupSize = {cudaWarpSize, 1, 1};
// Tiling along partitioned loops with size 1.
for (int64_t loopIndex : partitionedLoops) {
workgroupTileSize[loopIndex] = 1;
}
auto rank = op.getOperandRank();
if (workgroupTileSize.size() >= rank && workgroupTileSize[rank - 1] != 0) {
APInt value;
if (matchPattern(op.getStage(), m_ConstantInt(&value))) {
workgroupTileSize[rank - 1] = 1ll << value.getSExtValue();
} else {
op.emitError("non-constant stage might not work for fft op");
return failure();
}
}
TileSizesListType tileSizes = {workgroupTileSize};
return setOpConfigAndEntryPointFnTranslation(
entryPoint, op, tileSizes,
IREE::Codegen::DispatchLoweringPassPipeline::LLVMGPUDistribute,
workgroupSize);
}
static LogicalResult setSortConfig(FuncOp entryPoint, Operation *op) {
TileSizesListType tileSizes;
auto interfaceOp = cast<IREE::Flow::PartitionableLoopsInterface>(*op);
auto partitionedLoops =
interfaceOp.getPartitionableLoops(kNumMaxParallelDims);
if (partitionedLoops.empty()) {
tileSizes.push_back({});
return setOpConfigAndEntryPointFnTranslation(
entryPoint, op, tileSizes,
IREE::Codegen::DispatchLoweringPassPipeline::LLVMGPUDistribute,
{1, 1, 1});
}
size_t numLoops = partitionedLoops.back() + 1;
// To get peak occupancy we need a workgroup size of at least two warps
std::array<int64_t, 3> workgroupSize = {2 * cudaWarpSize, 1, 1};
SmallVector<int64_t, 4> workgroupTileSizes(numLoops, 1);
// Set all non-parallel loops to zero tile size.
llvm::DenseSet<unsigned> partitionedLoopsSet(partitionedLoops.begin(),
partitionedLoops.end());
for (auto depth : llvm::seq<int64_t>(0, numLoops)) {
if (!partitionedLoopsSet.count(depth)) {
workgroupTileSizes[depth] = 0;
}
}
// Tile to have one element per thread.
for (int64_t depth = numLoops; depth > 0; depth--) {
if (partitionedLoopsSet.count(depth - 1)) {
workgroupTileSizes[depth - 1] = workgroupSize[0];
break;
}
}
tileSizes.emplace_back(std::move(workgroupTileSizes)); // Workgroup level
return setOpConfigAndEntryPointFnTranslation(
entryPoint, op, tileSizes,
IREE::Codegen::DispatchLoweringPassPipeline::LLVMGPUDistribute,
workgroupSize);
}
// Basic default properties for linalg ops that haven't been tuned.
static LogicalResult setRootDefaultConfig(FuncOp entryPoint, Operation *op) {
IREE::Codegen::DispatchLoweringPassPipeline passPipeline =
IREE::Codegen::DispatchLoweringPassPipeline::LLVMGPUDistribute;
TileSizesListType tileSizes;
auto interfaceOp = cast<IREE::Flow::PartitionableLoopsInterface>(*op);
auto partitionedLoops =
interfaceOp.getPartitionableLoops(kNumMaxParallelDims);
if (partitionedLoops.empty()) {
tileSizes.push_back({});
return setOpConfigAndEntryPointFnTranslation(entryPoint, op, tileSizes,
passPipeline, {1, 1, 1});
}
size_t numLoops = partitionedLoops.back() + 1;
// To get peak occupancy we need a workgroup size of at least two warps
std::array<int64_t, 3> workgroupSize = {2 * cudaWarpSize, 1, 1};
unsigned vectorSize = 4;
SmallVector<int64_t, 4> workgroupTileSizes(numLoops, 1);
// Set all non-parallel loops to zero tile size.
llvm::DenseSet<unsigned> partitionedLoopsSet(partitionedLoops.begin(),
partitionedLoops.end());
for (auto depth : llvm::seq<int64_t>(0, numLoops)) {
if (!partitionedLoopsSet.count(depth)) {
workgroupTileSizes[depth] = 0;
}
}
if (auto genericOp = dyn_cast<linalg::GenericOp>(op)) {
for (auto outputOperand : enumerate(genericOp.getOutputOperands())) {
if (!genericOp.getTiedIndexingMap(outputOperand.value())
.isProjectedPermutation()) {
vectorSize = 1;
break;
}
ArrayRef<int64_t> shape = cast<linalg::LinalgOp>(op)
.getOutputOperand(outputOperand.index())
->get()
.getType()
.cast<ShapedType>()
.getShape();
if (llvm::any_of(shape, ShapedType::isDynamic)) {
vectorSize = 1;
break;
}
int64_t problemSize = std::accumulate(
shape.begin(), shape.end(), 1,
[](const int64_t &a, const int64_t &b) { return a * b; });
if ((problemSize / (cudaWarpSize * vectorSize)) < 64) {
vectorSize = 1;
break;
}
}
}
// Pick a vectorSize of 1 for op that we know won't get vectorizedd.
// TODO(thomasraoux): This could be improved by checking if the linalg op
// would fail vectorization.
if (!isa<linalg::LinalgOp>(op)) vectorSize = 1;
// Set the inner most parallel loop to `lowerTs`.
for (int64_t depth = numLoops; depth > 0; depth--) {
if (partitionedLoopsSet.count(depth - 1)) {
workgroupTileSizes[depth - 1] = workgroupSize[0] * vectorSize;
break;
}
}
if (auto linalgOp = dyn_cast<linalg::LinalgOp>(op)) {
// Tile reduction dimension to 4 to allow doing load4 if the reduction size
// is the most inner dimension.
workgroupTileSizes.append(linalgOp.getNumReductionLoops(), 4);
}
tileSizes.emplace_back(std::move(workgroupTileSizes)); // Workgroup level
return setOpConfigAndEntryPointFnTranslation(
entryPoint, op, tileSizes,
IREE::Codegen::DispatchLoweringPassPipeline::LLVMGPUVectorize,
workgroupSize);
}
/// Propagate the configuration annotated in the incoming IR.
static LogicalResult setUserConfig(
FuncOp entryPointFn, Operation *computeOp,
IREE::Codegen::CompilationInfoAttr compilationInfo) {
if (auto translationInfo = getTranslationInfo(entryPointFn)) {
return computeOp->emitOpError(
"multiple ops within dispatch trying to set the translation "
"info");
}
SmallVector<int64_t> workgroupSize = compilationInfo.getWorkgroupSizeVals();
setTranslationInfo(entryPointFn, compilationInfo.getTranslationInfo(),
workgroupSize);
setLoweringConfig(computeOp, compilationInfo.getLoweringConfig());
eraseCompilationInfo(computeOp);
return success();
}
static LogicalResult setRootConfig(FuncOp entryPointFn, Operation *computeOp) {
if (IREE::Codegen::CompilationInfoAttr compilationInfo =
getCompilationInfo(computeOp)) {
// If the op already has a lowering config coming from the IR use this and
// bypass the heuristic.
return setUserConfig(entryPointFn, computeOp, compilationInfo);
}
if (auto linalgOp = dyn_cast<linalg::LinalgOp>(computeOp)) {
if (linalg::isaContractionOpInterface(linalgOp) &&
linalgOp.getNumParallelLoops() >= 2) {
return setContractConfig(entryPointFn, linalgOp);
}
}
if (auto fftOp = dyn_cast<IREE::LinalgExt::FftOp>(computeOp)) {
return setFftConfig(entryPointFn, fftOp);
}
if (auto sortOp = dyn_cast<IREE::LinalgExt::SortOp>(computeOp)) {
return setSortConfig(entryPointFn, sortOp);
}
return setRootDefaultConfig(entryPointFn, computeOp);
}
namespace mlir {
namespace iree_compiler {
LogicalResult initGPULaunchConfig(ModuleOp moduleOp) {
llvm::StringMap<IREE::HAL::ExecutableEntryPointOp> entryPointOps =
getAllEntryPoints(moduleOp);
for (auto funcOp : moduleOp.getOps<FuncOp>()) {
auto entryPointOp = entryPointOps.lookup(funcOp.getName());
if (!entryPointOp) continue;
if (getTranslationInfo(entryPointOp)) continue;
SmallVector<Operation *> computeOps;
SmallVector<LoopTilingAndDistributionInfo> tiledLoops;
if (failed(getComputeOps(funcOp, computeOps, tiledLoops))) {
return funcOp.emitOpError("failed to get compute ops");
}
Operation *rootOperation = nullptr;
// Find the root operation. linalg.generic and linalg.fill are not root
// operations if there are other compute operations present.
for (Operation *op : llvm::reverse(computeOps)) {
if (!isa<linalg::GenericOp, linalg::FillOp>(op)) {
rootOperation = op;
break;
}
if (auto genericOp = dyn_cast<linalg::GenericOp>(op)) {
// linalg.generic with `reduction` iterator types are roots as well.
if (genericOp.getNumLoops() != genericOp.getNumParallelLoops()) {
rootOperation = op;
break;
}
}
}
if (!rootOperation) {
for (Operation *op : llvm::reverse(computeOps)) {
if (isa<linalg::GenericOp, linalg::FillOp>(op)) {
rootOperation = op;
break;
}
}
}
if (!rootOperation) {
// setTranslationInfo(
// funcOp,
// IREE::Codegen::DispatchLoweringPassPipeline::LLVMGPUDistribute,
// /*workloadPerWorkgroup=*/{}, {1, 1, 1});
// continue;
return funcOp.emitOpError("unable to find root operation");
}
if (failed(setRootConfig(funcOp, rootOperation))) continue;
// Propogate the configuration to the other ops.
// TODO(ravishankarm, thomasraoux): This is a very specific use (and
// fragile). In general, this should not be needed. Things are already tiled
// and distributed. The rest of the compilation must be structured to either
// use `TileAndFuse` or they are independent configurations that are
// determined based on the op.
IREE::Codegen::LoweringConfigAttr config = getLoweringConfig(rootOperation);
for (auto op : computeOps) {
if (op == rootOperation) continue;
setLoweringConfig(op, config);
}
}
return success();
}
} // namespace iree_compiler
} // namespace mlir