blob: 4f01291eeffc929c5ad36232c02171e93c868e15 [file] [log] [blame]
// Copyright 2021 The IREE Authors
//
// Licensed under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//===- MaliConfig.h - Mali CodeGen Configurations -------------------------===//
//
// This file contains CodeGen configurations for Mali GPUs.
//
//===----------------------------------------------------------------------===//
#include <array>
#include "iree/compiler/Codegen/SPIRV/KernelConfig.h"
#include "iree/compiler/Codegen/Transforms/Transforms.h"
#include "iree/compiler/Codegen/Utils/Utils.h"
#include "mlir/Dialect/Linalg/IR/LinalgOps.h"
namespace mlir {
namespace iree_compiler {
namespace detail {
struct TileWorkgroupSizePair {
// How many scalar elements each workgroup should handle along each dimension.
std::array<int64_t, 3> tileSize;
// The number of threads per workgroup along each dimension.
std::array<int64_t, 3> workgroupSize;
};
//===----------------------------------------------------------------------===//
// Matmul
//===----------------------------------------------------------------------===//
/// Writes preferred matmul workgroup tile sizes and workgroup size into
/// `pairs` for the given matmul `scale` (MxNxK) and `elementType`.
static void getMatmulTileAndWorkgroupSizes(
int64_t scale, Type elementType,
SmallVectorImpl<TileWorkgroupSizePair> &pairs) {
if (elementType.isF16()) {
const int64_t smallMatrixSizeThreshold = 512 * 512;
// For smaller destination size we cannot fill out the GPU with bigger tile
// sizes. Instead we pick smaller tiles along M and N to increase the number
// of workgroups and a larger K tile size since we have lower pressure and
// need extra instructions to hide latency.
// TODO: The threshold needs to be fine tuned by doing exploration based on
// matrix shapes.
if (scale <= smallMatrixSizeThreshold) {
pairs.push_back(TileWorkgroupSizePair({{16, 32, 16}, {8, 2, 1}}));
} else {
pairs.push_back(TileWorkgroupSizePair({{16, 64, 4}, {8, 2, 1}}));
pairs.push_back(TileWorkgroupSizePair({{8, 128, 4}, {8, 2, 1}}));
pairs.push_back(TileWorkgroupSizePair({{16, 32, 4}, {8, 2, 1}}));
}
return;
}
// TODO: Heuristic picked based on MobileNet performance. We need
// auto-tuning to be able to make a smarter choice.
const int64_t smallMatrixSizeThreshold = 20000;
if (scale <= smallMatrixSizeThreshold) {
pairs.push_back(TileWorkgroupSizePair({{4, 32, 16}, {8, 2, 1}}));
}
pairs.push_back(TileWorkgroupSizePair({{12, 32, 4}, {8, 2, 1}}));
pairs.push_back(TileWorkgroupSizePair({{14, 32, 4}, {8, 2, 1}}));
pairs.push_back(TileWorkgroupSizePair({{10, 32, 4}, {8, 2, 1}}));
pairs.push_back(TileWorkgroupSizePair({{7, 64, 4}, {16, 1, 1}}));
pairs.push_back(TileWorkgroupSizePair({{8, 32, 4}, {8, 2, 1}}));
pairs.push_back(TileWorkgroupSizePair({{6, 32, 4}, {8, 2, 1}}));
pairs.push_back(TileWorkgroupSizePair({{24, 16, 4}, {2, 8, 1}}));
pairs.push_back(TileWorkgroupSizePair({{16, 16, 4}, {2, 8, 1}}));
pairs.push_back(TileWorkgroupSizePair({{24, 8, 4}, {2, 8, 1}}));
pairs.push_back(TileWorkgroupSizePair({{40, 8, 4}, {2, 8, 1}}));
pairs.push_back(TileWorkgroupSizePair({{32, 8, 4}, {2, 8, 1}}));
pairs.push_back(TileWorkgroupSizePair({{16, 8, 4}, {2, 8, 1}}));
pairs.push_back(TileWorkgroupSizePair({{1, 32, 16}, {8, 1, 1}}));
pairs.push_back(TileWorkgroupSizePair({{1, 32, 8}, {8, 1, 1}}));
pairs.push_back(TileWorkgroupSizePair({{1, 32, 4}, {8, 1, 1}}));
}
/// Launch configuration for Mali GPU configuration.
static LogicalResult setOpConfig(linalg::BatchMatmulOp op) {
ArrayRef<int64_t> lhsShape = getUntiledShape(op.inputs()[0]);
ArrayRef<int64_t> rhsShape = getUntiledShape(op.inputs()[1]);
if (llvm::any_of(lhsShape, ShapedType::isDynamic) ||
llvm::any_of(rhsShape, ShapedType::isDynamic)) {
return success();
}
// Get a vector of best tile size ordered from best to worst.
Type elementType =
op.inputs()[0].getType().cast<ShapedType>().getElementType();
int64_t matmulScale = lhsShape[0] * lhsShape[1] * rhsShape[2];
SmallVector<TileWorkgroupSizePair, 4> pairs;
getMatmulTileAndWorkgroupSizes(matmulScale, elementType, pairs);
for (TileWorkgroupSizePair pair : pairs) {
if (lhsShape[1] % pair.tileSize[0] != 0 ||
rhsShape[2] % pair.tileSize[1] != 0 ||
lhsShape[2] % pair.tileSize[2] != 0) {
continue;
}
auto pipeline = IREE::HAL::DispatchLoweringPassPipeline::SPIRVVectorize;
SmallVector<int64_t, 4> numElementsPerWorkgroup;
numElementsPerWorkgroup = {1, pair.tileSize[0], pair.tileSize[1],
pair.tileSize[2]};
TileSizesListType tileSizes;
// Workgroup level.
tileSizes.push_back(numElementsPerWorkgroup);
// No tiling at the subgroup level since this target doesn't use subgroup op
// or shared memory.
tileSizes.emplace_back();
// Invocation level.
tileSizes.push_back({numElementsPerWorkgroup[0],
numElementsPerWorkgroup[1] / pair.workgroupSize[1],
numElementsPerWorkgroup[2] / pair.workgroupSize[0],
numElementsPerWorkgroup[3]});
return setOpConfigAndEntryPointFnTranslation(op->getParentOfType<FuncOp>(),
op, tileSizes, {}, pipeline,
pair.workgroupSize);
}
return success();
}
static LogicalResult setOpConfig(linalg::MatmulOp op) {
ArrayRef<int64_t> lhsShape = getUntiledShape(op.inputs()[0]);
ArrayRef<int64_t> rhsShape = getUntiledShape(op.inputs()[1]);
if (llvm::any_of(lhsShape, ShapedType::isDynamic) ||
llvm::any_of(rhsShape, ShapedType::isDynamic)) {
return success();
}
Type elementType =
op.inputs()[0].getType().cast<ShapedType>().getElementType();
int64_t matmulScale = lhsShape[0] * rhsShape[1];
SmallVector<TileWorkgroupSizePair, 4> pairs;
getMatmulTileAndWorkgroupSizes(matmulScale, elementType, pairs);
for (TileWorkgroupSizePair pair : pairs) {
if (lhsShape[0] % pair.tileSize[0] != 0 ||
rhsShape[1] % pair.tileSize[1] != 0 ||
lhsShape[1] % pair.tileSize[2] != 0) {
continue;
}
auto pipeline = IREE::HAL::DispatchLoweringPassPipeline::SPIRVVectorize;
SmallVector<int64_t, 4> numElementsPerWorkgroup(pair.tileSize.begin(),
pair.tileSize.end());
TileSizesListType tileSizes;
// Workgroup level.
tileSizes.push_back(numElementsPerWorkgroup);
// No tiling at the subgroup level since this target doesn't use subgroup op
// or shared memory.
tileSizes.emplace_back();
// Invocation level.
tileSizes.push_back({numElementsPerWorkgroup[0] / pair.workgroupSize[1],
numElementsPerWorkgroup[1] / pair.workgroupSize[0],
numElementsPerWorkgroup[2]});
return setOpConfigAndEntryPointFnTranslation(op->getParentOfType<FuncOp>(),
op, tileSizes, {}, pipeline,
pair.workgroupSize);
}
return success();
}
//===----------------------------------------------------------------------===//
// Entry Point
//===----------------------------------------------------------------------===//
LogicalResult setMaliCodeGenConfig(const spirv::TargetEnv &targetEnv,
Operation *rootOp) {
int64_t subgroupSize = targetEnv.getResourceLimits().subgroup_size().getInt();
return TypeSwitch<Operation *, LogicalResult>(rootOp)
.Case<linalg::BatchMatmulOp, linalg::MatmulOp>(
[](auto op) { return setOpConfig(op); })
.Case<linalg::Conv2DNhwcHwcfOp>([subgroupSize](auto op) {
return setConvOpConfig(op, subgroupSize,
/*bestTilingFactor=*/16);
})
.Case<linalg::DepthwiseConv2DNhwOp>([subgroupSize](auto op) {
return setConvOpConfig(op, subgroupSize,
/*bestTilingFactor=*/16);
})
.Default([](Operation *) { return success(); });
}
} // namespace detail
} // namespace iree_compiler
} // namespace mlir