blob: e45f9e7e5b2c0048fe3f0d724b2e6a748558fd91 [file] [log] [blame]
// Copyright 2020 The IREE Authors
//
// Licensed under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
#include "iree/compiler/Codegen/Passes.h"
#include "iree-dialects/Dialect/LinalgExt/IR/TiledOpInterface.h"
#include "iree-dialects/Dialect/LinalgExt/Transforms/Passes.h"
#include "iree/compiler/Codegen/LLVMCPU/KernelDispatch.h"
#include "iree/compiler/Codegen/PassDetail.h"
#include "iree/compiler/Codegen/Sandbox/Passes.h"
#include "iree/compiler/Codegen/Utils/Utils.h"
#include "llvm/Support/CommandLine.h"
#include "mlir/Conversion/SCFToStandard/SCFToStandard.h"
#include "mlir/Dialect/Arithmetic/Transforms/Passes.h"
#include "mlir/Dialect/Linalg/Passes.h"
#include "mlir/Dialect/StandardOps/Transforms/Passes.h"
#include "mlir/Pass/PassManager.h"
#include "mlir/Transforms/Passes.h"
namespace mlir {
namespace iree_compiler {
/// Command line options used purely for development purposes. Not to be relied
/// on in any way.
static llvm::cl::opt<bool> clCheckIRBeforeLLVMConversion(
"iree-codegen-check-ir-before-llvm-conversion",
llvm::cl::desc("Runs the pass to check the IR generated from LLVMCPU "
"before conversion to LLVM IR"),
llvm::cl::init(false));
//===---------------------------------------------------------------------===//
// Default allocation functions for CPU backend
//===---------------------------------------------------------------------===//
// Default allocation function to use with IREEs bufferization.
static Value cpuAllocationFunction(OpBuilder &builder, Location loc,
ArrayRef<int64_t> staticShape,
Type elementType,
ArrayRef<Value> dynamicSizes) {
MemRefType allocType = MemRefType::get(staticShape, elementType);
return builder.create<memref::AllocaOp>(loc, allocType, dynamicSizes);
}
// Allocation callbacks to use with upstream comprehensive bufferization
static Optional<Value> cpuComprehensiveBufferizeAllocationFn(
OpBuilder &builder, Location loc, MemRefType memRefType,
ArrayRef<Value> dynamicSizes) {
return builder.create<memref::AllocaOp>(loc, memRefType, dynamicSizes)
.getResult();
}
static void cpuComprehensiveBufferizeDeallocationFn(OpBuilder &builder,
Location loc,
Value allocation) {
return;
}
static void cpuComprehensiveBufferizeCopyFn(OpBuilder &builder, Location loc,
Value from, Value to) {
builder.create<linalg::CopyOp>(loc, from, to);
}
//===---------------------------------------------------------------------===//
// Codegen configuration verifications.
//===---------------------------------------------------------------------===//
LogicalResult verifyTensorToVectorsPassPipelineConfig(
Operation *op, IREE::Codegen::LoweringConfigAttr loweringConfig,
IREE::Codegen::TranslationInfoAttr translationInfo,
ArrayRef<int64_t> workgroupSize) {
if (!workgroupSize.empty()) {
return op->emitOpError(
"expected workgroup size to be empty for CPU pipelines");
}
// Verify that the translation info is using the right pipeline.
auto pipeline =
IREE::Codegen::DispatchLoweringPassPipeline::CPUTensorToVectors;
StringRef pipelineName = stringifyEnum(pipeline);
if (translationInfo.getDispatchLoweringPassPipeline() != pipeline) {
return op->emitOpError("expected pipeline in translation.info to be ")
<< pipelineName;
}
// Verify that the workload per workgroup is set and is non-zero.
SmallVector<int64_t> workloadPerWorkgroup =
translationInfo.getWorkloadPerWorkgroupVals();
if (workloadPerWorkgroup.size() > kNumMaxParallelDims) {
return op->emitOpError("workload_per_wg size should be less than ")
<< kNumMaxParallelDims;
}
if (isa<linalg::LinalgOp, IREE::LinalgExt::TiledOpInterface>(op)) {
SmallVector<unsigned> partitionedLoops = getPartitionedLoops(op);
if (workloadPerWorkgroup.size() != partitionedLoops.size()) {
return op->emitOpError("expected ")
<< partitionedLoops.size()
<< " entries for workload_per_wg, but got "
<< workloadPerWorkgroup.size();
}
}
if (llvm::any_of(workloadPerWorkgroup,
[](int64_t val) { return val == 0; })) {
return op->emitOpError("invalid to use 0 in workload_per_wg");
}
if (loweringConfig.getTileSizes().size() != 3) {
return op->emitOpError("expected three levels of tile sizes for ")
<< pipelineName << ", got " << loweringConfig.getTileSizes().size();
}
SmallVector<int64_t> firstLevelTileSizes = loweringConfig.getTileSizeVals(0);
if (!firstLevelTileSizes.empty()) {
// Verify that if the first-level tile sizes are set, they are the same as
// workload_per_wg for the partitioned loops.
SmallVector<unsigned> partitionedLoops = getPartitionedLoops(op);
size_t minElements =
(partitionedLoops.empty() ? 0 : partitionedLoops.back() + 1);
if (firstLevelTileSizes.size() < minElements) {
return op->emitOpError("expected at least ")
<< minElements
<< " size for first level tiling to get the distribution fully "
"specified.";
}
llvm::SmallDenseSet<unsigned> partitionedLoopsSet;
partitionedLoopsSet.insert(partitionedLoops.begin(),
partitionedLoops.end());
SmallVector<int64_t> partitionedTileSizes;
for (auto tileSize : llvm::enumerate(firstLevelTileSizes)) {
if (!partitionedLoopsSet.count(tileSize.index())) {
continue;
}
partitionedTileSizes.push_back(tileSize.value());
}
for (auto val : llvm::enumerate(llvm::reverse(workloadPerWorkgroup))) {
if (val.value() != partitionedTileSizes[val.index()]) {
return op->emitOpError("mismatch in distributed tile size value ")
<< partitionedTileSizes[val.index()] << " at position "
<< val.index() << " and workload_per_wg value " << val.value();
}
}
}
// Verify that native vector size is either empty, or if set is same as the
// last level of tiling
SmallVector<int64_t> nativeVectorSize =
loweringConfig.getNativeVectorSizeVals();
if (!nativeVectorSize.empty()) {
if (nativeVectorSize !=
loweringConfig.getTileSizeVals(
static_cast<unsigned>(TilingLevel::VectorTiles))) {
return op->emitOpError(
"native_vector_size must be same as the last level of tiling");
}
}
return success();
}
void addTensorToVectorsPassPipeline(OpPassManager &passManager,
bool lowerToVectors) {
passManager.addPass(createCanonicalizerPass());
// Tile and vectorize linalg ops on tensors.
passManager.addNestedPass<FuncOp>(
createLLVMCPUTileAndVectorizePass(lowerToVectors));
passManager.addNestedPass<FuncOp>(createCSEPass());
passManager.addNestedPass<FuncOp>(createCanonicalizerPass());
// Use stack allocation on CPU side.
addLinalgBufferizePasses(passManager, cpuAllocationFunction);
passManager.addNestedPass<FuncOp>(createCSEPass());
passManager.addNestedPass<FuncOp>(createCanonicalizerPass());
passManager.addNestedPass<FuncOp>(createForOpCanonicalizationPass());
passManager.addNestedPass<FuncOp>(createOptimizeVectorTransferPass());
}
void addSingleTilingExpertPassPipeline(OpPassManager &passManager) {
passManager.addPass(createCanonicalizerPass());
// Add the sandbox single tiling expert to tile and vectorize.
{
LinalgSingleTilingExpertPassOptions options;
options.vectorize = true;
options.tilingLevel = static_cast<int64_t>(TilingLevel::L1Tiles);
passManager.addNestedPass<FuncOp>(
createLinalgSingleTilingExpertPass(options));
}
// TODO(ravishankarm): This is commented cause this is WIP, to be enabled
// soon.
// auto callbacks =
// std::make_unique<linalg::comprehensive_bufferize::AllocationCallbacks>(
// cpuComprehensiveBufferizeAllocationFn,
// cpuComprehensiveBufferizeDeallocationFn,
// cpuComprehensiveBufferizeCopyFn);
// addIREEComprehensiveBufferizePasses(passManager, std::move(callbacks));
addLinalgBufferizePasses(passManager, cpuAllocationFunction);
// Add the vector lowering expert.
{
OpPassManager &nestedFuncPassManager = passManager.nest<FuncOp>();
LinalgVectorLoweringPassOptions options;
addLowerToVectorTransforms(nestedFuncPassManager, options);
}
}
void addDoubleTilingExpertPassPipeline(OpPassManager &passManager) {
passManager.addPass(createCanonicalizerPass());
{
passManager.addNestedPass<FuncOp>(createRemoveSingleIterationLoopPass());
LinalgSingleTilingExpertPassOptions options;
options.tilingLevel = static_cast<int64_t>(TilingLevel::L1Tiles);
options.tileInterchange = {0, 2, 1};
passManager.addNestedPass<FuncOp>(
createLinalgSingleTilingExpertPass(options));
passManager.addNestedPass<FuncOp>(createCanonicalizerPass());
passManager.addNestedPass<FuncOp>(createCSEPass());
}
// Add the sandbox single tiling expert to tile and vectorize.
{
// The options are derived from sandbox codegen driver. hoistPadding options
// does not work in IREE cases. It's fine to not have it, since it's already
// generating the IR as same as sandbox.
LinalgSingleTilingExpertPassOptions options;
options.vectorize = true;
options.vectorizePadding = true;
options.pad = true;
options.packPaddings = {1, 1, 0};
// options.hoistPaddings = {5, 6, 0};
options.tilingLevel = static_cast<int64_t>(TilingLevel::VectorTiles);
options.tileInterchange = {0, 1, 2};
passManager.addNestedPass<FuncOp>(
createLinalgSingleTilingExpertPass(options));
}
// TODO(ravishankarm): This is commented cause this is WIP, to be enabled
// soon.
// auto callbacks =
// std::make_unique<linalg::comprehensive_bufferize::AllocationCallbacks>(
// cpuComprehensiveBufferizeAllocationFn,
// cpuComprehensiveBufferizeDeallocationFn,
// cpuComprehensiveBufferizeCopyFn);
// addIREEComprehensiveBufferizePasses(passManager, std::move(callbacks));
addLinalgBufferizePasses(passManager, cpuAllocationFunction);
// Add the vector lowering expert.
{
OpPassManager &nestedFuncPassManager = passManager.nest<FuncOp>();
LinalgVectorLoweringPassOptions options;
options.splitVectorTransfersTo = "linalg-copy";
addLowerToVectorTransforms(nestedFuncPassManager, options);
}
}
void addTileFuseAndVectorizePassPipeline(OpPassManager &passManager,
bool lowerToVectors) {
passManager.addPass(createCanonicalizerPass());
// Tile and vectorize linalg ops on tensors.
passManager.addNestedPass<FuncOp>(
createLLVMCPUTileFuseAndVectorizePass(lowerToVectors));
passManager.addNestedPass<FuncOp>(createCSEPass());
passManager.addNestedPass<FuncOp>(createCanonicalizerPass());
// Use stack allocation on CPU side.
// TODO(ravishankarm): This is commented cause this is WIP, to be enabled
// soon.
//
// auto callbacks =
// std::make_unique<linalg::comprehensive_bufferize::AllocationCallbacks>(
// cpuComprehensiveBufferizeAllocationFn,
// cpuComprehensiveBufferizeDeallocationFn,
// cpuComprehensiveBufferizeCopyFn);
// addIREEComprehensiveBufferizePasses(passManager, std::move(callbacks));
addLinalgBufferizePasses(passManager, cpuAllocationFunction);
passManager.addNestedPass<FuncOp>(createCSEPass());
passManager.addNestedPass<FuncOp>(createCanonicalizerPass());
passManager.addNestedPass<FuncOp>(createForOpCanonicalizationPass());
passManager.addNestedPass<FuncOp>(createOptimizeVectorTransferPass());
}
void addCPUDefaultPassPipeline(OpPassManager &passManager) {
passManager.addPass(createCanonicalizerPass());
// Use stack allocation on CPU side.
addLinalgBufferizePasses(passManager, cpuAllocationFunction);
}
static void addLowerToLLVMPasses(OpPassManager &passManager) {
// LinalgExt -> SCF
passManager.addNestedPass<FuncOp>(
IREE::LinalgExt::createLinalgExtToLoopsPass());
// Linalg -> SCF
passManager.addNestedPass<FuncOp>(createConvertLinalgToLoopsPass());
passManager.addNestedPass<FuncOp>(createCanonicalizerPass());
passManager.addNestedPass<FuncOp>(createCSEPass());
// SCF -> STD
passManager.addNestedPass<FuncOp>(createLowerToCFGPass());
passManager.addNestedPass<FuncOp>(createCanonicalizerPass());
passManager.addNestedPass<FuncOp>(createCSEPass());
if (clCheckIRBeforeLLVMConversion) {
passManager.addPass(createLLVMCPUCheckIRBeforeLLVMConversionPass());
}
// Handled tensor-type constants.
passManager.addPass(createTensorConstantBufferizePass());
passManager.addPass(createFoldTensorExtractOpPass());
// (HAL, IREE, Linalg, STD) -> LLVM
passManager.addNestedPass<FuncOp>(arith::createArithmeticExpandOpsPass());
passManager.addNestedPass<FuncOp>(createStdExpandOpsPass());
passManager.addPass(createConvertToLLVMPass());
// We rely on MLIR symbol visibility being correct after this point and need
// to mirror the LLVM linkage that was assigned during conversion.
passManager.addPass(createLLVMCPUSynchronizeSymbolVisibilityPass());
passManager.addPass(createCanonicalizerPass());
passManager.addPass(createCSEPass());
}
void buildLLVMCPUCodegenPassPipeline(OpPassManager &passManager) {
passManager.addPass(createLLVMCPULowerExecutableTargetPass());
OpPassManager &nestedModulePM = passManager.nest<ModuleOp>();
addLowerToLLVMPasses(nestedModulePM);
}
} // namespace iree_compiler
} // namespace mlir