blob: f4f1a9b1217d44254ba3fe78d066042adb24581b [file] [log] [blame]
// Copyright 2021 The IREE Authors
//
// Licensed under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
#include "iree/compiler/Codegen/Passes.h"
#include "iree-dialects/Dialect/LinalgExt/Transforms/Passes.h"
#include "iree/compiler/Codegen/PassDetail.h"
#include "iree/compiler/Dialect/Shape/Transforms/Passes.h"
#include "mlir/Conversion/AffineToStandard/AffineToStandard.h"
#include "mlir/Conversion/GPUToNVVM/GPUToNVVMPass.h"
#include "mlir/Conversion/SCFToStandard/SCFToStandard.h"
#include "mlir/Conversion/VectorToGPU/VectorToGPU.h"
#include "mlir/Dialect/Arithmetic/Transforms/Passes.h"
#include "mlir/Dialect/Linalg/Passes.h"
#include "mlir/Dialect/MemRef/Transforms/Passes.h"
#include "mlir/Dialect/StandardOps/Transforms/Passes.h"
#include "mlir/Pass/PassManager.h"
#include "mlir/Pass/PassOptions.h"
#include "mlir/Pass/PassRegistry.h"
#include "mlir/Transforms/Passes.h"
namespace mlir {
namespace iree_compiler {
static Value gpuAllocationFunction(OpBuilder &builder, Location loc,
ArrayRef<int64_t> staticShape,
Type elementType,
ArrayRef<Value> dynamicSizes) {
MemRefType allocType = MemRefType::get(staticShape, elementType, {}, 3);
return builder.create<memref::AllocOp>(loc, allocType, dynamicSizes);
}
void addGPUVectorizationPassPipeline(OpPassManager &pm) {
//===--------------------------------------------------------------------===//
// Initial clean up.
//===--------------------------------------------------------------------===//
pm.addPass(createCanonicalizerPass());
pm.addPass(createCSEPass());
// Distribute linalg onto threads within the workgroup.
pm.addNestedPass<FuncOp>(createLLVMGPUTileAndDistribute());
pm.addPass(createCanonicalizerPass());
pm.addPass(createCSEPass());
pm.addNestedPass<FuncOp>(createRemoveSingleIterationLoopPass());
// Linalg -> vector
pm.addNestedPass<FuncOp>(createLLVMGPUVectorizationPass());
pm.addNestedPass<FuncOp>(createCanonicalizerPass());
pm.addNestedPass<FuncOp>(createCSEPass());
pm.addNestedPass<FuncOp>(createOptimizeVectorTransferPass());
}
void addGPUMatmulSimtPassPipeline(OpPassManager &pm) {
//===--------------------------------------------------------------------===//
// Initial clean up.
//===--------------------------------------------------------------------===//
pm.addPass(createCanonicalizerPass());
pm.addPass(createCSEPass());
// Distribute linalg onto threads within the workgroup.
pm.addNestedPass<FuncOp>(createLLVMGPUTileAndDistribute());
pm.addNestedPass<FuncOp>(createLLVMGPUDistributeSharedMemoryCopy());
pm.addPass(createCanonicalizerPass());
pm.addPass(createCSEPass());
pm.addNestedPass<FuncOp>(createRemoveSingleIterationLoopPass());
// Linalg -> vector
pm.addNestedPass<FuncOp>(createLLVMGPUVectorizationPass());
pm.addNestedPass<FuncOp>(createCanonicalizerPass());
pm.addNestedPass<FuncOp>(createCSEPass());
pm.addNestedPass<FuncOp>(createOptimizeVectorTransferPass());
// Pipeline memory operations.
pm.addNestedPass<FuncOp>(createLLVMGPUPipeliningPass());
}
void addGPUMatmulTensorCorePassPipeline(OpPassManager &pm) {
//===--------------------------------------------------------------------===//
// Initial clean up.
//===--------------------------------------------------------------------===//
pm.addPass(createCanonicalizerPass());
pm.addPass(createCSEPass());
// Distribute linalg onto warps within the workgroup.
pm.addNestedPass<FuncOp>(
createLLVMGPUTileAndDistribute(/*distributeToWarp=*/true));
pm.addNestedPass<FuncOp>(createLLVMGPUDistributeSharedMemoryCopy());
pm.addPass(createCanonicalizerPass());
pm.addPass(createCSEPass());
pm.addNestedPass<FuncOp>(createRemoveSingleIterationLoopPass());
// Linalg -> vector
pm.addNestedPass<FuncOp>(createLLVMGPUTensorCoreVectorizationPass());
pm.addNestedPass<FuncOp>(createCanonicalizerPass());
pm.addNestedPass<FuncOp>(createCSEPass());
pm.addNestedPass<FuncOp>(createOptimizeVectorTransferPass());
// Vector -> MMA ops
pm.addNestedPass<FuncOp>(memref::createFoldSubViewOpsPass());
pm.addNestedPass<FuncOp>(createConvertVectorToGPUPass());
pm.addPass(createCanonicalizerPass());
pm.addPass(createCSEPass());
// Pipeline memory operations.
pm.addNestedPass<FuncOp>(createLLVMGPUPipeliningPass());
}
void addGPUSimpleDistributePassPipeline(OpPassManager &pm) {
//===--------------------------------------------------------------------===//
// Initial clean up.
//===--------------------------------------------------------------------===//
pm.addPass(createCanonicalizerPass());
pm.addPass(createCSEPass());
// Distribute linalg onto threads within the workgroup.
pm.addNestedPass<FuncOp>(createLLVMGPUTileAndDistribute());
pm.addPass(createCanonicalizerPass());
pm.addPass(createCSEPass());
pm.addNestedPass<FuncOp>(createRemoveSingleIterationLoopPass());
}
static void addLowerToLLVMGPUPasses(OpPassManager &pm, bool useROCM) {
pm.addPass(createLowerAffinePass());
pm.addPass(createCanonicalizerPass());
pm.addPass(createCSEPass());
// LinalgExt -> SCF
pm.addNestedPass<FuncOp>(IREE::LinalgExt::createLinalgExtToLoopsPass());
// Linalg -> SCF
pm.addNestedPass<FuncOp>(createConvertLinalgToLoopsPass());
pm.addNestedPass<FuncOp>(createCanonicalizerPass());
pm.addNestedPass<FuncOp>(createCSEPass());
// Handled tensor-type constants.
pm.addPass(createTensorConstantBufferizePass());
pm.addPass(createFoldTensorExtractOpPass());
pm.addNestedPass<FuncOp>(createLLVMGPUVectorLoweringPass());
// SCF -> STD
pm.addNestedPass<FuncOp>(createLowerToCFGPass());
pm.addNestedPass<FuncOp>(createCanonicalizerPass());
pm.addNestedPass<FuncOp>(createCSEPass());
pm.addNestedPass<FuncOp>(arith::createArithmeticExpandOpsPass());
pm.addNestedPass<FuncOp>(createStdExpandOpsPass());
pm.addPass(createLowerAffinePass());
// Strip out the debug info for the kernel as CUDA driver doesn't diggest PTX
// debug info well.
pm.addPass(createStripDebugInfoPass());
if (useROCM) {
// convert to ROCDL.
pm.addPass(createConvertToROCDLPass());
} else {
// convert to NVVM.
pm.addPass(createConvertToNVVMPass());
}
}
void buildLLVMGPUTransformPassPipeline(OpPassManager &pm, bool useROCM) {
OpPassManager &bufferizePassPM = pm.nest<ModuleOp>();
addLinalgBufferizePasses(bufferizePassPM, gpuAllocationFunction);
pm.addPass(createLLVMGPULowerExecutableTargetPass());
OpPassManager &nestedModulePM = pm.nest<ModuleOp>();
//===--------------------------------------------------------------------===//
// Convert Linalg ops to LLVM+NVVM/ROCDL ops.
//
// Post-conditions:
// - All Linalg/Loops/GPU/Affine/Standard ops are converted away.
// - The module contains the final llvm.module ready to be serialized.
//===--------------------------------------------------------------------===//
addLowerToLLVMGPUPasses(nestedModulePM, useROCM);
}
} // namespace iree_compiler
} // namespace mlir