blob: 9e5efa64af2d93fb0f98e746f865b627c2b97f19 [file] [log] [blame]
// Copyright 2021 The IREE Authors
//
// Licensed under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
#include "iree/compiler/Codegen/Passes.h"
#include "iree-dialects/Dialect/LinalgExt/Passes/Passes.h"
#include "iree-dialects/Dialect/LinalgTransform/Passes.h"
#include "iree/compiler/Codegen/PassDetail.h"
#include "iree/compiler/Codegen/Utils/MarkerUtils.h"
#include "mlir/Conversion/AffineToStandard/AffineToStandard.h"
#include "mlir/Conversion/GPUToNVVM/GPUToNVVMPass.h"
#include "mlir/Conversion/SCFToControlFlow/SCFToControlFlow.h"
#include "mlir/Conversion/VectorToGPU/VectorToGPU.h"
#include "mlir/Dialect/Arith/Transforms/Passes.h"
#include "mlir/Dialect/Func/Transforms/Passes.h"
#include "mlir/Dialect/GPU/IR/GPUDialect.h"
#include "mlir/Dialect/GPU/Transforms/Passes.h"
#include "mlir/Dialect/Linalg/Passes.h"
#include "mlir/Dialect/MemRef/Transforms/Passes.h"
#include "mlir/IR/BuiltinAttributes.h"
#include "mlir/Pass/PassManager.h"
#include "mlir/Pass/PassOptions.h"
#include "mlir/Pass/PassRegistry.h"
#include "mlir/Transforms/Passes.h"
#define DEBUG_TYPE "iree-llvm-gpu-lowering-pass-pipeline"
namespace mlir {
namespace iree_compiler {
static llvm::cl::opt<unsigned> logSwizzleTile(
"iree-codegen-log-swizzle-tile", llvm::cl::desc("log swizzle tile value"),
llvm::cl::init(0));
/// Flag used for the transition from wmma to mma.sync. Once we have better
/// performance with mma.sync we can drop wmma support and remove this flag.
llvm::cl::opt<bool> llvmgpuUseMMASync(
"iree-codegen-llvmgpu-use-mma-sync",
llvm::cl::desc("use mma sync instead of wmma ops"), llvm::cl::init(false));
static FailureOr<Value> gpuAllocationFn(OpBuilder &builder, Location loc,
MemRefType memRefType,
ValueRange dynamicSizes,
unsigned alignment) {
auto workgroupSpace = gpu::AddressSpaceAttr::get(
builder.getContext(), gpu::GPUDialect::getWorkgroupAddressSpace());
MemRefType allocType =
MemRefType::get(memRefType.getShape(), memRefType.getElementType(),
AffineMap(), workgroupSpace);
return builder.create<memref::AllocOp>(loc, allocType, dynamicSizes)
.getResult();
}
static LogicalResult gpuDeallocationFn(OpBuilder &builder, Location loc,
Value allocation) {
return success();
}
static LogicalResult gpuCopyFn(OpBuilder &builder, Location loc, Value from,
Value to) {
auto fromType = from.getType().cast<MemRefType>();
auto toType = to.getType().cast<MemRefType>();
bool needsBarrier = false;
if (auto attr =
fromType.getMemorySpace().dyn_cast_or_null<gpu::AddressSpaceAttr>()) {
if (attr.getValue() == gpu::GPUDialect::getWorkgroupAddressSpace())
needsBarrier = true;
}
if (auto attr =
toType.getMemorySpace().dyn_cast_or_null<gpu::AddressSpaceAttr>()) {
if (attr.getValue() == gpu::GPUDialect::getWorkgroupAddressSpace())
needsBarrier = true;
}
if (needsBarrier) builder.create<gpu::BarrierOp>(loc);
Operation *copy = builder.create<memref::CopyOp>(loc, from, to);
if (needsBarrier) {
setMarker(copy, getCopyToWorkgroupMemoryMarker());
builder.create<gpu::BarrierOp>(loc);
}
return success();
}
static void addBufferizePasses(OpPassManager &passManager) {
BufferizationOptions::AllocationFn allocationFn = gpuAllocationFn;
BufferizationOptions::DeallocationFn deallocationFn = gpuDeallocationFn;
BufferizationOptions::MemCpyFn memcpyFn = gpuCopyFn;
addIREEComprehensiveBufferizePasses(passManager, allocationFn, deallocationFn,
memcpyFn);
passManager.addPass(createCanonicalizerPass());
passManager.addPass(createCSEPass());
// TODO: Remove the following pass the plumb support for #hal.descriptor_type
// memory space through the stack.
passManager.addNestedPass<func::FuncOp>(
createEraseHALDescriptorTypeFromMemRefPass());
}
static void tileAndDistributeToWorkgroup(
OpPassManager &pm, bool useWARForCooperativeMatrixCodegen = false) {
pm.addPass(createTileAndDistributeToWorkgroupsPass());
auto &nestedModulePM = pm.nest<ModuleOp>();
nestedModulePM.addNestedPass<func::FuncOp>(
IREE::LinalgExt::createTileAndDecomposeAttentionPass());
nestedModulePM.addNestedPass<func::FuncOp>(
IREE::LinalgExt::createDecomposeSoftmaxPass());
nestedModulePM.addNestedPass<func::FuncOp>(
createConvertToDestinationPassingStylePass(
useWARForCooperativeMatrixCodegen));
nestedModulePM.addPass(createCanonicalizerPass());
nestedModulePM.addPass(createCSEPass());
}
static void tileAndBufferize(OpPassManager &pm) {
tileAndDistributeToWorkgroup(pm, /*useWARForCooperativeMatrixCodegen=*/true);
auto &nestedModulePM = pm.nest<ModuleOp>();
addBufferizePasses(nestedModulePM);
}
//===---------------------------------------------------------------------===//
// Codegen pipelines.
//===---------------------------------------------------------------------===//
void addGPUVectorizationPassPipeline(OpPassManager &pm) {
tileAndDistributeToWorkgroup(pm);
auto &nestedModulePM = pm.nest<ModuleOp>();
nestedModulePM.addNestedPass<func::FuncOp>(
createWorkgroupSpecializationPass());
nestedModulePM.addPass(createCanonicalizerPass());
nestedModulePM.addPass(createCSEPass());
nestedModulePM.addNestedPass<func::FuncOp>(
createRemoveSingleIterationLoopPass());
// Distribute linalg onto threads within the workgroup.
nestedModulePM.addNestedPass<func::FuncOp>(createLLVMGPUTileTensor(false));
nestedModulePM.addPass(createCanonicalizerPass());
nestedModulePM.addPass(createCSEPass());
nestedModulePM.addNestedPass<func::FuncOp>(
createRemoveSingleIterationLoopPass());
// Linalg -> vector
nestedModulePM.addNestedPass<func::FuncOp>(createGPUVectorizationPass());
nestedModulePM.addNestedPass<func::FuncOp>(createCanonicalizerPass());
nestedModulePM.addNestedPass<func::FuncOp>(createCSEPass());
// tensor to memref
addBufferizePasses(nestedModulePM);
nestedModulePM.addNestedPass<func::FuncOp>(createLLVMGPUDistribute());
// Post bufferization optimizations.
nestedModulePM.addNestedPass<func::FuncOp>(
createLoopInvariantCodeMotionPass());
nestedModulePM.addNestedPass<func::FuncOp>(
memref::createFoldMemRefAliasOpsPass());
nestedModulePM.addNestedPass<func::FuncOp>(createCanonicalizerPass());
nestedModulePM.addNestedPass<func::FuncOp>(createCSEPass());
nestedModulePM.addNestedPass<func::FuncOp>(
createOptimizeVectorTransferPass());
}
void addGPUMatmulSimtPassPipeline(OpPassManager &pm) {
tileAndDistributeToWorkgroup(pm);
auto &nestedModulePM = pm.nest<ModuleOp>();
nestedModulePM.addNestedPass<func::FuncOp>(
createWorkgroupSpecializationPass());
nestedModulePM.addPass(createCanonicalizerPass());
nestedModulePM.addPass(createCSEPass());
nestedModulePM.addNestedPass<func::FuncOp>(
createRemoveSingleIterationLoopPass());
nestedModulePM.addNestedPass<func::FuncOp>(createLLVMGPUTensorAlloc());
nestedModulePM.addNestedPass<func::FuncOp>(createLLVMGPUTileTensor(false));
// Linalg -> vector
nestedModulePM.addNestedPass<func::FuncOp>(createGPUVectorizationPass());
nestedModulePM.addNestedPass<func::FuncOp>(createCanonicalizerPass());
nestedModulePM.addNestedPass<func::FuncOp>(createCSEPass());
// tensor to memref
addBufferizePasses(nestedModulePM);
// distribute foreach threads
nestedModulePM.addNestedPass<func::FuncOp>(createLLVMGPUDistribute());
nestedModulePM.addNestedPass<func::FuncOp>(createMemrefCopyToLinalgPass());
nestedModulePM.addNestedPass<func::FuncOp>(
createGPUDistributeSharedMemoryCopy());
nestedModulePM.addPass(createCanonicalizerPass());
nestedModulePM.addPass(createCSEPass());
nestedModulePM.addNestedPass<func::FuncOp>(
createGPUReduceSharedMemoryBankConflicts());
nestedModulePM.addNestedPass<func::FuncOp>(
createWorkGroupSwizzle(logSwizzleTile));
nestedModulePM.addPass(createCanonicalizerPass());
nestedModulePM.addPass(createCSEPass());
nestedModulePM.addNestedPass<func::FuncOp>(
memref::createFoldMemRefAliasOpsPass());
nestedModulePM.addPass(createCSEPass());
nestedModulePM.addPass(createCanonicalizerPass());
nestedModulePM.addPass(createCSEPass());
// Even though we vectorize before bufferization we are not able to hoist
// accumulator load/store out of the K loop until distribution. Therefore we
// still rely on buffer level transformations for transfer ops hoisting and
// store to load forwarding. This relies on shacky alias analysis and we need
// to move this to tensor level once we have better abstractions.
nestedModulePM.addNestedPass<func::FuncOp>(
createOptimizeVectorTransferPass());
// Pipeline memory operations.
nestedModulePM.addNestedPass<func::FuncOp>(createGPUPipeliningPass());
}
void addGPUMatmulTensorCorePassPipeline(OpPassManager &pm,
unsigned pipelineDepth) {
tileAndBufferize(pm);
auto &nestedModulePM = pm.nest<ModuleOp>();
// Distribute linalg onto warps within the workgroup.
nestedModulePM.addNestedPass<func::FuncOp>(
createLLVMGPUTileAndDistribute(/*distributeToWarp=*/true));
nestedModulePM.addNestedPass<func::FuncOp>(
createRemoveSingleIterationLoopPass());
if (pipelineDepth > 1)
nestedModulePM.addNestedPass<func::FuncOp>(
createGPUMultiBuffering(pipelineDepth));
nestedModulePM.addPass(createCanonicalizerPass());
nestedModulePM.addPass(createCSEPass());
nestedModulePM.addNestedPass<func::FuncOp>(
createRemoveSingleIterationLoopPass());
nestedModulePM.addNestedPass<func::FuncOp>(
createWorkGroupSwizzle(logSwizzleTile));
nestedModulePM.addPass(createCanonicalizerPass());
nestedModulePM.addPass(createCSEPass());
// Linalg -> vector
nestedModulePM.addNestedPass<func::FuncOp>(
createLLVMGPUTensorCoreVectorizationPass());
nestedModulePM.addNestedPass<func::FuncOp>(createCanonicalizerPass());
nestedModulePM.addNestedPass<func::FuncOp>(createCSEPass());
nestedModulePM.addNestedPass<func::FuncOp>(
createOptimizeVectorTransferPass());
// Distribute shared memory copies.
nestedModulePM.addNestedPass<func::FuncOp>(createMemrefCopyToLinalgPass());
nestedModulePM.addNestedPass<func::FuncOp>(
createGPUDistributeSharedMemoryCopy());
nestedModulePM.addNestedPass<func::FuncOp>(createCanonicalizerPass());
nestedModulePM.addNestedPass<func::FuncOp>(createCSEPass());
if (!llvmgpuUseMMASync) {
nestedModulePM.addNestedPass<func::FuncOp>(
createGPUReduceSharedMemoryBankConflicts());
}
// Vector -> MMA ops
nestedModulePM.addNestedPass<func::FuncOp>(
memref::createFoldMemRefAliasOpsPass());
nestedModulePM.addPass(createCanonicalizerPass());
nestedModulePM.addPass(createCSEPass());
nestedModulePM.addNestedPass<func::FuncOp>(createLLVMGPUVectorToGPU());
nestedModulePM.addPass(createCanonicalizerPass());
nestedModulePM.addPass(createCSEPass());
// Pipeline memory operations.
nestedModulePM.addNestedPass<func::FuncOp>(
createGPUPipeliningPass(/*epiloguePeeling=*/false, pipelineDepth));
}
void addGPUTransposePassPipeline(OpPassManager &pm) {
tileAndDistributeToWorkgroup(pm);
auto &nestedModulePM = pm.nest<ModuleOp>();
nestedModulePM.addNestedPass<func::FuncOp>(
createWorkgroupSpecializationPass());
nestedModulePM.addPass(createCanonicalizerPass());
nestedModulePM.addPass(createCSEPass());
nestedModulePM.addNestedPass<func::FuncOp>(
createRemoveSingleIterationLoopPass());
nestedModulePM.addNestedPass<func::FuncOp>(
createLLVMGPUTensorAlloc(GPUPromoteSharedMemPattern::TransposeOpPattern));
nestedModulePM.addNestedPass<func::FuncOp>(createLLVMGPUTileTensor(false));
// Linalg -> vector
nestedModulePM.addNestedPass<func::FuncOp>(createGPUVectorizationPass());
nestedModulePM.addNestedPass<func::FuncOp>(createCanonicalizerPass());
nestedModulePM.addNestedPass<func::FuncOp>(createCSEPass());
nestedModulePM.addNestedPass<func::FuncOp>(
createOptimizeVectorTransferPass());
// tensor to memref
addBufferizePasses(nestedModulePM);
// distribute foreach threads
nestedModulePM.addNestedPass<func::FuncOp>(createLLVMGPUDistribute());
nestedModulePM.addNestedPass<func::FuncOp>(createMemrefCopyToLinalgPass());
nestedModulePM.addNestedPass<func::FuncOp>(
createGPUDistributeSharedMemoryCopy());
nestedModulePM.addPass(createCanonicalizerPass());
nestedModulePM.addPass(createCSEPass());
// May or may not need to reduce shared mememory conflicts
nestedModulePM.addNestedPass<func::FuncOp>(
createGPUReduceSharedMemoryBankConflicts(/*paddingSizeBits=*/32));
nestedModulePM.addNestedPass<func::FuncOp>(
createRemoveSingleIterationLoopPass());
nestedModulePM.addPass(createCanonicalizerPass());
nestedModulePM.addPass(createCSEPass());
}
void addGPUWarpReductionPassPipeline(OpPassManager &pm) {
tileAndDistributeToWorkgroup(pm);
auto &nestedModulePM = pm.nest<ModuleOp>();
nestedModulePM.addNestedPass<func::FuncOp>(
createRematerializeParallelOpsPass());
nestedModulePM.addNestedPass<func::FuncOp>(createCanonicalizerPass());
nestedModulePM.addNestedPass<func::FuncOp>(
createRemoveSingleIterationLoopPass());
nestedModulePM.addNestedPass<func::FuncOp>(createGPUTileReductionPass());
nestedModulePM.addNestedPass<func::FuncOp>(createCanonicalizerPass());
nestedModulePM.addNestedPass<func::FuncOp>(createCSEPass());
// Linalg -> vector
nestedModulePM.addNestedPass<func::FuncOp>(createGPUVectorizationPass(
/*generateContract=*/false, /*maxVectorSize=*/16384));
nestedModulePM.addNestedPass<func::FuncOp>(
createLoopInvariantCodeMotionPass());
nestedModulePM.addNestedPass<func::FuncOp>(createCanonicalizerPass());
nestedModulePM.addNestedPass<func::FuncOp>(createCSEPass());
addBufferizePasses(nestedModulePM);
nestedModulePM.addNestedPass<func::FuncOp>(
createOptimizeVectorTransferPass());
nestedModulePM.addNestedPass<func::FuncOp>(
memref::createFoldMemRefAliasOpsPass());
nestedModulePM.addNestedPass<func::FuncOp>(
createLoopInvariantCodeMotionPass());
nestedModulePM.addNestedPass<func::FuncOp>(createCanonicalizerPass());
nestedModulePM.addNestedPass<func::FuncOp>(createCSEPass());
nestedModulePM.addNestedPass<func::FuncOp>(createForOpCanonicalizationPass());
nestedModulePM.addNestedPass<func::FuncOp>(createCanonicalizerPass());
// vector -> simt gpu + vector
nestedModulePM.addNestedPass<func::FuncOp>(
createConvertVectorReductionToGPUPass());
nestedModulePM.addPass(createCanonicalizerPass());
nestedModulePM.addPass(createCSEPass());
}
void addGPUSimpleDistributePassPipeline(OpPassManager &pm) {
tileAndBufferize(pm);
auto &nestedModulePM = pm.nest<ModuleOp>();
// Distribute linalg onto threads within the workgroup.
nestedModulePM.addNestedPass<func::FuncOp>(createLLVMGPUTileAndDistribute());
nestedModulePM.addPass(createCanonicalizerPass());
nestedModulePM.addPass(createCSEPass());
nestedModulePM.addNestedPass<func::FuncOp>(
createRemoveSingleIterationLoopPass());
}
static void addLowerToLLVMGPUPasses(OpPassManager &pm, bool useROCM) {
pm.addPass(createCanonicalizerPass());
pm.addPass(createCSEPass());
// LinalgExt -> SCF
pm.addNestedPass<func::FuncOp>(IREE::LinalgExt::createLinalgExtToLoopsPass());
// Linalg -> SCF
pm.addNestedPass<func::FuncOp>(createMemrefCopyToLinalgPass());
pm.addNestedPass<func::FuncOp>(createConvertLinalgToLoopsPass());
pm.addNestedPass<func::FuncOp>(createCanonicalizerPass());
pm.addNestedPass<func::FuncOp>(createCSEPass());
// Pad allocations with dynamic dimension before lowering of SCF and affine
// but after linalg lowering.
pm.addNestedPass<func::FuncOp>(createPadDynamicAlloc());
pm.addPass(createLowerAffinePass());
pm.addPass(createCanonicalizerPass());
pm.addPass(createCSEPass());
// Handled tensor-type constants.
pm.addPass(arith::createConstantBufferizePass());
pm.addPass(createFoldTensorExtractOpPass());
pm.addNestedPass<func::FuncOp>(createLLVMGPUVectorLoweringPass());
// SCF -> STD
pm.addNestedPass<func::FuncOp>(createConvertSCFToCFPass());
pm.addNestedPass<func::FuncOp>(createCanonicalizerPass());
pm.addNestedPass<func::FuncOp>(createCSEPass());
// math dialect elementry functions -> polynomial form.
pm.addNestedPass<func::FuncOp>(createPolynomialApproximationPass());
pm.addNestedPass<func::FuncOp>(arith::createArithExpandOpsPass());
pm.addNestedPass<func::FuncOp>(memref::createExpandOpsPass());
pm.addPass(memref::createExpandStridedMetadataPass());
pm.addPass(createLowerAffinePass());
pm.addPass(createGPULowerMemorySpaceAttributesPass());
// Strip out the debug info for the kernel as CUDA driver doesn't diggest PTX
// debug info well.
pm.addPass(createStripDebugInfoPass());
if (useROCM) {
// convert to ROCDL.
pm.addPass(createConvertToROCDLPass());
} else {
// convert to NVVM.
pm.addPass(createConvertToNVVMPass());
}
}
extern llvm::cl::opt<std::string> clGPUCodegenTransformDialectFileName;
extern llvm::cl::opt<std::string> clGPUCodegenTransformDialectDebugPayloadTag;
extern llvm::cl::opt<std::string> clGPUCodegenTransformDialectDebugTransformTag;
void addGPUTransformDialectPasses(OpPassManager &passManager) {
// Give control to the transform dialect.
passManager.addPass(
mlir::iree_compiler::createTransformDialectInterpreterPass(
clGPUCodegenTransformDialectFileName,
clGPUCodegenTransformDialectDebugPayloadTag,
clGPUCodegenTransformDialectDebugTransformTag));
// Dropping the schedule is needed:
// 1. if we want to embed the transform in the module: we should drop the
// schedule once applied.
// 2. if transform.do_not_dce_operands ops are introduced.
passManager.addPass(createDropSchedulePass());
}
void buildLLVMGPUTransformPassPipeline(OpPassManager &pm, bool useROCM) {
pm.nest<ModuleOp>().nest<func::FuncOp>().addPass(createTypePropagationPass());
pm.nest<ModuleOp>().addPass(createBufferizeCopyOnlyDispatchesPass());
// TODO: Remove the following pass the plumb support for #hal.descriptor_type
// memory space through the stack.
pm.nest<ModuleOp>().addNestedPass<func::FuncOp>(
createEraseHALDescriptorTypeFromMemRefPass());
pm.addPass(createLLVMGPULowerExecutableTargetPass());
OpPassManager &nestedModulePM = pm.nest<ModuleOp>();
//===--------------------------------------------------------------------===//
// Convert Linalg ops to LLVM+NVVM/ROCDL ops.
//
// Post-conditions:
// - All Linalg/Loops/GPU/Affine/Standard ops are converted away.
// - The module contains the final llvm.module ready to be serialized.
//===--------------------------------------------------------------------===//
addLowerToLLVMGPUPasses(nestedModulePM, useROCM);
LLVM_DEBUG({
llvm::dbgs() << "Using LLVMGPU pass pipeline:\n";
pm.printAsTextualPipeline(llvm::dbgs());
llvm::dbgs() << "\n";
});
}
} // namespace iree_compiler
} // namespace mlir