| // Copyright 2021 The IREE Authors |
| // |
| // Licensed under the Apache License v2.0 with LLVM Exceptions. |
| // See https://llvm.org/LICENSE.txt for license information. |
| // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| |
| #include "iree/compiler/Codegen/Passes.h" |
| |
| #include "iree-dialects/Dialect/LinalgExt/Passes/Passes.h" |
| #include "iree-dialects/Dialect/LinalgTransform/Passes.h" |
| #include "iree/compiler/Codegen/PassDetail.h" |
| #include "iree/compiler/Codegen/Utils/MarkerUtils.h" |
| #include "mlir/Conversion/AffineToStandard/AffineToStandard.h" |
| #include "mlir/Conversion/GPUToNVVM/GPUToNVVMPass.h" |
| #include "mlir/Conversion/SCFToControlFlow/SCFToControlFlow.h" |
| #include "mlir/Conversion/VectorToGPU/VectorToGPU.h" |
| #include "mlir/Dialect/Arith/Transforms/Passes.h" |
| #include "mlir/Dialect/Func/Transforms/Passes.h" |
| #include "mlir/Dialect/GPU/IR/GPUDialect.h" |
| #include "mlir/Dialect/GPU/Transforms/Passes.h" |
| #include "mlir/Dialect/Linalg/Passes.h" |
| #include "mlir/Dialect/MemRef/Transforms/Passes.h" |
| #include "mlir/IR/BuiltinAttributes.h" |
| #include "mlir/Pass/PassManager.h" |
| #include "mlir/Pass/PassOptions.h" |
| #include "mlir/Pass/PassRegistry.h" |
| #include "mlir/Transforms/Passes.h" |
| |
| #define DEBUG_TYPE "iree-llvm-gpu-lowering-pass-pipeline" |
| |
| namespace mlir { |
| namespace iree_compiler { |
| |
| static llvm::cl::opt<unsigned> logSwizzleTile( |
| "iree-codegen-log-swizzle-tile", llvm::cl::desc("log swizzle tile value"), |
| llvm::cl::init(0)); |
| |
| /// Flag used for the transition from wmma to mma.sync. Once we have better |
| /// performance with mma.sync we can drop wmma support and remove this flag. |
| llvm::cl::opt<bool> llvmgpuUseMMASync( |
| "iree-codegen-llvmgpu-use-mma-sync", |
| llvm::cl::desc("use mma sync instead of wmma ops"), llvm::cl::init(false)); |
| |
| static FailureOr<Value> gpuAllocationFn(OpBuilder &builder, Location loc, |
| MemRefType memRefType, |
| ValueRange dynamicSizes, |
| unsigned alignment) { |
| auto workgroupSpace = gpu::AddressSpaceAttr::get( |
| builder.getContext(), gpu::GPUDialect::getWorkgroupAddressSpace()); |
| MemRefType allocType = |
| MemRefType::get(memRefType.getShape(), memRefType.getElementType(), |
| AffineMap(), workgroupSpace); |
| return builder.create<memref::AllocOp>(loc, allocType, dynamicSizes) |
| .getResult(); |
| } |
| |
| static LogicalResult gpuDeallocationFn(OpBuilder &builder, Location loc, |
| Value allocation) { |
| return success(); |
| } |
| |
| static LogicalResult gpuCopyFn(OpBuilder &builder, Location loc, Value from, |
| Value to) { |
| auto fromType = from.getType().cast<MemRefType>(); |
| auto toType = to.getType().cast<MemRefType>(); |
| |
| bool needsBarrier = false; |
| if (auto attr = |
| fromType.getMemorySpace().dyn_cast_or_null<gpu::AddressSpaceAttr>()) { |
| if (attr.getValue() == gpu::GPUDialect::getWorkgroupAddressSpace()) |
| needsBarrier = true; |
| } |
| if (auto attr = |
| toType.getMemorySpace().dyn_cast_or_null<gpu::AddressSpaceAttr>()) { |
| if (attr.getValue() == gpu::GPUDialect::getWorkgroupAddressSpace()) |
| needsBarrier = true; |
| } |
| if (needsBarrier) builder.create<gpu::BarrierOp>(loc); |
| Operation *copy = builder.create<memref::CopyOp>(loc, from, to); |
| if (needsBarrier) { |
| setMarker(copy, getCopyToWorkgroupMemoryMarker()); |
| builder.create<gpu::BarrierOp>(loc); |
| } |
| return success(); |
| } |
| |
| static void addBufferizePasses(OpPassManager &passManager) { |
| BufferizationOptions::AllocationFn allocationFn = gpuAllocationFn; |
| BufferizationOptions::DeallocationFn deallocationFn = gpuDeallocationFn; |
| BufferizationOptions::MemCpyFn memcpyFn = gpuCopyFn; |
| addIREEComprehensiveBufferizePasses(passManager, allocationFn, deallocationFn, |
| memcpyFn); |
| passManager.addPass(createCanonicalizerPass()); |
| passManager.addPass(createCSEPass()); |
| // TODO: Remove the following pass the plumb support for #hal.descriptor_type |
| // memory space through the stack. |
| passManager.addNestedPass<func::FuncOp>( |
| createEraseHALDescriptorTypeFromMemRefPass()); |
| } |
| |
| static void tileAndDistributeToWorkgroup( |
| OpPassManager &pm, bool useWARForCooperativeMatrixCodegen = false) { |
| pm.addPass(createTileAndDistributeToWorkgroupsPass()); |
| |
| auto &nestedModulePM = pm.nest<ModuleOp>(); |
| nestedModulePM.addNestedPass<func::FuncOp>( |
| IREE::LinalgExt::createTileAndDecomposeAttentionPass()); |
| nestedModulePM.addNestedPass<func::FuncOp>( |
| IREE::LinalgExt::createDecomposeSoftmaxPass()); |
| nestedModulePM.addNestedPass<func::FuncOp>( |
| createConvertToDestinationPassingStylePass( |
| useWARForCooperativeMatrixCodegen)); |
| nestedModulePM.addPass(createCanonicalizerPass()); |
| nestedModulePM.addPass(createCSEPass()); |
| } |
| |
| static void tileAndBufferize(OpPassManager &pm) { |
| tileAndDistributeToWorkgroup(pm, /*useWARForCooperativeMatrixCodegen=*/true); |
| |
| auto &nestedModulePM = pm.nest<ModuleOp>(); |
| addBufferizePasses(nestedModulePM); |
| } |
| |
| //===---------------------------------------------------------------------===// |
| // Codegen pipelines. |
| //===---------------------------------------------------------------------===// |
| |
| void addGPUVectorizationPassPipeline(OpPassManager &pm) { |
| tileAndDistributeToWorkgroup(pm); |
| |
| auto &nestedModulePM = pm.nest<ModuleOp>(); |
| nestedModulePM.addNestedPass<func::FuncOp>( |
| createWorkgroupSpecializationPass()); |
| nestedModulePM.addPass(createCanonicalizerPass()); |
| nestedModulePM.addPass(createCSEPass()); |
| |
| nestedModulePM.addNestedPass<func::FuncOp>( |
| createRemoveSingleIterationLoopPass()); |
| // Distribute linalg onto threads within the workgroup. |
| nestedModulePM.addNestedPass<func::FuncOp>(createLLVMGPUTileTensor(false)); |
| nestedModulePM.addPass(createCanonicalizerPass()); |
| nestedModulePM.addPass(createCSEPass()); |
| |
| nestedModulePM.addNestedPass<func::FuncOp>( |
| createRemoveSingleIterationLoopPass()); |
| |
| // Linalg -> vector |
| nestedModulePM.addNestedPass<func::FuncOp>(createGPUVectorizationPass()); |
| nestedModulePM.addNestedPass<func::FuncOp>(createCanonicalizerPass()); |
| nestedModulePM.addNestedPass<func::FuncOp>(createCSEPass()); |
| |
| // tensor to memref |
| addBufferizePasses(nestedModulePM); |
| nestedModulePM.addNestedPass<func::FuncOp>(createLLVMGPUDistribute()); |
| |
| // Post bufferization optimizations. |
| nestedModulePM.addNestedPass<func::FuncOp>( |
| createLoopInvariantCodeMotionPass()); |
| nestedModulePM.addNestedPass<func::FuncOp>( |
| memref::createFoldMemRefAliasOpsPass()); |
| nestedModulePM.addNestedPass<func::FuncOp>(createCanonicalizerPass()); |
| nestedModulePM.addNestedPass<func::FuncOp>(createCSEPass()); |
| nestedModulePM.addNestedPass<func::FuncOp>( |
| createOptimizeVectorTransferPass()); |
| } |
| |
| void addGPUMatmulSimtPassPipeline(OpPassManager &pm) { |
| tileAndDistributeToWorkgroup(pm); |
| auto &nestedModulePM = pm.nest<ModuleOp>(); |
| nestedModulePM.addNestedPass<func::FuncOp>( |
| createWorkgroupSpecializationPass()); |
| nestedModulePM.addPass(createCanonicalizerPass()); |
| nestedModulePM.addPass(createCSEPass()); |
| |
| nestedModulePM.addNestedPass<func::FuncOp>( |
| createRemoveSingleIterationLoopPass()); |
| |
| nestedModulePM.addNestedPass<func::FuncOp>(createLLVMGPUTensorAlloc()); |
| nestedModulePM.addNestedPass<func::FuncOp>(createLLVMGPUTileTensor(false)); |
| |
| // Linalg -> vector |
| nestedModulePM.addNestedPass<func::FuncOp>(createGPUVectorizationPass()); |
| nestedModulePM.addNestedPass<func::FuncOp>(createCanonicalizerPass()); |
| nestedModulePM.addNestedPass<func::FuncOp>(createCSEPass()); |
| |
| // tensor to memref |
| addBufferizePasses(nestedModulePM); |
| |
| // distribute foreach threads |
| nestedModulePM.addNestedPass<func::FuncOp>(createLLVMGPUDistribute()); |
| |
| nestedModulePM.addNestedPass<func::FuncOp>(createMemrefCopyToLinalgPass()); |
| nestedModulePM.addNestedPass<func::FuncOp>( |
| createGPUDistributeSharedMemoryCopy()); |
| nestedModulePM.addPass(createCanonicalizerPass()); |
| nestedModulePM.addPass(createCSEPass()); |
| |
| nestedModulePM.addNestedPass<func::FuncOp>( |
| createGPUReduceSharedMemoryBankConflicts()); |
| nestedModulePM.addNestedPass<func::FuncOp>( |
| createWorkGroupSwizzle(logSwizzleTile)); |
| nestedModulePM.addPass(createCanonicalizerPass()); |
| nestedModulePM.addPass(createCSEPass()); |
| |
| nestedModulePM.addNestedPass<func::FuncOp>( |
| memref::createFoldMemRefAliasOpsPass()); |
| nestedModulePM.addPass(createCSEPass()); |
| nestedModulePM.addPass(createCanonicalizerPass()); |
| nestedModulePM.addPass(createCSEPass()); |
| |
| // Even though we vectorize before bufferization we are not able to hoist |
| // accumulator load/store out of the K loop until distribution. Therefore we |
| // still rely on buffer level transformations for transfer ops hoisting and |
| // store to load forwarding. This relies on shacky alias analysis and we need |
| // to move this to tensor level once we have better abstractions. |
| nestedModulePM.addNestedPass<func::FuncOp>( |
| createOptimizeVectorTransferPass()); |
| |
| // Pipeline memory operations. |
| nestedModulePM.addNestedPass<func::FuncOp>(createGPUPipeliningPass()); |
| } |
| |
| void addGPUMatmulTensorCorePassPipeline(OpPassManager &pm, |
| unsigned pipelineDepth) { |
| tileAndBufferize(pm); |
| |
| auto &nestedModulePM = pm.nest<ModuleOp>(); |
| // Distribute linalg onto warps within the workgroup. |
| nestedModulePM.addNestedPass<func::FuncOp>( |
| createLLVMGPUTileAndDistribute(/*distributeToWarp=*/true)); |
| nestedModulePM.addNestedPass<func::FuncOp>( |
| createRemoveSingleIterationLoopPass()); |
| if (pipelineDepth > 1) |
| nestedModulePM.addNestedPass<func::FuncOp>( |
| createGPUMultiBuffering(pipelineDepth)); |
| nestedModulePM.addPass(createCanonicalizerPass()); |
| nestedModulePM.addPass(createCSEPass()); |
| |
| nestedModulePM.addNestedPass<func::FuncOp>( |
| createRemoveSingleIterationLoopPass()); |
| nestedModulePM.addNestedPass<func::FuncOp>( |
| createWorkGroupSwizzle(logSwizzleTile)); |
| nestedModulePM.addPass(createCanonicalizerPass()); |
| nestedModulePM.addPass(createCSEPass()); |
| |
| // Linalg -> vector |
| nestedModulePM.addNestedPass<func::FuncOp>( |
| createLLVMGPUTensorCoreVectorizationPass()); |
| nestedModulePM.addNestedPass<func::FuncOp>(createCanonicalizerPass()); |
| nestedModulePM.addNestedPass<func::FuncOp>(createCSEPass()); |
| nestedModulePM.addNestedPass<func::FuncOp>( |
| createOptimizeVectorTransferPass()); |
| |
| // Distribute shared memory copies. |
| nestedModulePM.addNestedPass<func::FuncOp>(createMemrefCopyToLinalgPass()); |
| nestedModulePM.addNestedPass<func::FuncOp>( |
| createGPUDistributeSharedMemoryCopy()); |
| nestedModulePM.addNestedPass<func::FuncOp>(createCanonicalizerPass()); |
| nestedModulePM.addNestedPass<func::FuncOp>(createCSEPass()); |
| if (!llvmgpuUseMMASync) { |
| nestedModulePM.addNestedPass<func::FuncOp>( |
| createGPUReduceSharedMemoryBankConflicts()); |
| } |
| |
| // Vector -> MMA ops |
| nestedModulePM.addNestedPass<func::FuncOp>( |
| memref::createFoldMemRefAliasOpsPass()); |
| nestedModulePM.addPass(createCanonicalizerPass()); |
| nestedModulePM.addPass(createCSEPass()); |
| nestedModulePM.addNestedPass<func::FuncOp>(createLLVMGPUVectorToGPU()); |
| nestedModulePM.addPass(createCanonicalizerPass()); |
| nestedModulePM.addPass(createCSEPass()); |
| |
| // Pipeline memory operations. |
| nestedModulePM.addNestedPass<func::FuncOp>( |
| createGPUPipeliningPass(/*epiloguePeeling=*/false, pipelineDepth)); |
| } |
| |
| void addGPUTransposePassPipeline(OpPassManager &pm) { |
| tileAndDistributeToWorkgroup(pm); |
| auto &nestedModulePM = pm.nest<ModuleOp>(); |
| nestedModulePM.addNestedPass<func::FuncOp>( |
| createWorkgroupSpecializationPass()); |
| nestedModulePM.addPass(createCanonicalizerPass()); |
| nestedModulePM.addPass(createCSEPass()); |
| |
| nestedModulePM.addNestedPass<func::FuncOp>( |
| createRemoveSingleIterationLoopPass()); |
| |
| nestedModulePM.addNestedPass<func::FuncOp>( |
| createLLVMGPUTensorAlloc(GPUPromoteSharedMemPattern::TransposeOpPattern)); |
| nestedModulePM.addNestedPass<func::FuncOp>(createLLVMGPUTileTensor(false)); |
| |
| // Linalg -> vector |
| nestedModulePM.addNestedPass<func::FuncOp>(createGPUVectorizationPass()); |
| nestedModulePM.addNestedPass<func::FuncOp>(createCanonicalizerPass()); |
| nestedModulePM.addNestedPass<func::FuncOp>(createCSEPass()); |
| nestedModulePM.addNestedPass<func::FuncOp>( |
| createOptimizeVectorTransferPass()); |
| |
| // tensor to memref |
| addBufferizePasses(nestedModulePM); |
| |
| // distribute foreach threads |
| nestedModulePM.addNestedPass<func::FuncOp>(createLLVMGPUDistribute()); |
| |
| nestedModulePM.addNestedPass<func::FuncOp>(createMemrefCopyToLinalgPass()); |
| nestedModulePM.addNestedPass<func::FuncOp>( |
| createGPUDistributeSharedMemoryCopy()); |
| nestedModulePM.addPass(createCanonicalizerPass()); |
| nestedModulePM.addPass(createCSEPass()); |
| |
| // May or may not need to reduce shared mememory conflicts |
| nestedModulePM.addNestedPass<func::FuncOp>( |
| createGPUReduceSharedMemoryBankConflicts(/*paddingSizeBits=*/32)); |
| nestedModulePM.addNestedPass<func::FuncOp>( |
| createRemoveSingleIterationLoopPass()); |
| nestedModulePM.addPass(createCanonicalizerPass()); |
| nestedModulePM.addPass(createCSEPass()); |
| } |
| |
| void addGPUWarpReductionPassPipeline(OpPassManager &pm) { |
| tileAndDistributeToWorkgroup(pm); |
| auto &nestedModulePM = pm.nest<ModuleOp>(); |
| nestedModulePM.addNestedPass<func::FuncOp>( |
| createRematerializeParallelOpsPass()); |
| nestedModulePM.addNestedPass<func::FuncOp>(createCanonicalizerPass()); |
| nestedModulePM.addNestedPass<func::FuncOp>( |
| createRemoveSingleIterationLoopPass()); |
| nestedModulePM.addNestedPass<func::FuncOp>(createGPUTileReductionPass()); |
| nestedModulePM.addNestedPass<func::FuncOp>(createCanonicalizerPass()); |
| nestedModulePM.addNestedPass<func::FuncOp>(createCSEPass()); |
| |
| // Linalg -> vector |
| nestedModulePM.addNestedPass<func::FuncOp>(createGPUVectorizationPass( |
| /*generateContract=*/false, /*maxVectorSize=*/16384)); |
| nestedModulePM.addNestedPass<func::FuncOp>( |
| createLoopInvariantCodeMotionPass()); |
| nestedModulePM.addNestedPass<func::FuncOp>(createCanonicalizerPass()); |
| nestedModulePM.addNestedPass<func::FuncOp>(createCSEPass()); |
| |
| addBufferizePasses(nestedModulePM); |
| |
| nestedModulePM.addNestedPass<func::FuncOp>( |
| createOptimizeVectorTransferPass()); |
| |
| nestedModulePM.addNestedPass<func::FuncOp>( |
| memref::createFoldMemRefAliasOpsPass()); |
| nestedModulePM.addNestedPass<func::FuncOp>( |
| createLoopInvariantCodeMotionPass()); |
| nestedModulePM.addNestedPass<func::FuncOp>(createCanonicalizerPass()); |
| nestedModulePM.addNestedPass<func::FuncOp>(createCSEPass()); |
| nestedModulePM.addNestedPass<func::FuncOp>(createForOpCanonicalizationPass()); |
| nestedModulePM.addNestedPass<func::FuncOp>(createCanonicalizerPass()); |
| |
| // vector -> simt gpu + vector |
| nestedModulePM.addNestedPass<func::FuncOp>( |
| createConvertVectorReductionToGPUPass()); |
| nestedModulePM.addPass(createCanonicalizerPass()); |
| nestedModulePM.addPass(createCSEPass()); |
| } |
| |
| void addGPUSimpleDistributePassPipeline(OpPassManager &pm) { |
| tileAndBufferize(pm); |
| |
| auto &nestedModulePM = pm.nest<ModuleOp>(); |
| // Distribute linalg onto threads within the workgroup. |
| nestedModulePM.addNestedPass<func::FuncOp>(createLLVMGPUTileAndDistribute()); |
| nestedModulePM.addPass(createCanonicalizerPass()); |
| nestedModulePM.addPass(createCSEPass()); |
| |
| nestedModulePM.addNestedPass<func::FuncOp>( |
| createRemoveSingleIterationLoopPass()); |
| } |
| |
| static void addLowerToLLVMGPUPasses(OpPassManager &pm, bool useROCM) { |
| pm.addPass(createCanonicalizerPass()); |
| pm.addPass(createCSEPass()); |
| |
| // LinalgExt -> SCF |
| pm.addNestedPass<func::FuncOp>(IREE::LinalgExt::createLinalgExtToLoopsPass()); |
| |
| // Linalg -> SCF |
| pm.addNestedPass<func::FuncOp>(createMemrefCopyToLinalgPass()); |
| pm.addNestedPass<func::FuncOp>(createConvertLinalgToLoopsPass()); |
| pm.addNestedPass<func::FuncOp>(createCanonicalizerPass()); |
| pm.addNestedPass<func::FuncOp>(createCSEPass()); |
| |
| // Pad allocations with dynamic dimension before lowering of SCF and affine |
| // but after linalg lowering. |
| pm.addNestedPass<func::FuncOp>(createPadDynamicAlloc()); |
| |
| pm.addPass(createLowerAffinePass()); |
| pm.addPass(createCanonicalizerPass()); |
| pm.addPass(createCSEPass()); |
| |
| // Handled tensor-type constants. |
| pm.addPass(arith::createConstantBufferizePass()); |
| pm.addPass(createFoldTensorExtractOpPass()); |
| |
| pm.addNestedPass<func::FuncOp>(createLLVMGPUVectorLoweringPass()); |
| |
| // SCF -> STD |
| pm.addNestedPass<func::FuncOp>(createConvertSCFToCFPass()); |
| pm.addNestedPass<func::FuncOp>(createCanonicalizerPass()); |
| pm.addNestedPass<func::FuncOp>(createCSEPass()); |
| |
| // math dialect elementry functions -> polynomial form. |
| pm.addNestedPass<func::FuncOp>(createPolynomialApproximationPass()); |
| |
| pm.addNestedPass<func::FuncOp>(arith::createArithExpandOpsPass()); |
| pm.addNestedPass<func::FuncOp>(memref::createExpandOpsPass()); |
| pm.addPass(memref::createExpandStridedMetadataPass()); |
| pm.addPass(createLowerAffinePass()); |
| pm.addPass(createGPULowerMemorySpaceAttributesPass()); |
| // Strip out the debug info for the kernel as CUDA driver doesn't diggest PTX |
| // debug info well. |
| pm.addPass(createStripDebugInfoPass()); |
| if (useROCM) { |
| // convert to ROCDL. |
| pm.addPass(createConvertToROCDLPass()); |
| } else { |
| // convert to NVVM. |
| pm.addPass(createConvertToNVVMPass()); |
| } |
| } |
| |
| extern llvm::cl::opt<std::string> clGPUCodegenTransformDialectFileName; |
| extern llvm::cl::opt<std::string> clGPUCodegenTransformDialectDebugPayloadTag; |
| extern llvm::cl::opt<std::string> clGPUCodegenTransformDialectDebugTransformTag; |
| |
| void addGPUTransformDialectPasses(OpPassManager &passManager) { |
| // Give control to the transform dialect. |
| passManager.addPass( |
| mlir::iree_compiler::createTransformDialectInterpreterPass( |
| clGPUCodegenTransformDialectFileName, |
| clGPUCodegenTransformDialectDebugPayloadTag, |
| clGPUCodegenTransformDialectDebugTransformTag)); |
| |
| // Dropping the schedule is needed: |
| // 1. if we want to embed the transform in the module: we should drop the |
| // schedule once applied. |
| // 2. if transform.do_not_dce_operands ops are introduced. |
| passManager.addPass(createDropSchedulePass()); |
| } |
| |
| void buildLLVMGPUTransformPassPipeline(OpPassManager &pm, bool useROCM) { |
| pm.nest<ModuleOp>().nest<func::FuncOp>().addPass(createTypePropagationPass()); |
| pm.nest<ModuleOp>().addPass(createBufferizeCopyOnlyDispatchesPass()); |
| // TODO: Remove the following pass the plumb support for #hal.descriptor_type |
| // memory space through the stack. |
| pm.nest<ModuleOp>().addNestedPass<func::FuncOp>( |
| createEraseHALDescriptorTypeFromMemRefPass()); |
| pm.addPass(createLLVMGPULowerExecutableTargetPass()); |
| OpPassManager &nestedModulePM = pm.nest<ModuleOp>(); |
| //===--------------------------------------------------------------------===// |
| // Convert Linalg ops to LLVM+NVVM/ROCDL ops. |
| // |
| // Post-conditions: |
| // - All Linalg/Loops/GPU/Affine/Standard ops are converted away. |
| // - The module contains the final llvm.module ready to be serialized. |
| //===--------------------------------------------------------------------===// |
| addLowerToLLVMGPUPasses(nestedModulePM, useROCM); |
| |
| LLVM_DEBUG({ |
| llvm::dbgs() << "Using LLVMGPU pass pipeline:\n"; |
| pm.printAsTextualPipeline(llvm::dbgs()); |
| llvm::dbgs() << "\n"; |
| }); |
| } |
| |
| } // namespace iree_compiler |
| } // namespace mlir |