| // Copyright 2020 The IREE Authors |
| // |
| // Licensed under the Apache License v2.0 with LLVM Exceptions. |
| // See https://llvm.org/LICENSE.txt for license information. |
| // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| |
| //===- Passes.cpp - Pipelines from Linalg ops to SPIR-V -------------------===// |
| // |
| // This file contains various pipelines to lower IREE HAL executables containing |
| // Linalg ops to SPIR-V. |
| // |
| //===----------------------------------------------------------------------===// |
| |
| #include "iree/compiler/Dialect/LinalgExt/Transforms/Passes.h" |
| |
| #include "iree-dialects/Dialect/LinalgTransform/Passes.h" |
| #include "iree/compiler/Codegen/Common/GPU/Passes.h" |
| #include "iree/compiler/Codegen/Common/Passes.h" |
| #include "iree/compiler/Codegen/SPIRV/KernelConfig.h" |
| #include "iree/compiler/Codegen/SPIRV/Passes.h" |
| #include "iree/compiler/Codegen/Utils/GPUUtils.h" |
| #include "iree/compiler/Codegen/Utils/MarkerUtils.h" |
| #include "iree/compiler/Dialect/Util/Transforms/Passes.h" |
| #include "iree/compiler/Utils/PassUtils.h" |
| #include "llvm/ADT/STLForwardCompat.h" |
| #include "llvm/Support/Debug.h" |
| #include "mlir/Conversion/AffineToStandard/AffineToStandard.h" |
| #include "mlir/Conversion/ComplexToStandard/ComplexToStandard.h" |
| #include "mlir/Conversion/MemRefToSPIRV/MemRefToSPIRV.h" |
| #include "mlir/Conversion/MemRefToSPIRV/MemRefToSPIRVPass.h" |
| #include "mlir/Conversion/TosaToArith/TosaToArith.h" |
| #include "mlir/Dialect/GPU/IR/GPUDialect.h" |
| #include "mlir/Dialect/Linalg/Passes.h" |
| #include "mlir/Dialect/MemRef/Transforms/Passes.h" |
| #include "mlir/Dialect/SPIRV/IR/SPIRVAttributes.h" |
| #include "mlir/Dialect/SPIRV/IR/SPIRVEnums.h" |
| #include "mlir/Dialect/SPIRV/IR/SPIRVOps.h" |
| #include "mlir/Dialect/SPIRV/IR/TargetAndABI.h" |
| #include "mlir/Dialect/SPIRV/Transforms/Passes.h" |
| #include "mlir/IR/BuiltinOps.h" |
| #include "mlir/Pass/PassManager.h" |
| #include "mlir/Pass/PassOptions.h" |
| #include "mlir/Pass/PassRegistry.h" |
| #include "mlir/Transforms/Passes.h" |
| |
| #define DEBUG_TYPE "iree-spirv-lowering-pass-pipeline" |
| |
| namespace mlir::iree_compiler { |
| |
| static llvm::cl::opt<int> clSPIRVIndexingBits( |
| "iree-spirv-index-bits", |
| llvm::cl::desc("Set the bit width of indices in SPIR-V."), |
| llvm::cl::init(32)); |
| |
| //===----------------------------------------------------------------------===// |
| // Bufferization Configuration |
| //===----------------------------------------------------------------------===// |
| |
| static FailureOr<Value> gpuAllocateWorkgroupMemoryFn(OpBuilder &builder, |
| Location loc, |
| MemRefType memRefType, |
| ValueRange dynamicSizes, |
| unsigned alignment) { |
| auto workgroupSpace = gpu::AddressSpaceAttr::get( |
| builder.getContext(), gpu::GPUDialect::getWorkgroupAddressSpace()); |
| MemRefType allocType = |
| MemRefType::get(memRefType.getShape(), memRefType.getElementType(), |
| AffineMap(), workgroupSpace); |
| auto allocOp = builder.create<memref::AllocOp>( |
| loc, allocType, dynamicSizes, builder.getI64IntegerAttr(alignment)); |
| return allocOp.getResult(); |
| } |
| |
| static FailureOr<Value> gpuAllocateFunctionMemoryFn(OpBuilder &builder, |
| Location loc, |
| MemRefType memRefType, |
| ValueRange dynamicSizes, |
| unsigned alignment) { |
| std::optional<unsigned> space = |
| spirv::mapVulkanStorageClassToMemorySpace(spirv::StorageClass::Function); |
| MemRefType allocType = MemRefType::get( |
| memRefType.getShape(), memRefType.getElementType(), {}, *space); |
| auto allocaOp = builder.create<memref::AllocaOp>( |
| loc, allocType, dynamicSizes, builder.getI64IntegerAttr(alignment)); |
| return allocaOp.getResult(); |
| } |
| |
| static LogicalResult gpuCopyFn(OpBuilder &builder, Location loc, Value from, |
| Value to) { |
| auto fromType = llvm::cast<MemRefType>(from.getType()); |
| auto toType = llvm::cast<MemRefType>(to.getType()); |
| |
| bool needsBarrier = hasSharedMemoryAddressSpace(fromType) || |
| hasSharedMemoryAddressSpace(toType); |
| if (needsBarrier) |
| builder.create<gpu::BarrierOp>(loc); |
| Operation *copy = builder.create<memref::CopyOp>(loc, from, to); |
| if (needsBarrier) { |
| setMarker(copy, getCopyToWorkgroupMemoryMarker()); |
| builder.create<gpu::BarrierOp>(loc); |
| } |
| return success(); |
| } |
| |
| //===----------------------------------------------------------------------===// |
| // Common Pass Recipes |
| //===----------------------------------------------------------------------===// |
| |
| static void addTileAndDistributeToWorkgroupsPasses( |
| OpPassManager &funcPassManager, |
| bool useFuseTensorPadWithConsumerPass = false, |
| bool useWARForCooperativeMatrixCodegen = false) { |
| funcPassManager.addPass(createTileAndDistributeToWorkgroupsPass( |
| kNumMaxParallelDims, |
| linalg::DistributionMethod::CyclicNumProcsEqNumIters)); |
| funcPassManager.addPass(createCSEPass()); |
| if (useFuseTensorPadWithConsumerPass) { |
| funcPassManager.addPass(createFuseTensorPadWithConsumerPass()); |
| } |
| funcPassManager.addPass(createConvertToDestinationPassingStylePass( |
| useWARForCooperativeMatrixCodegen)); |
| funcPassManager.addPass(createConfigTrackingCanonicalizerPass()); |
| funcPassManager.addPass(createCSEPass()); |
| } |
| |
| /// Adds passes to lower vector ops to meet SPIR-V requirements. |
| void addSPIRVVectorLoweringPasses(OpPassManager &funcPassManager) { |
| funcPassManager.addPass(createSPIRVInitialVectorLoweringPass()); |
| funcPassManager.addPass(createOptimizeTensorInsertExtractSlicesPass()); |
| funcPassManager.addPass(createSPIRVFinalVectorLoweringPass()); |
| } |
| |
| static void addBufferizePasses(OpPassManager &funcPassManager, |
| BufferizationOptions::AllocationFn fn) { |
| BufferizationOptions::AllocationFn allocationFn = fn; |
| BufferizationOptions::MemCpyFn memcpyFn = gpuCopyFn; |
| addIREEComprehensiveBufferizePasses(funcPassManager, allocationFn, memcpyFn); |
| } |
| |
| static void |
| addSPIRVBufferizePasses(OpPassManager &funcPassManager, |
| BufferizationOptions::AllocationFn allocationFn) { |
| // Resolve dim ops first so that we don't have compute Linalg ops lingering on |
| // becuase of dim op usage. This avoids bufferizing those compute ops just for |
| // their shape dimensions. |
| funcPassManager.addPass(memref::createResolveShapedTypeResultDimsPass()); |
| addBufferizePasses(funcPassManager, allocationFn); |
| // Distribute immediately after bufferization to avoid losing attribute |
| // annotations in subsequent transformations. This is a bit fragile right now |
| // but we expect upstream for loops to eventually recognize distribution as a |
| // first-class attribute then we don't need this. |
| funcPassManager.addPass(createGPUDistributeScfForPass()); |
| funcPassManager.addPass(memref::createResolveShapedTypeResultDimsPass()); |
| funcPassManager.addPass(createCanonicalizerPass()); |
| funcPassManager.addPass(createCSEPass()); |
| funcPassManager.addPass(createCleanupBufferAllocViewPass()); |
| } |
| |
| /// Adds passes to materialize structured ops as loops. This replaces structured |
| /// ops with loop nests containing payloads, so it should be invoked after |
| /// tiling and vectorization and before buffer transformations. |
| static void addLoopMaterializationPasses(OpPassManager &funcPassManager) { |
| funcPassManager.addPass(IREE::LinalgExt::createLinalgExtToLoopsPass()); |
| funcPassManager.addPass(createMemrefCopyToLinalgPass()); |
| funcPassManager.addPass(createConvertLinalgToLoopsPass()); |
| funcPassManager.addPass(createRemoveSingleIterationLoopPass()); |
| funcPassManager.addPass(createCanonicalizerPass()); |
| funcPassManager.addPass(createCSEPass()); |
| } |
| |
| /// Adds passes to lowering MemRefs. This folds MemRef subviews, flattens n-D |
| /// MemRef into 1-D ones, vectorizes load/store when possible, and performs |
| /// cross loop nest optimizations. This should be invoked after structured op |
| /// lowering and before final SPIR-V conversion. |
| static void addMemRefLoweringPasses(OpPassManager &modulePassManager) { |
| FunctionLikeNest funcPassManager(modulePassManager); |
| |
| funcPassManager.addPass(createCanonicalizerPass) |
| .addPass(createCSEPass) |
| .addPass(createConvertComplexToStandardPass) |
| |
| // Math dialect elementry functions -> polynomial form. |
| .addPass(createPolynomialApproximationPass) |
| |
| .addPass(createPadDynamicAllocPass); |
| |
| // TODO: query this from the target. |
| auto getIndexBitwidth = [](mlir::FunctionOpInterface) { return 32; }; |
| funcPassManager |
| .addPass( |
| [&]() { return createGPUCheckResourceUsagePass(getIndexBitwidth); }) |
| |
| // Fold load/store from/to subview ops into the original memref when |
| // possible. In SPIR-V we don't use memref descriptor so it's not possible |
| // to handle subview ops. |
| .addPass(memref::createFoldMemRefAliasOpsPass) |
| .addPass(createEmulateNarrowTypePass) |
| .addPass(memref::createExpandOpsPass) |
| .addPass(createCanonicalizerPass) |
| .addPass(createCSEPass) |
| |
| // Turn scalar load/store from memrefs into vectorized ones if possible. |
| // This gives better memory access patterns, which is very important for |
| // perf. |
| .addPass(createSPIRVVectorizeLoadStorePass) |
| // Perform optimizations that need to across the scf.for region boundary. |
| .addPass(createForOpCanonicalizationPass) |
| // Perform various vector-level cross-op optimizations like load-store |
| // forwarding, shape casting and casting op cancelling. |
| .addPass([&]() { return createOptimizeVectorTransferPass(); }) |
| .addPass(createSPIRVBreakDownLargeVectorPass) |
| |
| // Perform optimizations that need to across the scf.for region boundary. |
| .addPass(createForOpCanonicalizationPass) |
| .addPass(createCanonicalizerPass) |
| .addPass(createCSEPass) |
| .addPass([&]() { return createOptimizeVectorTransferPass(); }); |
| |
| // Turn multi-dimension memref into one-dimension. This is needed for |
| // SPIR-V because we don't use upstream memref descriptors. |
| modulePassManager.addPass(createFlattenMemRefSubspanPass()); |
| |
| FunctionLikeNest(modulePassManager) |
| .addPass(createSPIRVEraseStorageBufferStaticShapePass); |
| } |
| |
| /// Adds passes to perform the final SPIR-V conversion. |
| static void addSPIRVLoweringPasses(OpPassManager &modulePassManager) { |
| FunctionLikeNest(modulePassManager) |
| .addPass(createPropagateDispatchSizeBoundsPass) |
| .addPass(createCanonicalizerPass) |
| .addPass(createCSEPass) |
| .addPass(createLowerAffinePass) |
| .addPass([]() { |
| return IREE::Util::createOptimizeIntArithmeticPass( |
| IREE::Util::OptimizeIntArithmeticPassOptions{/*narrowToI32=*/true}); |
| }) |
| |
| // Lower ApplyScale before the i64 Emulation Pass so that new 64-bit ops |
| // are also emulated if not supported by the target. |
| .addPass([&]() { |
| return tosa::createTosaToArith(/*includeApplyRescale=*/true, |
| /*use32BitApplyRescale=*/true); |
| }) |
| .addPass(createCanonicalizerPass) |
| .addPass(createCSEPass) |
| .addPass(createSPIRVMapMemRefStorageClassPass) |
| .addPass(createSPIRVEmulateI64Pass) |
| .addPass(createConvertBf16ArithToF32Pass) |
| .addPass(createConvertBf16ToUInt16BuffersPass) |
| .addPass(createCanonicalizerPass) |
| .addPass(createCSEPass); |
| |
| modulePassManager.addPass(createSPIRVConvertGPUTargetPass()); |
| modulePassManager.addPass(createConvertToSPIRVPass(clSPIRVIndexingBits)); |
| |
| auto getTargetEnv = [](spirv::ModuleOp moduleOp) { |
| return moduleOp->getParentOfType<mlir::ModuleOp>() |
| ->getAttrOfType<spirv::TargetEnvAttr>(spirv::getTargetEnvAttrName()); |
| }; |
| |
| OpPassManager &spirvModulePassManager = |
| modulePassManager.nest<spirv::ModuleOp>(); |
| spirvModulePassManager.addPass( |
| spirv::createUnifyAliasedResourcePass(getTargetEnv)); |
| spirvModulePassManager.addPass(spirv::createSPIRVLowerABIAttributesPass()); |
| spirvModulePassManager.addPass(createCanonicalizerPass()); |
| spirvModulePassManager.addPass(createCSEPass()); |
| spirvModulePassManager.addPass(spirv::createSPIRVRewriteInsertsPass()); |
| spirvModulePassManager.addPass(spirv::createSPIRVCanonicalizeGLPass()); |
| spirvModulePassManager.addPass(spirv::createSPIRVUpdateVCEPass()); |
| } |
| |
| //===----------------------------------------------------------------------===// |
| // Pass Pipelines |
| //===----------------------------------------------------------------------===// |
| |
| void addSPIRVBaseLoweringPassPipeline(OpPassManager &funcPassManager) { |
| funcPassManager.addPass(createConvertToDestinationPassingStylePass( |
| /*useWARForCooperativeMatrixCodegen=*/false)); |
| funcPassManager.addPass(createCanonicalizerPass()); |
| funcPassManager.addPass(createCSEPass()); |
| |
| addBufferizePasses(funcPassManager, gpuAllocateWorkgroupMemoryFn); |
| funcPassManager.addPass(createCanonicalizerPass()); |
| funcPassManager.addPass(createCSEPass()); |
| |
| addLoopMaterializationPasses(funcPassManager); |
| } |
| |
| void addSPIRVBaseDistributePassPipeline(OpPassManager &funcPassManager) { |
| addTileAndDistributeToWorkgroupsPasses(funcPassManager); |
| |
| addBufferizePasses(funcPassManager, gpuAllocateWorkgroupMemoryFn); |
| |
| // Tile and distribute to GPU invocations. |
| funcPassManager.addPass(createSPIRVTileAndDistributePass()); |
| funcPassManager.addPass(createMemrefCopyToLinalgPass()); |
| funcPassManager.addPass(createGPUDistributeSharedMemoryCopyPass()); |
| funcPassManager.addPass(createCanonicalizerPass()); |
| funcPassManager.addPass(createCSEPass()); |
| |
| addLoopMaterializationPasses(funcPassManager); |
| } |
| |
| void addSPIRVBaseVectorizePassPipeline(OpPassManager &funcPassManager) { |
| addTileAndDistributeToWorkgroupsPasses( |
| funcPassManager, /*useFuseTensorPadWithConsumerPass=*/true); |
| |
| funcPassManager.addPass(createFoldAffineMinInDistributedLoopsPass()); |
| funcPassManager.addPass(memref::createResolveShapedTypeResultDimsPass()); |
| |
| funcPassManager.addPass(createConfigTrackingCanonicalizerPass()); |
| funcPassManager.addPass(createCSEPass()); |
| |
| // Tile to GPU invocations and vectorize. |
| funcPassManager.addPass(createGPUCreateFastSlowPathPass()); |
| funcPassManager.addPass(createGPUTilePass()); |
| funcPassManager.addPass(createCanonicalizerPass()); |
| funcPassManager.addPass(createCSEPass()); |
| { |
| GenericVectorizationPassOptions options; |
| options.vectorizeGatherAccesses = true; |
| funcPassManager.addPass(createGenericVectorizationPass(options)); |
| } |
| addSPIRVVectorLoweringPasses(funcPassManager); |
| funcPassManager.addPass(createForOpCanonicalizationPass()); |
| funcPassManager.addPass(createCanonicalizerPass()); |
| funcPassManager.addPass(createCSEPass()); |
| |
| // Bufferize and distribute. |
| addSPIRVBufferizePasses(funcPassManager, gpuAllocateFunctionMemoryFn); |
| |
| // Generate loop nests for all remaining ops and remove trivial loops. |
| addLoopMaterializationPasses(funcPassManager); |
| |
| // Perform various vector-level cross-op optimizations like load-store |
| // forwarding, shape casting and casting op cancelling. |
| funcPassManager.addPass(createOptimizeVectorTransferPass()); |
| } |
| |
| void addSPIRVWinogradVectorizePassPipeline(OpPassManager &funcPassManager) { |
| addTileAndDistributeToWorkgroupsPasses( |
| funcPassManager, /*useFuseTensorPadWithConsumerPass=*/true); |
| |
| funcPassManager.addPass(createFoldAffineMinInDistributedLoopsPass()); |
| funcPassManager.addPass(memref::createResolveShapedTypeResultDimsPass()); |
| |
| funcPassManager.addPass(createConfigTrackingCanonicalizerPass()); |
| funcPassManager.addPass(createCSEPass()); |
| |
| funcPassManager.addPass(createGPUTilePass()); |
| funcPassManager.addPass( |
| IREE::LinalgExt::createDecomposeWinogradTransformPass()); |
| funcPassManager.addPass(createConfigTrackingCanonicalizerPass()); |
| funcPassManager.addPass(createCSEPass()); |
| |
| // Tile to GPU invocations and vectorize. |
| funcPassManager.addPass(createSPIRVAnnotateWinogradLoopsPass()); |
| funcPassManager.addPass(createConfigTrackingCanonicalizerPass()); |
| funcPassManager.addPass(createCSEPass()); |
| { |
| GenericVectorizationPassOptions options; |
| options.vectorizeGatherAccesses = true; |
| options.enableCleanup = true; |
| funcPassManager.addPass(createGenericVectorizationPass(options)); |
| } |
| addSPIRVVectorLoweringPasses(funcPassManager); |
| funcPassManager.addPass(createForOpCanonicalizationPass()); |
| funcPassManager.addPass(createCanonicalizerPass()); |
| funcPassManager.addPass(createCSEPass()); |
| |
| // Bufferize and distribute. |
| addSPIRVBufferizePasses(funcPassManager, gpuAllocateFunctionMemoryFn); |
| |
| // Generate loop nests for all remaining ops and remove trivial loops. |
| addLoopMaterializationPasses(funcPassManager); |
| |
| // Perform various vector-level cross-op optimizations like load-store |
| // forwarding, shape casting and casting op cancelling. |
| funcPassManager.addPass(createOptimizeVectorTransferPass()); |
| } |
| |
| void addSPIRVCooperativeMatrixVectorizePassPipeline( |
| OpPassManager &funcPassManager, unsigned pipelineDepth, |
| unsigned storeStage) { |
| addTileAndDistributeToWorkgroupsPasses( |
| funcPassManager, /*useFuseTensorPadWithConsumerPass=*/false, |
| /*useWARForCooperativeMatrixCodegen=*/true); |
| |
| addBufferizePasses(funcPassManager, gpuAllocateWorkgroupMemoryFn); |
| |
| // Tile to GPU workgroups and promote. |
| funcPassManager.addPass( |
| createSPIRVTileAndPromotePass(SPIRVTileAndPromotePassOptions{ |
| /*promoteCMatrix=*/true, /*skipThreadLevel=*/true})); |
| funcPassManager.addPass(createRemoveSingleIterationLoopPass()); |
| // Run canonicalization patterns to propagate constant shape sizes after |
| // removing trip-one loops. |
| funcPassManager.addPass(createConfigTrackingCanonicalizerPass()); |
| funcPassManager.addPass(createCSEPass()); |
| |
| // Tile and distribute to GPU subgroups. |
| funcPassManager.addPass(createSPIRVTileToCooperativeOpsPass()); |
| funcPassManager.addPass(createRemoveSingleIterationLoopPass()); |
| funcPassManager.addPass(createCanonicalizerPass()); |
| funcPassManager.addPass(createCSEPass()); |
| |
| // Multi-buffer depending on pipeline depth and distribute to shared memory. |
| if (pipelineDepth > 0) { |
| funcPassManager.addPass(createGPUMultiBufferingPass( |
| GPUMultiBufferingPassOptions{pipelineDepth + 1})); |
| } |
| funcPassManager.addPass(createMemrefCopyToLinalgPass()); |
| funcPassManager.addPass(createGPUDistributeSharedMemoryCopyPass()); |
| |
| // Reduce bank conflicts by padding. |
| { |
| GPUReduceBankConflictsPassOptions options = {}; |
| options.paddingBits = detail::bankConflictReductionPaddingBits; |
| funcPassManager.addPass(createGPUReduceBankConflictsPass(options)); |
| } |
| |
| // Performs high-level n-D mechanical vectorization. This does not perform |
| // unrolling or lowering, which is done later. |
| { |
| GenericVectorizationPassOptions options; |
| funcPassManager.addPass(createGenericVectorizationPass(options)); |
| } |
| |
| // With subview ops, vector hoisting won't kick in. So fold memref subview ops |
| // before performing vector unrolling and hoisting. |
| funcPassManager.addPass(memref::createFoldMemRefAliasOpsPass()); |
| |
| // Vectorize to cooperative ops. |
| funcPassManager.addPass(createSPIRVVectorizeToCooperativeOpsPass()); |
| funcPassManager.addPass(createCSEPass()); |
| funcPassManager.addPass(createOptimizeTensorInsertExtractSlicesPass()); |
| funcPassManager.addPass(createRemoveSingleIterationLoopPass()); |
| |
| // Run canonicalization patterns to propagate constant shape sizes after |
| // removing trip-one loops. |
| funcPassManager.addPass(createCanonicalizerPass()); |
| funcPassManager.addPass(createCSEPass()); |
| |
| // Perform various vector-level cross-op optimizations like load-store |
| // forwarding, shape casting and casting op cancelling. |
| funcPassManager.addPass(createOptimizeVectorTransferPass()); |
| |
| funcPassManager.addPass(createForOpCanonicalizationPass()); |
| funcPassManager.addPass(createCanonicalizerPass()); |
| funcPassManager.addPass(createCSEPass()); |
| funcPassManager.addPass(createSPIRVVectorToGPUSubgroupMMAPass()); |
| funcPassManager.addPass(createCanonicalizerPass()); |
| funcPassManager.addPass(createCSEPass()); |
| addSPIRVVectorLoweringPasses(funcPassManager); |
| |
| if (pipelineDepth > 0) { |
| PipeliningSchedulingStrategy schedule = |
| storeStage == 0 ? PipeliningSchedulingStrategy::loadStoreStage0 |
| : PipeliningSchedulingStrategy::loadGlobalStage0; |
| GPUPipeliningPassOptions pipelieningOptions = {}; |
| pipelieningOptions.epiloguePeeling = true; |
| pipelieningOptions.depth = pipelineDepth; |
| pipelieningOptions.scheduleIndex = llvm::to_underlying(schedule); |
| funcPassManager.addPass(createGPUPipeliningPass(pipelieningOptions)); |
| } |
| } |
| |
| void addSPIRVMatmulPromoteVectorizePassPipeline(OpPassManager &funcPassManager, |
| unsigned pipelineDepth, |
| unsigned storeStage) { |
| // Guards against 0 for consistency with older user provided tuning configs. |
| pipelineDepth = pipelineDepth ? pipelineDepth : 1; |
| LLVM_DEBUG(llvm::dbgs() << "Non-zero Pipeline Depth: " << pipelineDepth |
| << "\n";); |
| addTileAndDistributeToWorkgroupsPasses( |
| funcPassManager, /*useFuseTensorPadWithConsumerPass=*/false, |
| /*useWARForCooperativeMatrixCodegen=*/true); |
| |
| // Promote to workgroups and tile to threads. |
| funcPassManager.addPass(createGPUTensorTileToSerialLoopsPass()); |
| funcPassManager.addPass(createGPUTensorAlloc()); |
| funcPassManager.addPass(createGPUTensorTilePass()); |
| |
| // Performs high-level n-D mechanical vectorization. This does not perform |
| // unrolling or lowering, which is done later. |
| { |
| GenericVectorizationPassOptions options; |
| options.vectorizePadding = true; |
| options.vectorizeGatherAccesses = true; |
| options.enableCleanup = false; |
| options.maxVectorSize = 4096; |
| funcPassManager.addPass(createGenericVectorizationPass(options)); |
| funcPassManager.addPass(createOptimizeTensorInsertExtractSlicesPass()); |
| funcPassManager.addPass(createCanonicalizerPass()); |
| funcPassManager.addPass(createCSEPass()); |
| } |
| |
| // Bufferize. |
| addBufferizePasses(funcPassManager, gpuAllocateWorkgroupMemoryFn); |
| |
| // Distribute scf.forall to GPU threads. |
| funcPassManager.addPass(createGPUDistributePass()); |
| |
| if (pipelineDepth > 1 || storeStage == 0) { |
| GPUMultiBufferingPassOptions multibufferingOptions = { |
| storeStage == 0 ? pipelineDepth + 1 : pipelineDepth}; |
| funcPassManager.addPass(createGPUMultiBufferingPass(multibufferingOptions)); |
| } |
| |
| funcPassManager.addPass(createMemrefCopyToLinalgPass()); |
| funcPassManager.addPass(createGPUDistributeSharedMemoryCopyPass()); |
| funcPassManager.addPass(createCanonicalizerPass()); |
| funcPassManager.addPass(createCSEPass()); |
| |
| { |
| GPUReduceBankConflictsPassOptions options = {}; |
| options.paddingBits = detail::bankConflictReductionPaddingBits; |
| funcPassManager.addPass(createGPUReduceBankConflictsPass(options)); |
| } |
| |
| // With subview ops, vector hoisting won't kick in. So fold memref subview ops |
| // before performing vector unrolling and hoisting. |
| funcPassManager.addPass(memref::createFoldMemRefAliasOpsPass()); |
| |
| funcPassManager.addPass(createSPIRVInitialVectorLoweringPass()); |
| funcPassManager.addPass(createCSEPass()); |
| funcPassManager.addPass(createOptimizeTensorInsertExtractSlicesPass()); |
| funcPassManager.addPass(createSPIRVFinalVectorLoweringPass()); |
| |
| funcPassManager.addPass(createForOpCanonicalizationPass()); |
| funcPassManager.addPass(createCanonicalizerPass()); |
| funcPassManager.addPass(createCSEPass()); |
| funcPassManager.addPass(createOptimizeVectorTransferPass()); |
| |
| // Hoist loop invariant code to avoid pipelining it. |
| funcPassManager.addPass(createIREELoopInvariantCodeMotionPass()); |
| PipeliningSchedulingStrategy schedule = |
| storeStage == 0 ? PipeliningSchedulingStrategy::loadStoreStage0 |
| : PipeliningSchedulingStrategy::loadGlobalStage0; |
| GPUPipeliningPassOptions pipelieningOptions = {}; |
| pipelieningOptions.epiloguePeeling = true; |
| pipelieningOptions.depth = pipelineDepth; |
| pipelieningOptions.scheduleIndex = llvm::to_underlying(schedule); |
| funcPassManager.addPass(createGPUPipeliningPass(pipelieningOptions)); |
| |
| addLoopMaterializationPasses(funcPassManager); |
| } |
| |
| void addSPIRVSubgroupReducePassPipeline(OpPassManager &funcPassManager) { |
| addTileAndDistributeToWorkgroupsPasses( |
| funcPassManager, /*useFuseTensorPadWithConsumerPass=*/true); |
| |
| // Fuse input parallel ops into the reduction op so that we don't need to |
| // create temporary allocations during bufferization. |
| funcPassManager.addPass(createRematerializeParallelOpsPass()); |
| funcPassManager.addPass(createConfigTrackingCanonicalizerPass()); |
| |
| funcPassManager.addPass(createGPUTileReductionPass()); |
| funcPassManager.addPass(createCanonicalizerPass()); |
| funcPassManager.addPass(createCSEPass()); |
| |
| // Performs high-level n-D mechanical vectorization. This does not perform |
| // unrolling or lowering, which is done later. |
| { |
| GenericVectorizationPassOptions options; |
| options.enableVectorMasking = true; |
| options.useConfiguredVectorSizes = false; |
| options.vectorizePadding = true; |
| options.vectorizeGatherAccesses = true; |
| options.enableCleanup = false; |
| options.generateContract = false; |
| funcPassManager.addPass(createGenericVectorizationPass(options)); |
| funcPassManager.addPass(createOptimizeTensorInsertExtractSlicesPass()); |
| funcPassManager.addPass(createCanonicalizerPass()); |
| funcPassManager.addPass(createCSEPass()); |
| } |
| |
| funcPassManager.addPass(createIREELoopInvariantCodeMotionPass()); |
| funcPassManager.addPass(createCanonicalizerPass()); |
| funcPassManager.addPass(createCSEPass()); |
| |
| // Bufferize and distribute. |
| // We bufferize before distributing to threads there; so we are still at the |
| // block level. Therefore, need to allocate workgroup memory. |
| addSPIRVBufferizePasses(funcPassManager, gpuAllocateWorkgroupMemoryFn); |
| |
| // Perform various vector-level cross-op optimizations like load-store |
| // forwarding, shape casting and casting op cancelling. |
| funcPassManager.addPass(createOptimizeVectorTransferPass()); |
| |
| // Simplify the IR for vector distribution. |
| funcPassManager.addPass(memref::createFoldMemRefAliasOpsPass()); |
| funcPassManager.addPass(createIREELoopInvariantCodeMotionPass()); |
| funcPassManager.addPass(createCanonicalizerPass()); |
| funcPassManager.addPass(createCSEPass()); |
| funcPassManager.addPass(createForOpCanonicalizationPass()); |
| funcPassManager.addPass(createCanonicalizerPass()); |
| |
| // Handle vector reduction operations specifically. |
| funcPassManager.addPass(createConvertVectorReductionToGPUPass( |
| /*expandSubgroupReduction=*/false)); |
| // Perform normal vector unrolling and lowering transformations. This breaks |
| // vectors down to native machine size. |
| addSPIRVVectorLoweringPasses(funcPassManager); |
| funcPassManager.addPass(createCanonicalizerPass()); |
| funcPassManager.addPass(createCSEPass()); |
| } |
| |
| //===----------------------------------------------------------------------===// |
| // Entry Point |
| //===----------------------------------------------------------------------===// |
| |
| static void buildSPIRVCodegenConfigurationPassPipelineImpl( |
| OpPassManager &modulePassManager) { |
| { |
| FunctionLikeNest funcPassManager(modulePassManager); |
| funcPassManager.addPass(createGPUGeneralizeNamedOpsPass); |
| addCommonTargetExecutablePreprocessingPasses(funcPassManager); |
| addEncodingToNopPasses(funcPassManager); |
| } |
| modulePassManager.addPass(createMaterializeUserConfigsPass()); |
| modulePassManager.addPass(createSPIRVSelectLoweringStrategyPass()); |
| } |
| |
| void buildSPIRVCodegenConfigurationPassPipeline( |
| OpPassManager &variantPassManager) { |
| OpPassManager &modulePassManager = variantPassManager.nest<ModuleOp>(); |
| buildSPIRVCodegenConfigurationPassPipelineImpl(modulePassManager); |
| } |
| |
| void buildSPIRVCodegenPassPipeline(OpPassManager &variantPassManager) { |
| { |
| OpPassManager &modulePassManager = variantPassManager.nest<ModuleOp>(); |
| modulePassManager.addPass( |
| createSPIRVLowerExecutableUsingTransformDialectPass()); |
| FunctionLikeNest(modulePassManager) |
| .addPass(createSPIRVLowerExecutableTargetPass) |
| .addPass(createVerifyWorkgroupDistributionPass); |
| addMemRefLoweringPasses(modulePassManager); |
| } |
| variantPassManager.addPass(createReconcileTranslationInfoPass()); |
| variantPassManager.addPass(IREE::Util::createDropCompilerHintsPass()); |
| |
| { |
| OpPassManager &modulePassManager = variantPassManager.nest<ModuleOp>(); |
| addSPIRVLoweringPasses(modulePassManager); |
| } |
| |
| LLVM_DEBUG({ |
| llvm::dbgs() << "Using SPIR-V pass pipeline:\n"; |
| variantPassManager.printAsTextualPipeline(llvm::dbgs()); |
| llvm::dbgs() << "\n"; |
| }); |
| } |
| |
| // NOTE: this runs on the top-level program module containing all hal.executable |
| // ops. |
| void buildSPIRVLinkingPassPipeline(OpPassManager &modulePassManager) { |
| auto &executablePassManager = |
| modulePassManager.nest<IREE::HAL::ExecutableOp>(); |
| // Trim the allowed target environment (version/capability/extension/etc.) to |
| // the minimal requirement needed by compiled spirv.module ops. This helps to |
| // increase the chance of linking different variant ops together. |
| executablePassManager.addNestedPass<IREE::HAL::ExecutableVariantOp>( |
| createSPIRVTrimExecutableTargetEnvPass()); |
| // Materialize the minimal required target environment into proper device |
| // queries to execute in the runtime. |
| executablePassManager.addNestedPass<IREE::HAL::ExecutableVariantOp>( |
| createSPIRVMaterializeExecutableConditionsPass()); |
| // Link together executables. This may produce some IR duplication. |
| modulePassManager.addPass(createSPIRVLinkExecutablesPass()); |
| |
| // Cleanup IR duplication. |
| modulePassManager.addNestedPass<IREE::HAL::ExecutableOp>( |
| mlir::createCanonicalizerPass()); |
| } |
| |
| //===---------------------------------------------------------------------===// |
| // Register SPIR-V Passes |
| //===---------------------------------------------------------------------===// |
| |
| namespace { |
| #define GEN_PASS_REGISTRATION |
| #include "iree/compiler/Codegen/SPIRV/Passes.h.inc" |
| } // namespace |
| |
| void registerCodegenSPIRVPasses() { |
| // Generated. |
| registerPasses(); |
| |
| static PassPipelineRegistration<> SPIRVConfigPipeline( |
| "iree-codegen-spirv-configuration-pipeline", |
| "Runs the pipeline for configuring the lowering from linalg to SPIR-V on " |
| "all functions in a module", |
| [](OpPassManager &modulePassManager) { |
| buildSPIRVCodegenConfigurationPassPipelineImpl(modulePassManager); |
| }); |
| |
| static PassPipelineRegistration<> LinalgSPIRVPipeline( |
| "iree-codegen-linalg-to-spirv-pipeline", |
| "Runs the progressive lowering pipeline from linalg to SPIR-V", |
| [](OpPassManager &variantPassManager) { |
| buildSPIRVCodegenPassPipeline(variantPassManager); |
| }); |
| } |
| |
| } // namespace mlir::iree_compiler |