| // Copyright 2020 The IREE Authors |
| // |
| // Licensed under the Apache License v2.0 with LLVM Exceptions. |
| // See https://llvm.org/LICENSE.txt for license information. |
| // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| |
| #include "iree-dialects/Dialect/LinalgTransform/Passes.h" |
| #include "iree/compiler/Codegen/Common/CPU/Passes.h" |
| #include "iree/compiler/Codegen/Common/PassUtils.h" |
| #include "iree/compiler/Codegen/Common/Passes.h" |
| #include "iree/compiler/Codegen/Dialect/CPU/IR/IREECPUTypes.h" |
| #include "iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenAttrs.h" |
| #include "iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenInterfaces.h" |
| #include "iree/compiler/Codegen/LLVMCPU/Passes.h" |
| #include "iree/compiler/Dialect/LinalgExt/Transforms/Passes.h" |
| #include "iree/compiler/Dialect/Util/Transforms/Passes.h" |
| #include "iree/compiler/Utils/PassUtils.h" |
| #include "llvm/ADT/TypeSwitch.h" |
| #include "llvm/Support/CommandLine.h" |
| #include "mlir/Conversion/AffineToStandard/AffineToStandard.h" |
| #include "mlir/Conversion/ArithToArmSME/ArithToArmSME.h" |
| #include "mlir/Conversion/ArmSMEToLLVM/ArmSMEToLLVM.h" |
| #include "mlir/Conversion/ArmSMEToSCF/ArmSMEToSCF.h" |
| #include "mlir/Conversion/ComplexToStandard/ComplexToStandard.h" |
| #include "mlir/Conversion/ReconcileUnrealizedCasts/ReconcileUnrealizedCasts.h" |
| #include "mlir/Conversion/SCFToControlFlow/SCFToControlFlow.h" |
| #include "mlir/Conversion/VectorToArmSME/VectorToArmSME.h" |
| #include "mlir/Conversion/VectorToLLVM/ConvertVectorToLLVMPass.h" |
| #include "mlir/Dialect/Affine/Passes.h" |
| #include "mlir/Dialect/Arith/Transforms/Passes.h" |
| #include "mlir/Dialect/ArmSME/Transforms/Passes.h" |
| #include "mlir/Dialect/Func/IR/FuncOps.h" |
| #include "mlir/Dialect/Linalg/Passes.h" |
| #include "mlir/Dialect/MemRef/Transforms/Passes.h" |
| #include "mlir/Dialect/Utils/IndexingUtils.h" |
| #include "mlir/IR/BuiltinTypeInterfaces.h" |
| #include "mlir/Pass/PassManager.h" |
| #include "mlir/Transforms/Passes.h" |
| |
| #define DEBUG_TYPE "iree-llvmcpu-pass-pipelines" |
| |
| namespace mlir::iree_compiler { |
| |
| /// Command line options used purely for development purposes. Not to be relied |
| /// on in any way. |
| static llvm::cl::opt<bool> clFailOnOutOfBoundsStackAllocation( |
| "iree-llvmcpu-fail-on-out-of-bounds-stack-allocation", |
| llvm::cl::desc("fail if the upper bound of dynamic stack allocation cannot " |
| "be solved"), |
| llvm::cl::init(true)); |
| |
| static llvm::cl::opt<bool> clFailOnLargeVector( |
| "iree-llvmcpu-fail-on-large-vector", |
| llvm::cl::desc("fail if there are operations with large vectors"), |
| llvm::cl::init(true)); |
| |
| static llvm::cl::opt<bool> clCheckLinalgVectorization( |
| "iree-llvmcpu-check-linalg-vectorization", |
| llvm::cl::desc( |
| "Runs the pass to check if all the Linalg ops are vectorized"), |
| llvm::cl::init(false)); |
| |
| static llvm::cl::opt<bool> clUseFastMinMaxOps( |
| "iree-llvmcpu-use-fast-min-max-ops", |
| llvm::cl::desc( |
| "Use `arith.minf/maxf` instead of `arith.minimumf/maximumf` ops"), |
| llvm::cl::init(false)); |
| |
| static llvm::cl::opt<bool> clEnableReassociateFpReductions( |
| "iree-llvmcpu-reassociate-fp-reductions", |
| llvm::cl::desc("Enables reassociation for FP reductions"), |
| llvm::cl::init(true)); |
| |
| static llvm::cl::opt<bool> clSkipIntermediateRoundings( |
| "iree-llvmcpu-skip-intermediate-roundings", |
| llvm::cl::desc( |
| "Allow skipping intermediate roundings. For example, in f16 matmul " |
| "kernels on targets with only f32 arithmetic, we have to perform each " |
| "multiply-accumulate in f32, and if this flag is false, then we have " |
| "to round those f32 accumulators to the nearest f16 every time, which " |
| "is slow."), |
| llvm::cl::init(true)); |
| |
| static llvm::cl::opt<bool> clInstrumentMemoryAccesses{ |
| "iree-llvmcpu-instrument-memory-accesses", |
| llvm::cl::desc("Instruments memory accesses in dispatches when dispatch " |
| "instrumentation is enabled."), |
| llvm::cl::init(false)}; |
| |
| static llvm::cl::opt<bool> clUseSoftmaxInterFusion( |
| "iree-llvmcpu-use-decompose-softmax-fuse", |
| llvm::cl::desc("Enables inter-pass fusion for the DecomposeSoftmax pass."), |
| llvm::cl::init(true)); |
| |
| static llvm::cl::opt<bool> clEnableVectorContractCustomKernels( |
| "iree-llvmcpu-enable-vector-contract-custom-kernels", |
| llvm::cl::desc("Enables vector contract custom kernels for " |
| "LLVMCPUMmt4dVectorLowering pass."), |
| llvm::cl::init(false)); |
| |
| static llvm::cl::opt<bool> clTileDispatchUsingForall( |
| "iree-llvmcpu-tile-dispatch-using-forall", |
| llvm::cl::desc("Enable tile and distribute to workgroups using scf.forall"), |
| llvm::cl::init(true)); |
| |
| // By default, IREE does not enable the Armv9-A streaming SVE mode in the |
| // presence of scalable vectors (even when using `+sme`), as currently there's |
| // no cost model of when it could be beneficial. This flag will effectively make |
| // IREE/LLVM switch from SVE to SSVE in dispatch regions with supported |
| // scalable vector operations. |
| static llvm::cl::opt<bool> clForceArmStreaming( |
| "iree-llvmcpu-force-arm-streaming", |
| llvm::cl::desc( |
| "Enables Armv9-A streaming SVE mode for any dispatch region that " |
| "contains supported scalable vector operations (i.e., use SSVE rather " |
| "than SVE). Requires the +sme feature flag."), |
| llvm::cl::init(false)); |
| |
| static llvm::cl::opt<bool> clPatchFuncOps( |
| "iree-llvmcpu-debug-patch-func-ops", |
| llvm::cl::desc( |
| "Perform the patches on func ops for debugging purpose. It should be " |
| "used with `--iree-codegen-debug-patched-func-ops-file-name`."), |
| llvm::cl::init(false), llvm::cl::Hidden); |
| |
| // TODO: Enable `TileDispatchUsingForall` for every pipeline. |
| static void |
| addTileAndDistributePasses(OpPassManager &funcPassManager, |
| const LLVMCPUPipelineOptions &pipelineOpt) { |
| if (pipelineOpt.disableDistribution) { |
| return; |
| } |
| if (clTileDispatchUsingForall) { |
| funcPassManager.addPass( |
| createTileAndDistributeToWorkgroupsUsingForallOpPass()); |
| funcPassManager.addPass(createBufferizeDispatchTensorLoadStorePass()); |
| funcPassManager.addPass(createCombineLayoutTransformationPass()); |
| } else { |
| funcPassManager.addPass(createTileAndDistributeToWorkgroupsPass()); |
| funcPassManager.addPass(createCSEPass()); |
| funcPassManager.addPass(createConvertToDestinationPassingStylePass()); |
| funcPassManager.addPass(createFoldAffineMinInDistributedLoopsPass()); |
| } |
| funcPassManager.addPass(createConfigTrackingCanonicalizerPass()); |
| funcPassManager.addPass(createCSEPass()); |
| funcPassManager.addPass(createFuseTensorPadWithConsumerPass()); |
| funcPassManager.addPass(createConcretizePadResultShapePass()); |
| funcPassManager.addPass(createPropagateDispatchSizeBoundsPass()); |
| } |
| |
| //===---------------------------------------------------------------------===// |
| // Codegen pipelines. |
| //===---------------------------------------------------------------------===// |
| |
| void buildLLVMCPUVectorLoweringPipeline( |
| OpPassManager &funcPassManager, |
| const LLVMCPUVectorLoweringPassOptions &options) { |
| funcPassManager.addPass(createDropVectorUnitDimsPass()); |
| funcPassManager.addPass(createLLVMCPUVirtualVectorLoweringPass( |
| LLVMCPUVirtualVectorLoweringPassOptions{options.splitVectorTransfersTo, |
| options.enableArmI8mm})); |
| |
| // Make sure we remove redundant vector ops (e.g., vector transposes) before |
| // we lower them and can't be optimized away anymore. |
| funcPassManager.addPass(createCanonicalizerPass()); |
| |
| VectorTransferLoweringPassOptions transferLoweringOptions{}; |
| if (!options.enableArmSME) { |
| // The ArmSME dialect has its own (more specific) lowerings for scalable |
| // vectors that occur later in the pipeline, so only enable the general |
| // lowerings if SME is not available. |
| transferLoweringOptions.enableScalableLowerings = true; |
| } |
| funcPassManager.addPass( |
| createVectorTransferLoweringPass(transferLoweringOptions)); |
| funcPassManager.addPass(createLLVMCPUVectorTransposeLoweringPass( |
| LLVMCPUVectorTransposeLoweringPassOptions{ |
| options.lowerVectorTransposeToAVX2})); |
| |
| // Potentially removes shape_cast and broadcast on unit dims before shape_cast |
| // lowering. |
| funcPassManager.addPass(createCanonicalizerPass()); |
| |
| // 'vector.shape_cast' are very expensive operations that are even generated |
| // by some of the lowerings above (e.g., transpose lowering). There are |
| // chances to cancel them out if they are not lowered too early so we lower |
| // them at the very end of the pass. |
| funcPassManager.addPass(createLLVMCPUVectorShapeCastLoweringPass()); |
| } |
| |
| void addCPUBufferOpsTileAndVectorizePipeline( |
| OpPassManager &funcPassManager, const LLVMCPUPipelineOptions &pipelineOpt) { |
| addTileAndDistributePasses(funcPassManager, pipelineOpt); |
| |
| // Skip tiling reduction loops because this is expected to apply on copy ops |
| // only. |
| funcPassManager.addPass(createLLVMCPUTilePass( |
| IREE::CPU::TilingLevel::VectorCommonParallelTiles, /*skipRootOp=*/false)); |
| funcPassManager.addPass(createLLVMCPUPeelPass()); |
| { |
| GenericVectorizationPassOptions options; |
| options.useConfiguredVectorSizes = pipelineOpt.useConfiguredVectorSizes; |
| options.enableVectorMasking = pipelineOpt.enableVectorMasking; |
| options.vectorizeGatherAccesses = true; |
| funcPassManager.addPass(createGenericVectorizationPass(options)); |
| funcPassManager.addPass(createOptimizeTensorInsertExtractSlicesPass()); |
| funcPassManager.addPass(createCanonicalizerPass()); |
| funcPassManager.addPass(createCSEPass()); |
| if (clFailOnLargeVector) { |
| funcPassManager.addPass(createLLVMCPUVerifyVectorSizeLegalityPass()); |
| } |
| } |
| |
| // Run IREE specific passes before vector lowering expert. |
| funcPassManager.addPass(createRemoveSingleIterationLoopPass()); |
| |
| { |
| LLVMCPUVectorLoweringPassOptions options; |
| options.lowerVectorTransposeToAVX2 = pipelineOpt.lowerToAVX2; |
| options.splitVectorTransfersTo = "linalg-copy"; |
| options.enableArmI8mm = pipelineOpt.enableAArch64I8mm; |
| options.enableArmSME = pipelineOpt.enableAArch64SME; |
| buildLLVMCPUVectorLoweringPipeline(funcPassManager, options); |
| } |
| } |
| |
| void addMultiTilingExpertPassPipeline( |
| OpPassManager &funcPassManager, |
| IREE::Codegen::LoweringConfigAttrInterface loweringConfig, |
| const LLVMCPUPipelineOptions &pipelineOpt) { |
| addTileAndDistributePasses(funcPassManager, pipelineOpt); |
| for (int i = 0, e = IREE::CPU::TilingLevel::MaxNumTileLevels; i < e; ++i) { |
| auto level = static_cast<IREE::CPU::TilingLevel>(i); |
| if (!loweringConfig.hasTilingLevel(level)) { |
| continue; |
| } |
| |
| switch (level) { |
| case IREE::CPU::TilingLevel::CacheParallelTiles: |
| case IREE::CPU::TilingLevel::VectorCommonParallelTiles: |
| funcPassManager.addPass( |
| createLLVMCPUTileAndFuseProducerConsumerPass(level)); |
| break; |
| case IREE::CPU::TilingLevel::CacheReductionTiles: |
| funcPassManager.addPass( |
| createLLVMCPUTileRootAndFuseInputOperandsPass(level)); |
| break; |
| case IREE::CPU::TilingLevel::VectorReductionTiles: |
| // Run SplitReductionPass before the final reduction Fuse pass, because |
| // SplitReductionPass takes care of banked-tiling. |
| funcPassManager.addPass( |
| createLLVMCPUSplitReductionPass(clEnableReassociateFpReductions)); |
| funcPassManager.addPass( |
| createLLVMCPUTileRootAndFuseInputOperandsPass(level)); |
| // Tile all the reduction ops for target vector sizes, which ensures |
| // that all the dimensions are tiled in all the reduction ops. The root |
| // op is already tiled, so it is skipped in the pass. |
| funcPassManager.addPass(createLLVMCPUTilePass( |
| static_cast<IREE::CPU::TilingLevel>(i), /*skipRootOp=*/true)); |
| break; |
| case IREE::CPU::TilingLevel::VectorInnerParallelTiles: |
| case IREE::CPU::TilingLevel::DistributionTiles: |
| case IREE::CPU::TilingLevel::MaxNumTileLevels: |
| case IREE::CPU::TilingLevel::InvalidLevel: |
| continue; |
| }; |
| funcPassManager.addPass(createFuseTensorPadWithConsumerPass()); |
| funcPassManager.addPass(createConcretizePadResultShapePass()); |
| } |
| |
| // `VectorInnerParallelTiles` level models the tiling and fusion for the |
| // dimensions that are not captured in root op. I.e., root op may not have the |
| // config for the level. Thus, we use the last operation that has the tiling |
| // level as anchor. |
| funcPassManager.addPass(createLLVMCPUTileLastOpAndFuseProducerConsumerPass( |
| IREE::CPU::TilingLevel::VectorInnerParallelTiles)); |
| funcPassManager.addPass(createFuseTensorPadWithConsumerPass()); |
| funcPassManager.addPass(createConcretizePadResultShapePass()); |
| |
| funcPassManager.addPass(createForallToForPass()); |
| if (pipelineOpt.enablePeeling) { |
| funcPassManager.addPass(createLLVMCPUPeelPass()); |
| } |
| |
| if (pipelineOpt.enableAArch64SME) { |
| funcPassManager.addPass(createLLVMCPU2DScalableTo1DScalablePass()); |
| } |
| |
| { |
| funcPassManager.addPass(createTensorToVectorVectorizePadPass()); |
| if (pipelineOpt.decomposePackUnPackOps) { |
| funcPassManager.addPass(createDecomposePackUnPackOpsPass()); |
| funcPassManager.addPass(createConfigTrackingCanonicalizerPass()); |
| funcPassManager.addPass(createCSEPass()); |
| } |
| |
| GenericVectorizationPassOptions options; |
| options.useConfiguredVectorSizes = pipelineOpt.useConfiguredVectorSizes; |
| options.enableVectorMasking = pipelineOpt.enableVectorMasking; |
| options.vectorizePadding = true; |
| options.vectorizeGatherAccesses = true; |
| funcPassManager.addPass(createGenericVectorizationPass(options)); |
| funcPassManager.addPass(createOptimizeTensorInsertExtractSlicesPass()); |
| funcPassManager.addPass(createCanonicalizerPass()); |
| funcPassManager.addPass(createCSEPass()); |
| if (clFailOnLargeVector) { |
| funcPassManager.addPass(createLLVMCPUVerifyVectorSizeLegalityPass()); |
| } |
| } |
| |
| addCPUBufferizePasses(funcPassManager); |
| |
| // Run IREE specific passes before vector lowering expert. |
| funcPassManager.addPass(createPropagateDispatchSizeBoundsPass()); |
| funcPassManager.addPass(createRemoveSingleIterationLoopPass()); |
| |
| { |
| LLVMCPUVectorLoweringPassOptions options; |
| options.lowerVectorTransposeToAVX2 = pipelineOpt.lowerToAVX2; |
| options.splitVectorTransfersTo = "linalg-copy"; |
| options.enableArmI8mm = pipelineOpt.enableAArch64I8mm; |
| options.enableArmSME = pipelineOpt.enableAArch64SME; |
| buildLLVMCPUVectorLoweringPipeline(funcPassManager, options); |
| } |
| } |
| |
| void addConvTileAndDecomposeExpertPassPipeline( |
| OpPassManager &funcPassManager, const LLVMCPUPipelineOptions &pipelineOpt) { |
| addTileAndDistributePasses(funcPassManager, pipelineOpt); |
| |
| funcPassManager.addPass(createLLVMCPUTileAndFuseProducerConsumerPass( |
| IREE::CPU::TilingLevel::VectorCommonParallelTiles)); |
| funcPassManager.addPass(createFuseTensorPadWithConsumerPass()); |
| funcPassManager.addPass(createConcretizePadResultShapePass()); |
| |
| funcPassManager.addPass(createLLVMCPUTileRootAndFuseInputOperandsPass( |
| IREE::CPU::TilingLevel::VectorReductionTiles)); |
| funcPassManager.addPass(createDecomposeConvolutionToLowerDimOpsPass()); |
| funcPassManager.addPass(createFuseTensorPadWithConsumerPass()); |
| funcPassManager.addPass(createConcretizePadResultShapePass()); |
| |
| // Convert forall to for before vectorization preparation. |
| funcPassManager.addPass(iree_compiler::createForallToForPass()); |
| |
| if (pipelineOpt.enablePeeling) { |
| funcPassManager.addPass(createLLVMCPUPeelPass()); |
| } |
| |
| { |
| funcPassManager.addPass(createTensorToVectorVectorizePadPass()); |
| GenericVectorizationPassOptions options; |
| options.useConfiguredVectorSizes = pipelineOpt.useConfiguredVectorSizes; |
| options.enableVectorMasking = pipelineOpt.enableVectorMasking; |
| options.vectorizePadding = true; |
| options.vectorizeGatherAccesses = true; |
| funcPassManager.addPass(createGenericVectorizationPass(options)); |
| funcPassManager.addPass(createOptimizeTensorInsertExtractSlicesPass()); |
| funcPassManager.addPass(createCanonicalizerPass()); |
| funcPassManager.addPass(createCSEPass()); |
| if (clFailOnLargeVector) { |
| funcPassManager.addPass(createLLVMCPUVerifyVectorSizeLegalityPass()); |
| } |
| } |
| |
| // Eliminate redundant transfer_read/write to avoid stack allocations. |
| funcPassManager.addPass(createOptimizeVectorTransferPass( |
| OptimizeVectorTransferPassOptions{/*flatten=*/true})); |
| |
| addCPUBufferizePasses(funcPassManager); |
| |
| // Run IREE specific passes before vector lowering expert. |
| funcPassManager.addPass(createPropagateDispatchSizeBoundsPass()); |
| funcPassManager.addPass(createRemoveSingleIterationLoopPass()); |
| |
| { |
| LLVMCPUVectorLoweringPassOptions options; |
| options.lowerVectorTransposeToAVX2 = pipelineOpt.lowerToAVX2; |
| options.splitVectorTransfersTo = "shuffle"; |
| options.enableArmI8mm = pipelineOpt.enableAArch64I8mm; |
| options.enableArmSME = pipelineOpt.enableAArch64SME; |
| buildLLVMCPUVectorLoweringPipeline(funcPassManager, options); |
| } |
| } |
| |
| void addMmt4dTilingExpertPassPipeline( |
| OpPassManager &funcPassManager, const LLVMCPUPipelineOptions &pipelineOpt) { |
| addTileAndDistributePasses(funcPassManager, pipelineOpt); |
| |
| funcPassManager.addPass(createLLVMCPUTileAndFuseProducerConsumerPass( |
| IREE::CPU::TilingLevel::VectorCommonParallelTiles)); |
| // The below two passes are nop if the "mmt4d" is explicitly excluded in the |
| // ukernels attribute. |
| funcPassManager.addPass(createCPUPrepareUkernelsPass()); |
| funcPassManager.addPass( |
| createCPULowerToUKernelsPass(clSkipIntermediateRoundings)); |
| funcPassManager.addPass(createLLVMCPUTileRootAndFuseInputOperandsPass( |
| IREE::CPU::TilingLevel::VectorReductionTiles)); |
| funcPassManager.addPass(iree_compiler::createForallToForPass()); |
| |
| { |
| GenericVectorizationPassOptions options; |
| options.useConfiguredVectorSizes = pipelineOpt.useConfiguredVectorSizes; |
| options.enableVectorMasking = pipelineOpt.enableVectorMasking; |
| options.vectorizePadding = true; |
| options.vectorizeGatherAccesses = true; |
| funcPassManager.addPass(createGenericVectorizationPass(options)); |
| funcPassManager.addPass(createOptimizeTensorInsertExtractSlicesPass()); |
| funcPassManager.addPass(createCanonicalizerPass()); |
| funcPassManager.addPass(createCSEPass()); |
| if (clFailOnLargeVector) { |
| funcPassManager.addPass(createLLVMCPUVerifyVectorSizeLegalityPass()); |
| } |
| } |
| |
| funcPassManager.addPass(createCanonicalizerPass()); |
| funcPassManager.addPass(createCSEPass()); |
| |
| addCPUBufferizePasses(funcPassManager); |
| |
| // Vector lowering of Mmt4d. |
| funcPassManager.addPass(createLLVMCPUMmt4dVectorLoweringPass( |
| LLVMCPUMmt4dVectorLoweringPassOptions{ |
| clEnableVectorContractCustomKernels})); |
| |
| // Generic vector lowering. |
| LLVMCPUVectorLoweringPassOptions options; |
| options.lowerVectorTransposeToAVX2 = pipelineOpt.lowerToAVX2; |
| options.splitVectorTransfersTo = "linalg-copy"; |
| options.enableArmI8mm = pipelineOpt.enableAArch64I8mm; |
| options.enableArmSME = pipelineOpt.enableAArch64SME; |
| buildLLVMCPUVectorLoweringPipeline(funcPassManager, options); |
| } |
| |
| void addCPUDataTilingPipeline(OpPassManager &funcPassManager, |
| const LLVMCPUPipelineOptions &pipelineOpt) { |
| addTileAndDistributePasses(funcPassManager, pipelineOpt); |
| |
| // The below two passes are nop if pack/unpack is not specified in ukernels |
| // attribute. By default, they are disabled. |
| funcPassManager.addPass(createCPUPrepareUkernelsPass()); |
| funcPassManager.addPass( |
| createCPULowerToUKernelsPass(clSkipIntermediateRoundings)); |
| |
| funcPassManager.addPass(createLLVMCPUTilePass( |
| IREE::CPU::TilingLevel::VectorCommonParallelTiles, /*skipRootOp=*/false)); |
| if (pipelineOpt.decomposePackUnPackOps) { |
| funcPassManager.addPass(createDecomposePackUnPackOpsPass()); |
| } |
| |
| { |
| GenericVectorizationPassOptions options; |
| options.useConfiguredVectorSizes = pipelineOpt.useConfiguredVectorSizes; |
| options.vectorizePadding = true; |
| options.enableVectorMasking = pipelineOpt.enableVectorMasking; |
| funcPassManager.addPass(createGenericVectorizationPass(options)); |
| funcPassManager.addPass(createOptimizeTensorInsertExtractSlicesPass()); |
| funcPassManager.addPass(createCanonicalizerPass()); |
| funcPassManager.addPass(createCSEPass()); |
| if (clFailOnLargeVector) { |
| funcPassManager.addPass(createLLVMCPUVerifyVectorSizeLegalityPass()); |
| } |
| } |
| |
| addCPUBufferizePasses(funcPassManager); |
| |
| { |
| LLVMCPUVectorLoweringPassOptions options; |
| options.lowerVectorTransposeToAVX2 = pipelineOpt.lowerToAVX2; |
| options.splitVectorTransfersTo = "linalg-copy"; |
| options.enableArmI8mm = pipelineOpt.enableAArch64I8mm; |
| options.enableArmSME = pipelineOpt.enableAArch64SME; |
| buildLLVMCPUVectorLoweringPipeline(funcPassManager, options); |
| } |
| } |
| |
| void addCPULinalgExtTileAndVectorizePipeline( |
| OpPassManager &funcPassManager, const LLVMCPUPipelineOptions &pipelineOpt) { |
| addTileAndDistributePasses(funcPassManager, pipelineOpt); |
| funcPassManager.addPass(createLLVMCPUTileAndFuseProducerConsumerPass( |
| IREE::CPU::TilingLevel::VectorCommonParallelTiles)); |
| funcPassManager.addPass( |
| IREE::LinalgExt::createConvertAttentionToOnlineAttentionPass()); |
| funcPassManager.addPass(createLLVMCPUTileRootAndFuseInputOperandsPass( |
| IREE::CPU::TilingLevel::VectorReductionTiles)); |
| funcPassManager.addPass( |
| IREE::LinalgExt::createDecomposeWinogradTransformPass()); |
| funcPassManager.addPass(IREE::LinalgExt::createDecomposeAttentionPass()); |
| funcPassManager.addPass(iree_compiler::createForallToForPass()); |
| |
| { |
| GenericVectorizationPassOptions options; |
| options.useConfiguredVectorSizes = pipelineOpt.useConfiguredVectorSizes; |
| options.enableVectorMasking = pipelineOpt.enableVectorMasking; |
| funcPassManager.addPass(createGenericVectorizationPass(options)); |
| funcPassManager.addPass(createCanonicalizerPass()); |
| funcPassManager.addPass(createCSEPass()); |
| funcPassManager.addPass(createOptimizeTensorInsertExtractSlicesPass()); |
| funcPassManager.addPass(createCanonicalizerPass()); |
| funcPassManager.addPass(createCSEPass()); |
| if (clFailOnLargeVector) { |
| funcPassManager.addPass(createLLVMCPUVerifyVectorSizeLegalityPass()); |
| } |
| } |
| |
| addCPUBufferizePasses(funcPassManager); |
| |
| { |
| LLVMCPUVectorLoweringPassOptions options; |
| options.lowerVectorTransposeToAVX2 = pipelineOpt.lowerToAVX2; |
| options.splitVectorTransfersTo = "linalg-copy"; |
| options.enableArmI8mm = pipelineOpt.enableAArch64I8mm; |
| options.enableArmSME = pipelineOpt.enableAArch64SME; |
| buildLLVMCPUVectorLoweringPipeline(funcPassManager, options); |
| } |
| } |
| |
| void addCPUDefaultPassPipeline(OpPassManager &funcPassManager, |
| const LLVMCPUPipelineOptions &pipelineOpt) { |
| addTileAndDistributePasses(funcPassManager, pipelineOpt); |
| funcPassManager.addPass(createLLVMCPUTileLastOpAndFuseProducerConsumerPass( |
| IREE::CPU::TilingLevel::VectorCommonParallelTiles)); |
| addCPUBufferizePasses(funcPassManager); |
| } |
| |
| static void addLowerToLLVMPasses(OpPassManager &modulePassManager, |
| bool enableAArch64SME) { |
| // TODO: Remove the following pass and plumb support for #hal.descriptor_type |
| // memory space through the stack. |
| FunctionLikeNest(modulePassManager) |
| .addPass(createEraseHALDescriptorTypeFromMemRefPass); |
| |
| // Lower `ukernel.*` ops to function calls |
| modulePassManager.addPass(createLowerUKernelOpsToCallsPass()); |
| |
| FunctionLikeNest(modulePassManager) |
| // LinalgExt -> SCF |
| .addPass(IREE::LinalgExt::createLinalgExtToLoopsPass) |
| // Linalg -> SCF |
| .addPass(createMemrefCopyToLinalgPass) |
| .addPredicatedPass(clCheckLinalgVectorization, |
| createLLVMCPUEmitVectorizationRemarksPass) |
| .addPass(createConvertLinalgToLoopsPass) |
| .addPass(createConvertBf16ArithToF32Pass) |
| .addPass(createConvertBf16ToUInt16BuffersPass) |
| .addPass(createCanonicalizerPass) |
| .addPass(createCSEPass); |
| |
| // Handled tensor-type constants. |
| modulePassManager.addPass(createIREEBufferizeConstantsPass()); |
| |
| FunctionLikeNest(modulePassManager) |
| .addPass(createFoldTensorExtractOpPass) |
| // Handle complex operation conversion. |
| .addPass(createConvertComplexToStandardPass) |
| // Math dialect ops rewrites, approximations, casts. |
| .addPass(createMathTransformPass) |
| .addPass(createHoistStaticallyBoundAllocationsPass) |
| // Use `arith.minf/maxf` instead of `arith.minimumf/maximumf`. |
| .addPredicatedPass(clUseFastMinMaxOps, createReplaceSlowMinMaxOpsPass); |
| |
| if (enableAArch64SME) { |
| modulePassManager.addPass(mlir::arm_sme::createVectorLegalizationPass()); |
| FunctionLikeNest(modulePassManager) |
| .addPredicatedPass( |
| clForceArmStreaming, |
| [] { |
| // 1. Enable Armv9-A streaming mode without ZA (i.e., SSVE) for |
| // dispatch regions that contain scalable vectors when forced via |
| // the --iree-llvmcpu-force-arm-streaming flag. |
| return mlir::arm_sme::createEnableArmStreamingPass( |
| mlir::arm_sme::ArmStreamingMode::StreamingLocally, |
| mlir::arm_sme::ArmZaMode::Disabled, |
| /*ifRequiredByOps=*/false, |
| /*ifContainsScalableVectors=*/true); |
| }) |
| .addPass(createCanonicalizerPass) |
| .addPass(createCSEPass) |
| .addPass(mlir::createArithToArmSMEConversionPass) |
| .addPass(mlir::createConvertVectorToArmSMEPass) |
| .addPass([] { |
| // 2. Enable ZA for dispatch regions that contain ArmSME ops (which |
| // all make use of the ZA state). |
| return mlir::arm_sme::createEnableArmStreamingPass( |
| mlir::arm_sme::ArmStreamingMode::StreamingLocally, |
| mlir::arm_sme::ArmZaMode::NewZA, |
| /*ifRequiredByOps=*/true); |
| }) |
| .addPass(mlir::createConvertArmSMEToSCFPass); |
| } |
| |
| VectorTransferLoweringPassOptions transferLoweringOptions; |
| if (!enableAArch64SME) { |
| // The ArmSME dialect has its own (more specific) lowerings for scalable |
| // vectors that occur later in the pipeline, so only enable the general |
| // lowerings if SME is not available. |
| transferLoweringOptions.enableScalableLowerings = true; |
| } |
| |
| FunctionLikeNest(modulePassManager) |
| // All structural buffer manipulations must conclude before this point. |
| |
| // The subview folding doesn't like potentially-out-of-bounds |
| // vector.transfer_read and vector.transfer_write, lower them to loads and |
| // stores here. |
| .addPass([&]() { |
| return createVectorTransferLoweringPass(transferLoweringOptions); |
| }) |
| .addPass(memref::createFoldMemRefAliasOpsPass) |
| .addPass(createIREEExpandStridedMetadataPass) |
| .addPass(createCleanupBufferAllocViewPass) |
| // Checking stack allocation before converting to CF dialect is easier. |
| .addPass([&]() { |
| return createLLVMCPUCheckIRBeforeLLVMConversionPass( |
| LLVMCPUCheckIRBeforeLLVMConversionPassOptions{ |
| clFailOnOutOfBoundsStackAllocation}); |
| }) |
| // SCF -> CF |
| .addPass(createSCFToControlFlowPass) |
| .addPass(createCanonicalizerPass) |
| .addPass(createCSEPass) |
| // (HAL, IREE, Linalg, CF) -> LLVM |
| .addPass(memref::createFoldMemRefAliasOpsPass) |
| .addPass(affine::createAffineExpandIndexOpsPass) |
| .addPass([&]() { |
| arith::ArithExpandOpsPassOptions options; |
| options.includeBf16 = true; |
| options.includeF4E2M1 = true; |
| options.includeF8E8M0 = true; |
| return arith::createArithExpandOpsPass(options); |
| }) |
| .addPass(createEmulateNarrowTypePass) |
| .addPass(createCanonicalizerPass) |
| .addPass(createCSEPass) |
| .addPredicatedPass(clInstrumentMemoryAccesses, |
| createInstrumentMemoryAccessesPass); |
| |
| if (enableAArch64SME) { |
| FunctionLikeNest(modulePassManager).addPass([&] { |
| return createConvertArmSMEToLLVMPass(); |
| }); |
| } |
| modulePassManager.addPass( |
| createConvertToLLVMPass(clEnableReassociateFpReductions)); |
| modulePassManager.addPass(createReconcileUnrealizedCastsPass()); |
| |
| // We rely on MLIR symbol visibility being correct after this point and need |
| // to mirror the LLVM linkage that was assigned during conversion. |
| modulePassManager.addPass(createLLVMCPUSynchronizeSymbolVisibilityPass()); |
| |
| modulePassManager.addPass(createCanonicalizerPass()); |
| modulePassManager.addPass(createCSEPass()); |
| modulePassManager.addNestedPass<LLVM::LLVMFuncOp>( |
| createAddFastMathFlagsPass()); |
| } |
| |
| void buildLLVMCPUCodegenConfigurationPassPipelineImpl( |
| OpPassManager &modulePassManager) { |
| { |
| FunctionLikeNest funcPassManager(modulePassManager); |
| addCommonTargetExecutablePreprocessingPasses(funcPassManager, |
| clUseSoftmaxInterFusion); |
| } |
| modulePassManager.addPass(createMaterializeUserConfigsPass()); |
| FunctionLikeNest(modulePassManager) |
| .addPass(createRematerializeParallelOpsPass) |
| // TODO(#13888): This(createExpandF16OpToF32Pass()) pass is being added |
| // way to late and should insted be be done during lowering to LLVM. |
| .addPass(createExpandF16OpToF32Pass) |
| .addPass(createMaterializeDeviceEncodingPass) |
| .addPass(createCPUPropagateDataLayoutPass) |
| .addPass(createConvertAccGEMMToGEMMPass) |
| // TODO: Remove the following pass the plumb support for |
| // #hal.descriptor_type memory space through the stack. |
| .addPass(createEraseHALDescriptorTypeFromMemRefPass); |
| |
| modulePassManager.addPass(createLLVMCPUSelectLoweringStrategyPass()); |
| LLVM_DEBUG({ |
| llvm::dbgs() << "LLVMCPU codegen configuration pass pipeline:\n"; |
| modulePassManager.printAsTextualPipeline(llvm::dbgs()); |
| llvm::dbgs() << "\n"; |
| }); |
| } |
| |
| void buildLLVMCPUCodegenConfigurationPassPipeline( |
| OpPassManager &variantPassManager) { |
| variantPassManager.addPass(createSpecializeExportsPass()); |
| OpPassManager &modulePassManager = variantPassManager.nest<ModuleOp>(); |
| buildLLVMCPUCodegenConfigurationPassPipelineImpl(modulePassManager); |
| } |
| |
| void buildLLVMCPUCodegenPassPipeline(OpPassManager &variantPassManager, |
| bool enableAArch64SME) { |
| |
| { |
| OpPassManager &modulePassManager = variantPassManager.nest<ModuleOp>(); |
| modulePassManager.addPass(createLowerExecutableUsingTransformDialectPass()); |
| FunctionLikeNest(modulePassManager) |
| .addPass(createLLVMCPULowerExecutableTargetPass) |
| .addPass(createVerifyWorkgroupDistributionPass); |
| if (clPatchFuncOps) { |
| modulePassManager.addPass(createPatchFuncOpsPass()); |
| } |
| } |
| |
| variantPassManager.addPass(createReconcileTranslationInfoPass()); |
| variantPassManager.addPass(createLowerAffinePass()); |
| variantPassManager.addPass(IREE::Util::createDropCompilerHintsPass()); |
| |
| // Run conversion to LLVM at `ModuleOp` granularity. |
| { |
| OpPassManager &modulePassManager = variantPassManager.nest<ModuleOp>(); |
| addLowerToLLVMPasses(modulePassManager, enableAArch64SME); |
| } |
| LLVM_DEBUG({ |
| llvm::dbgs() << "LLVMCPU codegen pass pipeline:\n"; |
| variantPassManager.printAsTextualPipeline(llvm::dbgs()); |
| llvm::dbgs() << "\n"; |
| }); |
| } |
| |
| // NOTE: this runs on the top-level program module containing all |
| // hal.executable ops. |
| void buildLLVMCPULinkingPassPipeline(OpPassManager &modulePassManager, |
| std::optional<std::string> target) { |
| // Link together executables. This may produce some IR duplication. |
| LLVMCPULinkExecutablesPassOptions linkOptions; |
| linkOptions.target = target.value_or(""); |
| modulePassManager.addPass(createLLVMCPULinkExecutablesPass(linkOptions)); |
| |
| // Cleanup IR duplication. |
| modulePassManager.addNestedPass<IREE::HAL::ExecutableOp>( |
| mlir::createCanonicalizerPass()); |
| |
| // Assign final executable constant and import ordinals. |
| auto &variantPassManager = modulePassManager.nest<IREE::HAL::ExecutableOp>() |
| .nest<IREE::HAL::ExecutableVariantOp>(); |
| variantPassManager.addPass(createLLVMCPUAssignConstantOrdinalsPass()); |
| variantPassManager.addPass(createLLVMCPUAssignImportOrdinalsPass()); |
| } |
| |
| //===---------------------------------------------------------------------===// |
| // Register LLVMCPU Passes |
| //===---------------------------------------------------------------------===// |
| |
| namespace { |
| #define GEN_PASS_REGISTRATION |
| #include "iree/compiler/Codegen/LLVMCPU/Passes.h.inc" |
| } // namespace |
| |
| void registerCodegenLLVMCPUPasses() { |
| // Generated. |
| registerPasses(); |
| |
| static PassPipelineRegistration<> LLVMCPUConfigPipeline( |
| "iree-codegen-llvmcpu-configuration-pipeline", |
| "Runs the translation strategy configuration pipeline on Linalg for CPU", |
| [](OpPassManager &modulePassManager) { |
| buildLLVMCPUCodegenConfigurationPassPipelineImpl(modulePassManager); |
| }); |
| |
| static PassPipelineRegistration<> LLVMCPUBufferizationPipeline( |
| "iree-codegen-llvmcpu-bufferization-pipeline", |
| "Runs the bufferization pipeline for CPU", |
| [](OpPassManager &funcPassManager) { |
| addCPUBufferizePasses(funcPassManager); |
| }); |
| |
| static PassPipelineRegistration<> LLVMCPUVectorLoweringPipeline( |
| "iree-codegen-llvmcpu-vector-lowering-pipeline", |
| "Runs the translation strategy configuration pipeline on Linalg for CPU", |
| [](OpPassManager &funcPassManager) { |
| LLVMCPUVectorLoweringPassOptions options; |
| options.splitVectorTransfersTo = "linalg-copy"; |
| buildLLVMCPUVectorLoweringPipeline(funcPassManager, options); |
| }); |
| |
| struct LinalgToLLVMPipelineOptions |
| : public PassPipelineOptions<LinalgToLLVMPipelineOptions> { |
| Option<bool> enableArmSME{ |
| *this, "enable-arm-sme", |
| llvm::cl::desc("Enable the ArmSME lowering pipeline.")}; |
| }; |
| |
| static PassPipelineRegistration<LinalgToLLVMPipelineOptions> |
| LinalgLLVMPipeline( |
| "iree-codegen-linalg-to-llvm-pipeline", |
| "Runs the progressive lowering pipeline from Linalg to LLVM", |
| [](OpPassManager &variantPassManager, |
| LinalgToLLVMPipelineOptions const &options) { |
| buildLLVMCPUCodegenPassPipeline(variantPassManager, |
| options.enableArmSME); |
| }); |
| |
| static PassPipelineRegistration<> LLVMCPULinkingPipeline( |
| "iree-codegen-llvmcpu-linking-pipeline", |
| "Runs the LLVMCPU HAL executable linking pipeline", |
| [](OpPassManager &modulePassManager) { |
| buildLLVMCPULinkingPassPipeline(modulePassManager); |
| }); |
| } |
| |
| } // namespace mlir::iree_compiler |