compiler/src/iree/compiler/Codegen/LLVMCPU/Passes.cpp - 3p/openxla/iree - Git at Google

 // Copyright 2020 The IREE Authors
 //
 // Licensed under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

 #include "iree-dialects/Dialect/LinalgTransform/Passes.h"
 #include "iree/compiler/Codegen/Common/CPU/Passes.h"
 #include "iree/compiler/Codegen/Common/PassUtils.h"
 #include "iree/compiler/Codegen/Common/Passes.h"
 #include "iree/compiler/Codegen/Dialect/CPU/IR/IREECPUTypes.h"
 #include "iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenAttrs.h"
 #include "iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenInterfaces.h"
 #include "iree/compiler/Codegen/LLVMCPU/Passes.h"
 #include "iree/compiler/Dialect/LinalgExt/Transforms/Passes.h"
 #include "iree/compiler/Dialect/Util/Transforms/Passes.h"
 #include "iree/compiler/Utils/PassUtils.h"
 #include "llvm/ADT/TypeSwitch.h"
 #include "llvm/Support/CommandLine.h"
 #include "mlir/Conversion/AffineToStandard/AffineToStandard.h"
 #include "mlir/Conversion/ArithToArmSME/ArithToArmSME.h"
 #include "mlir/Conversion/ArmSMEToLLVM/ArmSMEToLLVM.h"
 #include "mlir/Conversion/ArmSMEToSCF/ArmSMEToSCF.h"
 #include "mlir/Conversion/ComplexToStandard/ComplexToStandard.h"
 #include "mlir/Conversion/ReconcileUnrealizedCasts/ReconcileUnrealizedCasts.h"
 #include "mlir/Conversion/SCFToControlFlow/SCFToControlFlow.h"
 #include "mlir/Conversion/VectorToArmSME/VectorToArmSME.h"
 #include "mlir/Conversion/VectorToLLVM/ConvertVectorToLLVMPass.h"
 #include "mlir/Dialect/Affine/Passes.h"
 #include "mlir/Dialect/Arith/Transforms/Passes.h"
 #include "mlir/Dialect/ArmSME/Transforms/Passes.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/Linalg/Passes.h"
 #include "mlir/Dialect/MemRef/Transforms/Passes.h"
 #include "mlir/Dialect/Utils/IndexingUtils.h"
 #include "mlir/IR/BuiltinTypeInterfaces.h"
 #include "mlir/Pass/PassManager.h"
 #include "mlir/Transforms/Passes.h"

 #define DEBUG_TYPE "iree-llvmcpu-pass-pipelines"

 namespace mlir::iree_compiler {

 /// Command line options used purely for development purposes. Not to be relied
 /// on in any way.
 static llvm::cl::opt<bool> clFailOnOutOfBoundsStackAllocation(
     "iree-llvmcpu-fail-on-out-of-bounds-stack-allocation",
     llvm::cl::desc("fail if the upper bound of dynamic stack allocation cannot "
                    "be solved"),
     llvm::cl::init(true));

 static llvm::cl::opt<bool> clFailOnLargeVector(
     "iree-llvmcpu-fail-on-large-vector",
     llvm::cl::desc("fail if there are operations with large vectors"),
     llvm::cl::init(true));

 static llvm::cl::opt<bool> clCheckLinalgVectorization(
     "iree-llvmcpu-check-linalg-vectorization",
     llvm::cl::desc(
         "Runs the pass to check if all the Linalg ops are vectorized"),
     llvm::cl::init(false));

 static llvm::cl::opt<bool> clUseFastMinMaxOps(
     "iree-llvmcpu-use-fast-min-max-ops",
     llvm::cl::desc(
         "Use `arith.minf/maxf` instead of `arith.minimumf/maximumf` ops"),
     llvm::cl::init(false));

 static llvm::cl::opt<bool> clEnableReassociateFpReductions(
     "iree-llvmcpu-reassociate-fp-reductions",
     llvm::cl::desc("Enables reassociation for FP reductions"),
     llvm::cl::init(true));

 static llvm::cl::opt<bool> clSkipIntermediateRoundings(
     "iree-llvmcpu-skip-intermediate-roundings",
     llvm::cl::desc(
         "Allow skipping intermediate roundings. For example, in f16 matmul "
         "kernels on targets with only f32 arithmetic, we have to perform each "
         "multiply-accumulate in f32, and if this flag is false, then we have "
         "to round those f32 accumulators to the nearest f16 every time, which "
         "is slow."),
     llvm::cl::init(true));

 static llvm::cl::opt<bool> clInstrumentMemoryAccesses{
     "iree-llvmcpu-instrument-memory-accesses",
     llvm::cl::desc("Instruments memory accesses in dispatches when dispatch "
                    "instrumentation is enabled."),
     llvm::cl::init(false)};

 static llvm::cl::opt<bool> clUseSoftmaxInterFusion(
     "iree-llvmcpu-use-decompose-softmax-fuse",
     llvm::cl::desc("Enables inter-pass fusion for the DecomposeSoftmax pass."),
     llvm::cl::init(true));

 static llvm::cl::opt<bool> clEnableVectorContractCustomKernels(
     "iree-llvmcpu-enable-vector-contract-custom-kernels",
     llvm::cl::desc("Enables vector contract custom kernels for "
                    "LLVMCPUMmt4dVectorLowering pass."),
     llvm::cl::init(false));

 static llvm::cl::opt<bool> clTileDispatchUsingForall(
     "iree-llvmcpu-tile-dispatch-using-forall",
     llvm::cl::desc("Enable tile and distribute to workgroups using scf.forall"),
     llvm::cl::init(true));

 // By default, IREE does not enable the Armv9-A streaming SVE mode in the
 // presence of scalable vectors (even when using `+sme`), as currently there's
 // no cost model of when it could be beneficial. This flag will effectively make
 // IREE/LLVM switch from SVE to SSVE in dispatch regions with supported
 // scalable vector operations.
 static llvm::cl::opt<bool> clForceArmStreaming(
     "iree-llvmcpu-force-arm-streaming",
     llvm::cl::desc(
         "Enables Armv9-A streaming SVE mode for any dispatch region that "
         "contains supported scalable vector operations (i.e., use SSVE rather "
         "than SVE). Requires the +sme feature flag."),
     llvm::cl::init(false));

 static llvm::cl::opt<bool> clPatchFuncOps(
     "iree-llvmcpu-debug-patch-func-ops",
     llvm::cl::desc(
         "Perform the patches on func ops for debugging purpose. It should be "
         "used with `--iree-codegen-debug-patched-func-ops-file-name`."),
     llvm::cl::init(false), llvm::cl::Hidden);

 // TODO: Enable `TileDispatchUsingForall` for every pipeline.
 static void
 addTileAndDistributePasses(OpPassManager &funcPassManager,
                            const LLVMCPUPipelineOptions &pipelineOpt) {
   if (pipelineOpt.disableDistribution) {
     return;
   }
   if (clTileDispatchUsingForall) {
     funcPassManager.addPass(
         createTileAndDistributeToWorkgroupsUsingForallOpPass());
     funcPassManager.addPass(createBufferizeDispatchTensorLoadStorePass());
     funcPassManager.addPass(createCombineLayoutTransformationPass());
   } else {
     funcPassManager.addPass(createTileAndDistributeToWorkgroupsPass());
     funcPassManager.addPass(createCSEPass());
     funcPassManager.addPass(createConvertToDestinationPassingStylePass());
     funcPassManager.addPass(createFoldAffineMinInDistributedLoopsPass());
   }
   funcPassManager.addPass(createConfigTrackingCanonicalizerPass());
   funcPassManager.addPass(createCSEPass());
   funcPassManager.addPass(createFuseTensorPadWithConsumerPass());
   funcPassManager.addPass(createConcretizePadResultShapePass());
   funcPassManager.addPass(createPropagateDispatchSizeBoundsPass());
 }

 //===---------------------------------------------------------------------===//
 // Codegen pipelines.
 //===---------------------------------------------------------------------===//

 void buildLLVMCPUVectorLoweringPipeline(
     OpPassManager &funcPassManager,
     const LLVMCPUVectorLoweringPassOptions &options) {
   funcPassManager.addPass(createDropVectorUnitDimsPass());
   funcPassManager.addPass(createLLVMCPUVirtualVectorLoweringPass(
       LLVMCPUVirtualVectorLoweringPassOptions{options.splitVectorTransfersTo,
                                               options.enableArmI8mm}));

   // Make sure we remove redundant vector ops (e.g., vector transposes) before
   // we lower them and can't be optimized away anymore.
   funcPassManager.addPass(createCanonicalizerPass());

   VectorTransferLoweringPassOptions transferLoweringOptions{};
   if (!options.enableArmSME) {
     // The ArmSME dialect has its own (more specific) lowerings for scalable
     // vectors that occur later in the pipeline, so only enable the general
     // lowerings if SME is not available.
     transferLoweringOptions.enableScalableLowerings = true;
   }
   funcPassManager.addPass(
       createVectorTransferLoweringPass(transferLoweringOptions));
   funcPassManager.addPass(createLLVMCPUVectorTransposeLoweringPass(
       LLVMCPUVectorTransposeLoweringPassOptions{
           options.lowerVectorTransposeToAVX2}));

   // Potentially removes shape_cast and broadcast on unit dims before shape_cast
   // lowering.
   funcPassManager.addPass(createCanonicalizerPass());

   // 'vector.shape_cast' are very expensive operations that are even generated
   // by some of the lowerings above (e.g., transpose lowering). There are
   // chances to cancel them out if they are not lowered too early so we lower
   // them at the very end of the pass.
   funcPassManager.addPass(createLLVMCPUVectorShapeCastLoweringPass());
 }

 void addCPUBufferOpsTileAndVectorizePipeline(
     OpPassManager &funcPassManager, const LLVMCPUPipelineOptions &pipelineOpt) {
   addTileAndDistributePasses(funcPassManager, pipelineOpt);

   // Skip tiling reduction loops because this is expected to apply on copy ops
   // only.
   funcPassManager.addPass(createLLVMCPUTilePass(
       IREE::CPU::TilingLevel::VectorCommonParallelTiles, /*skipRootOp=*/false));
   funcPassManager.addPass(createLLVMCPUPeelPass());
   {
     GenericVectorizationPassOptions options;
     options.useConfiguredVectorSizes = pipelineOpt.useConfiguredVectorSizes;
     options.enableVectorMasking = pipelineOpt.enableVectorMasking;
     options.vectorizeGatherAccesses = true;
     funcPassManager.addPass(createGenericVectorizationPass(options));
     funcPassManager.addPass(createOptimizeTensorInsertExtractSlicesPass());
     funcPassManager.addPass(createCanonicalizerPass());
     funcPassManager.addPass(createCSEPass());
     if (clFailOnLargeVector) {
       funcPassManager.addPass(createLLVMCPUVerifyVectorSizeLegalityPass());
     }
   }

   // Run IREE specific passes before vector lowering expert.
   funcPassManager.addPass(createRemoveSingleIterationLoopPass());

   {
     LLVMCPUVectorLoweringPassOptions options;
     options.lowerVectorTransposeToAVX2 = pipelineOpt.lowerToAVX2;
     options.splitVectorTransfersTo = "linalg-copy";
     options.enableArmI8mm = pipelineOpt.enableAArch64I8mm;
     options.enableArmSME = pipelineOpt.enableAArch64SME;
     buildLLVMCPUVectorLoweringPipeline(funcPassManager, options);
   }
 }

 void addMultiTilingExpertPassPipeline(
     OpPassManager &funcPassManager,
     IREE::Codegen::LoweringConfigAttrInterface loweringConfig,
     const LLVMCPUPipelineOptions &pipelineOpt) {
   addTileAndDistributePasses(funcPassManager, pipelineOpt);
   for (int i = 0, e = IREE::CPU::TilingLevel::MaxNumTileLevels; i < e; ++i) {
     auto level = static_cast<IREE::CPU::TilingLevel>(i);
     if (!loweringConfig.hasTilingLevel(level)) {
       continue;
     }

     switch (level) {
     case IREE::CPU::TilingLevel::CacheParallelTiles:
     case IREE::CPU::TilingLevel::VectorCommonParallelTiles:
       funcPassManager.addPass(
           createLLVMCPUTileAndFuseProducerConsumerPass(level));
       break;
     case IREE::CPU::TilingLevel::CacheReductionTiles:
       funcPassManager.addPass(
           createLLVMCPUTileRootAndFuseInputOperandsPass(level));
       break;
     case IREE::CPU::TilingLevel::VectorReductionTiles:
       // Run SplitReductionPass before the final reduction Fuse pass, because
       // SplitReductionPass takes care of banked-tiling.
       funcPassManager.addPass(
           createLLVMCPUSplitReductionPass(clEnableReassociateFpReductions));
       funcPassManager.addPass(
           createLLVMCPUTileRootAndFuseInputOperandsPass(level));
       // Tile all the reduction ops for target vector sizes, which ensures
       // that all the dimensions are tiled in all the reduction ops. The root
       // op is already tiled, so it is skipped in the pass.
       funcPassManager.addPass(createLLVMCPUTilePass(
           static_cast<IREE::CPU::TilingLevel>(i), /*skipRootOp=*/true));
       break;
     case IREE::CPU::TilingLevel::VectorInnerParallelTiles:
     case IREE::CPU::TilingLevel::DistributionTiles:
     case IREE::CPU::TilingLevel::MaxNumTileLevels:
     case IREE::CPU::TilingLevel::InvalidLevel:
       continue;
     };
     funcPassManager.addPass(createFuseTensorPadWithConsumerPass());
     funcPassManager.addPass(createConcretizePadResultShapePass());
   }

   // `VectorInnerParallelTiles` level models the tiling and fusion for the
   // dimensions that are not captured in root op. I.e., root op may not have the
   // config for the level. Thus, we use the last operation that has the tiling
   // level as anchor.
   funcPassManager.addPass(createLLVMCPUTileLastOpAndFuseProducerConsumerPass(
       IREE::CPU::TilingLevel::VectorInnerParallelTiles));
   funcPassManager.addPass(createFuseTensorPadWithConsumerPass());
   funcPassManager.addPass(createConcretizePadResultShapePass());

   funcPassManager.addPass(createForallToForPass());
   if (pipelineOpt.enablePeeling) {
     funcPassManager.addPass(createLLVMCPUPeelPass());
   }

   if (pipelineOpt.enableAArch64SME) {
     funcPassManager.addPass(createLLVMCPU2DScalableTo1DScalablePass());
   }

   {
     funcPassManager.addPass(createTensorToVectorVectorizePadPass());
     if (pipelineOpt.decomposePackUnPackOps) {
       funcPassManager.addPass(createDecomposePackUnPackOpsPass());
       funcPassManager.addPass(createConfigTrackingCanonicalizerPass());
       funcPassManager.addPass(createCSEPass());
     }

     GenericVectorizationPassOptions options;
     options.useConfiguredVectorSizes = pipelineOpt.useConfiguredVectorSizes;
     options.enableVectorMasking = pipelineOpt.enableVectorMasking;
     options.vectorizePadding = true;
     options.vectorizeGatherAccesses = true;
     funcPassManager.addPass(createGenericVectorizationPass(options));
     funcPassManager.addPass(createOptimizeTensorInsertExtractSlicesPass());
     funcPassManager.addPass(createCanonicalizerPass());
     funcPassManager.addPass(createCSEPass());
     if (clFailOnLargeVector) {
       funcPassManager.addPass(createLLVMCPUVerifyVectorSizeLegalityPass());
     }
   }

   addCPUBufferizePasses(funcPassManager);

   // Run IREE specific passes before vector lowering expert.
   funcPassManager.addPass(createPropagateDispatchSizeBoundsPass());
   funcPassManager.addPass(createRemoveSingleIterationLoopPass());

   {
     LLVMCPUVectorLoweringPassOptions options;
     options.lowerVectorTransposeToAVX2 = pipelineOpt.lowerToAVX2;
     options.splitVectorTransfersTo = "linalg-copy";
     options.enableArmI8mm = pipelineOpt.enableAArch64I8mm;
     options.enableArmSME = pipelineOpt.enableAArch64SME;
     buildLLVMCPUVectorLoweringPipeline(funcPassManager, options);
   }
 }

 void addConvTileAndDecomposeExpertPassPipeline(
     OpPassManager &funcPassManager, const LLVMCPUPipelineOptions &pipelineOpt) {
   addTileAndDistributePasses(funcPassManager, pipelineOpt);

   funcPassManager.addPass(createLLVMCPUTileAndFuseProducerConsumerPass(
       IREE::CPU::TilingLevel::VectorCommonParallelTiles));
   funcPassManager.addPass(createFuseTensorPadWithConsumerPass());
   funcPassManager.addPass(createConcretizePadResultShapePass());

   funcPassManager.addPass(createLLVMCPUTileRootAndFuseInputOperandsPass(
       IREE::CPU::TilingLevel::VectorReductionTiles));
   funcPassManager.addPass(createDecomposeConvolutionToLowerDimOpsPass());
   funcPassManager.addPass(createFuseTensorPadWithConsumerPass());
   funcPassManager.addPass(createConcretizePadResultShapePass());

   // Convert forall to for before vectorization preparation.
   funcPassManager.addPass(iree_compiler::createForallToForPass());

   if (pipelineOpt.enablePeeling) {
     funcPassManager.addPass(createLLVMCPUPeelPass());
   }

   {
     funcPassManager.addPass(createTensorToVectorVectorizePadPass());
     GenericVectorizationPassOptions options;
     options.useConfiguredVectorSizes = pipelineOpt.useConfiguredVectorSizes;
     options.enableVectorMasking = pipelineOpt.enableVectorMasking;
     options.vectorizePadding = true;
     options.vectorizeGatherAccesses = true;
     funcPassManager.addPass(createGenericVectorizationPass(options));
     funcPassManager.addPass(createOptimizeTensorInsertExtractSlicesPass());
     funcPassManager.addPass(createCanonicalizerPass());
     funcPassManager.addPass(createCSEPass());
     if (clFailOnLargeVector) {
       funcPassManager.addPass(createLLVMCPUVerifyVectorSizeLegalityPass());
     }
   }

   // Eliminate redundant transfer_read/write to avoid stack allocations.
   funcPassManager.addPass(createOptimizeVectorTransferPass(
       OptimizeVectorTransferPassOptions{/*flatten=*/true}));

   addCPUBufferizePasses(funcPassManager);

   // Run IREE specific passes before vector lowering expert.
   funcPassManager.addPass(createPropagateDispatchSizeBoundsPass());
   funcPassManager.addPass(createRemoveSingleIterationLoopPass());

   {
     LLVMCPUVectorLoweringPassOptions options;
     options.lowerVectorTransposeToAVX2 = pipelineOpt.lowerToAVX2;
     options.splitVectorTransfersTo = "shuffle";
     options.enableArmI8mm = pipelineOpt.enableAArch64I8mm;
     options.enableArmSME = pipelineOpt.enableAArch64SME;
     buildLLVMCPUVectorLoweringPipeline(funcPassManager, options);
   }
 }

 void addMmt4dTilingExpertPassPipeline(
     OpPassManager &funcPassManager, const LLVMCPUPipelineOptions &pipelineOpt) {
   addTileAndDistributePasses(funcPassManager, pipelineOpt);

   funcPassManager.addPass(createLLVMCPUTileAndFuseProducerConsumerPass(
       IREE::CPU::TilingLevel::VectorCommonParallelTiles));
   // The below two passes are nop if the "mmt4d" is explicitly excluded in the
   // ukernels attribute.
   funcPassManager.addPass(createCPUPrepareUkernelsPass());
   funcPassManager.addPass(
       createCPULowerToUKernelsPass(clSkipIntermediateRoundings));
   funcPassManager.addPass(createLLVMCPUTileRootAndFuseInputOperandsPass(
       IREE::CPU::TilingLevel::VectorReductionTiles));
   funcPassManager.addPass(iree_compiler::createForallToForPass());

   {
     GenericVectorizationPassOptions options;
     options.useConfiguredVectorSizes = pipelineOpt.useConfiguredVectorSizes;
     options.enableVectorMasking = pipelineOpt.enableVectorMasking;
     options.vectorizePadding = true;
     options.vectorizeGatherAccesses = true;
     funcPassManager.addPass(createGenericVectorizationPass(options));
     funcPassManager.addPass(createOptimizeTensorInsertExtractSlicesPass());
     funcPassManager.addPass(createCanonicalizerPass());
     funcPassManager.addPass(createCSEPass());
     if (clFailOnLargeVector) {
       funcPassManager.addPass(createLLVMCPUVerifyVectorSizeLegalityPass());
     }
   }

   funcPassManager.addPass(createCanonicalizerPass());
   funcPassManager.addPass(createCSEPass());

   addCPUBufferizePasses(funcPassManager);

   // Vector lowering of Mmt4d.
   funcPassManager.addPass(createLLVMCPUMmt4dVectorLoweringPass(
       LLVMCPUMmt4dVectorLoweringPassOptions{
           clEnableVectorContractCustomKernels}));

   // Generic vector lowering.
   LLVMCPUVectorLoweringPassOptions options;
   options.lowerVectorTransposeToAVX2 = pipelineOpt.lowerToAVX2;
   options.splitVectorTransfersTo = "linalg-copy";
   options.enableArmI8mm = pipelineOpt.enableAArch64I8mm;
   options.enableArmSME = pipelineOpt.enableAArch64SME;
   buildLLVMCPUVectorLoweringPipeline(funcPassManager, options);
 }

 void addCPUDataTilingPipeline(OpPassManager &funcPassManager,
                               const LLVMCPUPipelineOptions &pipelineOpt) {
   addTileAndDistributePasses(funcPassManager, pipelineOpt);

   // The below two passes are nop if pack/unpack is not specified in ukernels
   // attribute. By default, they are disabled.
   funcPassManager.addPass(createCPUPrepareUkernelsPass());
   funcPassManager.addPass(
       createCPULowerToUKernelsPass(clSkipIntermediateRoundings));

   funcPassManager.addPass(createLLVMCPUTilePass(
       IREE::CPU::TilingLevel::VectorCommonParallelTiles, /*skipRootOp=*/false));
   if (pipelineOpt.decomposePackUnPackOps) {
     funcPassManager.addPass(createDecomposePackUnPackOpsPass());
   }

   {
     GenericVectorizationPassOptions options;
     options.useConfiguredVectorSizes = pipelineOpt.useConfiguredVectorSizes;
     options.vectorizePadding = true;
     options.enableVectorMasking = pipelineOpt.enableVectorMasking;
     funcPassManager.addPass(createGenericVectorizationPass(options));
     funcPassManager.addPass(createOptimizeTensorInsertExtractSlicesPass());
     funcPassManager.addPass(createCanonicalizerPass());
     funcPassManager.addPass(createCSEPass());
     if (clFailOnLargeVector) {
       funcPassManager.addPass(createLLVMCPUVerifyVectorSizeLegalityPass());
     }
   }

   addCPUBufferizePasses(funcPassManager);

   {
     LLVMCPUVectorLoweringPassOptions options;
     options.lowerVectorTransposeToAVX2 = pipelineOpt.lowerToAVX2;
     options.splitVectorTransfersTo = "linalg-copy";
     options.enableArmI8mm = pipelineOpt.enableAArch64I8mm;
     options.enableArmSME = pipelineOpt.enableAArch64SME;
     buildLLVMCPUVectorLoweringPipeline(funcPassManager, options);
   }
 }

 void addCPULinalgExtTileAndVectorizePipeline(
     OpPassManager &funcPassManager, const LLVMCPUPipelineOptions &pipelineOpt) {
   addTileAndDistributePasses(funcPassManager, pipelineOpt);
   funcPassManager.addPass(createLLVMCPUTileAndFuseProducerConsumerPass(
       IREE::CPU::TilingLevel::VectorCommonParallelTiles));
   funcPassManager.addPass(
       IREE::LinalgExt::createConvertAttentionToOnlineAttentionPass());
   funcPassManager.addPass(createLLVMCPUTileRootAndFuseInputOperandsPass(
       IREE::CPU::TilingLevel::VectorReductionTiles));
   funcPassManager.addPass(
       IREE::LinalgExt::createDecomposeWinogradTransformPass());
   funcPassManager.addPass(IREE::LinalgExt::createDecomposeAttentionPass());
   funcPassManager.addPass(iree_compiler::createForallToForPass());

   {
     GenericVectorizationPassOptions options;
     options.useConfiguredVectorSizes = pipelineOpt.useConfiguredVectorSizes;
     options.enableVectorMasking = pipelineOpt.enableVectorMasking;
     funcPassManager.addPass(createGenericVectorizationPass(options));
     funcPassManager.addPass(createCanonicalizerPass());
     funcPassManager.addPass(createCSEPass());
     funcPassManager.addPass(createOptimizeTensorInsertExtractSlicesPass());
     funcPassManager.addPass(createCanonicalizerPass());
     funcPassManager.addPass(createCSEPass());
     if (clFailOnLargeVector) {
       funcPassManager.addPass(createLLVMCPUVerifyVectorSizeLegalityPass());
     }
   }

   addCPUBufferizePasses(funcPassManager);

   {
     LLVMCPUVectorLoweringPassOptions options;
     options.lowerVectorTransposeToAVX2 = pipelineOpt.lowerToAVX2;
     options.splitVectorTransfersTo = "linalg-copy";
     options.enableArmI8mm = pipelineOpt.enableAArch64I8mm;
     options.enableArmSME = pipelineOpt.enableAArch64SME;
     buildLLVMCPUVectorLoweringPipeline(funcPassManager, options);
   }
 }

 void addCPUDefaultPassPipeline(OpPassManager &funcPassManager,
                                const LLVMCPUPipelineOptions &pipelineOpt) {
   addTileAndDistributePasses(funcPassManager, pipelineOpt);
   funcPassManager.addPass(createLLVMCPUTileLastOpAndFuseProducerConsumerPass(
       IREE::CPU::TilingLevel::VectorCommonParallelTiles));
   addCPUBufferizePasses(funcPassManager);
 }

 static void addLowerToLLVMPasses(OpPassManager &modulePassManager,
                                  bool enableAArch64SME) {
   // TODO: Remove the following pass and plumb support for #hal.descriptor_type
   // memory space through the stack.
   FunctionLikeNest(modulePassManager)
       .addPass(createEraseHALDescriptorTypeFromMemRefPass);

   // Lower `ukernel.*` ops to function calls
   modulePassManager.addPass(createLowerUKernelOpsToCallsPass());

   FunctionLikeNest(modulePassManager)
       // LinalgExt -> SCF
       .addPass(IREE::LinalgExt::createLinalgExtToLoopsPass)
       // Linalg -> SCF
       .addPass(createMemrefCopyToLinalgPass)
       .addPredicatedPass(clCheckLinalgVectorization,
                          createLLVMCPUEmitVectorizationRemarksPass)
       .addPass(createConvertLinalgToLoopsPass)
       .addPass(createConvertBf16ArithToF32Pass)
       .addPass(createConvertBf16ToUInt16BuffersPass)
       .addPass(createCanonicalizerPass)
       .addPass(createCSEPass);

   // Handled tensor-type constants.
   modulePassManager.addPass(createIREEBufferizeConstantsPass());

   FunctionLikeNest(modulePassManager)
       .addPass(createFoldTensorExtractOpPass)
       // Handle complex operation conversion.
       .addPass(createConvertComplexToStandardPass)
       // Math dialect ops rewrites, approximations, casts.
       .addPass(createMathTransformPass)
       .addPass(createHoistStaticallyBoundAllocationsPass)
       // Use `arith.minf/maxf` instead of `arith.minimumf/maximumf`.
       .addPredicatedPass(clUseFastMinMaxOps, createReplaceSlowMinMaxOpsPass);

   if (enableAArch64SME) {
     modulePassManager.addPass(mlir::arm_sme::createVectorLegalizationPass());
     FunctionLikeNest(modulePassManager)
         .addPredicatedPass(
             clForceArmStreaming,
             [] {
               // 1. Enable Armv9-A streaming mode without ZA (i.e., SSVE) for
               // dispatch regions that contain scalable vectors when forced via
               // the --iree-llvmcpu-force-arm-streaming flag.
               return mlir::arm_sme::createEnableArmStreamingPass(
                   mlir::arm_sme::ArmStreamingMode::StreamingLocally,
                   mlir::arm_sme::ArmZaMode::Disabled,
                   /*ifRequiredByOps=*/false,
                   /*ifContainsScalableVectors=*/true);
             })
         .addPass(createCanonicalizerPass)
         .addPass(createCSEPass)
         .addPass(mlir::createArithToArmSMEConversionPass)
         .addPass(mlir::createConvertVectorToArmSMEPass)
         .addPass([] {
           // 2. Enable ZA for dispatch regions that contain ArmSME ops (which
           // all make use of the ZA state).
           return mlir::arm_sme::createEnableArmStreamingPass(
               mlir::arm_sme::ArmStreamingMode::StreamingLocally,
               mlir::arm_sme::ArmZaMode::NewZA,
               /*ifRequiredByOps=*/true);
         })
         .addPass(mlir::createConvertArmSMEToSCFPass);
   }

   VectorTransferLoweringPassOptions transferLoweringOptions;
   if (!enableAArch64SME) {
     // The ArmSME dialect has its own (more specific) lowerings for scalable
     // vectors that occur later in the pipeline, so only enable the general
     // lowerings if SME is not available.
     transferLoweringOptions.enableScalableLowerings = true;
   }

   FunctionLikeNest(modulePassManager)
       // All structural buffer manipulations must conclude before this point.

       // The subview folding doesn't like potentially-out-of-bounds
       // vector.transfer_read and vector.transfer_write, lower them to loads and
       // stores here.
       .addPass([&]() {
         return createVectorTransferLoweringPass(transferLoweringOptions);
       })
       .addPass(memref::createFoldMemRefAliasOpsPass)
       .addPass(createIREEExpandStridedMetadataPass)
       .addPass(createCleanupBufferAllocViewPass)
       // Checking stack allocation before converting to CF dialect is easier.
       .addPass([&]() {
         return createLLVMCPUCheckIRBeforeLLVMConversionPass(
             LLVMCPUCheckIRBeforeLLVMConversionPassOptions{
                 clFailOnOutOfBoundsStackAllocation});
       })
       // SCF -> CF
       .addPass(createSCFToControlFlowPass)
       .addPass(createCanonicalizerPass)
       .addPass(createCSEPass)
       // (HAL, IREE, Linalg, CF) -> LLVM
       .addPass(memref::createFoldMemRefAliasOpsPass)
       .addPass(affine::createAffineExpandIndexOpsPass)
       .addPass([&]() {
         arith::ArithExpandOpsPassOptions options;
         options.includeBf16 = true;
         options.includeF4E2M1 = true;
         options.includeF8E8M0 = true;
         return arith::createArithExpandOpsPass(options);
       })
       .addPass(createEmulateNarrowTypePass)
       .addPass(createCanonicalizerPass)
       .addPass(createCSEPass)
       .addPredicatedPass(clInstrumentMemoryAccesses,
                          createInstrumentMemoryAccessesPass);

   if (enableAArch64SME) {
     FunctionLikeNest(modulePassManager).addPass([&] {
       return createConvertArmSMEToLLVMPass();
     });
   }
   modulePassManager.addPass(
       createConvertToLLVMPass(clEnableReassociateFpReductions));
   modulePassManager.addPass(createReconcileUnrealizedCastsPass());

   // We rely on MLIR symbol visibility being correct after this point and need
   // to mirror the LLVM linkage that was assigned during conversion.
   modulePassManager.addPass(createLLVMCPUSynchronizeSymbolVisibilityPass());

   modulePassManager.addPass(createCanonicalizerPass());
   modulePassManager.addPass(createCSEPass());
   modulePassManager.addNestedPass<LLVM::LLVMFuncOp>(
       createAddFastMathFlagsPass());
 }

 void buildLLVMCPUCodegenConfigurationPassPipelineImpl(
     OpPassManager &modulePassManager) {
   {
     FunctionLikeNest funcPassManager(modulePassManager);
     addCommonTargetExecutablePreprocessingPasses(funcPassManager,
                                                  clUseSoftmaxInterFusion);
   }
   modulePassManager.addPass(createMaterializeUserConfigsPass());
   FunctionLikeNest(modulePassManager)
       .addPass(createRematerializeParallelOpsPass)
       // TODO(#13888): This(createExpandF16OpToF32Pass()) pass is being added
       // way to late and should insted be be done during lowering to LLVM.
       .addPass(createExpandF16OpToF32Pass)
       .addPass(createMaterializeDeviceEncodingPass)
       .addPass(createCPUPropagateDataLayoutPass)
       .addPass(createConvertAccGEMMToGEMMPass)
       // TODO: Remove the following pass the plumb support for
       // #hal.descriptor_type memory space through the stack.
       .addPass(createEraseHALDescriptorTypeFromMemRefPass);

   modulePassManager.addPass(createLLVMCPUSelectLoweringStrategyPass());
   LLVM_DEBUG({
     llvm::dbgs() << "LLVMCPU codegen configuration pass pipeline:\n";
     modulePassManager.printAsTextualPipeline(llvm::dbgs());
     llvm::dbgs() << "\n";
   });
 }

 void buildLLVMCPUCodegenConfigurationPassPipeline(
     OpPassManager &variantPassManager) {
   variantPassManager.addPass(createSpecializeExportsPass());
   OpPassManager &modulePassManager = variantPassManager.nest<ModuleOp>();
   buildLLVMCPUCodegenConfigurationPassPipelineImpl(modulePassManager);
 }

 void buildLLVMCPUCodegenPassPipeline(OpPassManager &variantPassManager,
                                      bool enableAArch64SME) {

   {
     OpPassManager &modulePassManager = variantPassManager.nest<ModuleOp>();
     modulePassManager.addPass(createLowerExecutableUsingTransformDialectPass());
     FunctionLikeNest(modulePassManager)
         .addPass(createLLVMCPULowerExecutableTargetPass)
         .addPass(createVerifyWorkgroupDistributionPass);
     if (clPatchFuncOps) {
       modulePassManager.addPass(createPatchFuncOpsPass());
     }
   }

   variantPassManager.addPass(createReconcileTranslationInfoPass());
   variantPassManager.addPass(createLowerAffinePass());
   variantPassManager.addPass(IREE::Util::createDropCompilerHintsPass());

   // Run conversion to LLVM at `ModuleOp` granularity.
   {
     OpPassManager &modulePassManager = variantPassManager.nest<ModuleOp>();
     addLowerToLLVMPasses(modulePassManager, enableAArch64SME);
   }
   LLVM_DEBUG({
     llvm::dbgs() << "LLVMCPU codegen pass pipeline:\n";
     variantPassManager.printAsTextualPipeline(llvm::dbgs());
     llvm::dbgs() << "\n";
   });
 }

 // NOTE: this runs on the top-level program module containing all
 // hal.executable ops.
 void buildLLVMCPULinkingPassPipeline(OpPassManager &modulePassManager,
                                      std::optional<std::string> target) {
   // Link together executables. This may produce some IR duplication.
   LLVMCPULinkExecutablesPassOptions linkOptions;
   linkOptions.target = target.value_or("");
   modulePassManager.addPass(createLLVMCPULinkExecutablesPass(linkOptions));

   // Cleanup IR duplication.
   modulePassManager.addNestedPass<IREE::HAL::ExecutableOp>(
       mlir::createCanonicalizerPass());

   // Assign final executable constant and import ordinals.
   auto &variantPassManager = modulePassManager.nest<IREE::HAL::ExecutableOp>()
                                  .nest<IREE::HAL::ExecutableVariantOp>();
   variantPassManager.addPass(createLLVMCPUAssignConstantOrdinalsPass());
   variantPassManager.addPass(createLLVMCPUAssignImportOrdinalsPass());
 }

 //===---------------------------------------------------------------------===//
 // Register LLVMCPU Passes
 //===---------------------------------------------------------------------===//

 namespace {
 #define GEN_PASS_REGISTRATION
 #include "iree/compiler/Codegen/LLVMCPU/Passes.h.inc"
 } // namespace

 void registerCodegenLLVMCPUPasses() {
   // Generated.
   registerPasses();

   static PassPipelineRegistration<> LLVMCPUConfigPipeline(
       "iree-codegen-llvmcpu-configuration-pipeline",
       "Runs the translation strategy configuration pipeline on Linalg for CPU",
       [](OpPassManager &modulePassManager) {
         buildLLVMCPUCodegenConfigurationPassPipelineImpl(modulePassManager);
       });

   static PassPipelineRegistration<> LLVMCPUBufferizationPipeline(
       "iree-codegen-llvmcpu-bufferization-pipeline",
       "Runs the bufferization pipeline for CPU",
       [](OpPassManager &funcPassManager) {
         addCPUBufferizePasses(funcPassManager);
       });

   static PassPipelineRegistration<> LLVMCPUVectorLoweringPipeline(
       "iree-codegen-llvmcpu-vector-lowering-pipeline",
       "Runs the translation strategy configuration pipeline on Linalg for CPU",
       [](OpPassManager &funcPassManager) {
         LLVMCPUVectorLoweringPassOptions options;
         options.splitVectorTransfersTo = "linalg-copy";
         buildLLVMCPUVectorLoweringPipeline(funcPassManager, options);
       });

   struct LinalgToLLVMPipelineOptions
       : public PassPipelineOptions<LinalgToLLVMPipelineOptions> {
     Option<bool> enableArmSME{
         *this, "enable-arm-sme",
         llvm::cl::desc("Enable the ArmSME lowering pipeline.")};
   };

   static PassPipelineRegistration<LinalgToLLVMPipelineOptions>
       LinalgLLVMPipeline(
           "iree-codegen-linalg-to-llvm-pipeline",
           "Runs the progressive lowering pipeline from Linalg to LLVM",
           [](OpPassManager &variantPassManager,
              LinalgToLLVMPipelineOptions const &options) {
             buildLLVMCPUCodegenPassPipeline(variantPassManager,
                                             options.enableArmSME);
           });

   static PassPipelineRegistration<> LLVMCPULinkingPipeline(
       "iree-codegen-llvmcpu-linking-pipeline",
       "Runs the LLVMCPU HAL executable linking pipeline",
       [](OpPassManager &modulePassManager) {
         buildLLVMCPULinkingPassPipeline(modulePassManager);
       });
 }

 } // namespace mlir::iree_compiler