iree/compiler/Codegen/LLVMCPU/Passes.cpp - 3p/openxla/iree - Git at Google

 // Copyright 2020 The IREE Authors
 //
 // Licensed under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

 #include "iree/compiler/Codegen/Passes.h"

 #include "iree-dialects/Dialect/LinalgExt/IR/TiledOpInterface.h"
 #include "iree-dialects/Dialect/LinalgExt/Passes/Passes.h"
 #include "iree-dialects/Dialect/LinalgTransform/Passes.h"
 #include "iree/compiler/Codegen/LLVMCPU/KernelDispatch.h"
 #include "iree/compiler/Codegen/PassDetail.h"
 #include "iree/compiler/Codegen/Sandbox/Passes.h"
 #include "iree/compiler/Codegen/Utils/Utils.h"
 #include "iree/compiler/Dialect/Flow/IR/PartitionableLoopsInterface.h"
 #include "llvm/Support/CommandLine.h"
 #include "mlir/Conversion/ReconcileUnrealizedCasts/ReconcileUnrealizedCasts.h"
 #include "mlir/Conversion/SCFToControlFlow/SCFToControlFlow.h"
 #include "mlir/Dialect/Arithmetic/Transforms/Passes.h"
 #include "mlir/Dialect/Func/Transforms/Passes.h"
 #include "mlir/Dialect/Linalg/Passes.h"
 #include "mlir/Dialect/MemRef/Transforms/Passes.h"
 #include "mlir/Pass/PassManager.h"
 #include "mlir/Transforms/Passes.h"

 namespace mlir {
 namespace iree_compiler {

 /// Command line options used purely for development purposes. Not to be relied
 /// on in any way.
 static llvm::cl::opt<bool> clCheckIRBeforeLLVMConversion(
     "iree-codegen-check-ir-before-llvm-conversion",
     llvm::cl::desc("Runs the pass to check the IR generated from LLVMCPU "
                    "before conversion to LLVM IR"),
     llvm::cl::init(true));

 //===---------------------------------------------------------------------===//
 // Default allocation functions for CPU backend
 //===---------------------------------------------------------------------===//

 // Default allocation function to use with IREEs bufferization.
 static Value cpuAllocationFunction(OpBuilder &builder, Location loc,
                                    ArrayRef<int64_t> staticShape,
                                    Type elementType,
                                    ArrayRef<Value> dynamicSizes) {
   MemRefType allocType = MemRefType::get(staticShape, elementType);
   return builder.create<memref::AllocaOp>(loc, allocType, dynamicSizes);
 }

 // Allocation callbacks to use with upstream comprehensive bufferization
 static FailureOr<Value> cpuComprehensiveBufferizeAllocationFn(
     OpBuilder &builder, Location loc, MemRefType memRefType,
     ValueRange dynamicSizes, unsigned alignment) {
   return builder
       .create<memref::AllocaOp>(loc, memRefType, dynamicSizes,
                                 builder.getI64IntegerAttr(alignment))
       .getResult();
 }

 static LogicalResult cpuComprehensiveBufferizeDeallocationFn(OpBuilder &builder,
                                                              Location loc,
                                                              Value allocation) {
   return success();
 }

 static LogicalResult cpuComprehensiveBufferizeCopyFn(OpBuilder &builder,
                                                      Location loc, Value from,
                                                      Value to) {
   createLinalgCopyOp(builder, loc, from, to);
   return success();
 }

 //===---------------------------------------------------------------------===//
 // Codegen configuration verifications.
 //===---------------------------------------------------------------------===//

 static bool isValidInterchange(ArrayRef<int64_t> interchange, int numLoops) {
   if (interchange.empty()) return true;
   llvm::SmallDenseSet<int64_t> s;
   s.insert(interchange.begin(), interchange.end());
   for (int i = 0; i < numLoops; ++i) {
     if (!s.contains(i)) return false;
   }
   return true;
 }

 LogicalResult verifyDoubleTilingExpertPassPipelineConfig(
     Operation *op, IREE::Codegen::LoweringConfigAttr loweringConfig,
     IREE::Codegen::TranslationInfoAttr translationInfo,
     ArrayRef<int64_t> workgroupSize) {
   if (!workgroupSize.empty()) {
     return op->emitOpError(
         "expected workgroup size to be empty for CPU pipelines");
   }

   // Verify that the translation info is using the right pipeline.
   auto pipeline =
       IREE::Codegen::DispatchLoweringPassPipeline::CPUDoubleTilingExpert;
   StringRef pipelineName = stringifyEnum(pipeline);
   if (translationInfo.getDispatchLoweringPassPipeline() != pipeline) {
     return op->emitOpError("expected pipeline in translation_info to be ")
            << pipelineName;
   }

   // Verify that the workload per workgroup is not set.
   // TODO(ravishankarm): Remove workload_per_wg eventually.
   SmallVector<int64_t> workloadPerWorkgroup =
       translationInfo.getWorkloadPerWorkgroupVals();
   if (!workloadPerWorkgroup.empty()) {
     return op->emitOpError(
                "workload_per_wg expected to be empty since its internal "
                "compiler implementation detail")
            << kNumMaxParallelDims;
   }

   if (loweringConfig.getTileSizes().size() !=
       static_cast<unsigned>(StrategyTilingLevel::NumStrategyTileLevels)) {
     return op->emitOpError("expected three tiling sizes for ")
            << pipelineName << ", got " << loweringConfig.getTileSizes().size();
   }

   IREE::Flow::PartitionableLoopsInterface interfaceOp =
       dyn_cast_or_null<IREE::Flow::PartitionableLoopsInterface>(op);
   if (interfaceOp) {
     SmallVector<int64_t> firstLevelTileSizes = loweringConfig.getTileSizeVals(
         static_cast<unsigned>(StrategyTilingLevel::WorkGroupTiles));
     // This is needed to fuse and distribute all ops together.
     if (firstLevelTileSizes.size() != interfaceOp.getNumLoops()) {
       return op->emitOpError(
           "mismatch between number of loops and first level of tiling");
     }

     llvm::SmallDenseSet<unsigned> pLoopsSet;
     for (auto iteratorType : llvm::enumerate(interfaceOp.getIteratorTypes())) {
       if (iteratorType.value() == getParallelIteratorTypeName()) {
         pLoopsSet.insert(iteratorType.index());
       }
     }

     SmallVector<int64_t> secondLevelTileSizes = loweringConfig.getTileSizeVals(
         static_cast<unsigned>(StrategyTilingLevel::ParallelTiles));
     for (auto en : llvm::enumerate(secondLevelTileSizes)) {
       if (en.value() != 0 && !pLoopsSet.contains(en.index())) {
         return op->emitOpError(
                    "expected only parallel dims to be set in the "
                    "second tiling sizes, got ")
                << en.index() << "-th tile size set";
       }
     }

     SmallVector<int64_t> thirdLevelTileSizes = loweringConfig.getTileSizeVals(
         static_cast<unsigned>(StrategyTilingLevel::ReductionTiles));
     for (auto en : llvm::enumerate(thirdLevelTileSizes)) {
       if (en.value() != 0 && pLoopsSet.contains(en.index())) {
         return op->emitOpError(
                    "expected only reduction dims to be set in the third "
                    "tiling sizes, got ")
                << en.index() << "-th tile size set";
       }
     }
   }

   // Verify interchange
   if (!loweringConfig.getTileInterchange().empty()) {
     for (auto level : llvm::seq<unsigned>(
              0, static_cast<unsigned>(
                     loweringConfig.getTileInterchange().size()))) {
       auto tileSizes = loweringConfig.getTileSizeVals(level);
       auto interchange = loweringConfig.getTileInterchangeVals(level);
       if (!isValidInterchange(interchange, tileSizes.size())) {
         return op->emitOpError("expected [0, ")
                << tileSizes.size()
                << ") to be set exactly once in interchange #" << level;
       }
     }
   }

   // Verify that native vector size is empty.
   SmallVector<int64_t> nativeVectorSize =
       loweringConfig.getNativeVectorSizeVals();
   if (!nativeVectorSize.empty()) {
     return op->emitOpError("native_vector_size must be empty");
   }
   return success();
 }

 //===---------------------------------------------------------------------===//
 // Codegen pipelines.
 //===---------------------------------------------------------------------===//

 void addSingleTilingExpertPassPipeline(OpPassManager &passManager) {
   // Do first level of tiling and distribution.
   passManager.addNestedPass<func::FuncOp>(createInsertDistributionInfoPass());
   passManager.addNestedPass<func::FuncOp>(
       createTileAndDistributeToWorkgroupsPass());
   passManager.addNestedPass<func::FuncOp>(
       createFoldAffineMinInDistributedLoopsPass());
   passManager.addPass(createCanonicalizerPass());
   passManager.addPass(createCSEPass());

   passManager.addNestedPass<func::FuncOp>(
       createConvertToDestinationPassingStylePass());
   passManager.addPass(createCanonicalizerPass());
   // Add the sandbox single tiling expert to tile and vectorize.
   {
     LinalgSingleTilingExpertPassOptions options;
     options.vectorize = true;
     options.tilingLevel = static_cast<int64_t>(TilingLevel::L1Tiles);
     passManager.addNestedPass<func::FuncOp>(
         createLinalgSingleTilingExpertPass(options));
   }

   // TODO(ravishankarm): This is commented cause this is WIP, to be enabled
   // soon.
   // auto callbacks =
   //     std::make_unique<linalg::comprehensive_bufferize::AllocationCallbacks>(
   //         cpuComprehensiveBufferizeAllocationFn,
   //         cpuComprehensiveBufferizeDeallocationFn,
   //         cpuComprehensiveBufferizeCopyFn);
   // addIREEComprehensiveBufferizePasses(passManager, std::move(callbacks));
   addLinalgBufferizePasses(passManager, cpuAllocationFunction);

   // Add the vector lowering expert.
   {
     OpPassManager &nestedFuncPassManager = passManager.nest<func::FuncOp>();
     LinalgVectorLoweringPassOptions options;
     addLowerToVectorTransforms(nestedFuncPassManager, options);
   }
 }

 void addCPUBufferOpsTileAndVectorizePipeline(OpPassManager &passManager) {
   // Do first level of tiling and distribution.
   passManager.addNestedPass<func::FuncOp>(createInsertDistributionInfoPass());
   passManager.addNestedPass<func::FuncOp>(
       createTileAndDistributeToWorkgroupsPass());
   passManager.addNestedPass<func::FuncOp>(
       createFoldAffineMinInDistributedLoopsPass());
   passManager.addPass(createCanonicalizerPass());
   passManager.addPass(createCSEPass());

   // Run IREE specific passes before vector lowering expert.
   passManager.addNestedPass<func::FuncOp>(
       createRemoveSingleIterationLoopPass());

   // Add the vector lowering expert.
   {
     OpPassManager &nestedFuncPassManager = passManager.nest<func::FuncOp>();
     LinalgVectorLoweringPassOptions options;
     options.splitVectorTransfersTo = "linalg-copy";
     addLowerToVectorTransforms(nestedFuncPassManager, options);
   }
 }

 void addDoubleTilingExpertPassPipeline(OpPassManager &passManager,
                                        bool lowerToAVX2) {
   passManager.addPass(createVerifyLinalgTransformLegalityPass());

   // Do first level of tiling and distribution.
   passManager.addNestedPass<func::FuncOp>(createInsertDistributionInfoPass());
   passManager.addNestedPass<func::FuncOp>(
       createTileAndDistributeToWorkgroupsPass());
   passManager.addNestedPass<func::FuncOp>(
       createFoldAffineMinInDistributedLoopsPass());
   passManager.addPass(createCanonicalizerPass());
   passManager.addPass(createCSEPass());
   passManager.addNestedPass<func::FuncOp>(
       createConvertToDestinationPassingStylePass());

   // Run LinalgFusePass firstly in case that we have fill + matmul + generic
   // ops. At this stage, we do not apply vectorization. The reduction dim won't
   // get tiled if the case is matmul + generic op. In this case, we have to tile
   // along reduction dim again, which needs them to be Linalg ops form.
   {
     LinalgFusePassOptions options;
     options.tilingLevel =
         static_cast<int64_t>(StrategyTilingLevel::ParallelTiles);
     passManager.addNestedPass<func::FuncOp>(createLinalgFusePass(options));
     passManager.addNestedPass<func::FuncOp>(createCanonicalizerPass());
     passManager.addNestedPass<func::FuncOp>(createCSEPass());
   }

   // Add the sandbox single tiling expert to tile and vectorize.
   {
     // The options are derived from sandbox codegen driver. hoistPadding options
     // does not work in IREE cases. It's fine to not have it, since it's already
     // generating the IR as same as sandbox.
     LinalgSingleTilingExpertPassOptions options;
     options.vectorize = true;
     options.vectorizePadding = true;
     // TODO(#8228): Enable the padding once we know how to deal with fusion. For
     // now, we don't enable padding because alloca ops will be created in
     // bufferization for some cases.
     // options.pad = true;
     // options.packPaddings = {1, 1, 0};
     options.tilingLevel =
         static_cast<int64_t>(StrategyTilingLevel::ReductionTiles);
     passManager.addNestedPass<func::FuncOp>(
         createLinalgSingleTilingExpertPass(options));
     passManager.addNestedPass<func::FuncOp>(createCanonicalizerPass());
     passManager.addNestedPass<func::FuncOp>(createCSEPass());
   }

   BufferizationOptions::AllocationFn allocationFn =
       cpuComprehensiveBufferizeAllocationFn;
   BufferizationOptions::DeallocationFn deallocationFn =
       cpuComprehensiveBufferizeDeallocationFn;
   BufferizationOptions::MemCpyFn memcpyFn = cpuComprehensiveBufferizeCopyFn;
   addIREEComprehensiveBufferizePasses(passManager, allocationFn, deallocationFn,
                                       memcpyFn);

   // Run IREE specific passes before vector lowering expert.
   passManager.addNestedPass<func::FuncOp>(
       createRemoveSingleIterationLoopPass());

   // Add the vector lowering expert.
   {
     OpPassManager &nestedFuncPassManager = passManager.nest<func::FuncOp>();
     LinalgVectorLoweringPassOptions options;
     options.lowerVectorTransposeToAVX2 = lowerToAVX2;
     options.splitVectorTransfersTo = "linalg-copy";
     addLowerToVectorTransforms(nestedFuncPassManager, options);
   }
 }

 void addConvTileAndDecomposeExpertPassPipeline(OpPassManager &passManager) {
   passManager.addPass(createVerifyLinalgTransformLegalityPass());

   // Do first level of tiling and distribution.
   passManager.addNestedPass<func::FuncOp>(createInsertDistributionInfoPass());
   passManager.addNestedPass<func::FuncOp>(
       createTileAndDistributeToWorkgroupsPass());
   passManager.addNestedPass<func::FuncOp>(
       createFoldAffineMinInDistributedLoopsPass());
   passManager.addPass(createCanonicalizerPass());
   passManager.addPass(createCSEPass());
   passManager.addNestedPass<func::FuncOp>(
       createConvertToDestinationPassingStylePass());

   // Run LinalgFusePass firstly in case that we have fill + conv + generic
   // ops. At this stage, we do not apply vectorization. The reduction dim won't
   // get tiled if the case is conv + generic op. In this case, we have to tile
   // along reduction dim again, which needs them to be Linalg ops form.
   {
     LinalgFusePassOptions options;
     options.tilingLevel =
         static_cast<int64_t>(StrategyTilingLevel::ParallelTiles);
     passManager.addNestedPass<func::FuncOp>(createLinalgFusePass(options));
     passManager.addNestedPass<func::FuncOp>(createCanonicalizerPass());
     passManager.addNestedPass<func::FuncOp>(createCSEPass());
   }

   // Add the sandbox single tiling expert to tile and vectorize.
   {
     LinalgSingleTilingExpertPassOptions options;
     options.decomposeToLowerDimOp = true;
     options.vectorize = true;
     options.vectorizePadding = true;
     options.tilingLevel =
         static_cast<int64_t>(StrategyTilingLevel::ReductionTiles);
     passManager.addNestedPass<func::FuncOp>(
         createLinalgSingleTilingExpertPass(options));
     passManager.addNestedPass<func::FuncOp>(createCanonicalizerPass());
     passManager.addNestedPass<func::FuncOp>(createCSEPass());
   }

   BufferizationOptions::AllocationFn allocationFn =
       cpuComprehensiveBufferizeAllocationFn;
   BufferizationOptions::DeallocationFn deallocationFn =
       cpuComprehensiveBufferizeDeallocationFn;
   BufferizationOptions::MemCpyFn memcpyFn = cpuComprehensiveBufferizeCopyFn;
   addIREEComprehensiveBufferizePasses(passManager, allocationFn, deallocationFn,
                                       memcpyFn);

   // Run IREE specific passes before vector lowering expert.
   passManager.addNestedPass<func::FuncOp>(
       createRemoveSingleIterationLoopPass());

   // Add the vector lowering expert.
   {
     OpPassManager &nestedFuncPassManager = passManager.nest<func::FuncOp>();
     LinalgVectorLoweringPassOptions options;
     options.splitVectorTransfersTo = "shuffle";
     addLowerToVectorTransforms(nestedFuncPassManager, options);
   }
 }

 void addTileFuseAndVectorizePassPipeline(OpPassManager &passManager,
                                          bool lowerToVectors) {
   // Do first level of tile and distribute to workgroups.
   passManager.addNestedPass<func::FuncOp>(createInsertDistributionInfoPass());
   passManager.addNestedPass<func::FuncOp>(
       createTileAndDistributeToWorkgroupsPass());
   passManager.addNestedPass<func::FuncOp>(
       createFoldAffineMinInDistributedLoopsPass());
   passManager.addPass(createCanonicalizerPass());
   passManager.addPass(createCSEPass());

   // Tile and vectorize linalg ops on tensors.
   passManager.addNestedPass<func::FuncOp>(
       createLLVMCPUTileFuseAndVectorizePass(lowerToVectors));
   passManager.addNestedPass<func::FuncOp>(createCSEPass());
   passManager.addNestedPass<func::FuncOp>(createCanonicalizerPass());

   // Use stack allocation on CPU side.

   // TODO(ravishankarm): This is commented cause this is WIP, to be enabled
   // soon.
   //
   // auto callbacks =
   //    std::make_unique<linalg::comprehensive_bufferize::AllocationCallbacks>(
   //        cpuComprehensiveBufferizeAllocationFn,
   //        cpuComprehensiveBufferizeDeallocationFn,
   //        cpuComprehensiveBufferizeCopyFn);
   // addIREEComprehensiveBufferizePasses(passManager, std::move(callbacks));

   addLinalgBufferizePasses(passManager, cpuAllocationFunction);
   passManager.addNestedPass<func::FuncOp>(createCSEPass());
   passManager.addNestedPass<func::FuncOp>(createCanonicalizerPass());

   passManager.addNestedPass<func::FuncOp>(createForOpCanonicalizationPass());
   passManager.addNestedPass<func::FuncOp>(createOptimizeVectorTransferPass());
 }

 void addCPUDefaultPassPipeline(OpPassManager &passManager) {
   // Do first level of tile and distribute to workgroups.
   passManager.addNestedPass<func::FuncOp>(createInsertDistributionInfoPass());
   passManager.addNestedPass<func::FuncOp>(
       createTileAndDistributeToWorkgroupsPass());
   passManager.addNestedPass<func::FuncOp>(
       createFoldAffineMinInDistributedLoopsPass());
   passManager.addPass(createCanonicalizerPass());
   passManager.addPass(createCSEPass());
   // Use stack allocation on CPU side.
   addLinalgBufferizePasses(passManager, cpuAllocationFunction);
 }

 void addLinalgTransformInterpPasses(OpPassManager &passManager) {
   // Give control to the linalg_transform dialect.
   passManager.addPass(createLinalgTransformInterpreterPass());

   // Dropping the schedule is only needed if we want to embed the transform in
   // the module: we should drop the schedule once applied.
   // This pass does nothing in the case where we apply a separate policy
   // through a file.
   passManager.addPass(createDropSchedulePass());
 }

 static void addLowerToLLVMPasses(OpPassManager &passManager) {
   // LinalgExt -> SCF
   passManager.addNestedPass<func::FuncOp>(
       IREE::LinalgExt::createLinalgExtToLoopsPass());

   // Linalg -> SCF
   passManager.addNestedPass<func::FuncOp>(createMemrefCopyToLinalgPass());
   passManager.addNestedPass<func::FuncOp>(createConvertLinalgToLoopsPass());
   passManager.addNestedPass<func::FuncOp>(createCanonicalizerPass());
   passManager.addNestedPass<func::FuncOp>(createCSEPass());

   // SCF -> STD
   passManager.addNestedPass<func::FuncOp>(createConvertSCFToCFPass());
   passManager.addNestedPass<func::FuncOp>(createCanonicalizerPass());
   passManager.addNestedPass<func::FuncOp>(createCSEPass());

   if (clCheckIRBeforeLLVMConversion) {
     passManager.addPass(createLLVMCPUCheckIRBeforeLLVMConversionPass());
   }
   // Handled tensor-type constants.
   passManager.addPass(arith::createConstantBufferizePass());
   passManager.addPass(createFoldTensorExtractOpPass());

   // math dialect elementry functions -> polynomial form.
   passManager.addNestedPass<func::FuncOp>(createPolynomialApproximationPass());

   // (HAL, IREE, Linalg, STD) -> LLVM
   passManager.addNestedPass<func::FuncOp>(
       arith::createArithmeticExpandOpsPass());
   passManager.addNestedPass<func::FuncOp>(memref::createExpandOpsPass());
   passManager.addPass(createConvertToLLVMPass());
   passManager.addPass(createReconcileUnrealizedCastsPass());

   // We rely on MLIR symbol visibility being correct after this point and need
   // to mirror the LLVM linkage that was assigned during conversion.
   passManager.addPass(createLLVMCPUSynchronizeSymbolVisibilityPass());

   passManager.addPass(createCanonicalizerPass());
   passManager.addPass(createCSEPass());
 }

 void buildLLVMCPUCodegenPassPipeline(OpPassManager &passManager) {
   passManager.nest<ModuleOp>().nest<func::FuncOp>().addPass(
       createTypePropagationPass());
   passManager.nest<ModuleOp>().addPass(createBufferizeCopyOnlyDispatchesPass());
   passManager.addPass(createLLVMCPULowerExecutableTargetPass());
   OpPassManager &nestedModulePM = passManager.nest<ModuleOp>();
   addLowerToLLVMPasses(nestedModulePM);
 }

 }  // namespace iree_compiler
 }  // namespace mlir
	// Copyright 2020 The IREE Authors
	//
	// Licensed under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

	#include "iree/compiler/Codegen/Passes.h"

	#include "iree-dialects/Dialect/LinalgExt/IR/TiledOpInterface.h"
	#include "iree-dialects/Dialect/LinalgExt/Passes/Passes.h"
	#include "iree-dialects/Dialect/LinalgTransform/Passes.h"
	#include "iree/compiler/Codegen/LLVMCPU/KernelDispatch.h"
	#include "iree/compiler/Codegen/PassDetail.h"
	#include "iree/compiler/Codegen/Sandbox/Passes.h"
	#include "iree/compiler/Codegen/Utils/Utils.h"
	#include "iree/compiler/Dialect/Flow/IR/PartitionableLoopsInterface.h"
	#include "llvm/Support/CommandLine.h"
	#include "mlir/Conversion/ReconcileUnrealizedCasts/ReconcileUnrealizedCasts.h"
	#include "mlir/Conversion/SCFToControlFlow/SCFToControlFlow.h"
	#include "mlir/Dialect/Arithmetic/Transforms/Passes.h"
	#include "mlir/Dialect/Func/Transforms/Passes.h"
	#include "mlir/Dialect/Linalg/Passes.h"
	#include "mlir/Dialect/MemRef/Transforms/Passes.h"
	#include "mlir/Pass/PassManager.h"
	#include "mlir/Transforms/Passes.h"

	namespace mlir {
	namespace iree_compiler {

	/// Command line options used purely for development purposes. Not to be relied
	/// on in any way.
	static llvm::cl::opt<bool> clCheckIRBeforeLLVMConversion(
	"iree-codegen-check-ir-before-llvm-conversion",
	llvm::cl::desc("Runs the pass to check the IR generated from LLVMCPU "
	"before conversion to LLVM IR"),
	llvm::cl::init(true));

	//===---------------------------------------------------------------------===//
	// Default allocation functions for CPU backend
	//===---------------------------------------------------------------------===//

	// Default allocation function to use with IREEs bufferization.
	static Value cpuAllocationFunction(OpBuilder &builder, Location loc,
	ArrayRef<int64_t> staticShape,
	Type elementType,
	ArrayRef<Value> dynamicSizes) {
	MemRefType allocType = MemRefType::get(staticShape, elementType);
	return builder.create<memref::AllocaOp>(loc, allocType, dynamicSizes);
	}

	// Allocation callbacks to use with upstream comprehensive bufferization
	static FailureOr<Value> cpuComprehensiveBufferizeAllocationFn(
	OpBuilder &builder, Location loc, MemRefType memRefType,
	ValueRange dynamicSizes, unsigned alignment) {
	return builder
	.create<memref::AllocaOp>(loc, memRefType, dynamicSizes,
	builder.getI64IntegerAttr(alignment))
	.getResult();
	}

	static LogicalResult cpuComprehensiveBufferizeDeallocationFn(OpBuilder &builder,
	Location loc,
	Value allocation) {
	return success();
	}

	static LogicalResult cpuComprehensiveBufferizeCopyFn(OpBuilder &builder,
	Location loc, Value from,
	Value to) {
	createLinalgCopyOp(builder, loc, from, to);
	return success();
	}

	//===---------------------------------------------------------------------===//
	// Codegen configuration verifications.
	//===---------------------------------------------------------------------===//

	static bool isValidInterchange(ArrayRef<int64_t> interchange, int numLoops) {
	if (interchange.empty()) return true;
	llvm::SmallDenseSet<int64_t> s;
	s.insert(interchange.begin(), interchange.end());
	for (int i = 0; i < numLoops; ++i) {
	if (!s.contains(i)) return false;
	}
	return true;
	}

	LogicalResult verifyDoubleTilingExpertPassPipelineConfig(
	Operation *op, IREE::Codegen::LoweringConfigAttr loweringConfig,
	IREE::Codegen::TranslationInfoAttr translationInfo,
	ArrayRef<int64_t> workgroupSize) {
	if (!workgroupSize.empty()) {
	return op->emitOpError(
	"expected workgroup size to be empty for CPU pipelines");
	}

	// Verify that the translation info is using the right pipeline.
	auto pipeline =
	IREE::Codegen::DispatchLoweringPassPipeline::CPUDoubleTilingExpert;
	StringRef pipelineName = stringifyEnum(pipeline);
	if (translationInfo.getDispatchLoweringPassPipeline() != pipeline) {
	return op->emitOpError("expected pipeline in translation_info to be ")
	<< pipelineName;
	}

	// Verify that the workload per workgroup is not set.
	// TODO(ravishankarm): Remove workload_per_wg eventually.
	SmallVector<int64_t> workloadPerWorkgroup =
	translationInfo.getWorkloadPerWorkgroupVals();
	if (!workloadPerWorkgroup.empty()) {
	return op->emitOpError(
	"workload_per_wg expected to be empty since its internal "
	"compiler implementation detail")
	<< kNumMaxParallelDims;
	}

	if (loweringConfig.getTileSizes().size() !=
	static_cast<unsigned>(StrategyTilingLevel::NumStrategyTileLevels)) {
	return op->emitOpError("expected three tiling sizes for ")
	<< pipelineName << ", got " << loweringConfig.getTileSizes().size();
	}

	IREE::Flow::PartitionableLoopsInterface interfaceOp =
	dyn_cast_or_null<IREE::Flow::PartitionableLoopsInterface>(op);
	if (interfaceOp) {
	SmallVector<int64_t> firstLevelTileSizes = loweringConfig.getTileSizeVals(
	static_cast<unsigned>(StrategyTilingLevel::WorkGroupTiles));
	// This is needed to fuse and distribute all ops together.
	if (firstLevelTileSizes.size() != interfaceOp.getNumLoops()) {
	return op->emitOpError(
	"mismatch between number of loops and first level of tiling");
	}

	llvm::SmallDenseSet<unsigned> pLoopsSet;
	for (auto iteratorType : llvm::enumerate(interfaceOp.getIteratorTypes())) {
	if (iteratorType.value() == getParallelIteratorTypeName()) {
	pLoopsSet.insert(iteratorType.index());
	}
	}

	SmallVector<int64_t> secondLevelTileSizes = loweringConfig.getTileSizeVals(
	static_cast<unsigned>(StrategyTilingLevel::ParallelTiles));
	for (auto en : llvm::enumerate(secondLevelTileSizes)) {
	if (en.value() != 0 && !pLoopsSet.contains(en.index())) {
	return op->emitOpError(
	"expected only parallel dims to be set in the "
	"second tiling sizes, got ")
	<< en.index() << "-th tile size set";
	}
	}

	SmallVector<int64_t> thirdLevelTileSizes = loweringConfig.getTileSizeVals(
	static_cast<unsigned>(StrategyTilingLevel::ReductionTiles));
	for (auto en : llvm::enumerate(thirdLevelTileSizes)) {
	if (en.value() != 0 && pLoopsSet.contains(en.index())) {
	return op->emitOpError(
	"expected only reduction dims to be set in the third "
	"tiling sizes, got ")
	<< en.index() << "-th tile size set";
	}
	}
	}

	// Verify interchange
	if (!loweringConfig.getTileInterchange().empty()) {
	for (auto level : llvm::seq<unsigned>(
	0, static_cast<unsigned>(
	loweringConfig.getTileInterchange().size()))) {
	auto tileSizes = loweringConfig.getTileSizeVals(level);
	auto interchange = loweringConfig.getTileInterchangeVals(level);
	if (!isValidInterchange(interchange, tileSizes.size())) {
	return op->emitOpError("expected [0, ")
	<< tileSizes.size()
	<< ") to be set exactly once in interchange #" << level;
	}
	}
	}

	// Verify that native vector size is empty.
	SmallVector<int64_t> nativeVectorSize =
	loweringConfig.getNativeVectorSizeVals();
	if (!nativeVectorSize.empty()) {
	return op->emitOpError("native_vector_size must be empty");
	}
	return success();
	}

	//===---------------------------------------------------------------------===//
	// Codegen pipelines.
	//===---------------------------------------------------------------------===//

	void addSingleTilingExpertPassPipeline(OpPassManager &passManager) {
	// Do first level of tiling and distribution.
	passManager.addNestedPass<func::FuncOp>(createInsertDistributionInfoPass());
	passManager.addNestedPass<func::FuncOp>(
	createTileAndDistributeToWorkgroupsPass());
	passManager.addNestedPass<func::FuncOp>(
	createFoldAffineMinInDistributedLoopsPass());
	passManager.addPass(createCanonicalizerPass());
	passManager.addPass(createCSEPass());

	passManager.addNestedPass<func::FuncOp>(
	createConvertToDestinationPassingStylePass());
	passManager.addPass(createCanonicalizerPass());
	// Add the sandbox single tiling expert to tile and vectorize.
	{
	LinalgSingleTilingExpertPassOptions options;
	options.vectorize = true;
	options.tilingLevel = static_cast<int64_t>(TilingLevel::L1Tiles);
	passManager.addNestedPass<func::FuncOp>(
	createLinalgSingleTilingExpertPass(options));
	}

	// TODO(ravishankarm): This is commented cause this is WIP, to be enabled
	// soon.
	// auto callbacks =
	// std::make_unique<linalg::comprehensive_bufferize::AllocationCallbacks>(
	// cpuComprehensiveBufferizeAllocationFn,
	// cpuComprehensiveBufferizeDeallocationFn,
	// cpuComprehensiveBufferizeCopyFn);
	// addIREEComprehensiveBufferizePasses(passManager, std::move(callbacks));
	addLinalgBufferizePasses(passManager, cpuAllocationFunction);

	// Add the vector lowering expert.
	{
	OpPassManager &nestedFuncPassManager = passManager.nest<func::FuncOp>();
	LinalgVectorLoweringPassOptions options;
	addLowerToVectorTransforms(nestedFuncPassManager, options);
	}
	}

	void addCPUBufferOpsTileAndVectorizePipeline(OpPassManager &passManager) {
	// Do first level of tiling and distribution.
	passManager.addNestedPass<func::FuncOp>(createInsertDistributionInfoPass());
	passManager.addNestedPass<func::FuncOp>(
	createTileAndDistributeToWorkgroupsPass());
	passManager.addNestedPass<func::FuncOp>(
	createFoldAffineMinInDistributedLoopsPass());
	passManager.addPass(createCanonicalizerPass());
	passManager.addPass(createCSEPass());

	// Run IREE specific passes before vector lowering expert.
	passManager.addNestedPass<func::FuncOp>(
	createRemoveSingleIterationLoopPass());

	// Add the vector lowering expert.
	{
	OpPassManager &nestedFuncPassManager = passManager.nest<func::FuncOp>();
	LinalgVectorLoweringPassOptions options;
	options.splitVectorTransfersTo = "linalg-copy";
	addLowerToVectorTransforms(nestedFuncPassManager, options);
	}
	}

	void addDoubleTilingExpertPassPipeline(OpPassManager &passManager,
	bool lowerToAVX2) {
	passManager.addPass(createVerifyLinalgTransformLegalityPass());

	// Do first level of tiling and distribution.
	passManager.addNestedPass<func::FuncOp>(createInsertDistributionInfoPass());
	passManager.addNestedPass<func::FuncOp>(
	createTileAndDistributeToWorkgroupsPass());
	passManager.addNestedPass<func::FuncOp>(
	createFoldAffineMinInDistributedLoopsPass());
	passManager.addPass(createCanonicalizerPass());
	passManager.addPass(createCSEPass());
	passManager.addNestedPass<func::FuncOp>(
	createConvertToDestinationPassingStylePass());

	// Run LinalgFusePass firstly in case that we have fill + matmul + generic
	// ops. At this stage, we do not apply vectorization. The reduction dim won't
	// get tiled if the case is matmul + generic op. In this case, we have to tile
	// along reduction dim again, which needs them to be Linalg ops form.
	{
	LinalgFusePassOptions options;
	options.tilingLevel =
	static_cast<int64_t>(StrategyTilingLevel::ParallelTiles);
	passManager.addNestedPass<func::FuncOp>(createLinalgFusePass(options));
	passManager.addNestedPass<func::FuncOp>(createCanonicalizerPass());
	passManager.addNestedPass<func::FuncOp>(createCSEPass());
	}

	// Add the sandbox single tiling expert to tile and vectorize.
	{
	// The options are derived from sandbox codegen driver. hoistPadding options
	// does not work in IREE cases. It's fine to not have it, since it's already
	// generating the IR as same as sandbox.
	LinalgSingleTilingExpertPassOptions options;
	options.vectorize = true;
	options.vectorizePadding = true;
	// TODO(#8228): Enable the padding once we know how to deal with fusion. For
	// now, we don't enable padding because alloca ops will be created in
	// bufferization for some cases.
	// options.pad = true;
	// options.packPaddings = {1, 1, 0};
	options.tilingLevel =
	static_cast<int64_t>(StrategyTilingLevel::ReductionTiles);
	passManager.addNestedPass<func::FuncOp>(
	createLinalgSingleTilingExpertPass(options));
	passManager.addNestedPass<func::FuncOp>(createCanonicalizerPass());
	passManager.addNestedPass<func::FuncOp>(createCSEPass());
	}

	BufferizationOptions::AllocationFn allocationFn =
	cpuComprehensiveBufferizeAllocationFn;
	BufferizationOptions::DeallocationFn deallocationFn =
	cpuComprehensiveBufferizeDeallocationFn;
	BufferizationOptions::MemCpyFn memcpyFn = cpuComprehensiveBufferizeCopyFn;
	addIREEComprehensiveBufferizePasses(passManager, allocationFn, deallocationFn,
	memcpyFn);

	// Run IREE specific passes before vector lowering expert.
	passManager.addNestedPass<func::FuncOp>(
	createRemoveSingleIterationLoopPass());

	// Add the vector lowering expert.
	{
	OpPassManager &nestedFuncPassManager = passManager.nest<func::FuncOp>();
	LinalgVectorLoweringPassOptions options;
	options.lowerVectorTransposeToAVX2 = lowerToAVX2;
	options.splitVectorTransfersTo = "linalg-copy";
	addLowerToVectorTransforms(nestedFuncPassManager, options);
	}
	}

	void addConvTileAndDecomposeExpertPassPipeline(OpPassManager &passManager) {
	passManager.addPass(createVerifyLinalgTransformLegalityPass());

	// Do first level of tiling and distribution.
	passManager.addNestedPass<func::FuncOp>(createInsertDistributionInfoPass());
	passManager.addNestedPass<func::FuncOp>(
	createTileAndDistributeToWorkgroupsPass());
	passManager.addNestedPass<func::FuncOp>(
	createFoldAffineMinInDistributedLoopsPass());
	passManager.addPass(createCanonicalizerPass());
	passManager.addPass(createCSEPass());
	passManager.addNestedPass<func::FuncOp>(
	createConvertToDestinationPassingStylePass());

	// Run LinalgFusePass firstly in case that we have fill + conv + generic
	// ops. At this stage, we do not apply vectorization. The reduction dim won't
	// get tiled if the case is conv + generic op. In this case, we have to tile
	// along reduction dim again, which needs them to be Linalg ops form.
	{
	LinalgFusePassOptions options;
	options.tilingLevel =
	static_cast<int64_t>(StrategyTilingLevel::ParallelTiles);
	passManager.addNestedPass<func::FuncOp>(createLinalgFusePass(options));
	passManager.addNestedPass<func::FuncOp>(createCanonicalizerPass());
	passManager.addNestedPass<func::FuncOp>(createCSEPass());
	}

	// Add the sandbox single tiling expert to tile and vectorize.
	{
	LinalgSingleTilingExpertPassOptions options;
	options.decomposeToLowerDimOp = true;
	options.vectorize = true;
	options.vectorizePadding = true;
	options.tilingLevel =
	static_cast<int64_t>(StrategyTilingLevel::ReductionTiles);
	passManager.addNestedPass<func::FuncOp>(
	createLinalgSingleTilingExpertPass(options));
	passManager.addNestedPass<func::FuncOp>(createCanonicalizerPass());
	passManager.addNestedPass<func::FuncOp>(createCSEPass());
	}

	BufferizationOptions::AllocationFn allocationFn =
	cpuComprehensiveBufferizeAllocationFn;
	BufferizationOptions::DeallocationFn deallocationFn =
	cpuComprehensiveBufferizeDeallocationFn;
	BufferizationOptions::MemCpyFn memcpyFn = cpuComprehensiveBufferizeCopyFn;
	addIREEComprehensiveBufferizePasses(passManager, allocationFn, deallocationFn,
	memcpyFn);

	// Run IREE specific passes before vector lowering expert.
	passManager.addNestedPass<func::FuncOp>(
	createRemoveSingleIterationLoopPass());

	// Add the vector lowering expert.
	{
	OpPassManager &nestedFuncPassManager = passManager.nest<func::FuncOp>();
	LinalgVectorLoweringPassOptions options;
	options.splitVectorTransfersTo = "shuffle";
	addLowerToVectorTransforms(nestedFuncPassManager, options);
	}
	}

	void addTileFuseAndVectorizePassPipeline(OpPassManager &passManager,
	bool lowerToVectors) {
	// Do first level of tile and distribute to workgroups.
	passManager.addNestedPass<func::FuncOp>(createInsertDistributionInfoPass());
	passManager.addNestedPass<func::FuncOp>(
	createTileAndDistributeToWorkgroupsPass());
	passManager.addNestedPass<func::FuncOp>(
	createFoldAffineMinInDistributedLoopsPass());
	passManager.addPass(createCanonicalizerPass());
	passManager.addPass(createCSEPass());

	// Tile and vectorize linalg ops on tensors.
	passManager.addNestedPass<func::FuncOp>(
	createLLVMCPUTileFuseAndVectorizePass(lowerToVectors));
	passManager.addNestedPass<func::FuncOp>(createCSEPass());
	passManager.addNestedPass<func::FuncOp>(createCanonicalizerPass());

	// Use stack allocation on CPU side.

	// TODO(ravishankarm): This is commented cause this is WIP, to be enabled
	// soon.
	//
	// auto callbacks =
	// std::make_unique<linalg::comprehensive_bufferize::AllocationCallbacks>(
	// cpuComprehensiveBufferizeAllocationFn,
	// cpuComprehensiveBufferizeDeallocationFn,
	// cpuComprehensiveBufferizeCopyFn);
	// addIREEComprehensiveBufferizePasses(passManager, std::move(callbacks));

	addLinalgBufferizePasses(passManager, cpuAllocationFunction);
	passManager.addNestedPass<func::FuncOp>(createCSEPass());
	passManager.addNestedPass<func::FuncOp>(createCanonicalizerPass());

	passManager.addNestedPass<func::FuncOp>(createForOpCanonicalizationPass());
	passManager.addNestedPass<func::FuncOp>(createOptimizeVectorTransferPass());
	}

	void addCPUDefaultPassPipeline(OpPassManager &passManager) {
	// Do first level of tile and distribute to workgroups.
	passManager.addNestedPass<func::FuncOp>(createInsertDistributionInfoPass());
	passManager.addNestedPass<func::FuncOp>(
	createTileAndDistributeToWorkgroupsPass());
	passManager.addNestedPass<func::FuncOp>(
	createFoldAffineMinInDistributedLoopsPass());
	passManager.addPass(createCanonicalizerPass());
	passManager.addPass(createCSEPass());
	// Use stack allocation on CPU side.
	addLinalgBufferizePasses(passManager, cpuAllocationFunction);
	}

	void addLinalgTransformInterpPasses(OpPassManager &passManager) {
	// Give control to the linalg_transform dialect.
	passManager.addPass(createLinalgTransformInterpreterPass());

	// Dropping the schedule is only needed if we want to embed the transform in
	// the module: we should drop the schedule once applied.
	// This pass does nothing in the case where we apply a separate policy
	// through a file.
	passManager.addPass(createDropSchedulePass());
	}

	static void addLowerToLLVMPasses(OpPassManager &passManager) {
	// LinalgExt -> SCF
	passManager.addNestedPass<func::FuncOp>(
	IREE::LinalgExt::createLinalgExtToLoopsPass());

	// Linalg -> SCF
	passManager.addNestedPass<func::FuncOp>(createMemrefCopyToLinalgPass());
	passManager.addNestedPass<func::FuncOp>(createConvertLinalgToLoopsPass());
	passManager.addNestedPass<func::FuncOp>(createCanonicalizerPass());
	passManager.addNestedPass<func::FuncOp>(createCSEPass());

	// SCF -> STD
	passManager.addNestedPass<func::FuncOp>(createConvertSCFToCFPass());
	passManager.addNestedPass<func::FuncOp>(createCanonicalizerPass());
	passManager.addNestedPass<func::FuncOp>(createCSEPass());

	if (clCheckIRBeforeLLVMConversion) {
	passManager.addPass(createLLVMCPUCheckIRBeforeLLVMConversionPass());
	}
	// Handled tensor-type constants.
	passManager.addPass(arith::createConstantBufferizePass());
	passManager.addPass(createFoldTensorExtractOpPass());

	// math dialect elementry functions -> polynomial form.
	passManager.addNestedPass<func::FuncOp>(createPolynomialApproximationPass());

	// (HAL, IREE, Linalg, STD) -> LLVM
	passManager.addNestedPass<func::FuncOp>(
	arith::createArithmeticExpandOpsPass());
	passManager.addNestedPass<func::FuncOp>(memref::createExpandOpsPass());
	passManager.addPass(createConvertToLLVMPass());
	passManager.addPass(createReconcileUnrealizedCastsPass());

	// We rely on MLIR symbol visibility being correct after this point and need
	// to mirror the LLVM linkage that was assigned during conversion.
	passManager.addPass(createLLVMCPUSynchronizeSymbolVisibilityPass());

	passManager.addPass(createCanonicalizerPass());
	passManager.addPass(createCSEPass());
	}

	void buildLLVMCPUCodegenPassPipeline(OpPassManager &passManager) {
	passManager.nest<ModuleOp>().nest<func::FuncOp>().addPass(
	createTypePropagationPass());
	passManager.nest<ModuleOp>().addPass(createBufferizeCopyOnlyDispatchesPass());
	passManager.addPass(createLLVMCPULowerExecutableTargetPass());
	OpPassManager &nestedModulePM = passManager.nest<ModuleOp>();
	addLowerToLLVMPasses(nestedModulePM);
	}

	} // namespace iree_compiler
	} // namespace mlir