iree/compiler/Codegen/LLVMCPU/Passes.cpp - 3p/openxla/iree - Git at Google

 // Copyright 2020 The IREE Authors
 //
 // Licensed under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

 #include "iree/compiler/Codegen/Passes.h"

 #include "iree-dialects/Dialect/LinalgExt/IR/TiledOpInterface.h"
 #include "iree-dialects/Dialect/LinalgExt/Transforms/Passes.h"
 #include "iree/compiler/Codegen/LLVMCPU/KernelDispatch.h"
 #include "iree/compiler/Codegen/PassDetail.h"
 #include "iree/compiler/Codegen/Sandbox/Passes.h"
 #include "iree/compiler/Codegen/Utils/Utils.h"
 #include "llvm/Support/CommandLine.h"
 #include "mlir/Conversion/SCFToStandard/SCFToStandard.h"
 #include "mlir/Dialect/Arithmetic/Transforms/Passes.h"
 #include "mlir/Dialect/Linalg/Passes.h"
 #include "mlir/Dialect/StandardOps/Transforms/Passes.h"
 #include "mlir/Pass/PassManager.h"
 #include "mlir/Transforms/Passes.h"

 namespace mlir {
 namespace iree_compiler {

 /// Command line options used purely for development purposes. Not to be relied
 /// on in any way.
 static llvm::cl::opt<bool> clCheckIRBeforeLLVMConversion(
     "iree-codegen-check-ir-before-llvm-conversion",
     llvm::cl::desc("Runs the pass to check the IR generated from LLVMCPU "
                    "before conversion to LLVM IR"),
     llvm::cl::init(false));

 //===---------------------------------------------------------------------===//
 // Default allocation functions for CPU backend
 //===---------------------------------------------------------------------===//

 // Default allocation function to use with IREEs bufferization.
 static Value cpuAllocationFunction(OpBuilder &builder, Location loc,
                                    ArrayRef<int64_t> staticShape,
                                    Type elementType,
                                    ArrayRef<Value> dynamicSizes) {
   MemRefType allocType = MemRefType::get(staticShape, elementType);
   return builder.create<memref::AllocaOp>(loc, allocType, dynamicSizes);
 }

 // Allocation callbacks to use with upstream comprehensive bufferization
 static Optional<Value> cpuComprehensiveBufferizeAllocationFn(
     OpBuilder &builder, Location loc, MemRefType memRefType,
     ArrayRef<Value> dynamicSizes) {
   return builder.create<memref::AllocaOp>(loc, memRefType, dynamicSizes)
       .getResult();
 }

 static void cpuComprehensiveBufferizeDeallocationFn(OpBuilder &builder,
                                                     Location loc,
                                                     Value allocation) {
   return;
 }

 static void cpuComprehensiveBufferizeCopyFn(OpBuilder &builder, Location loc,
                                             Value from, Value to) {
   builder.create<linalg::CopyOp>(loc, from, to);
 }

 //===---------------------------------------------------------------------===//
 // Codegen configuration verifications.
 //===---------------------------------------------------------------------===//

 LogicalResult verifyTensorToVectorsPassPipelineConfig(
     Operation *op, IREE::Codegen::LoweringConfigAttr loweringConfig,
     IREE::Codegen::TranslationInfoAttr translationInfo,
     ArrayRef<int64_t> workgroupSize) {
   if (!workgroupSize.empty()) {
     return op->emitOpError(
         "expected workgroup size to be empty for CPU pipelines");
   }

   // Verify that the translation info is using the right pipeline.
   auto pipeline =
       IREE::Codegen::DispatchLoweringPassPipeline::CPUTensorToVectors;
   StringRef pipelineName = stringifyEnum(pipeline);
   if (translationInfo.getDispatchLoweringPassPipeline() != pipeline) {
     return op->emitOpError("expected pipeline in translation.info to be ")
            << pipelineName;
   }

   // Verify that the workload per workgroup is set and is non-zero.
   SmallVector<int64_t> workloadPerWorkgroup =
       translationInfo.getWorkloadPerWorkgroupVals();
   if (workloadPerWorkgroup.size() > kNumMaxParallelDims) {
     return op->emitOpError("workload_per_wg size should be less than ")
            << kNumMaxParallelDims;
   }
   if (isa<linalg::LinalgOp, IREE::LinalgExt::TiledOpInterface>(op)) {
     SmallVector<unsigned> partitionedLoops = getPartitionedLoops(op);
     if (workloadPerWorkgroup.size() != partitionedLoops.size()) {
       return op->emitOpError("expected ")
              << partitionedLoops.size()
              << " entries for workload_per_wg, but got "
              << workloadPerWorkgroup.size();
     }
   }
   if (llvm::any_of(workloadPerWorkgroup,
                    [](int64_t val) { return val == 0; })) {
     return op->emitOpError("invalid to use 0 in workload_per_wg");
   }

   if (loweringConfig.getTileSizes().size() != 3) {
     return op->emitOpError("expected three levels of tile sizes for ")
            << pipelineName << ", got " << loweringConfig.getTileSizes().size();
   }
   SmallVector<int64_t> firstLevelTileSizes = loweringConfig.getTileSizeVals(0);
   if (!firstLevelTileSizes.empty()) {
     // Verify that if the first-level tile sizes are set, they are the same as
     // workload_per_wg for the partitioned loops.
     SmallVector<unsigned> partitionedLoops = getPartitionedLoops(op);
     size_t minElements =
         (partitionedLoops.empty() ? 0 : partitionedLoops.back() + 1);
     if (firstLevelTileSizes.size() < minElements) {
       return op->emitOpError("expected at least ")
              << minElements
              << " size for first level tiling to get the distribution fully "
                 "specified.";
     }
     llvm::SmallDenseSet<unsigned> partitionedLoopsSet;
     partitionedLoopsSet.insert(partitionedLoops.begin(),
                                partitionedLoops.end());
     SmallVector<int64_t> partitionedTileSizes;
     for (auto tileSize : llvm::enumerate(firstLevelTileSizes)) {
       if (!partitionedLoopsSet.count(tileSize.index())) {
         continue;
       }
       partitionedTileSizes.push_back(tileSize.value());
     }
     for (auto val : llvm::enumerate(llvm::reverse(workloadPerWorkgroup))) {
       if (val.value() != partitionedTileSizes[val.index()]) {
         return op->emitOpError("mismatch in distributed tile size value ")
                << partitionedTileSizes[val.index()] << " at position "
                << val.index() << " and workload_per_wg value " << val.value();
       }
     }
   }

   // Verify that native vector size is either empty, or if set is same as the
   // last level of tiling
   SmallVector<int64_t> nativeVectorSize =
       loweringConfig.getNativeVectorSizeVals();
   if (!nativeVectorSize.empty()) {
     if (nativeVectorSize !=
         loweringConfig.getTileSizeVals(
             static_cast<unsigned>(TilingLevel::VectorTiles))) {
       return op->emitOpError(
           "native_vector_size must be same as the last level of tiling");
     }
   }
   return success();
 }

 void addTensorToVectorsPassPipeline(OpPassManager &passManager,
                                     bool lowerToVectors) {
   passManager.addPass(createCanonicalizerPass());

   // Tile and vectorize linalg ops on tensors.
   passManager.addNestedPass<FuncOp>(
       createLLVMCPUTileAndVectorizePass(lowerToVectors));
   passManager.addNestedPass<FuncOp>(createCSEPass());
   passManager.addNestedPass<FuncOp>(createCanonicalizerPass());

   // Use stack allocation on CPU side.
   addLinalgBufferizePasses(passManager, cpuAllocationFunction);
   passManager.addNestedPass<FuncOp>(createCSEPass());
   passManager.addNestedPass<FuncOp>(createCanonicalizerPass());

   passManager.addNestedPass<FuncOp>(createForOpCanonicalizationPass());

   passManager.addNestedPass<FuncOp>(createOptimizeVectorTransferPass());
 }

 void addSingleTilingExpertPassPipeline(OpPassManager &passManager) {
   passManager.addPass(createCanonicalizerPass());
   // Add the sandbox single tiling expert to tile and vectorize.
   {
     LinalgSingleTilingExpertPassOptions options;
     options.vectorize = true;
     options.tilingLevel = static_cast<int64_t>(TilingLevel::L1Tiles);
     passManager.addNestedPass<FuncOp>(
         createLinalgSingleTilingExpertPass(options));
   }

   // TODO(ravishankarm): This is commented cause this is WIP, to be enabled
   // soon.
   // auto callbacks =
   //     std::make_unique<linalg::comprehensive_bufferize::AllocationCallbacks>(
   //         cpuComprehensiveBufferizeAllocationFn,
   //         cpuComprehensiveBufferizeDeallocationFn,
   //         cpuComprehensiveBufferizeCopyFn);
   // addIREEComprehensiveBufferizePasses(passManager, std::move(callbacks));
   addLinalgBufferizePasses(passManager, cpuAllocationFunction);

   // Add the vector lowering expert.
   {
     OpPassManager &nestedFuncPassManager = passManager.nest<FuncOp>();
     LinalgVectorLoweringPassOptions options;
     addLowerToVectorTransforms(nestedFuncPassManager, options);
   }
 }

 void addDoubleTilingExpertPassPipeline(OpPassManager &passManager) {
   passManager.addPass(createCanonicalizerPass());
   {
     passManager.addNestedPass<FuncOp>(createRemoveSingleIterationLoopPass());
     LinalgSingleTilingExpertPassOptions options;
     options.tilingLevel = static_cast<int64_t>(TilingLevel::L1Tiles);
     options.tileInterchange = {0, 2, 1};
     passManager.addNestedPass<FuncOp>(
         createLinalgSingleTilingExpertPass(options));
     passManager.addNestedPass<FuncOp>(createCanonicalizerPass());
     passManager.addNestedPass<FuncOp>(createCSEPass());
   }

   // Add the sandbox single tiling expert to tile and vectorize.
   {
     // The options are derived from sandbox codegen driver. hoistPadding options
     // does not work in IREE cases. It's fine to not have it, since it's already
     // generating the IR as same as sandbox.
     LinalgSingleTilingExpertPassOptions options;
     options.vectorize = true;
     options.vectorizePadding = true;
     options.pad = true;
     options.packPaddings = {1, 1, 0};
     // options.hoistPaddings = {5, 6, 0};
     options.tilingLevel = static_cast<int64_t>(TilingLevel::VectorTiles);
     options.tileInterchange = {0, 1, 2};
     passManager.addNestedPass<FuncOp>(
         createLinalgSingleTilingExpertPass(options));
   }

   // TODO(ravishankarm): This is commented cause this is WIP, to be enabled
   // soon.
   // auto callbacks =
   //     std::make_unique<linalg::comprehensive_bufferize::AllocationCallbacks>(
   //         cpuComprehensiveBufferizeAllocationFn,
   //         cpuComprehensiveBufferizeDeallocationFn,
   //         cpuComprehensiveBufferizeCopyFn);
   // addIREEComprehensiveBufferizePasses(passManager, std::move(callbacks));
   addLinalgBufferizePasses(passManager, cpuAllocationFunction);

   // Add the vector lowering expert.
   {
     OpPassManager &nestedFuncPassManager = passManager.nest<FuncOp>();
     LinalgVectorLoweringPassOptions options;
     options.splitVectorTransfersTo = "linalg-copy";
     addLowerToVectorTransforms(nestedFuncPassManager, options);
   }
 }

 void addTileFuseAndVectorizePassPipeline(OpPassManager &passManager,
                                          bool lowerToVectors) {
   passManager.addPass(createCanonicalizerPass());

   // Tile and vectorize linalg ops on tensors.
   passManager.addNestedPass<FuncOp>(
       createLLVMCPUTileFuseAndVectorizePass(lowerToVectors));
   passManager.addNestedPass<FuncOp>(createCSEPass());
   passManager.addNestedPass<FuncOp>(createCanonicalizerPass());

   // Use stack allocation on CPU side.

   // TODO(ravishankarm): This is commented cause this is WIP, to be enabled
   // soon.
   //
   // auto callbacks =
   //    std::make_unique<linalg::comprehensive_bufferize::AllocationCallbacks>(
   //        cpuComprehensiveBufferizeAllocationFn,
   //        cpuComprehensiveBufferizeDeallocationFn,
   //        cpuComprehensiveBufferizeCopyFn);
   // addIREEComprehensiveBufferizePasses(passManager, std::move(callbacks));

   addLinalgBufferizePasses(passManager, cpuAllocationFunction);
   passManager.addNestedPass<FuncOp>(createCSEPass());
   passManager.addNestedPass<FuncOp>(createCanonicalizerPass());

   passManager.addNestedPass<FuncOp>(createForOpCanonicalizationPass());
   passManager.addNestedPass<FuncOp>(createOptimizeVectorTransferPass());
 }

 void addCPUDefaultPassPipeline(OpPassManager &passManager) {
   passManager.addPass(createCanonicalizerPass());
   // Use stack allocation on CPU side.
   addLinalgBufferizePasses(passManager, cpuAllocationFunction);
 }

 static void addLowerToLLVMPasses(OpPassManager &passManager) {
   // LinalgExt -> SCF
   passManager.addNestedPass<FuncOp>(
       IREE::LinalgExt::createLinalgExtToLoopsPass());

   // Linalg -> SCF
   passManager.addNestedPass<FuncOp>(createConvertLinalgToLoopsPass());
   passManager.addNestedPass<FuncOp>(createCanonicalizerPass());
   passManager.addNestedPass<FuncOp>(createCSEPass());

   // SCF -> STD
   passManager.addNestedPass<FuncOp>(createLowerToCFGPass());
   passManager.addNestedPass<FuncOp>(createCanonicalizerPass());
   passManager.addNestedPass<FuncOp>(createCSEPass());

   if (clCheckIRBeforeLLVMConversion) {
     passManager.addPass(createLLVMCPUCheckIRBeforeLLVMConversionPass());
   }
   // Handled tensor-type constants.
   passManager.addPass(createTensorConstantBufferizePass());
   passManager.addPass(createFoldTensorExtractOpPass());

   // (HAL, IREE, Linalg, STD) -> LLVM
   passManager.addNestedPass<FuncOp>(arith::createArithmeticExpandOpsPass());
   passManager.addNestedPass<FuncOp>(createStdExpandOpsPass());
   passManager.addPass(createConvertToLLVMPass());

   // We rely on MLIR symbol visibility being correct after this point and need
   // to mirror the LLVM linkage that was assigned during conversion.
   passManager.addPass(createLLVMCPUSynchronizeSymbolVisibilityPass());

   passManager.addPass(createCanonicalizerPass());
   passManager.addPass(createCSEPass());
 }

 void buildLLVMCPUCodegenPassPipeline(OpPassManager &passManager) {
   passManager.addPass(createLLVMCPULowerExecutableTargetPass());
   OpPassManager &nestedModulePM = passManager.nest<ModuleOp>();
   addLowerToLLVMPasses(nestedModulePM);
 }

 }  // namespace iree_compiler
 }  // namespace mlir
	// Copyright 2020 The IREE Authors
	//
	// Licensed under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

	#include "iree/compiler/Codegen/Passes.h"

	#include "iree-dialects/Dialect/LinalgExt/IR/TiledOpInterface.h"
	#include "iree-dialects/Dialect/LinalgExt/Transforms/Passes.h"
	#include "iree/compiler/Codegen/LLVMCPU/KernelDispatch.h"
	#include "iree/compiler/Codegen/PassDetail.h"
	#include "iree/compiler/Codegen/Sandbox/Passes.h"
	#include "iree/compiler/Codegen/Utils/Utils.h"
	#include "llvm/Support/CommandLine.h"
	#include "mlir/Conversion/SCFToStandard/SCFToStandard.h"
	#include "mlir/Dialect/Arithmetic/Transforms/Passes.h"
	#include "mlir/Dialect/Linalg/Passes.h"
	#include "mlir/Dialect/StandardOps/Transforms/Passes.h"
	#include "mlir/Pass/PassManager.h"
	#include "mlir/Transforms/Passes.h"

	namespace mlir {
	namespace iree_compiler {

	/// Command line options used purely for development purposes. Not to be relied
	/// on in any way.
	static llvm::cl::opt<bool> clCheckIRBeforeLLVMConversion(
	"iree-codegen-check-ir-before-llvm-conversion",
	llvm::cl::desc("Runs the pass to check the IR generated from LLVMCPU "
	"before conversion to LLVM IR"),
	llvm::cl::init(false));

	//===---------------------------------------------------------------------===//
	// Default allocation functions for CPU backend
	//===---------------------------------------------------------------------===//

	// Default allocation function to use with IREEs bufferization.
	static Value cpuAllocationFunction(OpBuilder &builder, Location loc,
	ArrayRef<int64_t> staticShape,
	Type elementType,
	ArrayRef<Value> dynamicSizes) {
	MemRefType allocType = MemRefType::get(staticShape, elementType);
	return builder.create<memref::AllocaOp>(loc, allocType, dynamicSizes);
	}

	// Allocation callbacks to use with upstream comprehensive bufferization
	static Optional<Value> cpuComprehensiveBufferizeAllocationFn(
	OpBuilder &builder, Location loc, MemRefType memRefType,
	ArrayRef<Value> dynamicSizes) {
	return builder.create<memref::AllocaOp>(loc, memRefType, dynamicSizes)
	.getResult();
	}

	static void cpuComprehensiveBufferizeDeallocationFn(OpBuilder &builder,
	Location loc,
	Value allocation) {
	return;
	}

	static void cpuComprehensiveBufferizeCopyFn(OpBuilder &builder, Location loc,
	Value from, Value to) {
	builder.create<linalg::CopyOp>(loc, from, to);
	}

	//===---------------------------------------------------------------------===//
	// Codegen configuration verifications.
	//===---------------------------------------------------------------------===//

	LogicalResult verifyTensorToVectorsPassPipelineConfig(
	Operation *op, IREE::Codegen::LoweringConfigAttr loweringConfig,
	IREE::Codegen::TranslationInfoAttr translationInfo,
	ArrayRef<int64_t> workgroupSize) {
	if (!workgroupSize.empty()) {
	return op->emitOpError(
	"expected workgroup size to be empty for CPU pipelines");
	}

	// Verify that the translation info is using the right pipeline.
	auto pipeline =
	IREE::Codegen::DispatchLoweringPassPipeline::CPUTensorToVectors;
	StringRef pipelineName = stringifyEnum(pipeline);
	if (translationInfo.getDispatchLoweringPassPipeline() != pipeline) {
	return op->emitOpError("expected pipeline in translation.info to be ")
	<< pipelineName;
	}

	// Verify that the workload per workgroup is set and is non-zero.
	SmallVector<int64_t> workloadPerWorkgroup =
	translationInfo.getWorkloadPerWorkgroupVals();
	if (workloadPerWorkgroup.size() > kNumMaxParallelDims) {
	return op->emitOpError("workload_per_wg size should be less than ")
	<< kNumMaxParallelDims;
	}
	if (isa<linalg::LinalgOp, IREE::LinalgExt::TiledOpInterface>(op)) {
	SmallVector<unsigned> partitionedLoops = getPartitionedLoops(op);
	if (workloadPerWorkgroup.size() != partitionedLoops.size()) {
	return op->emitOpError("expected ")
	<< partitionedLoops.size()
	<< " entries for workload_per_wg, but got "
	<< workloadPerWorkgroup.size();
	}
	}
	if (llvm::any_of(workloadPerWorkgroup,
	[](int64_t val) { return val == 0; })) {
	return op->emitOpError("invalid to use 0 in workload_per_wg");
	}

	if (loweringConfig.getTileSizes().size() != 3) {
	return op->emitOpError("expected three levels of tile sizes for ")
	<< pipelineName << ", got " << loweringConfig.getTileSizes().size();
	}
	SmallVector<int64_t> firstLevelTileSizes = loweringConfig.getTileSizeVals(0);
	if (!firstLevelTileSizes.empty()) {
	// Verify that if the first-level tile sizes are set, they are the same as
	// workload_per_wg for the partitioned loops.
	SmallVector<unsigned> partitionedLoops = getPartitionedLoops(op);
	size_t minElements =
	(partitionedLoops.empty() ? 0 : partitionedLoops.back() + 1);
	if (firstLevelTileSizes.size() < minElements) {
	return op->emitOpError("expected at least ")
	<< minElements
	<< " size for first level tiling to get the distribution fully "
	"specified.";
	}
	llvm::SmallDenseSet<unsigned> partitionedLoopsSet;
	partitionedLoopsSet.insert(partitionedLoops.begin(),
	partitionedLoops.end());
	SmallVector<int64_t> partitionedTileSizes;
	for (auto tileSize : llvm::enumerate(firstLevelTileSizes)) {
	if (!partitionedLoopsSet.count(tileSize.index())) {
	continue;
	}
	partitionedTileSizes.push_back(tileSize.value());
	}
	for (auto val : llvm::enumerate(llvm::reverse(workloadPerWorkgroup))) {
	if (val.value() != partitionedTileSizes[val.index()]) {
	return op->emitOpError("mismatch in distributed tile size value ")
	<< partitionedTileSizes[val.index()] << " at position "
	<< val.index() << " and workload_per_wg value " << val.value();
	}
	}
	}

	// Verify that native vector size is either empty, or if set is same as the
	// last level of tiling
	SmallVector<int64_t> nativeVectorSize =
	loweringConfig.getNativeVectorSizeVals();
	if (!nativeVectorSize.empty()) {
	if (nativeVectorSize !=
	loweringConfig.getTileSizeVals(
	static_cast<unsigned>(TilingLevel::VectorTiles))) {
	return op->emitOpError(
	"native_vector_size must be same as the last level of tiling");
	}
	}
	return success();
	}

	void addTensorToVectorsPassPipeline(OpPassManager &passManager,
	bool lowerToVectors) {
	passManager.addPass(createCanonicalizerPass());

	// Tile and vectorize linalg ops on tensors.
	passManager.addNestedPass<FuncOp>(
	createLLVMCPUTileAndVectorizePass(lowerToVectors));
	passManager.addNestedPass<FuncOp>(createCSEPass());
	passManager.addNestedPass<FuncOp>(createCanonicalizerPass());

	// Use stack allocation on CPU side.
	addLinalgBufferizePasses(passManager, cpuAllocationFunction);
	passManager.addNestedPass<FuncOp>(createCSEPass());
	passManager.addNestedPass<FuncOp>(createCanonicalizerPass());

	passManager.addNestedPass<FuncOp>(createForOpCanonicalizationPass());

	passManager.addNestedPass<FuncOp>(createOptimizeVectorTransferPass());
	}

	void addSingleTilingExpertPassPipeline(OpPassManager &passManager) {
	passManager.addPass(createCanonicalizerPass());
	// Add the sandbox single tiling expert to tile and vectorize.
	{
	LinalgSingleTilingExpertPassOptions options;
	options.vectorize = true;
	options.tilingLevel = static_cast<int64_t>(TilingLevel::L1Tiles);
	passManager.addNestedPass<FuncOp>(
	createLinalgSingleTilingExpertPass(options));
	}

	// TODO(ravishankarm): This is commented cause this is WIP, to be enabled
	// soon.
	// auto callbacks =
	// std::make_unique<linalg::comprehensive_bufferize::AllocationCallbacks>(
	// cpuComprehensiveBufferizeAllocationFn,
	// cpuComprehensiveBufferizeDeallocationFn,
	// cpuComprehensiveBufferizeCopyFn);
	// addIREEComprehensiveBufferizePasses(passManager, std::move(callbacks));
	addLinalgBufferizePasses(passManager, cpuAllocationFunction);

	// Add the vector lowering expert.
	{
	OpPassManager &nestedFuncPassManager = passManager.nest<FuncOp>();
	LinalgVectorLoweringPassOptions options;
	addLowerToVectorTransforms(nestedFuncPassManager, options);
	}
	}

	void addDoubleTilingExpertPassPipeline(OpPassManager &passManager) {
	passManager.addPass(createCanonicalizerPass());
	{
	passManager.addNestedPass<FuncOp>(createRemoveSingleIterationLoopPass());
	LinalgSingleTilingExpertPassOptions options;
	options.tilingLevel = static_cast<int64_t>(TilingLevel::L1Tiles);
	options.tileInterchange = {0, 2, 1};
	passManager.addNestedPass<FuncOp>(
	createLinalgSingleTilingExpertPass(options));
	passManager.addNestedPass<FuncOp>(createCanonicalizerPass());
	passManager.addNestedPass<FuncOp>(createCSEPass());
	}

	// Add the sandbox single tiling expert to tile and vectorize.
	{
	// The options are derived from sandbox codegen driver. hoistPadding options
	// does not work in IREE cases. It's fine to not have it, since it's already
	// generating the IR as same as sandbox.
	LinalgSingleTilingExpertPassOptions options;
	options.vectorize = true;
	options.vectorizePadding = true;
	options.pad = true;
	options.packPaddings = {1, 1, 0};
	// options.hoistPaddings = {5, 6, 0};
	options.tilingLevel = static_cast<int64_t>(TilingLevel::VectorTiles);
	options.tileInterchange = {0, 1, 2};
	passManager.addNestedPass<FuncOp>(
	createLinalgSingleTilingExpertPass(options));
	}

	// TODO(ravishankarm): This is commented cause this is WIP, to be enabled
	// soon.
	// auto callbacks =
	// std::make_unique<linalg::comprehensive_bufferize::AllocationCallbacks>(
	// cpuComprehensiveBufferizeAllocationFn,
	// cpuComprehensiveBufferizeDeallocationFn,
	// cpuComprehensiveBufferizeCopyFn);
	// addIREEComprehensiveBufferizePasses(passManager, std::move(callbacks));
	addLinalgBufferizePasses(passManager, cpuAllocationFunction);

	// Add the vector lowering expert.
	{
	OpPassManager &nestedFuncPassManager = passManager.nest<FuncOp>();
	LinalgVectorLoweringPassOptions options;
	options.splitVectorTransfersTo = "linalg-copy";
	addLowerToVectorTransforms(nestedFuncPassManager, options);
	}
	}

	void addTileFuseAndVectorizePassPipeline(OpPassManager &passManager,
	bool lowerToVectors) {
	passManager.addPass(createCanonicalizerPass());

	// Tile and vectorize linalg ops on tensors.
	passManager.addNestedPass<FuncOp>(
	createLLVMCPUTileFuseAndVectorizePass(lowerToVectors));
	passManager.addNestedPass<FuncOp>(createCSEPass());
	passManager.addNestedPass<FuncOp>(createCanonicalizerPass());

	// Use stack allocation on CPU side.

	// TODO(ravishankarm): This is commented cause this is WIP, to be enabled
	// soon.
	//
	// auto callbacks =
	// std::make_unique<linalg::comprehensive_bufferize::AllocationCallbacks>(
	// cpuComprehensiveBufferizeAllocationFn,
	// cpuComprehensiveBufferizeDeallocationFn,
	// cpuComprehensiveBufferizeCopyFn);
	// addIREEComprehensiveBufferizePasses(passManager, std::move(callbacks));

	addLinalgBufferizePasses(passManager, cpuAllocationFunction);
	passManager.addNestedPass<FuncOp>(createCSEPass());
	passManager.addNestedPass<FuncOp>(createCanonicalizerPass());

	passManager.addNestedPass<FuncOp>(createForOpCanonicalizationPass());
	passManager.addNestedPass<FuncOp>(createOptimizeVectorTransferPass());
	}

	void addCPUDefaultPassPipeline(OpPassManager &passManager) {
	passManager.addPass(createCanonicalizerPass());
	// Use stack allocation on CPU side.
	addLinalgBufferizePasses(passManager, cpuAllocationFunction);
	}

	static void addLowerToLLVMPasses(OpPassManager &passManager) {
	// LinalgExt -> SCF
	passManager.addNestedPass<FuncOp>(
	IREE::LinalgExt::createLinalgExtToLoopsPass());

	// Linalg -> SCF
	passManager.addNestedPass<FuncOp>(createConvertLinalgToLoopsPass());
	passManager.addNestedPass<FuncOp>(createCanonicalizerPass());
	passManager.addNestedPass<FuncOp>(createCSEPass());

	// SCF -> STD
	passManager.addNestedPass<FuncOp>(createLowerToCFGPass());
	passManager.addNestedPass<FuncOp>(createCanonicalizerPass());
	passManager.addNestedPass<FuncOp>(createCSEPass());

	if (clCheckIRBeforeLLVMConversion) {
	passManager.addPass(createLLVMCPUCheckIRBeforeLLVMConversionPass());
	}
	// Handled tensor-type constants.
	passManager.addPass(createTensorConstantBufferizePass());
	passManager.addPass(createFoldTensorExtractOpPass());

	// (HAL, IREE, Linalg, STD) -> LLVM
	passManager.addNestedPass<FuncOp>(arith::createArithmeticExpandOpsPass());
	passManager.addNestedPass<FuncOp>(createStdExpandOpsPass());
	passManager.addPass(createConvertToLLVMPass());

	// We rely on MLIR symbol visibility being correct after this point and need
	// to mirror the LLVM linkage that was assigned during conversion.
	passManager.addPass(createLLVMCPUSynchronizeSymbolVisibilityPass());

	passManager.addPass(createCanonicalizerPass());
	passManager.addPass(createCSEPass());
	}

	void buildLLVMCPUCodegenPassPipeline(OpPassManager &passManager) {
	passManager.addPass(createLLVMCPULowerExecutableTargetPass());
	OpPassManager &nestedModulePM = passManager.nest<ModuleOp>();
	addLowerToLLVMPasses(nestedModulePM);
	}

	} // namespace iree_compiler
	} // namespace mlir