iree/compiler/Codegen/Passes.h - 3p/openxla/iree - Git at Google

 // Copyright 2021 The IREE Authors
 //
 // Licensed under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

 #ifndef IREE_COMPILER_CODEGEN_PASSES_H_
 #define IREE_COMPILER_CODEGEN_PASSES_H_

 #include <memory>

 #include "iree/compiler/Dialect/HAL/IR/HALOps.h"
 #include "iree/compiler/Dialect/HAL/IR/LoweringConfig.h"
 #include "mlir/Dialect/Linalg/Transforms/Transforms.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Pass/PassOptions.h"
 #include "mlir/Transforms/DialectConversion.h"

 namespace mlir {
 namespace iree_compiler {

 // Registers all conversion passes in this directory.
 void registerCodegenPasses();

 //------------------------------------------------------------------------------
 // Misc/common conversions
 //------------------------------------------------------------------------------

 /// Alias for callback function that allocates workgroup level memory. The
 /// callback expects the static shape and element-type of the result memref
 /// type. Also expects values for the dynamic dimension of the allocated memref,
 /// where each dynamic dimension corresponds to a `ShapedType::kDynamicSize`
 /// value in `staticShape`.
 using WorkgroupMemoryAllocationFn = std::function<Value(
     OpBuilder &builder, Location loc, ArrayRef<int64_t> staticShape,
     Type elementType, ArrayRef<Value> dynamicSizes)>;

 /// Adds passes to convert tiled+distributed linalg on tensors code to linalg on
 /// buffers.
 void addLinalgBufferizePasses(
     OpPassManager &passManager,
     WorkgroupMemoryAllocationFn allocationFn = nullptr);

 /// Pass to perform canonicalizations/cleanups related to HAL interface/buffer
 /// allocations and view operations.
 std::unique_ptr<OperationPass<FuncOp>> createCleanupBufferAllocViewPass();

 /// Create a pass to convert a model using f32 type to the equivalent one
 /// using f16.
 std::unique_ptr<OperationPass<ModuleOp>> createDemoteF32ToF16Pass();

 /// Flattens n-D MemRef subspan ops to 1-D MemRef and folds the byte offsets on
 /// subspan ops to the consumer load/store ops, in preparation for lowering to
 /// backends that require linearized access.
 std::unique_ptr<OperationPass<ModuleOp>> createFlattenMemRefSubspanPass();

 /// After running the upstream TensorConstantBufferize pass, remove tensor_loads
 /// introduced for use only in tensor_extract. These can be folded to use a load
 /// of the created memref object that holds the constant values.
 std::unique_ptr<OperationPass<>> createFoldTensorExtractOpPass();

 /// An ad-hoc pass to canonicalize selected loop carried dependencies on
 /// scf.for.
 std::unique_ptr<OperationPass<FuncOp>> createForOpCanonicalizationPass();

 /// Pass to perform linalg on tensor bufferization. The function passed into the
 /// pass through the `allocationFn` argument is invoked whenever a new buffer is
 /// to be created. The callback will be passed the Values for the dynamic
 /// dimensions in the memref type that is to be allocated.  The callback is
 /// expected to return a MemRefType Value.  When no `allocationFn` is specified,
 /// the default allocator generates an `std.alloc` instruction with the
 /// allocated MemRefType having no stride map (i.e. default row-major striding)
 /// and default memory space.
 std::unique_ptr<OperationPass<FuncOp>> createLinalgBufferizePass(
     WorkgroupMemoryAllocationFn allocationFn = nullptr);

 /// Creates a pass to vectorize a very specific form of linalg.conv ops.
 std::unique_ptr<OperationPass<FuncOp>> createLinalgToVectorVectorizeConvPass();

 /// Creates a pass to vectorize a very specific form of linalg.conv ops.
 std::unique_ptr<OperationPass<FuncOp>> createLinalgToVectorVectorizeMMT4dPass();

 /// Pass to optimize vector transfer_read and transfer_write.
 std::unique_ptr<OperationPass<FuncOp>> createOptimizeVectorTransferPass();

 /// Sets the number of workgroups to use for each entry point in the dispatch
 /// region.
 std::unique_ptr<OperationPass<IREE::HAL::ExecutableVariantOp>>
 createSetNumWorkgroupsPass(ArrayRef<int64_t> workgroupSize = {});

 //----------------------------------------------------------------------------//
 // Common codegen patterns.
 //----------------------------------------------------------------------------//

 /// Populates `patterns` with a very specific pattern that vectorizes a
 /// linalg.conv op for a single thread. The linalg.conv should compute on
 /// static-sized subviews. To match, output shape must be 1x1xWoxCo, where Co
 /// Co is a multiple of 4, and filter shape must be 1x1x4xCo.
 void populateLinalgToVectorVectorizeConvPatterns(
     MLIRContext *context, OwningRewritePatternList &patterns);

 /// Populates `patterns` to convert linalg.mmt4d to vector.contract.
 void populateLinalgToVectorVectorizeMMT4dPatterns(
     MLIRContext *context, OwningRewritePatternList &patterns);

 //------------------------------------------------------------------------------
 // LLVMCPU
 //------------------------------------------------------------------------------

 /// Performs the final conversion to LLVM dialect.
 std::unique_ptr<OperationPass<ModuleOp>> createConvertToLLVMPass();

 /// Pass to lower the module an hal.executable.variant operation to external
 /// dialect. Currently this pass lowers to LLVM dialect, but could be
 /// generalized to lower to any "final" dialect like SPIR-V/NVVM, etc.
 std::unique_ptr<OperationPass<IREE::HAL::ExecutableVariantOp>>
 createLLVMCPULowerExecutableTargetPass(bool lowerToVectors = true);

 /// Synchronizes LLVM linkage with MLIR symbol visibility.
 std::unique_ptr<OperationPass<ModuleOp>>
 createLLVMCPUSynchronizeSymbolVisibilityPass();

 /// Multi-level tiling and vectorization of linalg ops on tensors.
 std::unique_ptr<OperationPass<FuncOp>> createLLVMCPUTileAndVectorizePass(
     bool lowerToVectors = true);

 /// Vectorizes linalg ops executed in the same hal.interface.workgroup.
 std::unique_ptr<OperationPass<FuncOp>> createLLVMCPUVectorizationPass(
     bool lowerToVectors = true);

 /// Replaces llvm.intr.fma with its unfused mul and add ops.
 std::unique_ptr<OperationPass<FuncOp>> createLLVMCPUUnfuseFMAOpsPass();

 /// A pass that converts vector dialect operations to inline assembly
 std::unique_ptr<OperationPass<FuncOp>>
 createVectorToAArch64InlineAssemblyPass();

 //------------------------------------------------------------------------------
 // LLVMCPU Codegen specific patterns.
 //------------------------------------------------------------------------------

 /// Populates `patterns` to convert vector.contract op to a sequence
 /// of AArch64 inline assembly operations.
 void populateVectorContractToAArch64InlineAsm(
     OwningRewritePatternList &patterns, MLIRContext *context);

 void populateUnfusedFMAOpsPassPatterns(MLIRContext *context,
                                        OwningRewritePatternList &patterns);

 //----------------------------------------------------------------------------//
 // LLVMCPU backend Pass Pipelines.
 //----------------------------------------------------------------------------//

 /// Populates the passes to lower to scalars operations for linalg based
 /// code-generation. This pipeline does not vectorize, but instead just converts
 /// to memrefs
 void addCPUDefaultPassPipeline(OpPassManager &passManager);

 /// Populates the passes needed to lower to vector operations using linalg based
 /// progressive lowering with vectorization after bufferization.
 void addCPUVectorizationPassPipeline(OpPassManager &passManager,
                                      bool lowerToVectors = true);

 /// Populates the passes needed to multi level tile and lowering of linalg ops
 /// on tensors to vectors operations.
 void addTensorToVectorsPassPipeline(OpPassManager &passManager,
                                     bool lowerToVectors = true);

 //----------------------------------------------------------------------------//
 // LLVMCPU Pass Pipelines for lowering to LLVM dialect.
 //----------------------------------------------------------------------------//

 /// Populates passes needed to lower a XLA HLO op to LLVM dialect via the
 /// structured ops path. The pass manager `pm` in here should operate on the
 /// module within the IREE::HAL::ExecutableOp.
 void buildLLVMCPUCodegenPassPipeline(OpPassManager &passManager);

 //------------------------------------------------------------------------------
 // LLVMGPU
 //------------------------------------------------------------------------------

 /// Lowering calling vectorization patterns. Expects pass manager to be a
 /// module-level pass manager.
 void addGPUVectorizationPassPipeline(OpPassManager &pm);

 /// Lowering calling vectorization patterns.
 void addGPUMatmulSimtPassPipeline(OpPassManager &pm);

 /// Simple lowering only distributute linalg ops on blocks and threads. This
 /// will result in scalar operations. Expects pass manager to be a module-level
 /// pass manager.
 void addGPUSimpleDistributePassPipeline(OpPassManager &pm);

 /// Populates passes needed to lower a XLA HLO op to NVVM/ROCDL dialect via the
 /// structured ops path. The pass manager `pm` in here should operate on the
 /// module within the IREE::HAL::ExecutableOp.
 void buildLLVMGPUTransformPassPipeline(OpPassManager &pm, bool useROCM);

 /// Performs the final conversion to NNVM+LLVM dialect.
 std::unique_ptr<OperationPass<ModuleOp>> createConvertToNVVMPass();

 /// Performs the final conversion to ROCDL+LLVM dialect.
 std::unique_ptr<OperationPass<ModuleOp>> createConvertToROCDLPass();

 /// Perform tiling and distribution to threads.
 std::unique_ptr<OperationPass<FuncOp>>
 createLLVMGPUTileAndDistributeToThreads();

 std::unique_ptr<OperationPass<FuncOp>>
 createLLVMGPURemoveSingleIterationLoopPass();

 /// Create pass calling the dynamic pipeline for LLVMGPU.
 std::unique_ptr<OperationPass<IREE::HAL::ExecutableVariantOp>>
 createLLVMGPULowerExecutableTargetPass();

 /// Convert Linalg ops to Vector.
 std::unique_ptr<OperationPass<FuncOp>> createLLVMGPUVectorizationPass();

 /// Lower vector ops before convertion to LLVM.
 std::unique_ptr<OperationPass<FuncOp>> createLLVMGPUVectorLoweringPass();

 /// Convert shared memory copies to distributed transfer_read/transfer_write.
 std::unique_ptr<OperationPass<FuncOp>>
 createLLVMGPUDistributeSharedMemoryCopy();

 /// Apply software pipelining.
 std::unique_ptr<OperationPass<FuncOp>> createLLVMGPUPipeliningPass();

 //------------------------------------------------------------------------------
 // SPIR-V Passes
 //------------------------------------------------------------------------------

 /// Pass pipeline to lower IREE HAL executables with workgroup tiled and
 /// distributed Linalg ops to SPIR-V scalar code. Additionally performs
 /// distribution to threads without vectorization.
 void addSPIRVTileAndDistributePassPipeline(OpPassManager &pm);

 /// Pass pipeline to lower IREE HAL executables that contain Linalg ops that are
 /// not tiled/distributed. Performs distribution to global invocations.
 void addSPIRVDistributeToGlobalIDPassPipeline(OpPassManager &pm);

 /// Pass pipeline to lower IREE HAL executables with workgroup tiled and
 /// distributed Linalg ops to SPIR-V scalar and vector code. Additionally
 /// performs distribution to threads with vectorization.
 void addSPIRVTileAndVectorizePassPipeline(OpPassManager &pm);

 /// Pass to perform the final conversion to SPIR-V dialect.
 ///
 /// This pass converts remaining interface ops into SPIR-V global variables,
 /// GPU processor ID ops into SPIR-V global variables, loop/standard ops into
 /// corresponding SPIR-V ops.
 std::unique_ptr<OperationPass<ModuleOp>> createConvertToSPIRVPass();

 /// Pass to distribute Linalg ops with buffer semantics to global invocations.
 std::unique_ptr<OperationPass<FuncOp>> createSPIRVDistributeToGlobalIDPass();

 /// Creates a pass to fold processor ID uses where possible.
 std::unique_ptr<OperationPass<FuncOp>> createSPIRVFoldProcessorIDUsesPass();

 /// Main pass to lower executables to scalar + vector code on SPIR-V path.
 /// Invokes one of the pass pipelines that translate the executable to
 /// scalar + vector code.
 std::unique_ptr<OperationPass<IREE::HAL::ExecutableVariantOp>>
 createSPIRVLowerExecutableTargetPass();

 /// Pass to remove loop generated at flow for tiled and distributed Linalg ops
 /// when the loop is known to have a single trip count.
 /// WARNING: DO NOT USE. This is a legacy pass that is to be deprecated.
 std::unique_ptr<OperationPass<FuncOp>> createSPIRVRemoveOneTripTiledLoopPass();

 /// Pass to tile and distribute Linalg ops with buffer semantics to subgroups
 /// and invocations.
 std::unique_ptr<OperationPass<FuncOp>> createSPIRVTileAndDistributePass();

 /// Pass to convert vector read/write/arithmetic operations to the corresponding
 /// cooperative matrix ops when possible.
 std::unique_ptr<OperationPass<FuncOp>>
 createSPIRVVectorToCooperativeMatrixPass();

 /// Pass to lower linalg.copy for copying data to workgroup memory.
 std::unique_ptr<OperationPass<FuncOp>> createSPIRVCopyToWorkgroupMemoryPass();

 /// Pass to vectorize Linalg ops with buffer semantics.
 std::unique_ptr<OperationPass<FuncOp>> createSPIRVVectorizePass();

 /// Converts memref of scalar to memref of vector of efficent size. This will
 /// allow to convert memory accesses to vector load/store in SPIR-V without
 /// having pointer bitcast.
 std::unique_ptr<OperationPass<ModuleOp>> createSPIRVVectorizeLoadStore();

 //----------------------------------------------------------------------------//
 // SPIRV Codegen Pass Pipelines.
 //----------------------------------------------------------------------------//

 /// Populates passes needed to lower a XLA HLO op to SPIR-V dialect via the
 /// structured ops path. The pass manager `pm` in here operate on the module
 /// within the IREE::HAL::ExecutableOp. The `workGroupSize` can be used to
 /// control the work group size used in the code generation and is intended for
 /// testing purposes only. The pass pipeline will set an appropriate workgroup
 /// size.
 /// TODO: Are both of these needed and does this one still work on HLO?
 void buildSPIRVCodegenPassPipeline(OpPassManager &pm);

 //----------------------------------------------------------------------------//
 // SPIRV Codegen specific patterns.
 //----------------------------------------------------------------------------//

 /// Populates patterns to fold processor ID uses by using processor counts
 /// information where possible.
 void populateFoldGPUProcessorIDUsesPatterns(MLIRContext *context,
                                             OwningRewritePatternList &patterns);

 //------------------------------------------------------------------------------
 // Test passes
 //------------------------------------------------------------------------------

 std::unique_ptr<OperationPass<ModuleOp>> createTestLLVMGPULegalizePass();

 }  // namespace iree_compiler
 }  // namespace mlir

 #endif  // IREE_COMPILER_CODEGEN_PASSES_H_
	// Copyright 2021 The IREE Authors
	//
	// Licensed under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

	#ifndef IREE_COMPILER_CODEGEN_PASSES_H_
	#define IREE_COMPILER_CODEGEN_PASSES_H_

	#include <memory>

	#include "iree/compiler/Dialect/HAL/IR/HALOps.h"
	#include "iree/compiler/Dialect/HAL/IR/LoweringConfig.h"
	#include "mlir/Dialect/Linalg/Transforms/Transforms.h"
	#include "mlir/Pass/Pass.h"
	#include "mlir/Pass/PassOptions.h"
	#include "mlir/Transforms/DialectConversion.h"

	namespace mlir {
	namespace iree_compiler {

	// Registers all conversion passes in this directory.
	void registerCodegenPasses();

	//------------------------------------------------------------------------------
	// Misc/common conversions
	//------------------------------------------------------------------------------

	/// Alias for callback function that allocates workgroup level memory. The
	/// callback expects the static shape and element-type of the result memref
	/// type. Also expects values for the dynamic dimension of the allocated memref,
	/// where each dynamic dimension corresponds to a `ShapedType::kDynamicSize`
	/// value in `staticShape`.
	using WorkgroupMemoryAllocationFn = std::function<Value(
	OpBuilder &builder, Location loc, ArrayRef<int64_t> staticShape,
	Type elementType, ArrayRef<Value> dynamicSizes)>;

	/// Adds passes to convert tiled+distributed linalg on tensors code to linalg on
	/// buffers.
	void addLinalgBufferizePasses(
	OpPassManager &passManager,
	WorkgroupMemoryAllocationFn allocationFn = nullptr);

	/// Pass to perform canonicalizations/cleanups related to HAL interface/buffer
	/// allocations and view operations.
	std::unique_ptr<OperationPass<FuncOp>> createCleanupBufferAllocViewPass();

	/// Create a pass to convert a model using f32 type to the equivalent one
	/// using f16.
	std::unique_ptr<OperationPass<ModuleOp>> createDemoteF32ToF16Pass();

	/// Flattens n-D MemRef subspan ops to 1-D MemRef and folds the byte offsets on
	/// subspan ops to the consumer load/store ops, in preparation for lowering to
	/// backends that require linearized access.
	std::unique_ptr<OperationPass<ModuleOp>> createFlattenMemRefSubspanPass();

	/// After running the upstream TensorConstantBufferize pass, remove tensor_loads
	/// introduced for use only in tensor_extract. These can be folded to use a load
	/// of the created memref object that holds the constant values.
	std::unique_ptr<OperationPass<>> createFoldTensorExtractOpPass();

	/// An ad-hoc pass to canonicalize selected loop carried dependencies on
	/// scf.for.
	std::unique_ptr<OperationPass<FuncOp>> createForOpCanonicalizationPass();

	/// Pass to perform linalg on tensor bufferization. The function passed into the
	/// pass through the `allocationFn` argument is invoked whenever a new buffer is
	/// to be created. The callback will be passed the Values for the dynamic
	/// dimensions in the memref type that is to be allocated. The callback is
	/// expected to return a MemRefType Value. When no `allocationFn` is specified,
	/// the default allocator generates an `std.alloc` instruction with the
	/// allocated MemRefType having no stride map (i.e. default row-major striding)
	/// and default memory space.
	std::unique_ptr<OperationPass<FuncOp>> createLinalgBufferizePass(
	WorkgroupMemoryAllocationFn allocationFn = nullptr);

	/// Creates a pass to vectorize a very specific form of linalg.conv ops.
	std::unique_ptr<OperationPass<FuncOp>> createLinalgToVectorVectorizeConvPass();

	/// Creates a pass to vectorize a very specific form of linalg.conv ops.
	std::unique_ptr<OperationPass<FuncOp>> createLinalgToVectorVectorizeMMT4dPass();

	/// Pass to optimize vector transfer_read and transfer_write.
	std::unique_ptr<OperationPass<FuncOp>> createOptimizeVectorTransferPass();

	/// Sets the number of workgroups to use for each entry point in the dispatch
	/// region.
	std::unique_ptr<OperationPass<IREE::HAL::ExecutableVariantOp>>
	createSetNumWorkgroupsPass(ArrayRef<int64_t> workgroupSize = {});

	//----------------------------------------------------------------------------//
	// Common codegen patterns.
	//----------------------------------------------------------------------------//

	/// Populates `patterns` with a very specific pattern that vectorizes a
	/// linalg.conv op for a single thread. The linalg.conv should compute on
	/// static-sized subviews. To match, output shape must be 1x1xWoxCo, where Co
	/// Co is a multiple of 4, and filter shape must be 1x1x4xCo.
	void populateLinalgToVectorVectorizeConvPatterns(
	MLIRContext *context, OwningRewritePatternList &patterns);

	/// Populates `patterns` to convert linalg.mmt4d to vector.contract.
	void populateLinalgToVectorVectorizeMMT4dPatterns(
	MLIRContext *context, OwningRewritePatternList &patterns);

	//------------------------------------------------------------------------------
	// LLVMCPU
	//------------------------------------------------------------------------------

	/// Performs the final conversion to LLVM dialect.
	std::unique_ptr<OperationPass<ModuleOp>> createConvertToLLVMPass();

	/// Pass to lower the module an hal.executable.variant operation to external
	/// dialect. Currently this pass lowers to LLVM dialect, but could be
	/// generalized to lower to any "final" dialect like SPIR-V/NVVM, etc.
	std::unique_ptr<OperationPass<IREE::HAL::ExecutableVariantOp>>
	createLLVMCPULowerExecutableTargetPass(bool lowerToVectors = true);

	/// Synchronizes LLVM linkage with MLIR symbol visibility.
	std::unique_ptr<OperationPass<ModuleOp>>
	createLLVMCPUSynchronizeSymbolVisibilityPass();

	/// Multi-level tiling and vectorization of linalg ops on tensors.
	std::unique_ptr<OperationPass<FuncOp>> createLLVMCPUTileAndVectorizePass(
	bool lowerToVectors = true);

	/// Vectorizes linalg ops executed in the same hal.interface.workgroup.
	std::unique_ptr<OperationPass<FuncOp>> createLLVMCPUVectorizationPass(
	bool lowerToVectors = true);

	/// Replaces llvm.intr.fma with its unfused mul and add ops.
	std::unique_ptr<OperationPass<FuncOp>> createLLVMCPUUnfuseFMAOpsPass();

	/// A pass that converts vector dialect operations to inline assembly
	std::unique_ptr<OperationPass<FuncOp>>
	createVectorToAArch64InlineAssemblyPass();

	//------------------------------------------------------------------------------
	// LLVMCPU Codegen specific patterns.
	//------------------------------------------------------------------------------

	/// Populates `patterns` to convert vector.contract op to a sequence
	/// of AArch64 inline assembly operations.
	void populateVectorContractToAArch64InlineAsm(
	OwningRewritePatternList &patterns, MLIRContext *context);

	void populateUnfusedFMAOpsPassPatterns(MLIRContext *context,
	OwningRewritePatternList &patterns);

	//----------------------------------------------------------------------------//
	// LLVMCPU backend Pass Pipelines.
	//----------------------------------------------------------------------------//

	/// Populates the passes to lower to scalars operations for linalg based
	/// code-generation. This pipeline does not vectorize, but instead just converts
	/// to memrefs
	void addCPUDefaultPassPipeline(OpPassManager &passManager);

	/// Populates the passes needed to lower to vector operations using linalg based
	/// progressive lowering with vectorization after bufferization.
	void addCPUVectorizationPassPipeline(OpPassManager &passManager,
	bool lowerToVectors = true);

	/// Populates the passes needed to multi level tile and lowering of linalg ops
	/// on tensors to vectors operations.
	void addTensorToVectorsPassPipeline(OpPassManager &passManager,
	bool lowerToVectors = true);

	//----------------------------------------------------------------------------//
	// LLVMCPU Pass Pipelines for lowering to LLVM dialect.
	//----------------------------------------------------------------------------//

	/// Populates passes needed to lower a XLA HLO op to LLVM dialect via the
	/// structured ops path. The pass manager `pm` in here should operate on the
	/// module within the IREE::HAL::ExecutableOp.
	void buildLLVMCPUCodegenPassPipeline(OpPassManager &passManager);

	//------------------------------------------------------------------------------
	// LLVMGPU
	//------------------------------------------------------------------------------

	/// Lowering calling vectorization patterns. Expects pass manager to be a
	/// module-level pass manager.
	void addGPUVectorizationPassPipeline(OpPassManager &pm);

	/// Lowering calling vectorization patterns.
	void addGPUMatmulSimtPassPipeline(OpPassManager &pm);

	/// Simple lowering only distributute linalg ops on blocks and threads. This
	/// will result in scalar operations. Expects pass manager to be a module-level
	/// pass manager.
	void addGPUSimpleDistributePassPipeline(OpPassManager &pm);

	/// Populates passes needed to lower a XLA HLO op to NVVM/ROCDL dialect via the
	/// structured ops path. The pass manager `pm` in here should operate on the
	/// module within the IREE::HAL::ExecutableOp.
	void buildLLVMGPUTransformPassPipeline(OpPassManager &pm, bool useROCM);

	/// Performs the final conversion to NNVM+LLVM dialect.
	std::unique_ptr<OperationPass<ModuleOp>> createConvertToNVVMPass();

	/// Performs the final conversion to ROCDL+LLVM dialect.
	std::unique_ptr<OperationPass<ModuleOp>> createConvertToROCDLPass();

	/// Perform tiling and distribution to threads.
	std::unique_ptr<OperationPass<FuncOp>>
	createLLVMGPUTileAndDistributeToThreads();

	std::unique_ptr<OperationPass<FuncOp>>
	createLLVMGPURemoveSingleIterationLoopPass();

	/// Create pass calling the dynamic pipeline for LLVMGPU.
	std::unique_ptr<OperationPass<IREE::HAL::ExecutableVariantOp>>
	createLLVMGPULowerExecutableTargetPass();

	/// Convert Linalg ops to Vector.
	std::unique_ptr<OperationPass<FuncOp>> createLLVMGPUVectorizationPass();

	/// Lower vector ops before convertion to LLVM.
	std::unique_ptr<OperationPass<FuncOp>> createLLVMGPUVectorLoweringPass();

	/// Convert shared memory copies to distributed transfer_read/transfer_write.
	std::unique_ptr<OperationPass<FuncOp>>
	createLLVMGPUDistributeSharedMemoryCopy();

	/// Apply software pipelining.
	std::unique_ptr<OperationPass<FuncOp>> createLLVMGPUPipeliningPass();

	//------------------------------------------------------------------------------
	// SPIR-V Passes
	//------------------------------------------------------------------------------

	/// Pass pipeline to lower IREE HAL executables with workgroup tiled and
	/// distributed Linalg ops to SPIR-V scalar code. Additionally performs
	/// distribution to threads without vectorization.
	void addSPIRVTileAndDistributePassPipeline(OpPassManager &pm);

	/// Pass pipeline to lower IREE HAL executables that contain Linalg ops that are
	/// not tiled/distributed. Performs distribution to global invocations.
	void addSPIRVDistributeToGlobalIDPassPipeline(OpPassManager &pm);

	/// Pass pipeline to lower IREE HAL executables with workgroup tiled and
	/// distributed Linalg ops to SPIR-V scalar and vector code. Additionally
	/// performs distribution to threads with vectorization.
	void addSPIRVTileAndVectorizePassPipeline(OpPassManager &pm);

	/// Pass to perform the final conversion to SPIR-V dialect.
	///
	/// This pass converts remaining interface ops into SPIR-V global variables,
	/// GPU processor ID ops into SPIR-V global variables, loop/standard ops into
	/// corresponding SPIR-V ops.
	std::unique_ptr<OperationPass<ModuleOp>> createConvertToSPIRVPass();

	/// Pass to distribute Linalg ops with buffer semantics to global invocations.
	std::unique_ptr<OperationPass<FuncOp>> createSPIRVDistributeToGlobalIDPass();

	/// Creates a pass to fold processor ID uses where possible.
	std::unique_ptr<OperationPass<FuncOp>> createSPIRVFoldProcessorIDUsesPass();

	/// Main pass to lower executables to scalar + vector code on SPIR-V path.
	/// Invokes one of the pass pipelines that translate the executable to
	/// scalar + vector code.
	std::unique_ptr<OperationPass<IREE::HAL::ExecutableVariantOp>>
	createSPIRVLowerExecutableTargetPass();

	/// Pass to remove loop generated at flow for tiled and distributed Linalg ops
	/// when the loop is known to have a single trip count.
	/// WARNING: DO NOT USE. This is a legacy pass that is to be deprecated.
	std::unique_ptr<OperationPass<FuncOp>> createSPIRVRemoveOneTripTiledLoopPass();

	/// Pass to tile and distribute Linalg ops with buffer semantics to subgroups
	/// and invocations.
	std::unique_ptr<OperationPass<FuncOp>> createSPIRVTileAndDistributePass();

	/// Pass to convert vector read/write/arithmetic operations to the corresponding
	/// cooperative matrix ops when possible.
	std::unique_ptr<OperationPass<FuncOp>>
	createSPIRVVectorToCooperativeMatrixPass();

	/// Pass to lower linalg.copy for copying data to workgroup memory.
	std::unique_ptr<OperationPass<FuncOp>> createSPIRVCopyToWorkgroupMemoryPass();

	/// Pass to vectorize Linalg ops with buffer semantics.
	std::unique_ptr<OperationPass<FuncOp>> createSPIRVVectorizePass();

	/// Converts memref of scalar to memref of vector of efficent size. This will
	/// allow to convert memory accesses to vector load/store in SPIR-V without
	/// having pointer bitcast.
	std::unique_ptr<OperationPass<ModuleOp>> createSPIRVVectorizeLoadStore();

	//----------------------------------------------------------------------------//
	// SPIRV Codegen Pass Pipelines.
	//----------------------------------------------------------------------------//

	/// Populates passes needed to lower a XLA HLO op to SPIR-V dialect via the
	/// structured ops path. The pass manager `pm` in here operate on the module
	/// within the IREE::HAL::ExecutableOp. The `workGroupSize` can be used to
	/// control the work group size used in the code generation and is intended for
	/// testing purposes only. The pass pipeline will set an appropriate workgroup
	/// size.
	/// TODO: Are both of these needed and does this one still work on HLO?
	void buildSPIRVCodegenPassPipeline(OpPassManager &pm);

	//----------------------------------------------------------------------------//
	// SPIRV Codegen specific patterns.
	//----------------------------------------------------------------------------//

	/// Populates patterns to fold processor ID uses by using processor counts
	/// information where possible.
	void populateFoldGPUProcessorIDUsesPatterns(MLIRContext *context,
	OwningRewritePatternList &patterns);

	//------------------------------------------------------------------------------
	// Test passes
	//------------------------------------------------------------------------------

	std::unique_ptr<OperationPass<ModuleOp>> createTestLLVMGPULegalizePass();

	} // namespace iree_compiler
	} // namespace mlir

	#endif // IREE_COMPILER_CODEGEN_PASSES_H_