compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.h - 3p/openxla/iree - Git at Google

 // Copyright 2023 The IREE Authors
 //
 // Licensed under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //===----------------------------------------------------------------------===//
 //
 // This file includes the LLVMGPU Passes.
 //
 //===----------------------------------------------------------------------===//

 #ifndef IREE_COMPILER_CODEGEN_LLVMGPU_PASSES_H_
 #define IREE_COMPILER_CODEGEN_LLVMGPU_PASSES_H_

 #include "iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenAttrs.h"
 #include "mlir/Pass/Pass.h"

 namespace mlir::iree_compiler {

 //===----------------------------------------------------------------------===//
 // Passes
 //===----------------------------------------------------------------------===//

 /// Lowering using SIMT CUDA core operations.
 void addGPUMatmulSimtPassPipeline(OpPassManager &funcPassManager);

 /// Lowering using mma.sync Tensor Core operations.
 void addGPUMatmulTensorCoreMmaSyncPassPipeline(OpPassManager &funcPassManager,
                                                unsigned pipelineDepth);

 /// Lowering using wmma Tensor Core operations.
 void addGPUMatmulTensorCorePassPipeline(OpPassManager &funcPassManager,
                                         unsigned pipelineDepth);

 void addGPUPackUnPackPasses(OpPassManager &funcPassManager);

 /// Simple lowering only distributute linalg ops on blocks and threads. This
 /// will result in scalar operations. Expects pass manager to be a
 /// module-level pass manager.
 void addGPUSimpleDistributePassPipeline(OpPassManager &funcPassManager);

 /// Transform dialect-based path.
 void addGPUTransformDialectPasses(OpPassManager &funcPassManager,
                                   StringRef entryPoint);

 /// Lowering transpose using shared memory.
 void addGPUTransposePassPipeline(OpPassManager &funcPassManager);

 /// Lowering calling vectorization patterns. Expects pass manager to be a
 /// module-level pass manager.
 void addGPUVectorizationPassPipeline(OpPassManager &funcPassManager);

 /// Lowering based on vector distribution patterns.
 void addGPUVectorDistributePassPipeline(OpPassManager &funcPassManager);

 /// Lowering reductions to warp reductions.
 void addGPUWarpReductionPassPipeline(OpPassManager &funcPassManager);

 /// Default pass pipeline on GPU, currently used only for the ukernel path.
 void addGPUDefaultPassPipeline(OpPassManager &funcPassManager,
                                bool enableMicrokernels);

 /// Pass pipeline to lower IREE HAL executables without tiling and distribution.
 void addGPUBaseLoweringPassPipeline(OpPassManager &pm);

 /// Populates passes needed to preprocess and select the translation strategy.
 void buildLLVMGPUCodegenConfigurationPassPipeline(
     OpPassManager &variantPassManagery);

 /// Populates passes needed to lower a XLA HLO op to NVVM/ROCDL dialect via
 /// the structured ops path. The pass manager `pm` in here should operate on
 /// the module within the IREE::HAL::ExecutableOp.
 void buildLLVMGPUCodegenPassPipeline(OpPassManager &variantPassManagery,
                                      bool useROCM);

 /// Performs the final conversion to NNVM+LLVM dialect.
 std::unique_ptr<OperationPass<ModuleOp>> createConvertToNVVMPass();

 /// Performs the final conversion to ROCDL+LLVM dialect.
 std::unique_ptr<OperationPass<ModuleOp>> createConvertToROCDLPass();

 /// Cast address space to generic in CallOp and FuncOp
 std::unique_ptr<OperationPass<ModuleOp>>
 createLLVMGPUCastAddressSpaceFunction();

 /// Perform type extension/truncation over vector.contract types to target GPU
 /// MMA intrinsics.
 std::unique_ptr<InterfacePass<mlir::FunctionOpInterface>>
 createLLVMGPUCastTypeToFitMMAPass();

 std::unique_ptr<InterfacePass<mlir::FunctionOpInterface>>
 createLLVMGPUDistribute();

 /// Create pass selecting the lowering strategy for LLVMGPU.
 std::unique_ptr<OperationPass<ModuleOp>>
 createLLVMGPUSelectLoweringStrategyPass();

 /// Create pass calling the dynamic pipeline for LLVMGPU.
 std::unique_ptr<InterfacePass<FunctionOpInterface>>
 createLLVMGPULowerExecutableTargetPass();

 // Pass to pack shared memory allocations in order to reduce shared memory
 // usage.
 std::unique_ptr<InterfacePass<FunctionOpInterface>>
 createLLVMGPUPackSharedMemoryAlloc();

 std::unique_ptr<InterfacePass<mlir::FunctionOpInterface>>
 createLLVMGPUPrefetchSharedMemoryPass();

 enum class GPUTensorCoreType {
   WMMA = 0,
   MMA_SYNC = 1,
 };

 /// Convert Linalg ops to Vector and prepare converstion to GPU MMA ops.
 std::unique_ptr<InterfacePass<FunctionOpInterface>>
 createLLVMGPUTensorCoreVectorizationPass(
     GPUTensorCoreType tensorCoreType = GPUTensorCoreType::WMMA);

 //. Pass to pad out tensors up to static dimensions.
 std::unique_ptr<InterfacePass<FunctionOpInterface>>
 createLLVMGPUTensorPadPass();

 /// Perform tiling and distribution to threads.
 std::unique_ptr<InterfacePass<FunctionOpInterface>>
 createLLVMGPUTileAndDistribute(bool distributeToWarp = false);

 // Pass to distribute vectorized functions.
 std::unique_ptr<InterfacePass<FunctionOpInterface>>
 createLLVMGPUVectorDistribute();

 /// Lower vector ops before convertion to LLVM.
 std::unique_ptr<InterfacePass<FunctionOpInterface>>
 createLLVMGPUVectorLoweringPass();

 /// Converts vector ops to gpu dialect.
 std::unique_ptr<InterfacePass<FunctionOpInterface>> createLLVMGPUVectorToGPU(
     GPUTensorCoreType tensorCoreType = GPUTensorCoreType::WMMA);

 /// Lowering calling vectorization patterns.
 LogicalResult
 verifyGPUMatmulPipeline(Operation *op,
                         IREE::Codegen::LoweringConfigAttr loweringConfig,
                         IREE::Codegen::TranslationInfoAttr translationInfo,
                         ArrayRef<int64_t> workgroupSize);

 /// Given a chain of matmuls with some or no operations
 /// in between, like
 ///
 /// d = matmul_transpose_b(a, b) + c
 /// ...
 /// e = matmul_transpose_b(d, f) + g
 ///
 /// this pattern transforms the above IR to
 ///
 /// c.t = transpose c
 /// d = matmul_transpose_b(b, a) + c.t
 /// d.t = transpose d
 /// ...
 /// g.t = transpose g
 /// e = matmul_transpose_b(f, d.t) + g.t
 /// e.t = transpose e
 ///
 /// On CDNA architectures, where the layouts of the RHS and result
 /// are the same and transposed from the LHS layout, this type
 /// of transformation can avoid trips to shared memory/shuffle instructions
 /// on operators like Flash Attention.
 std::unique_ptr<InterfacePass<FunctionOpInterface>>
 createAMDGPUPrepareForChainedMatmulPass();

 //----------------------------------------------------------------------------//
 // Register LLVMGPU Passes
 //----------------------------------------------------------------------------//

 void registerCodegenLLVMGPUPasses();

 //------------------------------------------------------------------------------
 // Test passes
 //------------------------------------------------------------------------------

 std::unique_ptr<OperationPass<ModuleOp>> createTestLLVMGPULegalizePass();

 } // namespace mlir::iree_compiler

 #endif // IREE_COMPILER_CODEGEN_LLVMGPU_PASSES_H_
	// Copyright 2023 The IREE Authors
	//
	// Licensed under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//===----------------------------------------------------------------------===//
	//
	// This file includes the LLVMGPU Passes.
	//
	//===----------------------------------------------------------------------===//

	#ifndef IREE_COMPILER_CODEGEN_LLVMGPU_PASSES_H_
	#define IREE_COMPILER_CODEGEN_LLVMGPU_PASSES_H_

	#include "iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenAttrs.h"
	#include "mlir/Pass/Pass.h"

	namespace mlir::iree_compiler {

	//===----------------------------------------------------------------------===//
	// Passes
	//===----------------------------------------------------------------------===//

	/// Lowering using SIMT CUDA core operations.
	void addGPUMatmulSimtPassPipeline(OpPassManager &funcPassManager);

	/// Lowering using mma.sync Tensor Core operations.
	void addGPUMatmulTensorCoreMmaSyncPassPipeline(OpPassManager &funcPassManager,
	unsigned pipelineDepth);

	/// Lowering using wmma Tensor Core operations.
	void addGPUMatmulTensorCorePassPipeline(OpPassManager &funcPassManager,
	unsigned pipelineDepth);

	void addGPUPackUnPackPasses(OpPassManager &funcPassManager);

	/// Simple lowering only distributute linalg ops on blocks and threads. This
	/// will result in scalar operations. Expects pass manager to be a
	/// module-level pass manager.
	void addGPUSimpleDistributePassPipeline(OpPassManager &funcPassManager);

	/// Transform dialect-based path.
	void addGPUTransformDialectPasses(OpPassManager &funcPassManager,
	StringRef entryPoint);

	/// Lowering transpose using shared memory.
	void addGPUTransposePassPipeline(OpPassManager &funcPassManager);

	/// Lowering calling vectorization patterns. Expects pass manager to be a
	/// module-level pass manager.
	void addGPUVectorizationPassPipeline(OpPassManager &funcPassManager);

	/// Lowering based on vector distribution patterns.
	void addGPUVectorDistributePassPipeline(OpPassManager &funcPassManager);

	/// Lowering reductions to warp reductions.
	void addGPUWarpReductionPassPipeline(OpPassManager &funcPassManager);

	/// Default pass pipeline on GPU, currently used only for the ukernel path.
	void addGPUDefaultPassPipeline(OpPassManager &funcPassManager,
	bool enableMicrokernels);

	/// Pass pipeline to lower IREE HAL executables without tiling and distribution.
	void addGPUBaseLoweringPassPipeline(OpPassManager &pm);

	/// Populates passes needed to preprocess and select the translation strategy.
	void buildLLVMGPUCodegenConfigurationPassPipeline(
	OpPassManager &variantPassManagery);

	/// Populates passes needed to lower a XLA HLO op to NVVM/ROCDL dialect via
	/// the structured ops path. The pass manager `pm` in here should operate on
	/// the module within the IREE::HAL::ExecutableOp.
	void buildLLVMGPUCodegenPassPipeline(OpPassManager &variantPassManagery,
	bool useROCM);

	/// Performs the final conversion to NNVM+LLVM dialect.
	std::unique_ptr<OperationPass<ModuleOp>> createConvertToNVVMPass();

	/// Performs the final conversion to ROCDL+LLVM dialect.
	std::unique_ptr<OperationPass<ModuleOp>> createConvertToROCDLPass();

	/// Cast address space to generic in CallOp and FuncOp
	std::unique_ptr<OperationPass<ModuleOp>>
	createLLVMGPUCastAddressSpaceFunction();

	/// Perform type extension/truncation over vector.contract types to target GPU
	/// MMA intrinsics.
	std::unique_ptr<InterfacePass<mlir::FunctionOpInterface>>
	createLLVMGPUCastTypeToFitMMAPass();

	std::unique_ptr<InterfacePass<mlir::FunctionOpInterface>>
	createLLVMGPUDistribute();

	/// Create pass selecting the lowering strategy for LLVMGPU.
	std::unique_ptr<OperationPass<ModuleOp>>
	createLLVMGPUSelectLoweringStrategyPass();

	/// Create pass calling the dynamic pipeline for LLVMGPU.
	std::unique_ptr<InterfacePass<FunctionOpInterface>>
	createLLVMGPULowerExecutableTargetPass();

	// Pass to pack shared memory allocations in order to reduce shared memory
	// usage.
	std::unique_ptr<InterfacePass<FunctionOpInterface>>
	createLLVMGPUPackSharedMemoryAlloc();

	std::unique_ptr<InterfacePass<mlir::FunctionOpInterface>>
	createLLVMGPUPrefetchSharedMemoryPass();

	enum class GPUTensorCoreType {
	WMMA = 0,
	MMA_SYNC = 1,
	};

	/// Convert Linalg ops to Vector and prepare converstion to GPU MMA ops.
	std::unique_ptr<InterfacePass<FunctionOpInterface>>
	createLLVMGPUTensorCoreVectorizationPass(
	GPUTensorCoreType tensorCoreType = GPUTensorCoreType::WMMA);

	//. Pass to pad out tensors up to static dimensions.
	std::unique_ptr<InterfacePass<FunctionOpInterface>>
	createLLVMGPUTensorPadPass();

	/// Perform tiling and distribution to threads.
	std::unique_ptr<InterfacePass<FunctionOpInterface>>
	createLLVMGPUTileAndDistribute(bool distributeToWarp = false);

	// Pass to distribute vectorized functions.
	std::unique_ptr<InterfacePass<FunctionOpInterface>>
	createLLVMGPUVectorDistribute();

	/// Lower vector ops before convertion to LLVM.
	std::unique_ptr<InterfacePass<FunctionOpInterface>>
	createLLVMGPUVectorLoweringPass();

	/// Converts vector ops to gpu dialect.
	std::unique_ptr<InterfacePass<FunctionOpInterface>> createLLVMGPUVectorToGPU(
	GPUTensorCoreType tensorCoreType = GPUTensorCoreType::WMMA);

	/// Lowering calling vectorization patterns.
	LogicalResult
	verifyGPUMatmulPipeline(Operation *op,
	IREE::Codegen::LoweringConfigAttr loweringConfig,
	IREE::Codegen::TranslationInfoAttr translationInfo,
	ArrayRef<int64_t> workgroupSize);

	/// Given a chain of matmuls with some or no operations
	/// in between, like
	///
	/// d = matmul_transpose_b(a, b) + c
	/// ...
	/// e = matmul_transpose_b(d, f) + g
	///
	/// this pattern transforms the above IR to
	///
	/// c.t = transpose c
	/// d = matmul_transpose_b(b, a) + c.t
	/// d.t = transpose d
	/// ...
	/// g.t = transpose g
	/// e = matmul_transpose_b(f, d.t) + g.t
	/// e.t = transpose e
	///
	/// On CDNA architectures, where the layouts of the RHS and result
	/// are the same and transposed from the LHS layout, this type
	/// of transformation can avoid trips to shared memory/shuffle instructions
	/// on operators like Flash Attention.
	std::unique_ptr<InterfacePass<FunctionOpInterface>>
	createAMDGPUPrepareForChainedMatmulPass();

	//----------------------------------------------------------------------------//
	// Register LLVMGPU Passes
	//----------------------------------------------------------------------------//

	void registerCodegenLLVMGPUPasses();

	//------------------------------------------------------------------------------
	// Test passes
	//------------------------------------------------------------------------------

	std::unique_ptr<OperationPass<ModuleOp>> createTestLLVMGPULegalizePass();

	} // namespace mlir::iree_compiler

	#endif // IREE_COMPILER_CODEGEN_LLVMGPU_PASSES_H_