blob: f99cfec4a8af5dd9bb5211e9b96c4cd997cfc14e [file] [log] [blame]
// Copyright 2023 The IREE Authors
//
// Licensed under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//===----------------------------------------------------------------------===//
//
// This file includes the LLVMGPU Passes.
//
//===----------------------------------------------------------------------===//
#ifndef IREE_COMPILER_CODEGEN_LLVMGPU_PASSES_H_
#define IREE_COMPILER_CODEGEN_LLVMGPU_PASSES_H_
#include "iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenAttrs.h"
#include "mlir/Pass/Pass.h"
namespace mlir::iree_compiler {
//===----------------------------------------------------------------------===//
// Passes
//===----------------------------------------------------------------------===//
/// Lowering using SIMT CUDA core operations.
void addGPUMatmulSimtPassPipeline(OpPassManager &funcPassManager);
/// Lowering using mma.sync Tensor Core operations.
void addGPUMatmulTensorCoreMmaSyncPassPipeline(OpPassManager &funcPassManager,
unsigned pipelineDepth);
/// Lowering using wmma Tensor Core operations.
void addGPUMatmulTensorCorePassPipeline(OpPassManager &funcPassManager,
unsigned pipelineDepth);
void addGPUPackUnPackPasses(OpPassManager &funcPassManager);
/// Simple lowering only distributute linalg ops on blocks and threads. This
/// will result in scalar operations. Expects pass manager to be a
/// module-level pass manager.
void addGPUSimpleDistributePassPipeline(OpPassManager &funcPassManager);
/// Transform dialect-based path.
void addGPUTransformDialectPasses(OpPassManager &funcPassManager,
StringRef entryPoint);
/// Lowering transpose using shared memory.
void addGPUTransposePassPipeline(OpPassManager &funcPassManager);
/// Lowering calling vectorization patterns. Expects pass manager to be a
/// module-level pass manager.
void addGPUVectorizationPassPipeline(OpPassManager &funcPassManager);
/// Lowering based on vector distribution patterns.
void addGPUVectorDistributePassPipeline(OpPassManager &funcPassManager);
/// Lowering reductions to warp reductions.
void addGPUWarpReductionPassPipeline(OpPassManager &funcPassManager);
/// Default pass pipeline on GPU, currently used only for the ukernel path.
void addGPUDefaultPassPipeline(OpPassManager &funcPassManager,
bool enableMicrokernels);
/// Pass pipeline to lower IREE HAL executables without tiling and distribution.
void addGPUBaseLoweringPassPipeline(OpPassManager &pm);
/// Populates passes needed to preprocess and select the translation strategy.
void buildLLVMGPUCodegenConfigurationPassPipeline(
OpPassManager &variantPassManagery);
/// Populates passes needed to lower a XLA HLO op to NVVM/ROCDL dialect via
/// the structured ops path. The pass manager `pm` in here should operate on
/// the module within the IREE::HAL::ExecutableOp.
void buildLLVMGPUCodegenPassPipeline(OpPassManager &variantPassManagery,
bool useROCM);
/// Performs the final conversion to NNVM+LLVM dialect.
std::unique_ptr<OperationPass<ModuleOp>> createConvertToNVVMPass();
/// Performs the final conversion to ROCDL+LLVM dialect.
std::unique_ptr<OperationPass<ModuleOp>> createConvertToROCDLPass();
/// Cast address space to generic in CallOp and FuncOp
std::unique_ptr<OperationPass<ModuleOp>>
createLLVMGPUCastAddressSpaceFunction();
/// Perform type extension/truncation over vector.contract types to target GPU
/// MMA intrinsics.
std::unique_ptr<InterfacePass<mlir::FunctionOpInterface>>
createLLVMGPUCastTypeToFitMMAPass();
std::unique_ptr<InterfacePass<mlir::FunctionOpInterface>>
createLLVMGPUDistribute();
/// Create pass selecting the lowering strategy for LLVMGPU.
std::unique_ptr<OperationPass<ModuleOp>>
createLLVMGPUSelectLoweringStrategyPass();
/// Create pass calling the dynamic pipeline for LLVMGPU.
std::unique_ptr<InterfacePass<FunctionOpInterface>>
createLLVMGPULowerExecutableTargetPass();
// Pass to pack shared memory allocations in order to reduce shared memory
// usage.
std::unique_ptr<InterfacePass<FunctionOpInterface>>
createLLVMGPUPackSharedMemoryAlloc();
std::unique_ptr<InterfacePass<mlir::FunctionOpInterface>>
createLLVMGPUPrefetchSharedMemoryPass();
enum class GPUTensorCoreType {
WMMA = 0,
MMA_SYNC = 1,
};
/// Convert Linalg ops to Vector and prepare converstion to GPU MMA ops.
std::unique_ptr<InterfacePass<FunctionOpInterface>>
createLLVMGPUTensorCoreVectorizationPass(
GPUTensorCoreType tensorCoreType = GPUTensorCoreType::WMMA);
//. Pass to pad out tensors up to static dimensions.
std::unique_ptr<InterfacePass<FunctionOpInterface>>
createLLVMGPUTensorPadPass();
/// Perform tiling and distribution to threads.
std::unique_ptr<InterfacePass<FunctionOpInterface>>
createLLVMGPUTileAndDistribute(bool distributeToWarp = false);
// Pass to distribute vectorized functions.
std::unique_ptr<InterfacePass<FunctionOpInterface>>
createLLVMGPUVectorDistribute();
/// Lower vector ops before convertion to LLVM.
std::unique_ptr<InterfacePass<FunctionOpInterface>>
createLLVMGPUVectorLoweringPass();
/// Converts vector ops to gpu dialect.
std::unique_ptr<InterfacePass<FunctionOpInterface>> createLLVMGPUVectorToGPU(
GPUTensorCoreType tensorCoreType = GPUTensorCoreType::WMMA);
/// Lowering calling vectorization patterns.
LogicalResult
verifyGPUMatmulPipeline(Operation *op,
IREE::Codegen::LoweringConfigAttr loweringConfig,
IREE::Codegen::TranslationInfoAttr translationInfo,
ArrayRef<int64_t> workgroupSize);
/// Given a chain of matmuls with some or no operations
/// in between, like
///
/// d = matmul_transpose_b(a, b) + c
/// ...
/// e = matmul_transpose_b(d, f) + g
///
/// this pattern transforms the above IR to
///
/// c.t = transpose c
/// d = matmul_transpose_b(b, a) + c.t
/// d.t = transpose d
/// ...
/// g.t = transpose g
/// e = matmul_transpose_b(f, d.t) + g.t
/// e.t = transpose e
///
/// On CDNA architectures, where the layouts of the RHS and result
/// are the same and transposed from the LHS layout, this type
/// of transformation can avoid trips to shared memory/shuffle instructions
/// on operators like Flash Attention.
std::unique_ptr<InterfacePass<FunctionOpInterface>>
createAMDGPUPrepareForChainedMatmulPass();
//----------------------------------------------------------------------------//
// Register LLVMGPU Passes
//----------------------------------------------------------------------------//
void registerCodegenLLVMGPUPasses();
//------------------------------------------------------------------------------
// Test passes
//------------------------------------------------------------------------------
std::unique_ptr<OperationPass<ModuleOp>> createTestLLVMGPULegalizePass();
} // namespace mlir::iree_compiler
#endif // IREE_COMPILER_CODEGEN_LLVMGPU_PASSES_H_