blob: c1181776e8f7be0c78a744338d47e1237e66f00c [file] [log] [blame]
// Copyright 2023 The IREE Authors
//
// Licensed under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//===----------------------------------------------------------------------===//
//
// This file includes the LLVMGPU Passes.
//
//===----------------------------------------------------------------------===//
#ifndef IREE_COMPILER_CODEGEN_LLVMGPU_PASSES_H_
#define IREE_COMPILER_CODEGEN_LLVMGPU_PASSES_H_
#include "iree/compiler/Codegen/Common/GPU/Passes.h"
#include "iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenAttrs.h"
#include "iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.h"
#include "mlir/Pass/Pass.h"
namespace mlir::iree_compiler {
using IREE::GPU::GPUPipelineOptions;
//----------------------------------------------------------------------------//
// LLVMGPU backend Pass Pipelines.
//----------------------------------------------------------------------------//
/// Lowering using SIMT CUDA core operations.
void addGPUMatmulSimtPassPipeline(OpPassManager &funcPassManager,
const GPUPipelineOptions &options);
/// Lowering using mma.sync Tensor Core operations.
void addGPUMatmulTensorCoreMmaSyncPassPipeline(
OpPassManager &funcPassManager, const GPUPipelineOptions &options,
unsigned pipelineDepth);
/// Lowering using wmma Tensor Core operations.
void addGPUMatmulTensorCorePassPipeline(OpPassManager &funcPassManager,
const GPUPipelineOptions &options,
unsigned pipelineDepth);
void addGPUPackUnPackPasses(OpPassManager &funcPassManager);
/// Simple lowering only distributute linalg ops on blocks and threads. This
/// will result in scalar operations. Expects pass manager to be a
/// module-level pass manager.
void addGPUSimpleDistributePassPipeline(OpPassManager &funcPassManager);
/// Lowering config driven pipeline that uses greedy tile + fuse to distribute
/// to threads.
void addGPUTileAndFusePassPipeline(OpPassManager &funcPassManager,
const GPUPipelineOptions &pipelineOptions);
/// Transform dialect-based path.
void addGPUTransformDialectPasses(OpPassManager &funcPassManager,
StringRef entryPoint);
/// Lowering transpose using shared memory.
void addGPUTransposePassPipeline(OpPassManager &funcPassManager,
const GPUPipelineOptions &options);
/// Lowering calling vectorization patterns. Expects pass manager to be a
/// module-level pass manager.
void addGPUVectorizationPassPipeline(OpPassManager &funcPassManager);
/// Lowering for winograd transform ops. Follows `VectorizationPassPipeline`
/// with different tiling and distribution passes.
void addGPUWinogradVectorizePassPipeline(OpPassManager &funcPassManager);
/// Lowering based on vector distribution patterns.
void addGPUVectorDistributePassPipeline(OpPassManager &funcPassManager,
const GPUPipelineOptions &options,
bool usePadToModelSharedMemcpy);
/// Lowering reductions to warp reductions.
void addGPUWarpReductionPassPipeline(OpPassManager &funcPassManager);
/// Default pass pipeline on GPU, currently used only for the ukernel path.
void addGPUDefaultPassPipeline(OpPassManager &funcPassManager,
const GPUPipelineOptions &options);
/// Pass pipeline to lower IREE HAL executables without tiling and distribution.
void addGPUBaseLoweringPassPipeline(OpPassManager &pm);
/// Populates passes needed to preprocess and select the translation strategy.
void buildLLVMGPUCodegenConfigurationPassPipeline(
OpPassManager &variantPassManagery);
/// Populates passes needed to lower a XLA HLO op to NVVM/ROCDL dialect via
/// the structured ops path. The pass manager `pm` in here should operate on
/// the module within the IREE::HAL::ExecutableOp.
void buildLLVMGPUCodegenPassPipeline(OpPassManager &variantPassManagery,
bool useROCM);
/// Lowering calling vectorization patterns.
LogicalResult
verifyGPUMatmulPipeline(Operation *op,
IREE::Codegen::LoweringConfigAttr loweringConfig,
IREE::Codegen::TranslationInfoAttr translationInfo,
ArrayRef<int64_t> workgroupSize);
//------------------------------------------------------------------------------
// Wrappers that not use tablegen options.
//------------------------------------------------------------------------------
enum class LLVMGPUMatmulPadOption { ParallelDims, ReductionDims };
std::unique_ptr<InterfacePass<mlir::FunctionOpInterface>>
createLLVMGPUPromoteMatmulToFitMMAPass(LLVMGPUMatmulPadOption option);
enum class GPUTensorCoreType {
WMMA = 0,
MMA_SYNC = 1,
};
std::unique_ptr<InterfacePass<FunctionOpInterface>>
createLLVMGPUTensorCoreVectorizationPass(GPUTensorCoreType tensorCoreType);
std::unique_ptr<InterfacePass<FunctionOpInterface>>
createLLVMGPUVectorToGPUPass(GPUTensorCoreType tensorCoreType);
std::unique_ptr<InterfacePass<FunctionOpInterface>>
createLLVMGPUTileAndDistributePass(bool distributeToWarp);
//----------------------------------------------------------------------------//
// Register LLVMGPU Passes
//----------------------------------------------------------------------------//
#define GEN_PASS_DECL
#include "iree/compiler/Codegen/LLVMGPU/Passes.h.inc" // IWYU pragma: keep
void registerCodegenLLVMGPUPasses();
} // namespace mlir::iree_compiler
#endif // IREE_COMPILER_CODEGEN_LLVMGPU_PASSES_H_