blob: 13f7dcdd55ef3fbc9d165651ce7556f96540406d [file] [log] [blame]
// Copyright 2020 The IREE Authors
//
// Licensed under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//===- Passes.cpp - Pipelines from Linalg ops to SPIR-V -------------------===//
//
// This file contains various pipelines to lower IREE HAL executables containing
// Linalg ops to SPIR-V.
//
//===----------------------------------------------------------------------===//
#include "iree-dialects/Dialect/LinalgExt/Passes/Passes.h"
#include "iree-dialects/Dialect/LinalgTransform/Passes.h"
#include "iree/compiler/Codegen/Common/GPU/Passes.h"
#include "iree/compiler/Codegen/Common/Passes.h"
#include "iree/compiler/Codegen/SPIRV/KernelConfig.h"
#include "iree/compiler/Codegen/SPIRV/Passes.h"
#include "iree/compiler/Codegen/SPIRV/Utils.h"
#include "iree/compiler/Codegen/Utils/GPUUtils.h"
#include "iree/compiler/Codegen/Utils/MarkerUtils.h"
#include "llvm/Support/Debug.h"
#include "mlir/Conversion/AffineToStandard/AffineToStandard.h"
#include "mlir/Conversion/ComplexToStandard/ComplexToStandard.h"
#include "mlir/Conversion/MemRefToSPIRV/MemRefToSPIRV.h"
#include "mlir/Conversion/MemRefToSPIRV/MemRefToSPIRVPass.h"
#include "mlir/Conversion/TosaToArith/TosaToArith.h"
#include "mlir/Dialect/GPU/IR/GPUDialect.h"
#include "mlir/Dialect/Linalg/Passes.h"
#include "mlir/Dialect/MemRef/Transforms/Passes.h"
#include "mlir/Dialect/SPIRV/IR/SPIRVEnums.h"
#include "mlir/Dialect/SPIRV/IR/SPIRVOps.h"
#include "mlir/Dialect/SPIRV/Transforms/Passes.h"
#include "mlir/IR/BuiltinOps.h"
#include "mlir/Pass/PassManager.h"
#include "mlir/Pass/PassOptions.h"
#include "mlir/Pass/PassRegistry.h"
#include "mlir/Transforms/Passes.h"
#define DEBUG_TYPE "iree-spirv-lowering-pass-pipeline"
namespace mlir::iree_compiler {
static llvm::cl::opt<int> clSPIRVIndexingBits(
"iree-spirv-index-bits",
llvm::cl::desc("Set the bit width of indices in SPIR-V."),
llvm::cl::init(32));
//===----------------------------------------------------------------------===//
// Bufferization Configuration
//===----------------------------------------------------------------------===//
static FailureOr<Value> gpuAllocateWorkgroupMemoryFn(OpBuilder &builder,
Location loc,
MemRefType memRefType,
ValueRange dynamicSizes,
unsigned alignment) {
auto workgroupSpace = gpu::AddressSpaceAttr::get(
builder.getContext(), gpu::GPUDialect::getWorkgroupAddressSpace());
MemRefType allocType =
MemRefType::get(memRefType.getShape(), memRefType.getElementType(),
AffineMap(), workgroupSpace);
auto allocOp = builder.create<memref::AllocOp>(
loc, allocType, dynamicSizes, builder.getI64IntegerAttr(alignment));
return allocOp.getResult();
}
static FailureOr<Value> gpuAllocateFunctionMemoryFn(OpBuilder &builder,
Location loc,
MemRefType memRefType,
ValueRange dynamicSizes,
unsigned alignment) {
std::optional<unsigned> space =
spirv::mapVulkanStorageClassToMemorySpace(spirv::StorageClass::Function);
MemRefType allocType = MemRefType::get(
memRefType.getShape(), memRefType.getElementType(), {}, *space);
auto allocaOp = builder.create<memref::AllocaOp>(
loc, allocType, dynamicSizes, builder.getI64IntegerAttr(alignment));
return allocaOp.getResult();
}
static LogicalResult gpuCopyFn(OpBuilder &builder, Location loc, Value from,
Value to) {
auto fromType = llvm::cast<MemRefType>(from.getType());
auto toType = llvm::cast<MemRefType>(to.getType());
bool needsBarrier = hasSharedMemoryAddressSpace(fromType) ||
hasSharedMemoryAddressSpace(toType);
if (needsBarrier)
builder.create<gpu::BarrierOp>(loc);
Operation *copy = builder.create<memref::CopyOp>(loc, from, to);
if (needsBarrier) {
setMarker(copy, getCopyToWorkgroupMemoryMarker());
builder.create<gpu::BarrierOp>(loc);
}
return success();
}
//===----------------------------------------------------------------------===//
// Common Pass Recipes
//===----------------------------------------------------------------------===//
static void addTileAndDistributeToWorkgroupsPasses(
OpPassManager &passManager, bool useFuseTensorPadWithConsumerPass = false,
bool useWARForCooperativeMatrixCodegen = false) {
passManager.addPass(createTileAndDistributeToWorkgroupsPass(
kNumMaxParallelDims,
linalg::DistributionMethod::CyclicNumProcsEqNumIters));
auto &nestedModulePM = passManager.nest<ModuleOp>();
if (useFuseTensorPadWithConsumerPass) {
nestedModulePM.addNestedPass<func::FuncOp>(
createFuseTensorPadWithConsumerPass());
}
nestedModulePM.addNestedPass<func::FuncOp>(
createConvertToDestinationPassingStylePass(
useWARForCooperativeMatrixCodegen));
nestedModulePM.addPass(createCanonicalizerPass());
nestedModulePM.addPass(createCSEPass());
}
/// Adds passes to lower vector ops to meet SPIR-V requirements.
static void addSPIRVVectorLoweringPasses(OpPassManager &modulePM) {
modulePM.addNestedPass<func::FuncOp>(createSPIRVInitialVectorLoweringPass());
modulePM.addNestedPass<func::FuncOp>(
createHoistRedundantVectorTransfersPass());
modulePM.addNestedPass<func::FuncOp>(createSPIRVFinalVectorLoweringPass());
}
static void addBufferizePasses(OpPassManager &passManager,
BufferizationOptions::AllocationFn fn) {
BufferizationOptions::AllocationFn allocationFn = fn;
BufferizationOptions::MemCpyFn memcpyFn = gpuCopyFn;
addIREEComprehensiveBufferizePasses(passManager, allocationFn, memcpyFn);
}
static void
addSPIRVBufferizePasses(OpPassManager &passManager,
BufferizationOptions::AllocationFn allocationFn) {
// Resolve dim ops first so that we don't have compute Linalg ops lingering on
// becuase of dim op usage. This avoids bufferizing those compute ops just for
// their shape dimensions.
passManager.addPass(memref::createResolveShapedTypeResultDimsPass());
addBufferizePasses(passManager, allocationFn);
// Distribute immediately after bufferization to avoid losing attribute
// annotations in subsequent transformations. This is a bit fragile right now
// but we expect upstream for loops to eventually recognize distribution as a
// first-class attribute then we don't need this.
passManager.addNestedPass<func::FuncOp>(createSPIRVDistributePass());
passManager.addPass(memref::createResolveShapedTypeResultDimsPass());
passManager.addNestedPass<func::FuncOp>(createCanonicalizerPass());
passManager.addNestedPass<func::FuncOp>(createCSEPass());
passManager.addNestedPass<func::FuncOp>(createCleanupBufferAllocViewPass());
}
/// Adds passes to materialize structured ops as loops. This replaces structured
/// ops with loop nests containing payloads, so it should be invoked after
/// tiling and vectorization and before buffer transformations.
static void addLoopMaterializationPasses(OpPassManager &pm) {
pm.addNestedPass<func::FuncOp>(IREE::LinalgExt::createLinalgExtToLoopsPass());
pm.addNestedPass<func::FuncOp>(createMemrefCopyToLinalgPass());
pm.addNestedPass<func::FuncOp>(createConvertLinalgToLoopsPass());
pm.addNestedPass<func::FuncOp>(createRemoveSingleIterationLoopPass());
pm.addPass(createCanonicalizerPass());
pm.addPass(createCSEPass());
}
/// Adds passes to lowering MemRefs. This folds MemRef subviews, flattens n-D
/// MemRef into 1-D ones, vectorizes load/store when possible, and performs
/// cross loop nest optimizations. This should be invoked after structured op
/// lowering and before final SPIR-V conversion.
static void addMemRefLoweringPasses(OpPassManager &pm) {
pm.addPass(createCanonicalizerPass());
pm.addPass(createCSEPass());
pm.addNestedPass<func::FuncOp>(createConvertComplexToStandardPass());
// Math dialect elementry functions -> polynomial form.
pm.addNestedPass<func::FuncOp>(createPolynomialApproximationPass());
pm.addNestedPass<func::FuncOp>(createPadDynamicAlloc());
// Check to make sure we are not exceeding shared memory usage limit.
auto getSharedMemoryLimit = [](func::FuncOp func) {
auto moduleOp = func->getParentOfType<ModuleOp>();
spirv::TargetEnvAttr target = getSPIRVTargetEnvAttr(moduleOp);
return target.getResourceLimits().getMaxComputeSharedMemorySize();
};
// TODO: query this from the target.
auto getIndexBitwidth = [](func::FuncOp) { return 32; };
pm.addPass(
createGPUCheckResourceUsagePass(getSharedMemoryLimit, getIndexBitwidth));
// Fold load/store from/to subview ops into the original memref when possible.
// In SPIR-V we don't use memref descriptor so it's not possible to handle
// subview ops.
pm.addPass(memref::createFoldMemRefAliasOpsPass());
pm.addPass(createEmulateNarrowTypePass());
pm.addNestedPass<func::FuncOp>(memref::createExpandOpsPass());
pm.addPass(createCanonicalizerPass());
pm.addPass(createCSEPass());
// Turn scalar load/store from memrefs into vectorized ones if possible. This
// gives better memory access patterns, which is very important for perf.
pm.addPass(createSPIRVVectorizeLoadStore());
// Perform optimizations that need to across the scf.for region boundary.
pm.addNestedPass<func::FuncOp>(createForOpCanonicalizationPass());
// Perform various vector-level cross-op optimizations like load-store
// forwarding, shape casting and casting op cancelling.
pm.addNestedPass<func::FuncOp>(createOptimizeVectorTransferPass(
/*flatten=*/false, /*dropUnitDims=*/false));
pm.addNestedPass<func::FuncOp>(createSPIRVBreakDownLargeVectorPass());
// Perform optimizations that need to across the scf.for region boundary.
pm.addNestedPass<func::FuncOp>(createForOpCanonicalizationPass());
pm.addPass(createCanonicalizerPass());
pm.addPass(createCSEPass());
pm.addNestedPass<func::FuncOp>(createOptimizeVectorTransferPass(
/*flatten=*/false, /*dropUnitDims=*/false));
// Turn multi-dimension memref into one-dimension. This is needed for SPIR-V
// because we don't use upstream memref descriptors.
pm.addPass(createFlattenMemRefSubspanPass());
pm.addNestedPass<func::FuncOp>(
createSPIRVEraseStorageBufferStaticShapePass());
}
/// Adds passes to perform the final SPIR-V conversion.
static void addSPIRVLoweringPasses(OpPassManager &pm, bool enableFastMath) {
pm.addPass(createCanonicalizerPass());
pm.addPass(createCSEPass());
pm.addPass(createLowerAffinePass());
// Lower ApplyScale before the i64 Emulation Pass so that new 64-bit ops are
// also emulated if not supported by the target.
pm.addPass(tosa::createTosaToArith(/*includeApplyRescale=*/true,
/*use32BitApplyRescale=*/true));
pm.addPass(createCanonicalizerPass());
pm.addPass(createCSEPass());
pm.addNestedPass<func::FuncOp>(createSPIRVMapMemRefStorageClassPass());
pm.addPass(createSPIRVEmulateI64Pass());
pm.addPass(createConvertBf16ArithToF32Pass());
pm.addPass(createConvertBf16ToUInt16BuffersPass());
pm.addPass(createCanonicalizerPass());
pm.addPass(createCSEPass());
pm.addPass(createConvertToSPIRVPass(enableFastMath, clSPIRVIndexingBits));
auto getTargetEnv = [](spirv::ModuleOp moduleOp) {
return getSPIRVTargetEnvAttr(moduleOp);
};
OpPassManager &spirvPM = pm.nest<spirv::ModuleOp>();
spirvPM.addPass(spirv::createUnifyAliasedResourcePass(getTargetEnv));
spirvPM.addPass(spirv::createSPIRVLowerABIAttributesPass());
spirvPM.addPass(createCanonicalizerPass());
spirvPM.addPass(createCSEPass());
spirvPM.addPass(spirv::createSPIRVRewriteInsertsPass());
spirvPM.addPass(spirv::createSPIRVCanonicalizeGLPass());
spirvPM.addPass(spirv::createSPIRVUpdateVCEPass());
}
void addSPIRVTransformDialectPasses(OpPassManager &passManager) {
passManager.addPass(
mlir::iree_compiler::createTransformDialectInterpreterPass());
// Dropping the schedule is needed:
// 1. if we want to embed the transform in the module: we should drop the
// schedule once applied.
// 2. if transform.do_not_dce_operands ops are introduced.
passManager.addPass(createDropSchedulePass());
}
//===----------------------------------------------------------------------===//
// Pass Pipelines
//===----------------------------------------------------------------------===//
void addSPIRVBaseLoweringPassPipeline(OpPassManager &pm) {
auto &nestedModulePM = pm.nest<ModuleOp>();
nestedModulePM.addNestedPass<func::FuncOp>(
createConvertToDestinationPassingStylePass(
/*useWARForCooperativeMatrixCodegen=*/false));
nestedModulePM.addPass(createCanonicalizerPass());
nestedModulePM.addPass(createCSEPass());
addBufferizePasses(nestedModulePM, gpuAllocateWorkgroupMemoryFn);
nestedModulePM.addPass(createCanonicalizerPass());
nestedModulePM.addPass(createCSEPass());
addLoopMaterializationPasses(nestedModulePM);
}
void addSPIRVBaseDistributePassPipeline(OpPassManager &pm) {
addTileAndDistributeToWorkgroupsPasses(pm);
auto &nestedModulePM = pm.nest<ModuleOp>();
addBufferizePasses(nestedModulePM, gpuAllocateWorkgroupMemoryFn);
// Tile and distribute to GPU invocations.
nestedModulePM.addNestedPass<func::FuncOp>(
createSPIRVTileAndDistributePass());
nestedModulePM.addNestedPass<func::FuncOp>(createMemrefCopyToLinalgPass());
nestedModulePM.addNestedPass<func::FuncOp>(
createGPUDistributeSharedMemoryCopy());
nestedModulePM.addPass(createCanonicalizerPass());
nestedModulePM.addPass(createCSEPass());
addLoopMaterializationPasses(nestedModulePM);
}
void addSPIRVBaseVectorizePassPipeline(OpPassManager &pm) {
addTileAndDistributeToWorkgroupsPasses(
pm, /*useFuseTensorPadWithConsumerPass=*/true);
auto &nestedModulePM = pm.nest<ModuleOp>();
nestedModulePM.addNestedPass<func::FuncOp>(
createFoldAffineMinInDistributedLoopsPass());
nestedModulePM.addPass(memref::createResolveShapedTypeResultDimsPass());
nestedModulePM.addPass(createCanonicalizerPass());
nestedModulePM.addPass(createCSEPass());
// Tile to GPU invocations and vectorize.
nestedModulePM.addNestedPass<func::FuncOp>(
createSPIRVCreateFastSlowPathPass());
nestedModulePM.addNestedPass<func::FuncOp>(createSPIRVTilePass());
nestedModulePM.addPass(createCanonicalizerPass());
nestedModulePM.addPass(createCSEPass());
{
GenericVectorizationPassOptions options;
options.vectorizeGatherAccesses = true;
nestedModulePM.addNestedPass<func::FuncOp>(
createGenericVectorizationPass(options));
}
addSPIRVVectorLoweringPasses(nestedModulePM);
nestedModulePM.addNestedPass<func::FuncOp>(createForOpCanonicalizationPass());
nestedModulePM.addPass(createCanonicalizerPass());
nestedModulePM.addPass(createCSEPass());
// Bufferize and distribute.
addSPIRVBufferizePasses(nestedModulePM, gpuAllocateFunctionMemoryFn);
// Generate loop nests for all remaining ops and remove trivial loops.
addLoopMaterializationPasses(nestedModulePM);
// Perform various vector-level cross-op optimizations like load-store
// forwarding, shape casting and casting op cancelling.
nestedModulePM.addNestedPass<func::FuncOp>(createOptimizeVectorTransferPass(
/*flatten=*/false, /*dropUnitDims=*/false));
}
void addSPIRVWinogradVectorizePassPipeline(OpPassManager &pm) {
addTileAndDistributeToWorkgroupsPasses(
pm, /*useFuseTensorPadWithConsumerPass=*/true);
auto &nestedModulePM = pm.nest<ModuleOp>();
nestedModulePM.addNestedPass<func::FuncOp>(
IREE::LinalgExt::createTileAndDecomposeWinogradTransformPass());
nestedModulePM.addNestedPass<func::FuncOp>(
createFoldAffineMinInDistributedLoopsPass());
nestedModulePM.addPass(memref::createResolveShapedTypeResultDimsPass());
nestedModulePM.addPass(createCanonicalizerPass());
nestedModulePM.addPass(createCSEPass());
// Tile to GPU invocations and vectorize.
nestedModulePM.addNestedPass<func::FuncOp>(
createSPIRVAnnotateWinogradLoopsPass());
nestedModulePM.addPass(createCanonicalizerPass());
nestedModulePM.addPass(createCSEPass());
{
GenericVectorizationPassOptions options;
options.vectorizeGatherAccesses = true;
options.enableCleanup = true;
nestedModulePM.addNestedPass<func::FuncOp>(
createGenericVectorizationPass(options));
}
addSPIRVVectorLoweringPasses(nestedModulePM);
nestedModulePM.addNestedPass<func::FuncOp>(createForOpCanonicalizationPass());
nestedModulePM.addPass(createCanonicalizerPass());
nestedModulePM.addPass(createCSEPass());
// Bufferize and distribute.
addSPIRVBufferizePasses(nestedModulePM, gpuAllocateFunctionMemoryFn);
// Generate loop nests for all remaining ops and remove trivial loops.
addLoopMaterializationPasses(nestedModulePM);
// Perform various vector-level cross-op optimizations like load-store
// forwarding, shape casting and casting op cancelling.
nestedModulePM.addNestedPass<func::FuncOp>(createOptimizeVectorTransferPass(
/*flatten=*/false, /*dropUnitDims=*/false));
}
void addSPIRVCooperativeMatrixVectorizePassPipeline(OpPassManager &pm,
unsigned pipelineDepth,
unsigned storeStage) {
addTileAndDistributeToWorkgroupsPasses(
pm, /*useFuseTensorPadWithConsumerPass=*/false,
/*useWARForCooperativeMatrixCodegen=*/true);
auto &nestedModulePM = pm.nest<ModuleOp>();
addBufferizePasses(nestedModulePM, gpuAllocateWorkgroupMemoryFn);
// Tile to GPU workgroups and promote.
nestedModulePM.addNestedPass<func::FuncOp>(createSPIRVTileAndPromotePass(
/*promoteCMatrix=*/true, /*skipThreadLevel=*/true));
nestedModulePM.addNestedPass<func::FuncOp>(
createRemoveSingleIterationLoopPass());
// Run canonicalization patterns to propagate constant shape sizes after
// removing trip-one loops.
nestedModulePM.addPass(createCanonicalizerPass());
nestedModulePM.addPass(createCSEPass());
// Tile and distribute to GPU subgroups.
nestedModulePM.addNestedPass<func::FuncOp>(
createSPIRVTileToCooperativeOpsPass());
nestedModulePM.addNestedPass<func::FuncOp>(
createRemoveSingleIterationLoopPass());
nestedModulePM.addPass(createCanonicalizerPass());
nestedModulePM.addPass(createCSEPass());
// Multi-buffer depending on pipeline depth and distribute to shared memory.
if (pipelineDepth > 0) {
nestedModulePM.addNestedPass<func::FuncOp>(
createGPUMultiBuffering(pipelineDepth + 1));
}
nestedModulePM.addNestedPass<func::FuncOp>(createMemrefCopyToLinalgPass());
nestedModulePM.addNestedPass<func::FuncOp>(
createGPUDistributeSharedMemoryCopy());
// Reduce bank conflicts by padding.
nestedModulePM.addNestedPass<func::FuncOp>(
createGPUReduceSharedMemoryBankConflicts(
detail::bankConflictReductionPaddingBits));
// Performs high-level n-D mechanical vectorization. This does not perform
// unrolling or lowering, which is done later.
{
GenericVectorizationPassOptions options;
nestedModulePM.addNestedPass<func::FuncOp>(
createGenericVectorizationPass(options));
}
// With subview ops, vector hoisting won't kick in. So fold memref subview ops
// before performing vector unrolling and hoisting.
nestedModulePM.addNestedPass<func::FuncOp>(
memref::createFoldMemRefAliasOpsPass());
// Vectorize to cooperative ops.
nestedModulePM.addNestedPass<func::FuncOp>(
createSPIRVVectorizeToCooperativeOpsPass());
nestedModulePM.addPass(createCSEPass());
nestedModulePM.addNestedPass<func::FuncOp>(
createHoistRedundantVectorTransfersPass());
nestedModulePM.addNestedPass<func::FuncOp>(
createRemoveSingleIterationLoopPass());
// Run canonicalization patterns to propagate constant shape sizes after
// removing trip-one loops.
nestedModulePM.addPass(createCanonicalizerPass());
nestedModulePM.addPass(createCSEPass());
// Perform various vector-level cross-op optimizations like load-store
// forwarding, shape casting and casting op cancelling.
nestedModulePM.addNestedPass<func::FuncOp>(createOptimizeVectorTransferPass(
/*flatten=*/false, /*dropUnitDims=*/false));
nestedModulePM.addNestedPass<func::FuncOp>(createForOpCanonicalizationPass());
nestedModulePM.addPass(createCanonicalizerPass());
nestedModulePM.addPass(createCSEPass());
nestedModulePM.addNestedPass<func::FuncOp>(
createSPIRVVectorToGPUSubgroupMMAOpsPass());
nestedModulePM.addPass(createCanonicalizerPass());
nestedModulePM.addPass(createCSEPass());
addSPIRVVectorLoweringPasses(nestedModulePM);
if (pipelineDepth > 0) {
PipeliningSchedulingStrategy schedule =
storeStage == 0 ? PipeliningSchedulingStrategy::loadStoreStage0
: PipeliningSchedulingStrategy::loadGlobalStage0;
nestedModulePM.addNestedPass<func::FuncOp>(createGPUPipeliningPass(
/*epiloguePeeling=*/true, pipelineDepth, schedule));
}
}
void addSPIRVMatmulPromoteVectorizePassPipeline(OpPassManager &topPM,
unsigned pipelineDepth,
unsigned storeStage) {
// Guards against 0 for consistency with older user provided tuning configs.
pipelineDepth = pipelineDepth ? pipelineDepth : 1;
LLVM_DEBUG(llvm::dbgs() << "Non-zero Pipeline Depth: " << pipelineDepth
<< "\n";);
addTileAndDistributeToWorkgroupsPasses(
topPM, /*useFuseTensorPadWithConsumerPass=*/false,
/*useWARForCooperativeMatrixCodegen=*/true);
// Promote to workgroups and tile to threads.
auto &nestedPM = topPM.nest<ModuleOp>();
nestedPM.addNestedPass<func::FuncOp>(createGPUTensorAlloc());
nestedPM.addNestedPass<func::FuncOp>(
createGPUTensorTile(/*distributeToWarp=*/false));
// Performs high-level n-D mechanical vectorization. This does not perform
// unrolling or lowering, which is done later.
{
GenericVectorizationPassOptions options;
options.vectorizePadding = true;
options.vectorizeGatherAccesses = true;
options.enableCleanup = false;
options.maxVectorSize = 4096;
nestedPM.addNestedPass<func::FuncOp>(
createGenericVectorizationPass(options));
nestedPM.addNestedPass<func::FuncOp>(
createHoistRedundantVectorTransfersPass());
nestedPM.addNestedPass<func::FuncOp>(createCanonicalizerPass());
nestedPM.addNestedPass<func::FuncOp>(createCSEPass());
}
// Bufferize.
addBufferizePasses(nestedPM, gpuAllocateWorkgroupMemoryFn);
// Distribute scf.forall to GPU threads.
nestedPM.addNestedPass<func::FuncOp>(createGPUDistribute());
if (pipelineDepth > 1 || storeStage == 0) {
nestedPM.addNestedPass<func::FuncOp>(createGPUMultiBuffering(
storeStage == 0 ? pipelineDepth + 1 : pipelineDepth));
}
nestedPM.addNestedPass<func::FuncOp>(createMemrefCopyToLinalgPass());
nestedPM.addNestedPass<func::FuncOp>(createGPUDistributeSharedMemoryCopy());
nestedPM.addPass(createCanonicalizerPass());
nestedPM.addPass(createCSEPass());
nestedPM.addNestedPass<func::FuncOp>(createGPUReduceSharedMemoryBankConflicts(
detail::bankConflictReductionPaddingBits));
// With subview ops, vector hoisting won't kick in. So fold memref subview ops
// before performing vector unrolling and hoisting.
nestedPM.addNestedPass<func::FuncOp>(memref::createFoldMemRefAliasOpsPass());
nestedPM.addNestedPass<func::FuncOp>(createSPIRVInitialVectorLoweringPass());
nestedPM.addPass(createCSEPass());
nestedPM.addNestedPass<func::FuncOp>(
createHoistRedundantVectorTransfersPass());
nestedPM.addNestedPass<func::FuncOp>(createSPIRVFinalVectorLoweringPass());
nestedPM.addNestedPass<func::FuncOp>(createForOpCanonicalizationPass());
nestedPM.addPass(createCanonicalizerPass());
nestedPM.addPass(createCSEPass());
nestedPM.addNestedPass<func::FuncOp>(createOptimizeVectorTransferPass(
/*flatten=*/false, /*dropUnitDims=*/false));
// Hoist loop invariant code to avoid pipelining it.
nestedPM.addNestedPass<func::FuncOp>(createLoopInvariantCodeMotionPass());
PipeliningSchedulingStrategy schedule =
storeStage == 0 ? PipeliningSchedulingStrategy::loadStoreStage0
: PipeliningSchedulingStrategy::loadGlobalStage0;
nestedPM.addNestedPass<func::FuncOp>(createGPUPipeliningPass(
/*epiloguePeeling=*/true, pipelineDepth, schedule));
addLoopMaterializationPasses(nestedPM);
}
void addSPIRVSubgroupReducePassPipeline(OpPassManager &pm) {
addTileAndDistributeToWorkgroupsPasses(
pm, /*useFuseTensorPadWithConsumerPass=*/true);
auto &nestedModulePM = pm.nest<ModuleOp>();
// Fuse input parallel ops into the reduction op so that we don't need to
// create temporary allocations during bufferization.
nestedModulePM.addNestedPass<func::FuncOp>(
createRematerializeParallelOpsPass());
nestedModulePM.addNestedPass<func::FuncOp>(createCanonicalizerPass());
nestedModulePM.addNestedPass<func::FuncOp>(createGPUTileReductionPass());
nestedModulePM.addNestedPass<func::FuncOp>(createCanonicalizerPass());
nestedModulePM.addNestedPass<func::FuncOp>(createCSEPass());
// Performs high-level n-D mechanical vectorization. This does not perform
// unrolling or lowering, which is done later.
{
GenericVectorizationPassOptions options;
options.enableVectorMasking = true;
options.useConfiguredVectorSizes = false;
options.vectorizePadding = true;
options.vectorizeGatherAccesses = true;
options.enableCleanup = false;
options.generateContract = false;
options.maxVectorSize = 32768;
nestedModulePM.addNestedPass<func::FuncOp>(
createGenericVectorizationPass(options));
nestedModulePM.addNestedPass<func::FuncOp>(
createHoistRedundantVectorTransfersPass());
nestedModulePM.addNestedPass<func::FuncOp>(createCanonicalizerPass());
nestedModulePM.addNestedPass<func::FuncOp>(createCSEPass());
}
nestedModulePM.addNestedPass<func::FuncOp>(
createLoopInvariantCodeMotionPass());
nestedModulePM.addNestedPass<func::FuncOp>(createCanonicalizerPass());
nestedModulePM.addNestedPass<func::FuncOp>(createCSEPass());
// Bufferize and distribute.
// We bufferize before distributing to threads there; so we are still at the
// block level. Therefore, need to allocate workgroup memory.
addSPIRVBufferizePasses(nestedModulePM, gpuAllocateWorkgroupMemoryFn);
// Perform various vector-level cross-op optimizations like load-store
// forwarding, shape casting and casting op cancelling.
nestedModulePM.addNestedPass<func::FuncOp>(createOptimizeVectorTransferPass(
/*flatten=*/false, /*dropUnitDims=*/false));
// Simplify the IR for vector distribution.
nestedModulePM.addNestedPass<func::FuncOp>(
memref::createFoldMemRefAliasOpsPass());
nestedModulePM.addNestedPass<func::FuncOp>(
createLoopInvariantCodeMotionPass());
nestedModulePM.addNestedPass<func::FuncOp>(createCanonicalizerPass());
nestedModulePM.addNestedPass<func::FuncOp>(createCSEPass());
nestedModulePM.addNestedPass<func::FuncOp>(createForOpCanonicalizationPass());
nestedModulePM.addNestedPass<func::FuncOp>(createCanonicalizerPass());
auto getWarpSize = [](func::FuncOp func) {
auto moduleOp = func->getParentOfType<ModuleOp>();
spirv::TargetEnvAttr target = getSPIRVTargetEnvAttr(moduleOp);
return target.getResourceLimits().getSubgroupSize();
};
// Handle vector reduction operations specifically.
nestedModulePM.addNestedPass<func::FuncOp>(
createConvertVectorReductionToGPUPass(/*expandSubgroupReduction=*/false,
getWarpSize));
// Perform normal vector unrolling and lowering transformations. This breaks
// vectors down to native machine size.
addSPIRVVectorLoweringPasses(nestedModulePM);
nestedModulePM.addPass(createCanonicalizerPass());
nestedModulePM.addPass(createCSEPass());
}
void addSPIRVTransformDialectPassPipeline(OpPassManager &pm) {
addSPIRVTransformDialectPasses(pm);
// Run GenericVectorization pass additionally to convert vectors into forms
// needed for SPIR-V.
auto &nestedModulePM = pm.nest<ModuleOp>();
nestedModulePM.addNestedPass<func::FuncOp>(createGenericVectorizationPass());
addSPIRVVectorLoweringPasses(nestedModulePM);
}
//===----------------------------------------------------------------------===//
// Entry Point
//===----------------------------------------------------------------------===//
void buildSPIRVCodegenConfigurationPassPipeline(OpPassManager &pm) {
addCommonTargetExecutablePreprocessingPasses(pm);
auto &nestedModulePM = pm.nest<ModuleOp>();
nestedModulePM.addNestedPass<func::FuncOp>(createGPUGeneralizeNamedOpsPass());
pm.addPass(createSPIRVSelectLoweringStrategyPass());
}
void buildSPIRVCodegenPassPipeline(OpPassManager &pm, bool enableFastMath) {
pm.addPass(createSPIRVLowerExecutableTargetPass());
addMemRefLoweringPasses(pm.nest<ModuleOp>());
addSPIRVLoweringPasses(pm.nest<ModuleOp>(), enableFastMath);
LLVM_DEBUG({
llvm::dbgs() << "Using SPIR-V pass pipeline:\n";
pm.printAsTextualPipeline(llvm::dbgs());
llvm::dbgs() << "\n";
});
}
// NOTE: this runs on the top-level program module containing all hal.executable
// ops.
void buildSPIRVLinkingPassPipeline(OpPassManager &passManager) {
auto &nestedExecutablePM = passManager.nest<IREE::HAL::ExecutableOp>();
// Trim the allowed target environment (version/capability/extension/etc.) to
// the minimal requirement needed by compiled spirv.module ops. This helps to
// increase the chance of linking different variant ops together.
nestedExecutablePM.addNestedPass<IREE::HAL::ExecutableVariantOp>(
createSPIRVTrimExecutableTargetEnvPass());
// Materialize the minimal required target environment into proper device
// queries to execute in the runtime.
nestedExecutablePM.addNestedPass<IREE::HAL::ExecutableVariantOp>(
createSPIRVMaterializeExecutableConditionsPass());
// Link together executables. This may produce some IR duplication.
passManager.addPass(createSPIRVLinkExecutablesPass());
// Cleanup IR duplication.
passManager.addNestedPass<IREE::HAL::ExecutableOp>(
mlir::createCanonicalizerPass());
}
//===---------------------------------------------------------------------===//
// Register SPIR-V Passes
//===---------------------------------------------------------------------===//
namespace {
#define GEN_PASS_REGISTRATION
#include "iree/compiler/Codegen/SPIRV/Passes.h.inc"
} // namespace
void registerCodegenSPIRVPasses() {
// Generated.
registerPasses();
static PassPipelineRegistration<> SPIRVConfigPipeline(
"iree-codegen-spirv-configuration-pipeline",
"Runs the pipeline for configuring the lowering from linalg to SPIR-V",
[](OpPassManager &passManager) {
buildSPIRVCodegenConfigurationPassPipeline(passManager);
});
static PassPipelineRegistration<> LinalgSPIRVPipeline(
"iree-codegen-linalg-to-spirv-pipeline",
"Runs the progressive lowering pipeline from linalg to SPIR-V",
[](OpPassManager &passManager) {
buildSPIRVCodegenPassPipeline(passManager, /*enableFastMath=*/false);
});
}
} // namespace mlir::iree_compiler