blob: 53940bc45b232ff210f8969214adc05b45201dbc [file] [log] [blame] [edit]
// Copyright 2021 The IREE Authors
//
// Licensed under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
#include "iree/compiler/Dialect/Stream/Transforms/Passes.h"
#include <memory>
#include "iree/compiler/Dialect/Util/Transforms/Passes.h"
#include "iree/compiler/Utils/PassUtils.h"
#include "mlir/Conversion/SCFToControlFlow/SCFToControlFlow.h"
#include "mlir/Dialect/Func/IR/FuncOps.h"
#include "mlir/Pass/PassOptions.h"
#include "mlir/Pass/PassRegistry.h"
#include "mlir/Transforms/Passes.h"
static llvm::cl::opt<bool> clAnnotateInputAffinities(
"iree-stream-annotate-input-affinities",
llvm::cl::desc("Annotates all tensor/resource affinities on the input to "
"the pipeline for debugging."),
llvm::cl::init(false));
namespace mlir::iree_compiler::IREE::Stream {
using FunctionLikeNest =
MultiOpNest<func::FuncOp, IREE::Util::InitializerOp, IREE::Util::FuncOp>;
//===----------------------------------------------------------------------===//
// --iree-stream-cleanup-pipeline
//===----------------------------------------------------------------------===//
static void buildStreamCleanupPassPipeline(
OpPassManager &passManager,
const IREE::Stream::TransformOptions &transformOptions) {
FunctionLikeNest(passManager)
// Standard MLIR cleanup.
.addPass(mlir::createCanonicalizerPass)
.addPass(mlir::createCSEPass)
// Integer optimizations. These operate best on a canonical form both
// for performance (post-simplifications cause less analysis) and
// simplified pattern matching.
.addPass(IREE::Util::createOptimizeIntArithmeticPass)
// Simplify util.global accesses; this can help with data flow tracking as
// redundant store-loads are removed.
.addPass(IREE::Util::createSimplifyGlobalAccessesPass)
// Aggressive cleanup.
.addPass(IREE::Util::createApplyPatternsPass);
// Cleanup and canonicalization of util.global (and other util ops).
passManager.addPass(IREE::Util::createFoldGlobalsPass());
passManager.addPass(IREE::Util::createFuseGlobalsPass());
// Large IPO pass. Note that this can introduce a significant amount of
// duplication/inlined constants and we'll want to ensure we're running
// cleanup again after (this entire set of patterns is run in a fixed-point
// iteration to do that).
passManager.addPass(IREE::Util::createIPOPass());
}
//===----------------------------------------------------------------------===//
// --iree-stream-tensor-transformation-pipeline
//===----------------------------------------------------------------------===//
void buildStreamTensorPassPipeline(OpPassManager &passManager,
const TransformOptions &transformOptions) {
//----------------------------------------------------------------------------
// Input cleanup and simplification
//----------------------------------------------------------------------------
// Verify we support the program.
passManager.addPass(IREE::Stream::createVerifyInputPass());
// Cleanup the program prior to outlining constants in case there is
// propagation or fusion that needs to happen first.
buildStreamCleanupPassPipeline(passManager, transformOptions);
//----------------------------------------------------------------------------
// Conversion
//----------------------------------------------------------------------------
// TODO(benvanik): cache the affinity analysis - if this pass does nothing (or
// once it converges) the analysis will be usable by AnnotateAffinities and
// ConvertToStream.
//
// Clone operations to consumers when the operations opt-in to such behavior.
//
// NOTE: CSE must not be run between this and ConvertToStream - doing so will
// undo the clones.
passManager.addPass(IREE::Stream::createCloneToConsumersPass());
// Annotate all ops/resources with the analyzed affinities.
// This should have no behavioral changes during conversion but allows for
// debugging of analysis errors in end-user tooling.
if (clAnnotateInputAffinities) {
passManager.addPass(IREE::Stream::createAnnotateAffinitiesPass());
}
// Converts from all input dialects into various levels of the stream dialect.
// Tensor-like things go to stream.tensor.* ops while lower level buffer-like
// things will go to stream.async.* ops.
passManager.addPass(IREE::Stream::createConvertToStreamPass());
// No more tensor.*/etc ops are allowed. This is conservative - there may be
// a lot of ops we convert but this will catch the majority of stragglers.
passManager.addPass(IREE::Stream::createVerifyLoweringToTensorsPass());
//----------------------------------------------------------------------------
// Constant/variable optimization
//----------------------------------------------------------------------------
// Run inlining after having baked out affinities.
passManager.addPass(mlir::createInlinerPass());
// Elide any redundant transfers now that affinities are baked out and we know
// where resources are located.
//
// TODO(benvanik): enable this pass after updating usage refinement: today
// the clones are not handled correctly and will result in usage analysis
// failing. This seems to be caused by transfers having some non-trivial logic
// during analysis that clone does not have and just applying the same logic
// to clones results in other errors around lifetime changes. The resource
// analysis and refinement logic likely needs a larger reworking.
//
// passManager.addPass(IREE::Stream::createElideAsyncTransfersPass());
// Cleanup globals that were created during conversion.
buildStreamCleanupPassPipeline(passManager, transformOptions);
// Bring all initializers together so that we can schedule them.
passManager.addPass(IREE::Util::createCombineInitializersPass());
//----------------------------------------------------------------------------
// Stream affinity/assignment
//----------------------------------------------------------------------------
// TODO(benvanik): pin based on target backends here.
// TODO(benvanik): compute affinities for executables.
// TODO(benvanik): annotate all dispatches with preferred executable affinity.
// TODO(benvanik): DFA to specify all value affinities and pin dispatches.
// TODO(multi-device): it's really nice to be able to verify here but it
// prevents compiling to stream without devices specified or continuation at
// various phases. It'd be nice to find a way to enable this when the user
// expects it to work and otherwise not.
//
// Verify that all ops that may require affinities have them assigned or
// available (on a parent scope, etc). This allows subsequent passes to trust
// that an affinity lookup will always return a valid affinity.
// passManager.addPass(IREE::Stream::createVerifyAffinitiesPass());
}
//===----------------------------------------------------------------------===//
// --iree-stream-async-transformation-pipeline
//===----------------------------------------------------------------------===//
void buildStreamAsyncPassPipeline(OpPassManager &passManager,
const TransformOptions &transformOptions) {
//----------------------------------------------------------------------------
// Tensor lowering and resource management
//----------------------------------------------------------------------------
// Specialize the encodings before the lowering of stream tensor ops.
passManager.addPass(IREE::Stream::createSpecializeEncodingsPass());
// Lower stream.tensor.* ops to stream.async.* ops based on
// affinity/configuration assigned during placement.
FunctionLikeNest(passManager)
.addPass(IREE::Stream::createEncodeHostTensorsPass);
passManager.addNestedPass<IREE::Stream::ExecutableOp>(
IREE::Stream::createEncodeDeviceTensorsPass());
passManager.addPass(IREE::Stream::createMaterializeEncodingsPass());
buildStreamCleanupPassPipeline(passManager, transformOptions);
// Everything must now be in stream.async.* form but we don't yet have
// lifetime assigned.
passManager.addPass(IREE::Stream::createVerifyLoweringToAsyncResourcesPass());
// Materialize copy-on-write behavior with explicit stream.async.* ops.
// This will insert a lot of copies, so follow it up with a pass that elides
// ones that aren't needed. This is easier to verify than if there was one
// pass attempting to do both. Note that copy-on-write materialization is
// required for correct execution while copy elision is for performance only
// (though it's critical enough that it is not optional).
FunctionLikeNest(passManager)
.addPass(IREE::Stream::createMaterializeCopyOnWritePass)
.addPass(mlir::createCanonicalizerPass);
passManager.addPass(IREE::Stream::createElideAsyncCopiesPass());
FunctionLikeNest(passManager)
.addPass(mlir::createCanonicalizerPass)
.addPass(IREE::Stream::createEmplaceAllocationsPass);
// Refine lifetime of all resources across the module.
// We do this after scheduling execution so that we know how the resources
// move across devices. We do it before scheduling waves as lifetime doesn't
// change and it makes the IR cleaner.
passManager.addPass(IREE::Stream::createRefineUsagePass());
buildStreamCleanupPassPipeline(passManager, transformOptions);
// Verify all stream.async.* op access ranges that we can by taking advantage
// of statically available information or that which we can infer from data
// flow analysis. Because this may require a global analysis it's best done in
// a pass instead of individual op verifiers. We could run the pass more
// frequently above or move some of the simpler checks to op verifiers if we
// wanted to catch errors earlier but this is mostly a guard before we go into
// the stream.cmd.* layer.
passManager.addPass(IREE::Stream::createVerifyAsyncAccessRangesPass());
//----------------------------------------------------------------------------
// Stream formation and scheduling
//----------------------------------------------------------------------------
FunctionLikeNest(passManager)
// Combine async work into execution regions.
.addPass(IREE::Stream::createScheduleExecutionPass)
// Group concurrently executable work into waves.
.addPass(IREE::Stream::createScheduleConcurrencyPass);
// When synchronous initialization is requested we need to separate any work
// behind a timepoint in the initializer from the consumers of that timepoint.
if (transformOptions.initializationMode ==
IREE::Stream::InitializationMode::Synchronous) {
passManager.addPass(IREE::Stream::createSyncInitializersPass());
}
// Materialize timepoints across the entire module. This simplifies scheduling
// of the timeline as we can shake the IR and see what timepoints we still
// have left.
passManager.addPass(IREE::Stream::createPropagateTimepointsPass());
// Expand builtins to dispatches. This may introduce new executables.
// We do this after scheduling so that we preserve the semantics of the ops
// for partitioning/placement before turning them into opaque dispatches.
passManager.addPass(IREE::Stream::createMaterializeBuiltinsPass());
buildStreamCleanupPassPipeline(passManager, transformOptions);
// Everything must now be in stream.async.* form.
passManager.addPass(IREE::Stream::createVerifyLoweringToAsyncPass());
}
//===----------------------------------------------------------------------===//
// --iree-stream-cmd-transformation-pipeline
//===----------------------------------------------------------------------===//
void buildStreamCmdPassPipeline(OpPassManager &passManager,
const TransformOptions &transformOptions) {
// Schedule fine-grained allocations and insert placeholders for larger/longer
// lifetime allocations.
passManager.addPass(IREE::Stream::createScheduleAllocationPass());
FunctionLikeNest(passManager)
// TODO(benvanik): passes to convert alloc to alloca and thread through
// streams. Ideally all transient allocs become stream-ordered allocas.
// createPropagateTransientsPass()
// Allocate backing storage for fused constant resources.
// This expands packed constants into explicit forms with partitioned
// storage buffers and upload logic.
.addPass(IREE::Stream::createPackConstantsPass)
// Layout packed slices to emit the arithmetic required for all resource
// offsets. This enables us to propagate the subviews across the program
// below.
.addPass(IREE::Stream::createLayoutSlicesPass)
// Apply canonicalization patterns to clean up subview ops prior to
// propagating subranges.
.addPass(mlir::createCanonicalizerPass);
// Propagate subviews throughout the program to unify resource storage access.
// After propagation many resource SSA values can be deduped or folded by the
// cleanup patterns.
passManager.addPass(IREE::Util::createPropagateSubrangesPass());
buildStreamCleanupPassPipeline(passManager, transformOptions);
// TODO(benvanik): outline streams (ala dispatch regions). Note that we may
// want to do this earlier to enable better deduplication but that makes the
// above passes trickier. Outlining may be more like "find chunks of streams
// useful to move into secondary command buffers."
// Everything must now be in explicit stream.cmd.* form.
passManager.addPass(IREE::Stream::createVerifyLoweringToCmdPass());
}
//===----------------------------------------------------------------------===//
// --iree-stream-optimization-pipeline
//===----------------------------------------------------------------------===//
void buildStreamOptimizationPassPipeline(
OpPassManager &passManager, const TransformOptions &transformOptions) {
// Forming streams involves a fair amount of subgraph stitching, which can
// cause duplication. Run CSE to collapse.
buildStreamCleanupPassPipeline(passManager, transformOptions);
// If any scf ops crept in we get rid of them here. We should be able to
// support them all the way through the stream dialect but some passes are not
// currently set up to handle them (such as elide timepoints).
FunctionLikeNest(passManager).addPass(mlir::createSCFToControlFlowPass);
//----------------------------------------------------------------------------
// Whole-program scheduling optimization
//----------------------------------------------------------------------------
{
// We run these under a fixed-point iteration such that we can perform
// inter-procedural, intra-procedural, and canonicalization as separably
// verifiable/reusable passes alongside the custom stream ones. IPO will
// fold duplicate arguments/results and inline constants to allow the local
// optimizations to work more effectively.
OpPassManager ipoPipeline(mlir::ModuleOp::getOperationName());
// IPO and other cleanups.
buildStreamCleanupPassPipeline(ipoPipeline, transformOptions);
// TODO(#9747): elide timepoints that are know-reached due to host
// synchronization via stream.timepoint.await.
// Elide timepoints in dependency chains where one is known to have been
// reached by the time another is (A -> B -> A|C).
ipoPipeline.addPass(IREE::Stream::createElideTimepointsPass());
// Run fixed-point iteration on the IPO pipeline.
passManager.addPass(
IREE::Util::createFixedPointIteratorPass(std::move(ipoPipeline)));
}
//----------------------------------------------------------------------------
// Binding optimization
//----------------------------------------------------------------------------
if (transformOptions.optimizeBindings) {
// Fuse bindings together and add operands for their subview ranges.
passManager.addPass(IREE::Stream::createFuseDispatchBindingsPass());
// TODO(benvanik): canonicalize bindings: we should sort the bindings by
// the block argument order of the parent stream.cmd.execute. This will get
// us more regular descriptor set layouts. We could also use some other
// heuristics (all constant bindings -> transients -> external etc) to
// make partitioning the bindings easier. Note we need to update both the
// dispatches and the dispatch function argument order.
}
// Annotate dispatch region arguments based on the operands passed at dispatch
// sites. This allows codegen to see the potential values for the operands
// when operating locally on executables.
passManager.addPass(IREE::Stream::createAnnotateDispatchArgumentsPass());
passManager.addPass(IREE::Stream::createAnnotateDispatchAssumptionsPass());
// Pack dispatch operands on stream.executable into i32 values.
// We do this prior to exiting the pipeline as here we can still easily
// add/remove operands.
passManager.addPass(IREE::Stream::createPackDispatchOperandsPass());
// Folding operands requires that canonicalization/CSE folds the inputs that
// we check for.
buildStreamCleanupPassPipeline(passManager, transformOptions);
passManager.addPass(IREE::Stream::createFoldUniformOperandsPass());
// Only want to specialize after we've added all the operands we need above.
// TODO(benvanik): make codegen more efficient with the specialized
// constants. The lookup tables inserted are currently extremely slow on
// some backends.
// passManager.addPass(IREE::Stream::createSpecializeDispatchesPass());
// TODO(benvanik): when we spill push constants spill to staging buffers.
// Need to know push constant limit but that could be specified as a stream
// option (max operand count).
}
//===----------------------------------------------------------------------===//
// --iree-stream-transformation-pipeline
//===----------------------------------------------------------------------===//
void buildStreamTransformPassPipeline(
OpPassManager &passManager, const TransformOptions &transformOptions) {
//----------------------------------------------------------------------------
// Primary pipeline stages (required)
//----------------------------------------------------------------------------
buildStreamTensorPassPipeline(passManager, transformOptions);
buildStreamAsyncPassPipeline(passManager, transformOptions);
buildStreamCmdPassPipeline(passManager, transformOptions);
// Dump statistics before the deeper optimizations happen.
// Optimizations such as dispatch operand fusion remove information we can use
// to determine memory usage by dispatches.
if (transformOptions.dumpStatisticsFormat != DumpOutputFormat::None) {
DumpStatisticsPassOptions dumpStatisticsOptions;
dumpStatisticsOptions.outputFormat = transformOptions.dumpStatisticsFormat;
dumpStatisticsOptions.outputFile = transformOptions.dumpStatisticsFile;
passManager.addPass(
IREE::Stream::createDumpStatisticsPass(dumpStatisticsOptions));
}
//----------------------------------------------------------------------------
// Optimizations (may be required by some targets)
//----------------------------------------------------------------------------
buildStreamOptimizationPassPipeline(passManager, transformOptions);
//----------------------------------------------------------------------------
// Post-pipeline cleanup
//----------------------------------------------------------------------------
// Final cleanup after we optimize dispatches and fuse operands and bindings.
buildStreamCleanupPassPipeline(passManager, transformOptions);
// Symbol DCE any remaining variables/functions that are now no longer
// required.
passManager.addPass(mlir::createSymbolDCEPass());
}
//===----------------------------------------------------------------------===//
// Registration
//===----------------------------------------------------------------------===//
namespace {
#define GEN_PASS_REGISTRATION
#include "iree/compiler/Dialect/Stream/Transforms/Passes.h.inc" // IWYU pragma: export
} // namespace
void registerStreamPasses() {
// Generated.
registerPasses();
// Pipelines.
PassPipelineRegistration<TransformOptions> cleanupPassPipeline(
"iree-stream-cleanup-pipeline",
"Runs the cleanup passes that are performed between stages of the full "
"stream pipeline.",
[](OpPassManager &passManager, const TransformOptions &transformOptions) {
buildStreamCleanupPassPipeline(passManager, transformOptions);
});
PassPipelineRegistration<TransformOptions> tensorPassPipeline(
"iree-stream-tensor-transformation-pipeline",
"Lowers source dialects into stream.tensor.* IR.",
[](OpPassManager &passManager, const TransformOptions &transformOptions) {
buildStreamTensorPassPipeline(passManager, transformOptions);
});
PassPipelineRegistration<TransformOptions> asyncPassPipeline(
"iree-stream-async-transformation-pipeline",
"Lowers stream.tensor.* to stream.async.* IR.",
[](OpPassManager &passManager, const TransformOptions &transformOptions) {
buildStreamAsyncPassPipeline(passManager, transformOptions);
});
PassPipelineRegistration<TransformOptions> cmdPassPipeline(
"iree-stream-cmd-transformation-pipeline",
"Lowers stream.async.* to stream.cmd.* IR.",
[](OpPassManager &passManager, const TransformOptions &transformOptions) {
buildStreamCmdPassPipeline(passManager, transformOptions);
});
PassPipelineRegistration<TransformOptions> optimizationPassPipeline(
"iree-stream-optimization-pipeline",
"Optimizes stream commands and resources (may be required for some "
"targets).",
[](OpPassManager &passManager, const TransformOptions &transformOptions) {
buildStreamOptimizationPassPipeline(passManager, transformOptions);
});
PassPipelineRegistration<TransformOptions> transformPassPipeline(
"iree-stream-transformation-pipeline",
"Runs the full IREE stream dialect transformation pipeline.",
[](OpPassManager &passManager, const TransformOptions &transformOptions) {
buildStreamTransformPassPipeline(passManager, transformOptions);
});
}
} // namespace mlir::iree_compiler::IREE::Stream