blob: bc97a7f4f0b86ef129e34682eb59b588d5118981 [file] [log] [blame] [edit]
// Copyright 2021 The IREE Authors
//
// Licensed under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
#ifndef IREE_DIALECT_STREAM_PASSES
#define IREE_DIALECT_STREAM_PASSES
include "mlir/Pass/PassBase.td"
//===----------------------------------------------------------------------===//
// Tensor lowering and resource management
//===----------------------------------------------------------------------===//
def ConvertToStreamPass :
Pass<"iree-stream-conversion", "mlir::ModuleOp"> {
let summary = "Converts from flow and other input dialects into the stream dialect.";
let description = [{
Converts supported input dialects (`flow`, `tensor`, `util`, and various
upstream dialects like `cf`/`scf`) into the stream dialect and adds
additional metadata. After conversion all supported operations will act on
`!stream.resource<*>` types and track resource storage sizes symbolically.
Though the conversion requires that the program be in an implicitly
synchronized form (SSA use-def chains on immutable tensor-like objects)
limited support is available for a subset of the `hal` dialect ops that are
used on the program ABI boundary for interoperating with external buffers
and fences. These ops, such as `hal.tensor.import` and `hal.tensor.barrier`,
will be converted to their `stream` dialect form and preserve the implicit
synchronization guaranteeds required for proper analysis.
Dispatched executables are allowed to be in one of the supported input
dialects (like `flow.executable`), already be lowered into
`stream.executable` ops, or be the final `hal.executable` ops. The amount of
analysis and optimization that can be performed on `hal.executable` ops is
limited and no retargetability is available when directly providing them.
}];
let dependentDialects = [
"mlir::arith::ArithDialect",
"mlir::func::FuncDialect",
"mlir::tensor::TensorDialect",
"IREE::Stream::StreamDialect",
"IREE::Util::UtilDialect",
];
}
def EncodeHostTensorsPass :
Pass<"iree-stream-encode-host-tensors", ""> {
let summary = "Encodes tensors into storage formats based on affinity and target support.";
let description = [{
Encodes `stream.tensor.*` ops on tensor-like objects into encoding-erased
asynchronous `stream.async.*` ops and resolves (if possible) symbolic
encoding ops such as `stream.tensor.sizeof` into their final values.
Dense tensors are trivially lowerable but other encodings may require
additional transfer and dispatch operations. For example, computing the
minimal fixed storage size of an unblocked sparse tensor may require the
pass to insert a dispatch that traverses the index tables to discover how
many elements are present while a blocked sparse tensor may be able to
resolve to a simpler calculation based solely on the number of fixed-size
blocks.
Sub-byte tensor types or those with non-trivial packing/encoding are also
resolved here such as by calculating that a `tensor<Nxi4>` requires `N*4/8`
bytes of storage. Some operations like slicing subranges of elements without
known alignment may also require additional transfer and dispatch operations
to preserve behavior while lowering into the type-erased forms.
}];
let dependentDialects = [
"mlir::arith::ArithDialect",
"mlir::complex::ComplexDialect",
"IREE::Stream::StreamDialect",
];
}
// TODO(benvanik): remove this pass and instead specify the encoding with
// attributes such that codegen can deterministically match the host behavior.
// This pass only exists today because we don't have a way to ensure all codegen
// backends do the "right" (or at least consistent) thing beyond convention.
def EncodeDeviceTensorsPass :
Pass<"iree-stream-encode-device-tensors", ""> {
let summary = "Encodes tensors into binary formats based on affinity and target support.";
let description = [{
Encodes `stream.binding.*` ops on tensor-like objects while handling packing
and encoding as with the `iree-stream-encode-host-tensors` pass but within
executables.
}];
let dependentDialects = [
"mlir::arith::ArithDialect",
"mlir::complex::ComplexDialect",
"IREE::Stream::StreamDialect",
];
}
def MaterializeBuiltinsPass :
Pass<"iree-stream-materialize-builtins", "mlir::ModuleOp"> {
let summary = "Materialize dispatches to builtin executables where required.";
let description = [{
Materializes dispatches to builtin executables when operations are not
supported by lower layers of the stack. For example, an `stream.async.fill`
op with an i64 pattern will be converted to a `stream.async.dispatch` of
`__builtin_fill_i64` and the `stream.executable` will be merged into the
module.
Though in many cases this kind of emulation happens more naturally
during the global optimization phase of the compiler and is more efficient
as there is opportunity for fusion into existing dispatches sometimes it's
not possible to statically know at the time such phases operate whether the
operations are required and this catches those cases.
Since it's often less efficient to materialize a builtin dispatch instead of
having fused it with others or to have been able to make use of a pure
transfer operation the materialization is seen as a pessimization that
should be avoided. Generally builtins are only added to ensure correct
execution and are not used to try to optimize the program.
}];
let dependentDialects = [
// We need to include all dialects that the builtin modules use.
"mlir::arith::ArithDialect",
"mlir::func::FuncDialect",
"mlir::linalg::LinalgDialect",
"mlir::memref::MemRefDialect",
"mlir::scf::SCFDialect",
"mlir::vector::VectorDialect",
"IREE::Flow::FlowDialect",
"IREE::Stream::StreamDialect",
"IREE::Util::UtilDialect",
];
}
def MaterializeCopyOnWritePass :
Pass<"iree-stream-materialize-copy-on-write", ""> {
let summary = "Materializes copy-on-write (🐄) behavior as explicit ops.";
let description = [{
Materializes copy-on-write behavior in the program by analyzing usage of
`!stream.resource<*>` types by stream ops. Prior to this pass resources are
implicitly immutable and follow SSA semantics while after the pass any cases
where such implicit behavior is assumed has been expanded into appropriate
clones of the resources or rematerialization of source values.
As an example attempting to update the same immutable tensor will result in
the original tensor being cloned such that each update sees a unique copy:
```mlir
%init = stream.async.splat %c0
%fill0 = stream.async.fill %c123, %init[...] -> %init
%fill1 = stream.async.fill %c456, %init[...] -> %init
->
%init = stream.async.splat %c0
%clone0 = stream.async.clone %init
%fill0 = stream.async.fill %c123, %clone0[...] -> %clone0
%clone1 = stream.async.clone %init
%fill1 = stream.async.fill %c456, %clone1[...] -> %clone1
```
A subsequently run `iree-stream-elide-async-copies` pass can often elide or
simplify some of the copies such as above where splatting and then cloning
the splat twice is not required. The passes are split to allow for simple
local analysis here and for the elision pass to catch input that may already
have contained unneeded copies.
}];
let dependentDialects = [
"IREE::Stream::StreamDialect",
];
}
def MaterializeEncodingsPass :
Pass<"iree-stream-materialize-encodings", "mlir::ModuleOp"> {
let summary = "Materialize stream.tensor.encode ops to dispatches and executables.";
let description = [{
Materializes uniqued executables for `stream.tensor.encode` ops and replaces
them with dispatches to those executables.
}];
let dependentDialects = [
"mlir::func::FuncDialect",
"IREE::Encoding::IREEEncodingDialect",
// TODO(#20249): Drop the Flow dependency once the needed operations and
// types are moved to other dialects like TensorExt.
"IREE::Flow::FlowDialect",
"IREE::Stream::StreamDialect",
];
}
def CloneToConsumersPass :
Pass<"iree-stream-clone-to-consumers", "mlir::ModuleOp"> {
let summary = "Clones operations that opt-in to consumer affinities.";
let description = [{
Performs whole-program analysis to identify operations that are used on
multiple affinities that can be cloned per-affinity. The `StreamableOp`
interface's `preferCloneToConsumers` query is used and any ops implementing
the interface may opt-in to the cloning.
}];
let dependentDialects = [
"IREE::Stream::StreamDialect",
];
}
def ElideAsyncCopiesPass :
Pass<"iree-stream-elide-async-copies", "mlir::ModuleOp"> {
let summary = "Elides copies when they are not performing meaningful work.";
let description = [{
Performs whole-program analysis to identify copies that are not required for
program correctness or enabling concurrency, such as clones of the last use
of a value. This eliminates copies both from input programs and those
materialized by the `iree-stream-materialize-copy-on-write` pass.
}];
}
def ElideAsyncTransfersPass :
Pass<"iree-stream-elide-async-transfers", "mlir::ModuleOp"> {
let summary = "Elides transfers when they are not performing meaningful work.";
let description = [{
Performs whole-program analysis to identify transfers that are not required
for program correctness (transfers to/from the same device, etc).
}];
let dependentDialects = [
"IREE::Stream::StreamDialect",
];
}
def EmplaceAllocationsPass :
Pass<"iree-stream-emplace-allocations", ""> {
let summary = "Emplaces transient tensor allocations to remove copies.";
let description = [{
Identifies opportunities for placing operation results directly into
existing resources when analysis determines it is safe to do so. This is
intended to run after copy-on-write materialization when such analysis can
be performed local to the operations. The common case this helps with is
insertions of produced results into larger resources such as performed by
tensor concatenation.
}];
let dependentDialects = [
"IREE::Stream::StreamDialect",
];
}
def RefineUsagePass :
Pass<"iree-stream-refine-usage", "mlir::ModuleOp"> {
let summary = "Refines resource usage bits and inserts transfers where appropriate.";
let description = [{
Performs whole-program analysis to assign lifetime and usage attributes to
`!stream.resource<*>` types that have not yet been fixed. Resources are
tracked across global loads/stores, function calls, control flow, and
operations acting on them to determine how they are used (transfers, host
staging, constants, etc). Upon completion all resources have a fixed
lifetime and any new resources introduced into the program with an
unspecified lifetime (`!stream.resource<*>`) will require the pass to be
run again prior to continued lowering.
}];
let dependentDialects = [
"IREE::Stream::StreamDialect",
];
}
//===----------------------------------------------------------------------===//
// Stream formation and scheduling
//===----------------------------------------------------------------------===//
def ScheduleExecutionPass :
InterfacePass<"iree-stream-schedule-execution", "mlir::CallableOpInterface"> {
let summary = "Identifies and groups asynchronous operations into executable regions within function-like regions.";
let description = [{
Partitions `stream.async.*` operations into execution regions that are
executed atomically on a single device. The partitioning algorithm uses the
operations being performed and the affinity assigned to them (if any) to
determine which are allowed to execute together and is allowed to produce
any number of partitions to cover the workload. Original executing ordering
is preserved by the resulting `stream.async.execute` operations using
`!stream.timepoint` to maintain explicit SSA use-def-based wait-on and
signal-to behavior. Scheduling may insert host waits on device work that can
be later avoided by timepoint propagation and elision.
}];
let dependentDialects = [
"IREE::Stream::StreamDialect",
];
}
def ScheduleConcurrencyPass :
InterfacePass<"iree-stream-schedule-concurrency", "mlir::CallableOpInterface"> {
let summary = "Identifies and groups asynchronous operations within executable regions that can run concurrently and groups them into streams.";
let description = [{
Partitions operations that can execute concurrently within
`stream.async.execute` regions into a tree with `stream.async.concurrent`
ops indicating two or more operations that are allowed to execute
concurrently even if resources may alias.
}];
let dependentDialects = [
"IREE::Stream::StreamDialect",
];
}
def SyncInitializersPass :
Pass<"iree-stream-sync-initializers", "mlir::ModuleOp"> {
let summary = "Makes all initializer-produced timepoints synchronously wait before proceeding.";
let description = [{
Gathers all global timepoint stores within each initializer and converts
them to a single synchronous host wait.
NOTE: this does not currently find timepoints in called functions. To handle
that we would need to analyze the call graph to find functions called only
from initializers and duplicate any function that is called from both
initializers and non-initializer roots. At the point in the pipeline where
this pass runs most internal function calls return timepoints and the
initializer is the place where they are stored into globals so it happens to
work out.
}];
let dependentDialects = [
"IREE::Stream::StreamDialect",
"IREE::Util::UtilDialect",
];
}
def PropagateTimepointsPass :
Pass<"iree-stream-propagate-timepoints", "mlir::ModuleOp"> {
let summary = "Materializes timepoints and sinks them to consumers throughout the whole program.";
let description = [{
Propagates `!stream.timepoint` values across the whole program in order to
avoid host-device and device-device waits where possible without changing
correct execution ordering. For example a host wait on a timepoint via a
`stream.timepoint.await` op guarding a resource passed to a function call
will be changed to pass the timepoint to the callee and have the wait occur
in there thus allowing it to be chained with subsequent device operations
that may consume the resource. Such propagation happens across global stores
and loads, function calls, and control flow.
}];
let dependentDialects = [
"mlir::cf::ControlFlowDialect",
"IREE::Stream::StreamDialect",
"IREE::Util::UtilDialect",
];
}
def ElideTimepointsPass :
Pass<"iree-stream-elide-timepoints", "mlir::ModuleOp"> {
let summary = "Elides timepoints that are known to be covered by dependent timepoints.";
let description = [{
Elides waits on timepoints that are known to be reached by a dependent
timepoint. Errs on the side of preserving timepoints if analysis can't
guarantee that a particular wait is covered.
Example:
```mlir
%timepoint0 = ...
%timepoint1 = ... await(%timepoint0)
%timepoint2 = stream.timepoint.join max(%timepoint0, %timepoint1)
->
%timepoint0 = ...
%timepoint1 = ... await(%timepoint0)
%timepoint2 = stream.timepoint.join max(%timepoint1)
-> (canonicalization) ->
%timepoint0 = ...
%timepoint1 = ... await(%timepoint0)
%timepoint2 = %timepoint1
```
}];
let dependentDialects = [
"IREE::Stream::StreamDialect",
];
}
//===----------------------------------------------------------------------===//
// Allocation and command issuing
//===----------------------------------------------------------------------===//
def ScheduleAllocationPass :
Pass<"iree-stream-schedule-allocation", "mlir::ModuleOp"> {
let summary = "Allocates resources and converts to explicit stream commands.";
let description = [{
Schedules allocation of resources and converts the program from the implicit
resource management scheme of the `stream.async.*` ops into the explicit
resource management scheme of the `stream.cmd.*` ops. After conversion the
program cannot be raised as aliasing is introduced and local liveness ranges
are erased.
Allocations are performed by asynchronous operations like
`stream.resource.alloca` (and the matching `stream.resource.dealloca`) and
sequenced in the device timeline by `!stream.timepoint` values.
}];
let dependentDialects = [
"mlir::arith::ArithDialect",
"IREE::Stream::StreamDialect",
];
}
def PackConstantsPass :
InterfacePass<"iree-stream-pack-constants", "mlir::CallableOpInterface"> {
let summary = "Packs and allocates backing storage for fused constant resources.";
let description = [{
Packs slices of `stream.resource.constants` ops and materializes operations
to initialize them based on their contents. Embedded constants are turned
into inline host buffers with operations that try to map them into device
memory or perform device-accelerated file I/O asynchronously with other
initialization code. Parameters are expanded based on the the device memory
model to be loads (which may allow mapping memory on devices with unified
memory) or gathers (that require allocation and staging on devices with
discrete memory).
}];
let dependentDialects = [
"mlir::arith::ArithDialect",
"mlir::scf::SCFDialect",
"IREE::Stream::StreamDialect",
"IREE::Util::UtilDialect",
];
}
def LayoutSlicesPass :
InterfacePass<"iree-stream-layout-slices", "mlir::CallableOpInterface"> {
let summary = "Lays out packed slices and produces arithmetic required for all offsets.";
let description = [{
Performs target-aware layout of packed slices in `stream.resource.pack` ops.
Alignment, padding, and static/dynamic offset calculation of the slices
within larger allocated resources happens with awareness of both the
resource slices being packed and where they will be consumed.
}];
let dependentDialects = [
"mlir::arith::ArithDialect",
"IREE::Stream::StreamDialect",
"IREE::Util::UtilDialect",
];
}
//===----------------------------------------------------------------------===//
// Memoization
//===----------------------------------------------------------------------===//
// TODO(benvanik): outline streams (ala dispatch regions).
// TODO(benvanik): deduplicate outlined streams.
//===----------------------------------------------------------------------===//
// Dispatch optimization
//===----------------------------------------------------------------------===//
def FoldUniformOperandsPass :
Pass<"iree-stream-fold-uniform-operands", "mlir::ModuleOp"> {
let summary = "Folds redundant and uniformly constant dispatch operands.";
let description = [{
Performs whole-program analysis to find all dispatch sites to each dispatch
and fold or inline operands that are uniformly passed. For example if
multiple dispatch sites pass the same SSA value for two operands (even if
dynamically computed) they will be folded into a single value, and if
multiple dispatch sites pass the same constant value for the same operand
the constant value will be inlined and the operand removed.
}];
let dependentDialects = [
"mlir::arith::ArithDialect",
];
}
def FuseDispatchBindingsPass :
Pass<"iree-stream-fuse-dispatch-bindings", "mlir::ModuleOp"> {
let summary = "Fuses bindings to the same underlying storage to reduce binding count.";
let description = [{
Erases dispatch binding subranges and attempts to fuse bindings that
originate from the same resources across all dispatch sites.
}];
let dependentDialects = [
"mlir::arith::ArithDialect",
"IREE::Stream::StreamDialect",
];
}
def SpecializeDispatchesPass :
Pass<"iree-stream-specialize-dispatches", "mlir::ModuleOp"> {
let summary = "Specializes executables by inlining/fusing operands based on dispatch sites.";
let description = [{
Reduces the number of operands passed to dispatches by identifying common
patterns at dispatch sites across the program that can be compressed into
unique dispatch site identifiers. For example, if a dispatch takes several
operands that are [0, 1, ...] at one dispatch site and [10, 11, ...] at
another the dispatch will be changed to take a single value indicating which
set of operands to use and the operands themselves will be placed into a
lookup table within the dispatch.
}];
let dependentDialects = [
"mlir::arith::ArithDialect",
"mlir::tensor::TensorDialect",
];
}
def SpecializeEncodingsPass :
Pass<"iree-stream-specialize-encodings", "mlir::ModuleOp"> {
let summary = "Specializes serializable encodings based on layout analysis.";
let description = [{
Attaches layouts to encodings and duplicates executables based on the
encoding layout analysis.
Some executables can be launched by different devices. It can produce
wrong codegen artifacts when bindings types are encoded (i.e., the
tensor type has an encoding attribute). Because they can result in
different layouts, especially when multi-device is involved. E.g., say
that device_a and device_b interpret a tensor type with encodings in
different layouts, and there is an executable that can be launched with
resources from either device_a or device_b. It is confusing what the
input layouts for the executable because there are two possibilities. In
this case, we have to duplicate the executable with updated encoding,
and modify the dispatch to launch proper executable based on device
analysis.
The pass resolves the layouts based on Stream affinity analysis. It updates
the encodings of all the Stream tensor ops with resolved layouts, duplicates
executables based on the set of incoming layouts and result layouts, and
updates bindings with resolved layouts.
Requirements:
- At least one of the dialect implements AffinityAnalysisDialectInterface
dialect interface, because Stream does not need to know any dialect other
than itself.
- The binding types have to implement IREE::Encoding::EncodingTypeInterface,
so it can updates the types without accessing any other dialects.
- All the encodings attached on the types have to implement
SerializableEncodingAttrInterface. Because the pass updates the encodings
using interfaces.
}];
let dependentDialects = [
"IREE::Encoding::IREEEncodingDialect"
];
}
def AnnotateDispatchArgumentsPass :
Pass<"iree-stream-annotate-dispatch-arguments", "mlir::ModuleOp"> {
let summary = "Annotates dispatch arguments with potential values derived from dispatch sites.";
let description = [{
Uses data flow analysis to identify potential value sets and alignments
(or divisibility) for dispatch operands and bindings. Upon successful
analysis the dispatch executables are annotated such that further lowering
in codegen has the analysis results locally without needing to inspect the
entire program.
Operands are annotated with `stream.values` and/or `stream.alignment`
attributes indicating all known constant values at all dispatch sites and/or
their divisibility. `stream.values` is only added when only statically-known
values are passed and `stream.alignment` is added in cases where some
minimum divisibility is identified even if the values are dynamic (such as
all values passed in going through `util.align` or `arith.muli` prior).
Bindings are annotated with `stream.alignment` attributes indicating their
base alignment prior to the offset specified on the binding op itself. Note
that just because the base alignment is some value does not mean the offset
is always known to be aligned in the same way.
}];
}
def AnnotateDispatchAssumptionsPass :
Pass<"iree-stream-annotate-dispatch-assumptions", "mlir::ModuleOp"> {
let summary = "Adds util.assume.* op to executables from all dispatch sites.";
let description = [{
Uses dataflow analysis to determine integer range and divisibility,
propagating that as `util.assume.int` ops within the executable with an
assumption row for each dispatch site. This effectively transports the
per-dispatch level analyses to the executable so that the backend can
act on it as it sees fit.
Note that this pass largely replaces the `AnnotateDispatchArgumentsPass`
above and can eventually subsume it entirely. However, as the mechanism is
new and needs to be phased in, both exist in parallel for the moment.
}];
}
def PackDispatchOperandsPass :
Pass<"iree-stream-pack-dispatch-operands", "mlir::ModuleOp"> {
let summary = "Packs stream dispatch operands into i32 push constants.";
let description = [{
Packs dispatch operands (such as `i2`, `i64`, `complex<f32>`, etc) into the
required `i32` values on the dispatch ABI. May optimize multiple wider
bit-width operands with known ranges or alignments into or across fewer
operands to reduce the total operand count.
}];
let dependentDialects = [
"mlir::arith::ArithDialect",
"mlir::complex::ComplexDialect",
];
}
//===----------------------------------------------------------------------===//
// Diagnostics
//===----------------------------------------------------------------------===//
def AnnotateAffinitiesPass :
Pass<"iree-stream-annotate-affinities", "mlir::ModuleOp"> {
let summary = "Annotates affinities on all ops for debugging.";
}
def DumpStatisticsPass :
Pass<"iree-stream-dump-statistics", "mlir::ModuleOp"> {
let summary = "Dumps stream dialect usage information to a file.";
let options = [
Option<
"outputFormat", "output-format",
"IREE::Stream::DumpOutputFormat",
"IREE::Stream::DumpOutputFormat::Pretty",
"Specifies the output format to produce.",
[{::llvm::cl::values(
clEnumValN(IREE::Stream::DumpOutputFormat::Pretty, "pretty", "Human-readable pretty printed output."),
clEnumValN(IREE::Stream::DumpOutputFormat::Verbose, "verbose", "Pretty printed output with additional IR."),
clEnumValN(IREE::Stream::DumpOutputFormat::CSV, "csv", "Comma separated values.")
)}]
>,
Option<
"outputFile", "output-file",
"std::string",
/*default=*/"std::string()",
"File path to write to; or `` for stderr or `-` for stdout."
>,
];
}
def VerifyInputPass :
Pass<"iree-stream-verify-input", "mlir::ModuleOp"> {
let summary = "Verifies that input dialects are supported by the streams dialect.";
}
def VerifyAffinitiesPass :
Pass<"iree-stream-verify-affinities", "mlir::ModuleOp"> {
let summary = "Verifies that all operations have affinities assigned (directly or indirectly).";
}
def VerifyLoweringToTensorsPass :
Pass<"iree-stream-verify-lowering-to-tensors", "mlir::ModuleOp"> {
let summary = "Verifies that input dialects are converted to stream.tensor.* ops.";
}
def VerifyLoweringToAsyncResourcesPass :
Pass<"iree-stream-verify-lowering-to-async-resources", "mlir::ModuleOp"> {
let summary = "Verifies that all stream.tensor.* ops and types are fully lowered to stream.async.* resource ops.";
}
def VerifyLoweringToAsyncPass :
Pass<"iree-stream-verify-lowering-to-async", "mlir::ModuleOp"> {
let summary = "Verifies that all stream.tensor.* ops and types are fully lowered to stream.async.* ops and all resources have an assigned lifetime.";
}
def VerifyAsyncAccessRangesPass :
Pass<"iree-stream-verify-async-access-ranges", "mlir::ModuleOp"> {
let summary = "Verifies that stream.async.* access ranges are in bounds where possible.";
}
def VerifyLoweringToCmdPass :
Pass<"iree-stream-verify-lowering-to-cmd", "mlir::ModuleOp"> {
let summary = "Verifies that all stream.async.* ops and types are fully lowered to stream.cmd.* ops.";
}
#endif // IREE_DIALECT_STREAM_PASSES