blob: 8abbce5e0af823575633a9f84fd05ba184c1039a [file] [log] [blame]
// Copyright 2021 The IREE Authors
//
// Licensed under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
#include "iree/compiler/Dialect/Stream/IR/StreamDialect.h"
#include "iree/compiler/Dialect/Stream/IR/StreamOps.h"
#include "iree/compiler/Dialect/Stream/IR/StreamTypes.h"
#include "iree/compiler/Dialect/Stream/Transforms/PassDetail.h"
#include "iree/compiler/Dialect/Stream/Transforms/Passes.h"
#include "iree/compiler/Dialect/Util/IR/UtilDialect.h"
#include "iree/compiler/Dialect/Util/IR/UtilOps.h"
#include "iree/compiler/Dialect/Util/IR/UtilTypes.h"
#include "iree/compiler/Utils/IndexSet.h"
#include "llvm/Support/Debug.h"
#include "mlir/Dialect/SCF/SCF.h"
#include "mlir/Dialect/StandardOps/IR/Ops.h"
#include "mlir/IR/AsmState.h"
#include "mlir/IR/Attributes.h"
#include "mlir/IR/BlockAndValueMapping.h"
#include "mlir/IR/Builders.h"
#include "mlir/IR/BuiltinOps.h"
#include "mlir/IR/Matchers.h"
#include "mlir/IR/PatternMatch.h"
#include "mlir/Pass/Pass.h"
#define DEBUG_TYPE "iree-stream-pack-constants"
namespace mlir {
namespace iree_compiler {
namespace IREE {
namespace Stream {
namespace {
//===----------------------------------------------------------------------===//
// Pool packing and storage assignment
//===----------------------------------------------------------------------===//
struct ConstantSlice {
// Result SSA value from the pooling op.
Value result;
// Size, in bytes, of the encoded constant value as an SSA value.
Value resultSize;
// Constant value being encoded.
Attribute value;
};
struct PackedSpan {
// Original slice this span represents.
ConstantSlice slice;
// Byte offset within the storage buffer.
uint64_t offset = 0;
// Length of the valid data when padded out.
// This is only accounting for the padding of the valid data itself and not
// any additional padding for other spans within the buffer (like start
// offset alignment).
uint64_t length = 0;
};
struct StorageResource {
// Fused location of all spans that make up this storage buffer.
Location loc;
// Total size in bytes (including padding).
uint64_t totalSize = 0;
// Constant spans packed into this resource.
SmallVector<PackedSpan, 8> spans;
// Packed byte data that must be embedded in the final module.
// It must be written with an alignment as required by the constraints.
IREE::Util::CompositeAttr data;
};
// Buckets |slices| into 1+ storage resources based on |resourceConfig|.
static SmallVector<StorageResource, 8> bucketValuesIntoStorageResources(
ArrayRef<ConstantSlice> slices,
IREE::Stream::ResourceConfigAttr resourceConfig) {
// TODO(benvanik): replace with a better strategy (best-fit, etc).
SmallVector<StorageResource, 8> storageBuffers;
storageBuffers.push_back({UnknownLoc::get(resourceConfig.getContext())});
StorageResource *currentBuffer = &storageBuffers.back();
for (auto slice : slices) {
uint64_t offset = IREE::Util::align(
currentBuffer->totalSize, resourceConfig.getMinBufferOffsetAlignment());
uint64_t unpaddedLength =
slice.value.cast<DenseElementsAttr>().getRawData().size();
uint64_t paddedLength = IREE::Util::align(
unpaddedLength, resourceConfig.getMinBufferRangeAlignment());
if (offset + unpaddedLength > resourceConfig.getMaxAllocationSize()) {
// Spilling buffer; make a new one.
storageBuffers.push_back({UnknownLoc::get(resourceConfig.getContext())});
currentBuffer = &storageBuffers.back();
offset = 0;
}
currentBuffer->spans.push_back({slice, offset, unpaddedLength});
currentBuffer->totalSize =
std::max(currentBuffer->totalSize, offset + paddedLength);
}
if (storageBuffers.back().spans.empty()) {
storageBuffers.pop_back();
}
return storageBuffers;
}
// Packs all span data into a single data attribute we can tag on the buffer.
// The data produced will contain all spans at the specified offsets with no
// additional padding.
//
// NOTE: data can overlap so do not assume that the order between spans
// is contiguous or always increasing! Always seek!
static void packStorageResourceData(StorageResource &storageBuffer,
MLIRContext *context) {
// The constants get rolled into the buffer. This neat bit of info would
// be useful if we wanted to map back a module size through data blobs.
// With buffer <-> constant it's possible to build a tree map of
// contributions in the source. TBD ;)
storageBuffer.loc =
FusedLoc::get(context, llvm::to_vector<8>(llvm::map_range(
storageBuffer.spans, [](PackedSpan &span) {
return span.slice.result.getLoc();
})));
// Construct a composite attribute that contains references to the original
// uniqued storage values. This avoids needing to reallocate/unique/copy
// the entire constant storage. Since we may have inserted padding between
// values to meet the target buffer constraints we may need to insert some
// padding bytes between the elements to keep the composite dense.
auto i8Type = IntegerType::get(context, 8);
auto zeroAttr = IntegerAttr::get(i8Type, 0);
SmallVector<Attribute> values;
int64_t offset = 0;
for (auto &constantSpan : storageBuffer.spans) {
if (constantSpan.length == 0) continue;
int64_t start = constantSpan.offset;
int64_t end = start + constantSpan.length;
// TODO(benvanik): when we start overlapping data we'll want to have a way
// to build subranges here by slicing out parts of the attributes we have
// coming in. This could be done with a #util.slice<> attr or something
// that when serialized performs the offsetting.
assert(start >= offset && "expect ordered spans");
int64_t spanPadding = start - offset;
if (spanPadding > 0) {
// 0-pad until the start of this span.
values.push_back(DenseElementsAttr::get(
VectorType::get({spanPadding}, i8Type), zeroAttr));
offset += spanPadding;
}
values.push_back(constantSpan.slice.value);
offset = end;
}
// Add tail padding. Unfortunate but some implementations will require the
// bytes and it's possible (and in some cases intentional) that the storage
// buffers fall at the end of mapped memory and need the valid bytes for
// page-granularity access.
int64_t tailPadding = storageBuffer.totalSize - offset;
if (tailPadding > 0) {
// 0-pad until the start of this span.
values.push_back(DenseElementsAttr::get(
VectorType::get({tailPadding}, i8Type), zeroAttr));
offset += tailPadding;
}
storageBuffer.data = IREE::Util::CompositeAttr::get(context, values);
}
// Returns zero or more storage resources and the spans values map into.
// Assume that |slices| have been ordered by prior passes and that order may
// have some performance-sensitivity (constants are grouped by
// locality/lifetime/etc).
static SmallVector<StorageResource, 8> computePackingMap(
ArrayRef<ConstantSlice> slices,
IREE::Stream::ResourceConfigAttr resourceConfig, MLIRContext *context) {
// This is literally all my brain has brain for right now. The ideal here is
// that we have a basic static (and ideally profile-guided) sorting pass
// that keeps constant values that are accessed sorted together.
//
// We want good spatial locality as being in the same storage buffer means
// that constants are likelier to be pulled into memory together (by disk
// prefetcher pulling in mapped pages, TLB cache being hot, etc). We want
// good temporal locality because then we have a higher chance of the
// constants being placed into the same runtime buffer and that reduces the
// amount of buffer swapping/bindings we need to manage when recording
// commands.
//
// <story time> Funnily enough, the same reasons we care here (and this same
// algorithm) are the same that console developers encountered in the early
// days of CD-ROMs; inserting padding to force alignment on block
// boundaries, ensuring that temporally related content was together even if
// it meant repeating things, etc - like always physically duplicate the
// music near each level that uses it and potentially even interleave those
// together in block order on the disc, as being able to stream the music
// and still seek to blocks in level content was worth the % of lost space.
// You could listen for how well-optimized a game was by the noise level of
// the read head! (incidentally, same case too with tapes and floppies,
// however the space limitations were almost always the top concern there -
// it wasn't until CD-ROM and beyond that there was enough space to shuffle
// things around and waste on silly things like loading times).
//
// Here it's all descriptor sets and mapped pages but same thing pretty
// much, and passes earlier on may duplicate constants in the pool if it
// means they can improve locality at runtime. This pass doesn't dedupe and
// just sticks to packing for that reason.
// Build a list of resources and spans (append to current or spill to new).
auto storageBuffers =
bucketValuesIntoStorageResources(slices, resourceConfig);
// Pack each storage resource bucket into a single data blob.
for (auto &storageBuffer : storageBuffers) {
packStorageResourceData(storageBuffer, context);
}
return storageBuffers;
}
//===----------------------------------------------------------------------===//
// Upload materialization
//===----------------------------------------------------------------------===//
struct AllocatedStorage {
// Resource storing all packed constants.
Value resource;
// Total size, in bytes, of the storage resource.
Value resourceSize;
};
struct UploadResult {
// Timepoint when the storage is initialized with the constant values.
Value timepoint;
// Each (resource, resourceSize) allocated.
SmallVector<AllocatedStorage> allocations;
};
// Maps constants as a staging buffer and then issues copy commands.
// Per-storage resource we map the source rodata, allocate the result, and then
// issue an async copy from source to result. To avoid a bunch of overhead when
// there are multiple storage buffers we invert the logic so that we put all the
// async copies into a single region.
static UploadResult buildStagingUpload(
IREE::Stream::ResourceConstantsOp constantsOp,
IREE::Stream::ResourceType resourceType,
ArrayRef<StorageResource> storageResources, ArrayRef<Value> storageBuffers,
IndexSet &indexSet, OpBuilder &builder) {
UploadResult uploadResult;
auto stagingType = builder.getType<IREE::Stream::ResourceType>(
IREE::Stream::Lifetime::Staging);
// Map all of the storage data and allocate the result buffers.
// This will produce a list of copies we should perform from staging->final.
struct Copy {
Location loc;
Value source;
Value sourceSize;
Value sourceOffset;
Value target;
Value targetSize;
Value targetOffset;
Value length;
};
SmallVector<Copy> copies;
SmallVector<Value> capturedResources;
SmallVector<Value> capturedResourceSizes;
for (auto it : llvm::zip(storageResources, storageBuffers)) {
auto &storageResource = std::get<0>(it);
auto storageBuffer = std::get<1>(it);
// Today we assume 1:1 lengths of storage data and uploaded data, but this
// need not be the case if we want to pad the buffer for runtime.
auto totalLength = indexSet.get(storageResource.totalSize);
// Map the source staging resource rodata.
auto mapOp = builder.create<IREE::Stream::ResourceMapOp>(
storageResource.loc, stagingType, storageBuffer, indexSet.get(0),
totalLength, constantsOp.affinityAttr());
// Allocate the resulting storage resource of the final resource type.
auto allocOp = builder.create<IREE::Stream::ResourceAllocOp>(
storageResource.loc, resourceType, mapOp.result_size(),
/*uninitialized=*/builder.getUnitAttr(), constantsOp.affinityAttr());
uploadResult.allocations.push_back({
allocOp.results().front(),
allocOp.storage_sizes().front(),
});
// Queue copy for processing below.
Copy copy{
storageResource.loc, mapOp.result(),
mapOp.result_size(), indexSet.get(0),
allocOp.results().front(), allocOp.storage_sizes().front(),
indexSet.get(0), totalLength,
};
capturedResources.push_back(copy.source);
capturedResourceSizes.push_back(copy.sourceSize);
capturedResources.push_back(copy.target);
capturedResourceSizes.push_back(copy.targetSize);
copies.push_back(std::move(copy));
}
// Create the execution op capturing the resources.
auto executeOp = builder.create<IREE::Stream::CmdExecuteOp>(
constantsOp.getLoc(), /*awaitTimepoint=*/Value{}, capturedResources,
capturedResourceSizes);
if (constantsOp.affinity().hasValue()) {
executeOp.affinityAttr(constantsOp.affinityAttr());
}
uploadResult.timepoint = executeOp.result_timepoint();
// Map captured resources into the execution region.
BlockAndValueMapping mapping;
auto *entryBlock = new Block();
executeOp.body().push_back(entryBlock);
for (auto outerValue : capturedResources) {
auto arg = entryBlock->addArgument(outerValue.getType());
mapping.map(outerValue, arg);
}
// Issue copies. Note that we use the captured resources.
auto executionBuilder = OpBuilder::atBlockBegin(entryBlock);
for (auto &copy : copies) {
executionBuilder.create<IREE::Stream::CmdCopyOp>(
copy.loc, mapping.lookup(copy.source), copy.sourceSize,
copy.sourceOffset, mapping.lookup(copy.target), copy.targetSize,
copy.targetOffset, copy.length);
}
executionBuilder.create<IREE::Stream::YieldOp>(executeOp.getLoc());
return uploadResult;
}
// Emits IR to first try mapping the storage resources directly into usable
// constant resources. If the mapping fails (the target can't use the memory)
// then fall back to staging uploads.
static UploadResult buildTryMapConstantResources(
IREE::Stream::ResourceConstantsOp constantsOp,
IREE::Stream::ResourceType resourceType,
ArrayRef<StorageResource> storageResources, ArrayRef<Value> storageBuffers,
IndexSet &indexSet, OpBuilder &builder) {
// Try mapping each resource. We do this as an all-or-nothing across the
// storage: if any fails we fallback to the allocation path. This is mostly
// just to get more predictable behavior in the face of weird platform
// requirements: we want something like misaligned mappings to be easily
// visible in tracing.
SmallVector<Value> mappedResources;
SmallVector<Type> resultTypes;
Value ok;
auto zero = indexSet.get(0);
for (auto it : llvm::zip(storageResources, storageBuffers)) {
auto &storageResource = std::get<0>(it);
auto storageBuffer = std::get<1>(it);
auto tryMapOp = builder.create<IREE::Stream::ResourceTryMapOp>(
storageResource.loc, builder.getI1Type(), resourceType, storageBuffer,
zero, indexSet.get(storageResource.totalSize),
constantsOp.affinityAttr());
if (!ok) {
ok = tryMapOp.did_map();
} else {
ok = builder.createOrFold<arith::AndIOp>(tryMapOp.getLoc(), ok,
tryMapOp.did_map());
}
mappedResources.push_back(tryMapOp.result());
resultTypes.push_back(tryMapOp.result().getType());
}
// If we are able to directly map the resources then we don't need to wait.
auto timepointType = builder.getType<IREE::Stream::TimepointType>();
resultTypes.push_back(timepointType);
// if ok: return mapped resources
// else: allocate and upload
auto ifOp = builder.create<scf::IfOp>(
constantsOp.getLoc(), resultTypes, ok,
[&](OpBuilder &thenBuilder, Location loc) {
// Just return the resources + an immediate timepoint.
SmallVector<Value> ifResults = mappedResources;
ifResults.push_back(
thenBuilder.create<IREE::Stream::TimepointImmediateOp>(loc));
thenBuilder.create<scf::YieldOp>(loc, ifResults);
},
[&](OpBuilder &elseBuilder, Location loc) {
// Fallback to upload and then
auto stagingResult =
buildStagingUpload(constantsOp, resourceType, storageResources,
storageBuffers, indexSet, builder);
SmallVector<Value> ifResults;
for (auto &allocation : stagingResult.allocations) {
ifResults.push_back(allocation.resource);
}
ifResults.push_back(stagingResult.timepoint);
elseBuilder.create<scf::YieldOp>(loc, ifResults);
});
auto ifTimepoint = ifOp.results().back();
auto ifResources = ifOp.results().slice(0, ifOp.results().size() - 1);
// Use the result of either the direct mapping or the staging upload.
UploadResult uploadResult;
uploadResult.timepoint = ifTimepoint;
for (auto it : llvm::zip(storageResources, ifResources)) {
auto &storageResource = std::get<0>(it);
auto ifResource = std::get<1>(it);
uploadResult.allocations.push_back({
ifResource,
indexSet.get(storageResource.totalSize),
});
}
return uploadResult;
}
//===----------------------------------------------------------------------===//
// -iree-stream-pack-constants
//===----------------------------------------------------------------------===//
// NOTE: this pass currently produces suboptimal packing when multiple constant
// pools exist within the module. Each pool is packed independently even if
// there is overlap in subelement spans - as often is the case in multi-device
// partitioning where a constant may be used on multiple devices. By being
// purely local here we would pack those constants for each use into the final
// binary. A global pass for gathering constants from the whole program and
// splitting into dedicated target-agnostic staging buffers that we bake once
// would be much nicer. For now, though, we don't do multi-device so there's
// never a case where this matters by construction; which is a feature :P
class PackConstantsPass : public PackConstantsBase<PackConstantsPass> {
public:
void getDependentDialects(DialectRegistry &registry) const override {
registry.insert<mlir::StandardOpsDialect>();
registry.insert<mlir::arith::ArithmeticDialect>();
registry.insert<mlir::scf::SCFDialect>();
registry.insert<IREE::Stream::StreamDialect>();
registry.insert<IREE::Util::UtilDialect>();
}
void runOnOperation() override {
auto parentOp = dyn_cast<CallableOpInterface>(getOperation());
if (!parentOp || !parentOp.getCallableRegion() ||
parentOp.getCallableRegion()->empty()) {
return;
}
parentOp.walk([&](IREE::Stream::ResourceConstantsOp constantsOp) {
// Derive resource constraints based on pack affinity.
auto resourceConfig =
IREE::Stream::ResourceConfigAttr::lookup(constantsOp);
// Gather the slices produced by this constant pooling op.
SmallVector<ConstantSlice> slices;
slices.reserve(constantsOp.results().size());
for (auto it :
llvm::zip(constantsOp.results(), constantsOp.result_sizes(),
constantsOp.values())) {
auto result = std::get<0>(it);
auto resultSize = std::get<1>(it);
auto value = std::get<2>(it);
slices.push_back(ConstantSlice{
result,
resultSize,
value,
});
}
// Perform the packing of dense values to compute the storage resources we
// will need and where each value will be placed.
auto storageResources =
computePackingMap(slices, resourceConfig, constantsOp.getContext());
if (storageResources.empty()) return;
OpBuilder builder(constantsOp);
IndexSet indexSet(constantsOp.getLoc(), builder);
indexSet.populate(constantsOp.result_sizes());
// Emit rodata storage for the constant values.
// As our upload paths may vary this ensures that we are only emitting
// them once regardless of how many strategies we emit IR for.
SmallVector<Value> storageBuffers;
for (auto &storageResource : storageResources) {
auto rodataOp = builder.create<IREE::Util::ByteBufferConstantOp>(
storageResource.loc, builder.getType<IREE::Util::ByteBufferType>(),
storageResource.data,
builder.getI64IntegerAttr(
resourceConfig.getMinBufferOffsetAlignment()));
storageBuffers.push_back(rodataOp);
}
// If this is producing constants (vs variables) we can try to go on a
// fast-path where we directly map the constant memory. If producing
// variables then we always need to stage and clone.
auto anyResult = constantsOp.results().front();
auto resourceType =
anyResult.getType().cast<IREE::Stream::ResourceType>();
UploadResult uploadResult;
if (resourceType.getLifetime() == IREE::Stream::Lifetime::Constant) {
uploadResult = buildTryMapConstantResources(
constantsOp, resourceType, storageResources, storageBuffers,
indexSet, builder);
} else {
uploadResult =
buildStagingUpload(constantsOp, resourceType, storageResources,
storageBuffers, indexSet, builder);
}
// Build subviews for all packed spans back into storage buffers.
for (auto it : llvm::zip(storageResources, uploadResult.allocations)) {
auto &storageResource = std::get<0>(it);
auto &allocatedStorage = std::get<1>(it);
for (auto &span : storageResource.spans) {
auto loc = span.slice.result.getLoc();
auto subviewOp = builder.create<IREE::Stream::ResourceSubviewOp>(
loc, allocatedStorage.resource, allocatedStorage.resourceSize,
indexSet.get(span.offset), span.slice.resultSize);
span.slice.result.replaceAllUsesWith(subviewOp.result());
}
}
// Join on storage timepoints for our transitive dependencies to await.
constantsOp.result_timepoint().replaceAllUsesWith(uploadResult.timepoint);
constantsOp.erase();
});
}
};
} // namespace
std::unique_ptr<OperationPass<>> createPackConstantsPass() {
return std::make_unique<PackConstantsPass>();
}
} // namespace Stream
} // namespace IREE
} // namespace iree_compiler
} // namespace mlir