blob: b64810579116d7187994794dcc9cdf2ac0cf9efd [file] [log] [blame]
// Copyright 2021 The IREE Authors
//
// Licensed under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
#include <list>
#include "iree/compiler/Dialect/Stream/IR/StreamDialect.h"
#include "iree/compiler/Dialect/Stream/IR/StreamOps.h"
#include "iree/compiler/Dialect/Stream/IR/StreamTypes.h"
#include "iree/compiler/Dialect/Stream/Transforms/PassDetail.h"
#include "iree/compiler/Dialect/Stream/Transforms/Passes.h"
#include "iree/compiler/Dialect/Util/IR/UtilDialect.h"
#include "iree/compiler/Dialect/Util/IR/UtilOps.h"
#include "iree/compiler/Dialect/Util/IR/UtilTypes.h"
#include "iree/compiler/Utils/IndexSet.h"
#include "llvm/Support/Debug.h"
#include "mlir/Dialect/Arithmetic/IR/Arithmetic.h"
#include "mlir/Dialect/StandardOps/IR/Ops.h"
#include "mlir/IR/AsmState.h"
#include "mlir/IR/Attributes.h"
#include "mlir/IR/Builders.h"
#include "mlir/IR/BuiltinOps.h"
#include "mlir/IR/Matchers.h"
#include "mlir/Pass/Pass.h"
#define DEBUG_TYPE "iree-stream-layout-slices"
namespace mlir {
namespace iree_compiler {
namespace IREE {
namespace Stream {
namespace {
using Slice = IREE::Stream::ResourcePackOp::Slice;
// Packs slices back-to-back with no aliasing. Useful when debugging to remove
// the aliasing that makes data breakpoints useless.
//
// Slice packed offset SSA values will be updated and start at the given
// |baseOffset|. Returns |baseOffset| + the total size of the allocation
// aligned to the requirements of |resourceConfig|.
static Value packSlicesWithNoAliasing(
IREE::Stream::ResourcePackOp packOp, Value baseOffset,
ArrayRef<Slice> slices, IREE::Stream::ResourceConfigAttr resourceConfig,
IndexSet &indexSet, OpBuilder &builder) {
auto loc = packOp.getLoc();
int64_t offsetAlignment = resourceConfig.getMinBufferOffsetAlignment();
int64_t rangeAlignment = resourceConfig.getMinBufferRangeAlignment();
Value offset = baseOffset;
for (auto &slice : slices) {
auto sliceSize = builder.createOrFold<IREE::Util::AlignOp>(
loc, slice.dynamicSize, rangeAlignment);
slice.packedOffset.replaceAllUsesWith(offset);
auto valueToAlign =
builder.createOrFold<arith::AddIOp>(loc, offset, sliceSize);
offset = builder.createOrFold<IREE::Util::AlignOp>(loc, valueToAlign,
offsetAlignment);
}
return builder.createOrFold<IREE::Util::AlignOp>(loc, offset, rangeAlignment);
}
// Packs a set of statically-sized slices by greedy strip packing.
//
// This is the same algorithm used in tflite here:
// https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/simple_memory_arena.cc
// It's not fantastic and can end up with a significant amount of wastage.
// They do the packing at runtime and as such care more about performance than
// we do while doing the packing here offline. For this initial version I
// wanted to ensure we matched apples/apples their implementation.
//
// We should use a set of heuristics and pick the smallest one. There are also
// some really great papers that have approximations (as all of these are -
// 2D strip packing is NP-hard) such as
// https://www.sciencedirect.com/science/article/pii/S0925772113001016 that
// someone with a brain able to parse mathy papers can try implementing.
//
// Slice packed offset SSA values will be updated and start at the given
// |baseOffset|. Returns |baseOffset| + the total size of the allocation
// aligned to the requirements of |resourceConfig|.
static Value packStaticSlicesGreedily(
IREE::Stream::ResourcePackOp packOp, Value baseOffset,
ArrayRef<Slice> slices, IREE::Stream::ResourceConfigAttr resourceConfig,
IndexSet &indexSet, OpBuilder &builder) {
int64_t offsetAlignment = resourceConfig.getMinBufferOffsetAlignment();
int64_t rangeAlignment = resourceConfig.getMinBufferRangeAlignment();
struct Reservation {
const Slice *slice = nullptr;
int64_t staticOffset = 0;
int64_t staticSize = 0;
};
static constexpr int64_t UNASSIGNED = INT64_MAX;
std::list<Reservation> reservations;
int64_t highwaterMark = 0;
for (auto &slice : slices) {
int64_t bestOffset = UNASSIGNED;
int64_t bestOffsetFit = UNASSIGNED;
int64_t staticSize =
cast<arith::ConstantIndexOp>(slice.dynamicSize.getDefiningOp()).value();
int64_t alignedSize = IREE::Util::align(staticSize, rangeAlignment);
// Iterate through reservations (sorted by ascending offset) and identify
// gaps in which the slice will fit. To reduce wastage we want to find the
// smallest gap.
int64_t currentOffset = 0;
for (auto &reservation : reservations) {
if (!reservation.slice->intersects(slice)) {
// Non-overlapping - we can reuse the currentOffset (assuming we find
// no better place).
continue;
}
// If we found a gap >= the required size and smaller than
// previous best fit take it.
int64_t alignedOffset = IREE::Util::align(currentOffset, offsetAlignment);
if (alignedOffset + alignedSize <= reservation.staticOffset &&
reservation.staticOffset - alignedOffset < bestOffsetFit) {
bestOffset = alignedOffset;
bestOffsetFit = reservation.staticOffset - currentOffset;
}
currentOffset = std::max(
currentOffset, reservation.staticOffset + reservation.staticSize);
}
if (bestOffset == UNASSIGNED) {
bestOffset = IREE::Util::align(currentOffset, offsetAlignment);
}
// Reserve the memory.
Reservation reservation;
reservation.slice = &slice;
reservation.staticOffset = bestOffset;
reservation.staticSize = alignedSize;
auto insertionIt = reservations.begin();
while (insertionIt != reservations.end() &&
insertionIt->staticOffset < reservation.staticOffset) {
++insertionIt;
}
reservations.insert(insertionIt, reservation);
slice.packedOffset.replaceAllUsesWith(builder.createOrFold<arith::AddIOp>(
packOp.getLoc(), baseOffset, indexSet.get(bestOffset)));
// Update highwater mark indicating how much memory needs to be allocated
// for the entire slab.
highwaterMark = std::max(highwaterMark, bestOffset + alignedSize);
}
highwaterMark = IREE::Util::align(highwaterMark, rangeAlignment);
return builder.createOrFold<arith::AddIOp>(packOp.getLoc(), baseOffset,
indexSet.get(highwaterMark));
}
// Packs a set of dynamically-sized slices based on the structural information
// in the IR. Only slices that have the exact same size will be allowed to
// alias.
//
// We can improve this if we know which sizes are larger/smaller relative to
// others as then we can do things like reuse an allocation bucket with
// non-overlapping lifetimes if the thing we are trying to pack in it is
// definitely <=. If we end up knowing that certain sizes are less than some
// absolute value then we could also alias with static allocations like if %sz
// is known less than 1000b it could reuse any static allocation >= 1000b.
//
// We could also emit code for efficient runtime bucketing by providing the
// sorted, compacted, delta-coded lifetime intervals and runtime-computed
// sizes if we wanted to produce the smallest buffers.
//
// Slice packed offset SSA values will be updated and start at the given
// |baseOffset|. Returns |baseOffset| + the total size of the allocation
// aligned to the requirements of |resourceConfig|.
static Value packDynamicSlicesConservatively(
IREE::Stream::ResourcePackOp packOp, Value baseOffset,
ArrayRef<Slice> slices, IREE::Stream::ResourceConfigAttr resourceConfig,
IndexSet &indexSet, OpBuilder &builder) {
auto loc = packOp.getLoc();
int64_t offsetAlignment = resourceConfig.getMinBufferOffsetAlignment();
int64_t rangeAlignment = resourceConfig.getMinBufferRangeAlignment();
// Bucket all slices by their size SSA value. We rely on shapes being
// lowered to computed byte sizes and CSE to then dedupe the values for us.
llvm::MapVector<Value, SmallVector<const Slice *>> slicesBySize;
for (auto &slice : slices) {
slicesBySize[slice.dynamicSize].push_back(&slice);
}
// Allocate each bucket while observing the lifetime of the slices within.
// Two or more slices may overlap in lifetime and need their own unique
// reservation.
Value offset = baseOffset;
for (auto &sizeBucket : slicesBySize) {
auto sliceSize = builder.createOrFold<IREE::Util::AlignOp>(
loc, sizeBucket.first, rangeAlignment);
auto &slices = sizeBucket.second;
std::stable_sort(slices.begin(), slices.end());
// Bin the slices by those that do not overlap. All of the allocations in
// each bin can alias.
// NOTE: O(n^2) in the worst case but there's usually only a small number
// of bins (<10) as we have already bucketed by size class. We could do
// some sorting and make this O(nlogn) or O(logn) with some interval tree
// magic.
struct Bin {
Value offset;
SmallVector<const Slice *> slices;
bool intersects(const Slice &slice) const {
for (auto *binSlice : slices) {
if (binSlice->intersects(slice)) return true;
}
return false;
}
};
SmallVector<Bin> bins;
for (auto *slice : slices) {
// Try to find a bin we can reuse (non-intersecting lifetime).
Bin *targetBin = nullptr;
for (auto &bin : bins) {
if (!bin.intersects(*slice)) {
targetBin = &bin;
break;
}
}
if (!targetBin) {
// Allocate a new bin for this slice.
bins.push_back({offset, {}});
targetBin = &bins.back();
auto binSize =
builder.createOrFold<arith::AddIOp>(loc, offset, sliceSize);
offset = builder.createOrFold<IREE::Util::AlignOp>(loc, binSize,
offsetAlignment);
}
targetBin->slices.push_back(slice);
slice->packedOffset.replaceAllUsesWith(targetBin->offset);
}
}
return builder.createOrFold<IREE::Util::AlignOp>(loc, offset, rangeAlignment);
}
//===----------------------------------------------------------------------===//
// -iree-stream-layout-slices
//===----------------------------------------------------------------------===//
class LayoutSlicesPass : public LayoutSlicesBase<LayoutSlicesPass> {
public:
void getDependentDialects(DialectRegistry &registry) const override {
registry.insert<mlir::StandardOpsDialect>();
registry.insert<mlir::arith::ArithmeticDialect>();
registry.insert<IREE::Stream::StreamDialect>();
registry.insert<IREE::Util::UtilDialect>();
}
void runOnOperation() override {
auto parentOp = dyn_cast<CallableOpInterface>(getOperation());
if (!parentOp || !parentOp.getCallableRegion() ||
parentOp.getCallableRegion()->empty()) {
return;
}
// NOTE: we could try several algorithms and compute which packs best. For
// now we just pack greedily as it's fast and what most existing ML
// frameworks do.
parentOp.walk([&](IREE::Stream::ResourcePackOp packOp) {
// Derive resource constraints based on pack affinity.
auto resourceConfig = IREE::Stream::ResourceConfigAttr::lookup(packOp);
// Bucket into static and dynamic sizes. Static packing is a much more
// constrained problem.
auto allSlices = packOp.getSlices();
SmallVector<Slice> staticSlices;
SmallVector<Slice> dynamicSlices;
staticSlices.reserve(allSlices.size());
dynamicSlices.reserve(allSlices.size());
for (auto &slice : allSlices) {
if (isa_and_nonnull<arith::ConstantOp>(
slice.dynamicSize.getDefiningOp())) {
staticSlices.push_back(slice);
} else {
dynamicSlices.push_back(slice);
}
}
OpBuilder builder(packOp);
IndexSet indexSet(packOp.getLoc(), builder);
// First pack all static slices as these are entirely knowable here at
// compile time.
auto offset = packOp.offset() ? packOp.offset() : indexSet.get(0);
if (!staticSlices.empty()) {
offset = packStaticSlicesGreedily(packOp, offset, staticSlices,
resourceConfig, indexSet, builder);
// TODO(benvanik): make this an option; it can be useful for debugging
// this code.
// offset = packSlicesWithNoAliasing(packOp, offset, staticSlices,
// resourceConfig, indexSet, builder);
}
// Next pack all dynamic slices. Depending on the analysis information
// available we could reuse static slices with non-overlapping lifetimes
// in some cases.
if (!dynamicSlices.empty()) {
offset = packDynamicSlicesConservatively(
packOp, offset, dynamicSlices, resourceConfig, indexSet, builder);
}
// Total packed length is the current offset after all slices are
// allocated. This should be aligned to the range constraints.
packOp.total_length().replaceAllUsesWith(offset);
packOp.erase();
});
}
};
} // namespace
std::unique_ptr<OperationPass<>> createLayoutSlicesPass() {
return std::make_unique<LayoutSlicesPass>();
}
} // namespace Stream
} // namespace IREE
} // namespace iree_compiler
} // namespace mlir