blob: 14e5871b3a0243ce5792e62fe4ff949a8fbc35fb [file] [log] [blame]
// Copyright 2022 The IREE Authors
//
// Licensed under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
#include "iree/hal/utils/resource_set.h"
#include "iree/base/tracing.h"
// Inlines the first chunk into the block using all of the remaining space.
// This is a special case chunk that is released back to the pool with the
// resource set and lets us avoid an additional allocation.
static void iree_hal_resource_set_setup_inline_chunk(
iree_hal_resource_set_t* set) {
uint8_t* block_ptr = (uint8_t*)set + sizeof(*set);
iree_hal_resource_set_chunk_t* inlined_chunk =
(iree_hal_resource_set_chunk_t*)block_ptr;
inlined_chunk->flags = IREE_HAL_RESOURCE_SET_CHUNK_FLAG_INLINE;
inlined_chunk->capacity = (set->block_pool->total_block_size - sizeof(*set) -
sizeof(*inlined_chunk)) /
sizeof(iree_hal_resource_t*);
inlined_chunk->capacity = iree_min(inlined_chunk->capacity,
IREE_HAL_RESOURCE_SET_CHUNK_MAX_CAPACITY);
inlined_chunk->count = 0;
set->chunk_head = inlined_chunk;
}
IREE_API_EXPORT iree_status_t iree_hal_resource_set_allocate(
iree_arena_block_pool_t* block_pool, iree_hal_resource_set_t** out_set) {
IREE_TRACE_ZONE_BEGIN(z0);
// We could allow larger sizes (would require widening the capacity/count
// fields in the chunk) but in real usage having even 64k is a bit too much.
IREE_ASSERT_LE(block_pool->total_block_size, 64 * 1024,
"keep block sizes small for resource sets");
// Acquire block and place the set struct at the head.
iree_arena_block_t* block = NULL;
IREE_RETURN_AND_END_ZONE_IF_ERROR(
z0, iree_arena_block_pool_acquire(block_pool, &block));
uint8_t* block_ptr = (uint8_t*)block - block_pool->usable_block_size;
iree_hal_resource_set_t* set = (iree_hal_resource_set_t*)block_ptr;
memset(set, 0, sizeof(*set));
set->block_pool = block_pool;
iree_hal_resource_set_setup_inline_chunk(set);
*out_set = set;
IREE_TRACE_ZONE_END(z0);
return iree_ok_status();
}
static void iree_hal_resource_set_release_blocks(iree_hal_resource_set_t* set,
bool preserve_set) {
// Release all resources in all chunks and stitch together the blocks in a
// linked list. We do this first so that we can release all of the chunks back
// to the block pool in one operation. Ideally we'd maintain the linked list
// in our chunks but there's some weirdness with prefix/suffix header/footers
// that isn't worth the complexity.
iree_arena_block_t* block_head = NULL;
iree_arena_block_t* block_tail = NULL;
iree_hal_resource_set_chunk_t* chunk = set->chunk_head;
while (chunk) {
// Release all resources in the chunk.
for (iree_host_size_t i = 0; i < chunk->count; ++i) {
iree_hal_resource_release(chunk->resources[i]);
}
// Consume the chunk and add it to the block pool release linked list.
iree_hal_resource_set_chunk_t* next_chunk = chunk->next_chunk;
iree_arena_block_t* block = NULL;
if (iree_hal_resource_set_chunk_is_stored_inline(chunk)) {
// This is the inlined first chunk that also stores the set header.
// If we are not freeing the set then we don't release the block back to
// the pool.
if (preserve_set) {
// Don't release the block.
break;
} else {
block = (iree_arena_block_t*)((uint8_t*)set +
set->block_pool->usable_block_size);
next_chunk = NULL;
}
} else {
// A chunk acquired after the set was acquired.
block = (iree_arena_block_t*)((uint8_t*)chunk +
set->block_pool->usable_block_size);
}
block->next = block_head;
block_head = block;
if (!block_tail) block_tail = block;
chunk = next_chunk;
}
// Release all blocks back to the block pool in one operation.
// NOTE: this invalidates the |set| memory.
iree_arena_block_pool_t* block_pool = set->block_pool;
iree_arena_block_pool_release(block_pool, block_head, block_tail);
}
IREE_API_EXPORT void iree_hal_resource_set_free(iree_hal_resource_set_t* set) {
IREE_TRACE_ZONE_BEGIN(z0);
// Release all resources and the arena block used by the set.
// The set pointer is invalid after this call returns.
iree_hal_resource_set_release_blocks(set, /*preserve_set=*/false);
IREE_TRACE_ZONE_END(z0);
}
IREE_API_EXPORT void iree_hal_resource_set_reset(iree_hal_resource_set_t* set) {
IREE_TRACE_ZONE_BEGIN(z0);
// Release all resources and the blocks besides the base set.
iree_hal_resource_set_release_blocks(set, /*preserve_set=*/true);
// Reset the set state.
memset(set->mru, 0, sizeof(set->mru));
iree_hal_resource_set_setup_inline_chunk(set);
IREE_TRACE_ZONE_END(z0);
}
// Retains |resource| and adds it to the main |set| list.
static iree_status_t iree_hal_resource_set_insert_retain(
iree_hal_resource_set_t* set, iree_hal_resource_t* resource) {
iree_hal_resource_set_chunk_t* chunk = set->chunk_head;
if (IREE_UNLIKELY(chunk->count + 1 > chunk->capacity)) {
// Ran out of room in the current chunk - acquire a new one and link it into
// the list of chunks.
iree_arena_block_t* block = NULL;
IREE_RETURN_IF_ERROR(
iree_arena_block_pool_acquire(set->block_pool, &block));
chunk =
(iree_hal_resource_set_chunk_t*)((uint8_t*)block -
set->block_pool->usable_block_size);
chunk->next_chunk = set->chunk_head;
set->chunk_head = chunk;
chunk->capacity = (set->block_pool->total_block_size - sizeof(*chunk)) /
sizeof(iree_hal_resource_t*);
chunk->capacity =
iree_min(chunk->capacity, IREE_HAL_RESOURCE_SET_CHUNK_MAX_CAPACITY);
chunk->count = 0;
}
// Retain and insert into the chunk.
chunk->resources[chunk->count++] = resource;
iree_hal_resource_retain(resource);
return iree_ok_status();
}
// Scans the lookaside for the resource pointer and updates the order if found.
// If the resource was not found then it will be inserted into the main list as
// well as the MRU.
//
// This performs a full scan over the MRU and if the resource is found will
// move the resource to the front of the list before returning. Otherwise the
// resource will be retained in the main source-of-truth list.
//
// Example (hit):
// +----+----+----+----+
// | AA | BB | CC | DD | resource: CC
// +----+----+----+----+
// scan mru to find CC:
// found at mru[2]
// shift prefix down 1:
// +----+----+----+----+
// | AA | AA | BB | DD |
// +----+----+----+----+
// insert resource at front:
// +----+----+----+----+
// | CC | AA | BB | DD |
// +----+----+----+----+
//
// Example (miss):
// +----+----+----+----+
// | AA | BB | CC | DD | resource: EE
// +----+----+----+----+
// scan mru to find EE: not found
// shift set down 1:
// +----+----+----+----+
// | AA | AA | BB | CC |
// +----+----+----+----+
// insert resource at front:
// +----+----+----+----+
// | EE | AA | BB | CC |
// +----+----+----+----+
// insert resource into main list
//
// The intent here is that we can model this behavior with SIMD ops to perform
// both the scan and update using comparison, extraction, and permutation. The
// best and worst case flows will load the entire MRU into registers from a
// single cache line, do all the scanning and shifting in registers, and then
// store back to the single cache line.
//
// Today, though, we leave this as an exercise to whoever comes across this :)
// Notes:
// As the MRU is a fixed size we can unroll it entirely and avoid any looping.
// On a 32-bit system with uint32x4_t we only need 4 registers.
// On a 64-bit system with uint64x2_t we also only need 4 registers - though
// the MRU has half as many entries and we may want to go >1 cache line.
//
// If we wanted to process more than one resource at a time we can specialize
// the code paths to handle 1/2/4/etc resources and process in batches with
// an optional remainder. This would increase the ratio of work performed on
// the loaded MRU registers before we do the shift/store.
//
// The tree sequence we likely want is something like:
// https://developer.arm.com/architectures/instruction-sets/intrinsics/vdupq_n_u32
// https://developer.arm.com/architectures/instruction-sets/intrinsics/vceqq_u32
// https://developer.arm.com/architectures/instruction-sets/intrinsics/vorrq_u32
// https://developer.arm.com/architectures/instruction-sets/intrinsics/vmaxvq_u32
// or
// https://developer.arm.com/architectures/instruction-sets/intrinsics/vdupq_n_u64
// https://developer.arm.com/architectures/instruction-sets/intrinsics/vceqq_u64
// https://developer.arm.com/architectures/instruction-sets/intrinsics/vorrq_u64
// https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_u64_u32
// https://developer.arm.com/architectures/instruction-sets/intrinsics/vmaxvq_u32
// This would yield whether the pointer was found, but instead of maxing at
// the end we can use the produced mask to extract out a single register with
// which positions are hits and use that to then permute the registers into
// the proper order. At the end we could use a table instruction to remap and
// extract out a byte/bitmap of the indices that we need to insert into the
// main set.
//
// The shifting can be performed with
// https://developer.arm.com/architectures/instruction-sets/intrinsics/vextq_u32
// https://developer.arm.com/architectures/instruction-sets/intrinsics/vextq_u64
// This takes n low elements of LHS and rest from RHS and we can cascade them
// to shift down the whole MRU.
//
// We can use SIMDE as a rosetta stone for getting neon/avx/wasm/etc:
// https://github.com/simd-everywhere/simde/blob/master/simde/arm/neon/ceq.h#L591
static iree_status_t iree_hal_resource_set_insert_1(
iree_hal_resource_set_t* set, iree_hal_resource_t* resource) {
// Scan and hope for a hit.
for (iree_host_size_t i = 0; i < IREE_ARRAYSIZE(set->mru); ++i) {
if (set->mru[i] != resource) continue;
// Hit - keep the list sorted by most->least recently used.
// We shift the MRU down to make room at index 0 and store the
// resource there.
if (i > 0) {
memmove(&set->mru[1], &set->mru[0], sizeof(set->mru[0]) * i);
set->mru[0] = resource;
}
return iree_ok_status();
}
// Miss - insert into the main list (slow path).
// Note that we do this before updating the MRU in case allocation fails - we
// don't want to keep the pointer around unless we've really retained it.
IREE_RETURN_IF_ERROR(iree_hal_resource_set_insert_retain(set, resource));
// Shift the MRU down and insert the new item at the head.
memmove(&set->mru[1], &set->mru[0],
sizeof(set->mru[0]) * (IREE_ARRAYSIZE(set->mru) - 1));
set->mru[0] = resource;
return iree_ok_status();
}
IREE_API_EXPORT iree_status_t
iree_hal_resource_set_insert(iree_hal_resource_set_t* set,
iree_host_size_t count, const void* resources) {
// For now we process one at a time. We should have a stride that lets us
// amortize the cost of doing the MRU update and insertion allocation by
// say slicing off 4/8/16/32 resources at a time etc. Today each miss that
// requires a full insertion goes down the whole path of checking chunk
// capacity and such.
iree_hal_resource_t* const* typed_resources =
(iree_hal_resource_t* const*)resources;
for (iree_host_size_t i = 0; i < count; ++i) {
IREE_RETURN_IF_ERROR(
iree_hal_resource_set_insert_1(set, typed_resources[i]));
}
return iree_ok_status();
}