| // Copyright 2022 The IREE Authors |
| // |
| // Licensed under the Apache License v2.0 with LLVM Exceptions. |
| // See https://llvm.org/LICENSE.txt for license information. |
| // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| |
| #include "iree/hal/utils/resource_set.h" |
| |
| #include "iree/base/tracing.h" |
| |
| // Inlines the first chunk into the block using all of the remaining space. |
| // This is a special case chunk that is released back to the pool with the |
| // resource set and lets us avoid an additional allocation. |
| static void iree_hal_resource_set_setup_inline_chunk( |
| iree_hal_resource_set_t* set) { |
| uint8_t* block_ptr = (uint8_t*)set + sizeof(*set); |
| iree_hal_resource_set_chunk_t* inlined_chunk = |
| (iree_hal_resource_set_chunk_t*)block_ptr; |
| inlined_chunk->flags = IREE_HAL_RESOURCE_SET_CHUNK_FLAG_INLINE; |
| inlined_chunk->capacity = (set->block_pool->total_block_size - sizeof(*set) - |
| sizeof(*inlined_chunk)) / |
| sizeof(iree_hal_resource_t*); |
| inlined_chunk->capacity = iree_min(inlined_chunk->capacity, |
| IREE_HAL_RESOURCE_SET_CHUNK_MAX_CAPACITY); |
| inlined_chunk->count = 0; |
| set->chunk_head = inlined_chunk; |
| } |
| |
| IREE_API_EXPORT iree_status_t iree_hal_resource_set_allocate( |
| iree_arena_block_pool_t* block_pool, iree_hal_resource_set_t** out_set) { |
| IREE_TRACE_ZONE_BEGIN(z0); |
| |
| // We could allow larger sizes (would require widening the capacity/count |
| // fields in the chunk) but in real usage having even 64k is a bit too much. |
| IREE_ASSERT_LE(block_pool->total_block_size, 64 * 1024, |
| "keep block sizes small for resource sets"); |
| |
| // Acquire block and place the set struct at the head. |
| iree_arena_block_t* block = NULL; |
| IREE_RETURN_AND_END_ZONE_IF_ERROR( |
| z0, iree_arena_block_pool_acquire(block_pool, &block)); |
| uint8_t* block_ptr = (uint8_t*)block - block_pool->usable_block_size; |
| iree_hal_resource_set_t* set = (iree_hal_resource_set_t*)block_ptr; |
| memset(set, 0, sizeof(*set)); |
| set->block_pool = block_pool; |
| iree_hal_resource_set_setup_inline_chunk(set); |
| |
| *out_set = set; |
| IREE_TRACE_ZONE_END(z0); |
| return iree_ok_status(); |
| } |
| |
| static void iree_hal_resource_set_release_blocks(iree_hal_resource_set_t* set, |
| bool preserve_set) { |
| // Release all resources in all chunks and stitch together the blocks in a |
| // linked list. We do this first so that we can release all of the chunks back |
| // to the block pool in one operation. Ideally we'd maintain the linked list |
| // in our chunks but there's some weirdness with prefix/suffix header/footers |
| // that isn't worth the complexity. |
| iree_arena_block_t* block_head = NULL; |
| iree_arena_block_t* block_tail = NULL; |
| iree_hal_resource_set_chunk_t* chunk = set->chunk_head; |
| while (chunk) { |
| // Release all resources in the chunk. |
| for (iree_host_size_t i = 0; i < chunk->count; ++i) { |
| iree_hal_resource_release(chunk->resources[i]); |
| } |
| // Consume the chunk and add it to the block pool release linked list. |
| iree_hal_resource_set_chunk_t* next_chunk = chunk->next_chunk; |
| iree_arena_block_t* block = NULL; |
| if (iree_hal_resource_set_chunk_is_stored_inline(chunk)) { |
| // This is the inlined first chunk that also stores the set header. |
| // If we are not freeing the set then we don't release the block back to |
| // the pool. |
| if (preserve_set) { |
| // Don't release the block. |
| break; |
| } else { |
| block = (iree_arena_block_t*)((uint8_t*)set + |
| set->block_pool->usable_block_size); |
| next_chunk = NULL; |
| } |
| } else { |
| // A chunk acquired after the set was acquired. |
| block = (iree_arena_block_t*)((uint8_t*)chunk + |
| set->block_pool->usable_block_size); |
| } |
| block->next = block_head; |
| block_head = block; |
| if (!block_tail) block_tail = block; |
| chunk = next_chunk; |
| } |
| |
| // Release all blocks back to the block pool in one operation. |
| // NOTE: this invalidates the |set| memory. |
| iree_arena_block_pool_t* block_pool = set->block_pool; |
| iree_arena_block_pool_release(block_pool, block_head, block_tail); |
| } |
| |
| IREE_API_EXPORT void iree_hal_resource_set_free(iree_hal_resource_set_t* set) { |
| IREE_TRACE_ZONE_BEGIN(z0); |
| |
| // Release all resources and the arena block used by the set. |
| // The set pointer is invalid after this call returns. |
| iree_hal_resource_set_release_blocks(set, /*preserve_set=*/false); |
| |
| IREE_TRACE_ZONE_END(z0); |
| } |
| |
| IREE_API_EXPORT void iree_hal_resource_set_reset(iree_hal_resource_set_t* set) { |
| IREE_TRACE_ZONE_BEGIN(z0); |
| |
| // Release all resources and the blocks besides the base set. |
| iree_hal_resource_set_release_blocks(set, /*preserve_set=*/true); |
| |
| // Reset the set state. |
| memset(set->mru, 0, sizeof(set->mru)); |
| iree_hal_resource_set_setup_inline_chunk(set); |
| |
| IREE_TRACE_ZONE_END(z0); |
| } |
| |
| // Retains |resource| and adds it to the main |set| list. |
| static iree_status_t iree_hal_resource_set_insert_retain( |
| iree_hal_resource_set_t* set, iree_hal_resource_t* resource) { |
| iree_hal_resource_set_chunk_t* chunk = set->chunk_head; |
| if (IREE_UNLIKELY(chunk->count + 1 > chunk->capacity)) { |
| // Ran out of room in the current chunk - acquire a new one and link it into |
| // the list of chunks. |
| iree_arena_block_t* block = NULL; |
| IREE_RETURN_IF_ERROR( |
| iree_arena_block_pool_acquire(set->block_pool, &block)); |
| chunk = |
| (iree_hal_resource_set_chunk_t*)((uint8_t*)block - |
| set->block_pool->usable_block_size); |
| chunk->next_chunk = set->chunk_head; |
| set->chunk_head = chunk; |
| chunk->capacity = (set->block_pool->total_block_size - sizeof(*chunk)) / |
| sizeof(iree_hal_resource_t*); |
| chunk->capacity = |
| iree_min(chunk->capacity, IREE_HAL_RESOURCE_SET_CHUNK_MAX_CAPACITY); |
| chunk->count = 0; |
| } |
| |
| // Retain and insert into the chunk. |
| chunk->resources[chunk->count++] = resource; |
| iree_hal_resource_retain(resource); |
| return iree_ok_status(); |
| } |
| |
| // Scans the lookaside for the resource pointer and updates the order if found. |
| // If the resource was not found then it will be inserted into the main list as |
| // well as the MRU. |
| // |
| // This performs a full scan over the MRU and if the resource is found will |
| // move the resource to the front of the list before returning. Otherwise the |
| // resource will be retained in the main source-of-truth list. |
| // |
| // Example (hit): |
| // +----+----+----+----+ |
| // | AA | BB | CC | DD | resource: CC |
| // +----+----+----+----+ |
| // scan mru to find CC: |
| // found at mru[2] |
| // shift prefix down 1: |
| // +----+----+----+----+ |
| // | AA | AA | BB | DD | |
| // +----+----+----+----+ |
| // insert resource at front: |
| // +----+----+----+----+ |
| // | CC | AA | BB | DD | |
| // +----+----+----+----+ |
| // |
| // Example (miss): |
| // +----+----+----+----+ |
| // | AA | BB | CC | DD | resource: EE |
| // +----+----+----+----+ |
| // scan mru to find EE: not found |
| // shift set down 1: |
| // +----+----+----+----+ |
| // | AA | AA | BB | CC | |
| // +----+----+----+----+ |
| // insert resource at front: |
| // +----+----+----+----+ |
| // | EE | AA | BB | CC | |
| // +----+----+----+----+ |
| // insert resource into main list |
| // |
| // The intent here is that we can model this behavior with SIMD ops to perform |
| // both the scan and update using comparison, extraction, and permutation. The |
| // best and worst case flows will load the entire MRU into registers from a |
| // single cache line, do all the scanning and shifting in registers, and then |
| // store back to the single cache line. |
| // |
| // Today, though, we leave this as an exercise to whoever comes across this :) |
| // Notes: |
| // As the MRU is a fixed size we can unroll it entirely and avoid any looping. |
| // On a 32-bit system with uint32x4_t we only need 4 registers. |
| // On a 64-bit system with uint64x2_t we also only need 4 registers - though |
| // the MRU has half as many entries and we may want to go >1 cache line. |
| // |
| // If we wanted to process more than one resource at a time we can specialize |
| // the code paths to handle 1/2/4/etc resources and process in batches with |
| // an optional remainder. This would increase the ratio of work performed on |
| // the loaded MRU registers before we do the shift/store. |
| // |
| // The tree sequence we likely want is something like: |
| // https://developer.arm.com/architectures/instruction-sets/intrinsics/vdupq_n_u32 |
| // https://developer.arm.com/architectures/instruction-sets/intrinsics/vceqq_u32 |
| // https://developer.arm.com/architectures/instruction-sets/intrinsics/vorrq_u32 |
| // https://developer.arm.com/architectures/instruction-sets/intrinsics/vmaxvq_u32 |
| // or |
| // https://developer.arm.com/architectures/instruction-sets/intrinsics/vdupq_n_u64 |
| // https://developer.arm.com/architectures/instruction-sets/intrinsics/vceqq_u64 |
| // https://developer.arm.com/architectures/instruction-sets/intrinsics/vorrq_u64 |
| // https://developer.arm.com/architectures/instruction-sets/intrinsics/vreinterpretq_u64_u32 |
| // https://developer.arm.com/architectures/instruction-sets/intrinsics/vmaxvq_u32 |
| // This would yield whether the pointer was found, but instead of maxing at |
| // the end we can use the produced mask to extract out a single register with |
| // which positions are hits and use that to then permute the registers into |
| // the proper order. At the end we could use a table instruction to remap and |
| // extract out a byte/bitmap of the indices that we need to insert into the |
| // main set. |
| // |
| // The shifting can be performed with |
| // https://developer.arm.com/architectures/instruction-sets/intrinsics/vextq_u32 |
| // https://developer.arm.com/architectures/instruction-sets/intrinsics/vextq_u64 |
| // This takes n low elements of LHS and rest from RHS and we can cascade them |
| // to shift down the whole MRU. |
| // |
| // We can use SIMDE as a rosetta stone for getting neon/avx/wasm/etc: |
| // https://github.com/simd-everywhere/simde/blob/master/simde/arm/neon/ceq.h#L591 |
| static iree_status_t iree_hal_resource_set_insert_1( |
| iree_hal_resource_set_t* set, iree_hal_resource_t* resource) { |
| // Scan and hope for a hit. |
| for (iree_host_size_t i = 0; i < IREE_ARRAYSIZE(set->mru); ++i) { |
| if (set->mru[i] != resource) continue; |
| // Hit - keep the list sorted by most->least recently used. |
| // We shift the MRU down to make room at index 0 and store the |
| // resource there. |
| if (i > 0) { |
| memmove(&set->mru[1], &set->mru[0], sizeof(set->mru[0]) * i); |
| set->mru[0] = resource; |
| } |
| return iree_ok_status(); |
| } |
| |
| // Miss - insert into the main list (slow path). |
| // Note that we do this before updating the MRU in case allocation fails - we |
| // don't want to keep the pointer around unless we've really retained it. |
| IREE_RETURN_IF_ERROR(iree_hal_resource_set_insert_retain(set, resource)); |
| |
| // Shift the MRU down and insert the new item at the head. |
| memmove(&set->mru[1], &set->mru[0], |
| sizeof(set->mru[0]) * (IREE_ARRAYSIZE(set->mru) - 1)); |
| set->mru[0] = resource; |
| |
| return iree_ok_status(); |
| } |
| |
| IREE_API_EXPORT iree_status_t |
| iree_hal_resource_set_insert(iree_hal_resource_set_t* set, |
| iree_host_size_t count, const void* resources) { |
| // For now we process one at a time. We should have a stride that lets us |
| // amortize the cost of doing the MRU update and insertion allocation by |
| // say slicing off 4/8/16/32 resources at a time etc. Today each miss that |
| // requires a full insertion goes down the whole path of checking chunk |
| // capacity and such. |
| iree_hal_resource_t* const* typed_resources = |
| (iree_hal_resource_t* const*)resources; |
| for (iree_host_size_t i = 0; i < count; ++i) { |
| IREE_RETURN_IF_ERROR( |
| iree_hal_resource_set_insert_1(set, typed_resources[i])); |
| } |
| return iree_ok_status(); |
| } |