blob: c20c9ebe69e4bd0c3bac87b0523fe3a229360f30 [file] [log] [blame]
// Copyright 2023 The IREE Authors
//
// Licensed under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
#include "experimental/cuda2/memory_pools.h"
#include "experimental/cuda2/cuda_buffer.h"
#include "experimental/cuda2/cuda_dynamic_symbols.h"
#include "experimental/cuda2/cuda_status_util.h"
#include "iree/base/tracing.h"
// NOTE: these are currently global for all devices; we could make
// device-specific ones by malloc() and leaking (with LSAN note) unique string
// values instead.
#if IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_ALLOCATION_TRACKING
static const char* IREE_HAL_CUDA_DEVICE_LOCAL_POOL_RESERVED_ID =
"CUDA pool: device-local reserved";
static const char* IREE_HAL_CUDA_OTHER_POOL_RESERVED_ID =
"CUDA pool: other reserved";
#endif // IREE_TRACING_FEATURE_ALLOCATION_TRACKING
static iree_status_t iree_hal_cuda2_create_memory_pool(
const iree_hal_cuda2_dynamic_symbols_t* cuda_symbols, CUdevice cu_device,
iree_hal_cuda2_memory_pool_params_t params,
CUmemoryPool* IREE_RESTRICT out_pool) {
*out_pool = NULL;
CUmemPoolProps pool_props = {
.allocType = CU_MEM_ALLOCATION_TYPE_PINNED,
// TODO: allow sharing of certain pool memory types by fd/HANDLE.
.handleTypes = CU_MEM_HANDLE_TYPE_NONE,
.location =
{
.type = CU_MEM_LOCATION_TYPE_DEVICE,
.id = cu_device,
},
.win32SecurityAttributes = NULL,
.reserved = {0},
};
CUmemoryPool pool = NULL;
IREE_CUDA_RETURN_IF_ERROR(cuda_symbols, cuMemPoolCreate(&pool, &pool_props),
"cuMemPoolCreate");
iree_status_t status = IREE_CURESULT_TO_STATUS(
cuda_symbols,
cuMemPoolSetAttribute(pool, CU_MEMPOOL_ATTR_RELEASE_THRESHOLD,
&params.release_threshold),
"cuMemPoolSetAttribute");
if (iree_status_is_ok(status)) {
*out_pool = pool;
} else {
IREE_CUDA_IGNORE_ERROR(cuda_symbols, cuMemPoolDestroy(pool));
}
return status;
}
iree_status_t iree_hal_cuda2_memory_pools_initialize(
const iree_hal_cuda2_dynamic_symbols_t* cuda_symbols, CUdevice cu_device,
const iree_hal_cuda2_memory_pooling_params_t* pooling_params,
iree_allocator_t host_allocator,
iree_hal_cuda2_memory_pools_t* IREE_RESTRICT out_pools) {
IREE_ASSERT_ARGUMENT(cuda_symbols);
IREE_ASSERT_ARGUMENT(pooling_params);
IREE_ASSERT_ARGUMENT(out_pools);
IREE_TRACE_ZONE_BEGIN(z0);
memset(out_pools, 0, sizeof(*out_pools));
out_pools->cuda_symbols = cuda_symbols;
out_pools->host_allocator = host_allocator;
iree_status_t status = iree_ok_status();
if (iree_status_is_ok(status)) {
status = iree_hal_cuda2_create_memory_pool(cuda_symbols, cu_device,
pooling_params->device_local,
&out_pools->device_local);
}
if (iree_status_is_ok(status)) {
status = iree_hal_cuda2_create_memory_pool(
cuda_symbols, cu_device, pooling_params->other, &out_pools->other);
}
IREE_TRACE_ZONE_END(z0);
return status;
}
void iree_hal_cuda2_memory_pools_deinitialize(
iree_hal_cuda2_memory_pools_t* pools) {
IREE_TRACE_ZONE_BEGIN(z0);
if (pools->device_local) {
IREE_CUDA_IGNORE_ERROR(pools->cuda_symbols,
cuMemPoolDestroy(pools->device_local));
pools->device_local = NULL;
}
if (pools->other) {
IREE_CUDA_IGNORE_ERROR(pools->cuda_symbols, cuMemPoolDestroy(pools->other));
pools->other = NULL;
}
IREE_TRACE_ZONE_END(z0);
}
static void iree_hal_cuda2_memory_pool_track_alloc(
iree_hal_cuda2_memory_pools_t* pools, iree_hal_buffer_t* buffer) {
bool is_device_local = iree_all_bits_set(iree_hal_buffer_memory_type(buffer),
IREE_HAL_MEMORY_TYPE_DEVICE_LOCAL);
(void)is_device_local;
iree_device_size_t allocation_size = iree_hal_buffer_allocation_size(buffer);
(void)allocation_size;
IREE_TRACE_ALLOC_NAMED(
is_device_local ? IREE_HAL_CUDA_DEVICE_LOCAL_POOL_RESERVED_ID
: IREE_HAL_CUDA_OTHER_POOL_RESERVED_ID,
(void*)iree_hal_cuda2_buffer_device_pointer(buffer), allocation_size);
IREE_STATISTICS({
iree_atomic_int64_t* bytes_allocated =
is_device_local ? &pools->statistics.device_bytes_allocated
: &pools->statistics.host_bytes_allocated;
iree_atomic_fetch_add_int64(bytes_allocated, allocation_size,
iree_memory_order_relaxed);
});
}
static void iree_hal_cuda2_memory_pool_track_free(
iree_hal_cuda2_memory_pools_t* pools, iree_hal_buffer_t* buffer) {
bool is_device_local = iree_all_bits_set(iree_hal_buffer_memory_type(buffer),
IREE_HAL_MEMORY_TYPE_DEVICE_LOCAL);
(void)is_device_local;
IREE_TRACE_FREE_NAMED(is_device_local
? IREE_HAL_CUDA_DEVICE_LOCAL_POOL_RESERVED_ID
: IREE_HAL_CUDA_OTHER_POOL_RESERVED_ID,
(void*)iree_hal_cuda2_buffer_device_pointer(buffer));
IREE_STATISTICS({
iree_atomic_int64_t* bytes_freed =
is_device_local ? &pools->statistics.device_bytes_freed
: &pools->statistics.host_bytes_freed;
iree_device_size_t allocation_size =
iree_hal_buffer_allocation_size(buffer);
iree_atomic_fetch_add_int64(bytes_freed, allocation_size,
iree_memory_order_relaxed);
});
}
void iree_hal_cuda2_memory_pools_merge_statistics(
iree_hal_cuda2_memory_pools_t* pools,
iree_hal_allocator_statistics_t* statistics) {
IREE_STATISTICS({
statistics->device_bytes_allocated = iree_atomic_load_int64(
&pools->statistics.device_bytes_allocated, iree_memory_order_relaxed);
statistics->host_bytes_allocated = iree_atomic_load_int64(
&pools->statistics.host_bytes_allocated, iree_memory_order_relaxed);
statistics->device_bytes_freed = iree_atomic_load_int64(
&pools->statistics.device_bytes_freed, iree_memory_order_relaxed);
statistics->host_bytes_freed = iree_atomic_load_int64(
&pools->statistics.host_bytes_freed, iree_memory_order_relaxed);
if (pools->device_local) {
cuuint64_t pool_peak = 0;
IREE_CUDA_IGNORE_ERROR(
pools->cuda_symbols,
cuMemPoolGetAttribute(pools->device_local,
CU_MEMPOOL_ATTR_USED_MEM_HIGH, &pool_peak));
statistics->device_bytes_peak += (iree_device_size_t)pool_peak;
}
if (pools->other) {
cuuint64_t pool_peak = 0;
IREE_CUDA_IGNORE_ERROR(
pools->cuda_symbols,
cuMemPoolGetAttribute(pools->other, CU_MEMPOOL_ATTR_USED_MEM_HIGH,
&pool_peak));
statistics->host_bytes_peak += (iree_device_size_t)pool_peak;
}
});
}
iree_status_t iree_hal_cuda2_memory_pools_trim(
iree_hal_cuda2_memory_pools_t* pools,
const iree_hal_cuda2_memory_pooling_params_t* pooling_params) {
IREE_CUDA_RETURN_IF_ERROR(
pools->cuda_symbols,
cuMemPoolTrimTo(pools->device_local,
pooling_params->device_local.minimum_capacity),
"cuMemPoolTrimTo");
IREE_CUDA_RETURN_IF_ERROR(
pools->cuda_symbols,
cuMemPoolTrimTo(pools->other, pooling_params->other.minimum_capacity),
"cuMemPoolTrimTo");
return iree_ok_status();
}
// NOTE: this is only issued if the buffer is destroyed without having had been
// scheduled for deallocation asynchronously. When a buffer is scheduled we drop
// the release callback so that this isn't called and we don't double-free.
static void iree_hal_cuda2_async_buffer_release_callback(
void* user_data, iree_hal_buffer_t* buffer) {
iree_hal_cuda2_memory_pools_t* pools =
(iree_hal_cuda2_memory_pools_t*)user_data;
IREE_TRACE_ZONE_BEGIN(z0);
CUdeviceptr device_ptr = iree_hal_cuda2_buffer_device_pointer(buffer);
IREE_CUDA_IGNORE_ERROR(pools->cuda_symbols, cuMemFree(device_ptr));
iree_hal_cuda2_memory_pool_track_free(pools, buffer);
IREE_TRACE_ZONE_END(z0);
}
iree_status_t iree_hal_cuda2_memory_pools_alloca(
iree_hal_cuda2_memory_pools_t* pools, CUstream stream,
iree_hal_allocator_pool_t pool, iree_hal_buffer_params_t params,
iree_device_size_t allocation_size,
iree_hal_buffer_t** IREE_RESTRICT out_buffer) {
IREE_TRACE_ZONE_BEGIN(z0);
IREE_TRACE_ZONE_APPEND_VALUE(z0, (int64_t)allocation_size);
iree_hal_buffer_params_canonicalize(&params);
// TODO: more pools and better selection; this is coarsely deciding between
// only device local (variables, constants, transients) and other (staging,
// external) but could use more buffer properties (including usage/export
// flags) to better isolate the different usage patterns and keep the pools
// operating with reasonable limits. We should be using the |pool| arg.
CUmemoryPool memory_pool =
iree_all_bits_set(params.type, IREE_HAL_MEMORY_TYPE_DEVICE_LOCAL)
? pools->device_local
: pools->other;
CUdeviceptr device_ptr = 0;
iree_status_t status = IREE_CURESULT_TO_STATUS(
pools->cuda_symbols,
cuMemAllocFromPoolAsync(&device_ptr, (size_t)allocation_size, memory_pool,
stream),
"cuMemAllocFromPoolAsync");
// Wrap the allocated CUDA buffer in a HAL buffer.
// NOTE: we don't provide a device allocator because we didn't allocate from
// one and instead we use a release callback to perform the free if the user
// doesn't dealloca the buffer.
iree_hal_buffer_t* buffer = NULL;
if (iree_status_is_ok(status)) {
iree_hal_buffer_release_callback_t release_callback = {
.fn = iree_hal_cuda2_async_buffer_release_callback,
.user_data = pools,
};
status = iree_hal_cuda2_buffer_wrap(
/*device_allocator=*/NULL, params.type, params.access, params.usage,
allocation_size, /*byte_offset=*/0,
/*byte_length=*/allocation_size, IREE_HAL_CUDA_BUFFER_TYPE_ASYNC,
device_ptr, /*host_ptr=*/NULL, release_callback, pools->host_allocator,
&buffer);
}
if (iree_status_is_ok(status)) {
// Update statistics (note that it may not yet be accurate).
iree_hal_cuda2_memory_pool_track_alloc(pools, buffer);
*out_buffer = buffer;
} else if (buffer) {
iree_hal_buffer_release(buffer);
} else {
IREE_CUDA_IGNORE_ERROR(pools->cuda_symbols,
cuMemFreeAsync(device_ptr, stream));
}
IREE_TRACE_ZONE_END(z0);
return status;
}
iree_status_t iree_hal_cuda2_memory_pools_dealloca(
iree_hal_cuda2_memory_pools_t* pools, CUstream stream,
iree_hal_buffer_t* buffer) {
IREE_TRACE_ZONE_BEGIN(z0);
IREE_TRACE_ZONE_APPEND_VALUE(
z0, (int64_t)iree_hal_buffer_allocation_size(buffer));
// Only process the request if the buffer came from an async pool.
// We may get requests for deallocations on ones that didn't if one part of
// the application allocated the buffer synchronously and another deallocated
// it asynchronously.
iree_status_t status = iree_ok_status();
if (iree_hal_cuda2_buffer_type(buffer) == IREE_HAL_CUDA_BUFFER_TYPE_ASYNC) {
// Try to schedule the buffer for freeing.
CUdeviceptr device_ptr = iree_hal_cuda2_buffer_device_pointer(buffer);
status = IREE_CURESULT_TO_STATUS(pools->cuda_symbols,
cuMemFreeAsync(device_ptr, stream),
"cuMemFreeAsync");
if (iree_status_is_ok(status)) {
// Drop the release callback so that we don't try to double-free the
// buffer. Note that we only do this if the CUDA free succeeded as
// otherwise we still need to synchronously deallocate the buffer when it
// is destroyed.
iree_hal_cuda2_buffer_drop_release_callback(buffer);
// Update statistics (note that it may not yet be accurate).
iree_hal_cuda2_memory_pool_track_free(pools, buffer);
}
} else {
// Not allocated via alloca, ignore.
IREE_TRACE_ZONE_APPEND_TEXT(z0, "ignored sync allocation");
}
IREE_TRACE_ZONE_END(z0);
return status;
}