| // Copyright 2023 The IREE Authors |
| // |
| // Licensed under the Apache License v2.0 with LLVM Exceptions. |
| // See https://llvm.org/LICENSE.txt for license information. |
| // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| |
| #include "experimental/cuda2/memory_pools.h" |
| |
| #include "experimental/cuda2/cuda_buffer.h" |
| #include "experimental/cuda2/cuda_dynamic_symbols.h" |
| #include "experimental/cuda2/cuda_status_util.h" |
| #include "iree/base/tracing.h" |
| |
| // NOTE: these are currently global for all devices; we could make |
| // device-specific ones by malloc() and leaking (with LSAN note) unique string |
| // values instead. |
| #if IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_ALLOCATION_TRACKING |
| static const char* IREE_HAL_CUDA_DEVICE_LOCAL_POOL_RESERVED_ID = |
| "CUDA pool: device-local reserved"; |
| static const char* IREE_HAL_CUDA_OTHER_POOL_RESERVED_ID = |
| "CUDA pool: other reserved"; |
| #endif // IREE_TRACING_FEATURE_ALLOCATION_TRACKING |
| |
| static iree_status_t iree_hal_cuda2_create_memory_pool( |
| const iree_hal_cuda2_dynamic_symbols_t* cuda_symbols, CUdevice cu_device, |
| iree_hal_cuda2_memory_pool_params_t params, |
| CUmemoryPool* IREE_RESTRICT out_pool) { |
| *out_pool = NULL; |
| |
| CUmemPoolProps pool_props = { |
| .allocType = CU_MEM_ALLOCATION_TYPE_PINNED, |
| // TODO: allow sharing of certain pool memory types by fd/HANDLE. |
| .handleTypes = CU_MEM_HANDLE_TYPE_NONE, |
| .location = |
| { |
| .type = CU_MEM_LOCATION_TYPE_DEVICE, |
| .id = cu_device, |
| }, |
| .win32SecurityAttributes = NULL, |
| .reserved = {0}, |
| }; |
| |
| CUmemoryPool pool = NULL; |
| IREE_CUDA_RETURN_IF_ERROR(cuda_symbols, cuMemPoolCreate(&pool, &pool_props), |
| "cuMemPoolCreate"); |
| |
| iree_status_t status = IREE_CURESULT_TO_STATUS( |
| cuda_symbols, |
| cuMemPoolSetAttribute(pool, CU_MEMPOOL_ATTR_RELEASE_THRESHOLD, |
| ¶ms.release_threshold), |
| "cuMemPoolSetAttribute"); |
| |
| if (iree_status_is_ok(status)) { |
| *out_pool = pool; |
| } else { |
| IREE_CUDA_IGNORE_ERROR(cuda_symbols, cuMemPoolDestroy(pool)); |
| } |
| return status; |
| } |
| |
| iree_status_t iree_hal_cuda2_memory_pools_initialize( |
| const iree_hal_cuda2_dynamic_symbols_t* cuda_symbols, CUdevice cu_device, |
| const iree_hal_cuda2_memory_pooling_params_t* pooling_params, |
| iree_allocator_t host_allocator, |
| iree_hal_cuda2_memory_pools_t* IREE_RESTRICT out_pools) { |
| IREE_ASSERT_ARGUMENT(cuda_symbols); |
| IREE_ASSERT_ARGUMENT(pooling_params); |
| IREE_ASSERT_ARGUMENT(out_pools); |
| IREE_TRACE_ZONE_BEGIN(z0); |
| |
| memset(out_pools, 0, sizeof(*out_pools)); |
| out_pools->cuda_symbols = cuda_symbols; |
| out_pools->host_allocator = host_allocator; |
| |
| iree_status_t status = iree_ok_status(); |
| |
| if (iree_status_is_ok(status)) { |
| status = iree_hal_cuda2_create_memory_pool(cuda_symbols, cu_device, |
| pooling_params->device_local, |
| &out_pools->device_local); |
| } |
| |
| if (iree_status_is_ok(status)) { |
| status = iree_hal_cuda2_create_memory_pool( |
| cuda_symbols, cu_device, pooling_params->other, &out_pools->other); |
| } |
| |
| IREE_TRACE_ZONE_END(z0); |
| return status; |
| } |
| |
| void iree_hal_cuda2_memory_pools_deinitialize( |
| iree_hal_cuda2_memory_pools_t* pools) { |
| IREE_TRACE_ZONE_BEGIN(z0); |
| |
| if (pools->device_local) { |
| IREE_CUDA_IGNORE_ERROR(pools->cuda_symbols, |
| cuMemPoolDestroy(pools->device_local)); |
| pools->device_local = NULL; |
| } |
| |
| if (pools->other) { |
| IREE_CUDA_IGNORE_ERROR(pools->cuda_symbols, cuMemPoolDestroy(pools->other)); |
| pools->other = NULL; |
| } |
| |
| IREE_TRACE_ZONE_END(z0); |
| } |
| |
| static void iree_hal_cuda2_memory_pool_track_alloc( |
| iree_hal_cuda2_memory_pools_t* pools, iree_hal_buffer_t* buffer) { |
| bool is_device_local = iree_all_bits_set(iree_hal_buffer_memory_type(buffer), |
| IREE_HAL_MEMORY_TYPE_DEVICE_LOCAL); |
| (void)is_device_local; |
| iree_device_size_t allocation_size = iree_hal_buffer_allocation_size(buffer); |
| (void)allocation_size; |
| IREE_TRACE_ALLOC_NAMED( |
| is_device_local ? IREE_HAL_CUDA_DEVICE_LOCAL_POOL_RESERVED_ID |
| : IREE_HAL_CUDA_OTHER_POOL_RESERVED_ID, |
| (void*)iree_hal_cuda2_buffer_device_pointer(buffer), allocation_size); |
| IREE_STATISTICS({ |
| iree_atomic_int64_t* bytes_allocated = |
| is_device_local ? &pools->statistics.device_bytes_allocated |
| : &pools->statistics.host_bytes_allocated; |
| iree_atomic_fetch_add_int64(bytes_allocated, allocation_size, |
| iree_memory_order_relaxed); |
| }); |
| } |
| |
| static void iree_hal_cuda2_memory_pool_track_free( |
| iree_hal_cuda2_memory_pools_t* pools, iree_hal_buffer_t* buffer) { |
| bool is_device_local = iree_all_bits_set(iree_hal_buffer_memory_type(buffer), |
| IREE_HAL_MEMORY_TYPE_DEVICE_LOCAL); |
| (void)is_device_local; |
| IREE_TRACE_FREE_NAMED(is_device_local |
| ? IREE_HAL_CUDA_DEVICE_LOCAL_POOL_RESERVED_ID |
| : IREE_HAL_CUDA_OTHER_POOL_RESERVED_ID, |
| (void*)iree_hal_cuda2_buffer_device_pointer(buffer)); |
| IREE_STATISTICS({ |
| iree_atomic_int64_t* bytes_freed = |
| is_device_local ? &pools->statistics.device_bytes_freed |
| : &pools->statistics.host_bytes_freed; |
| iree_device_size_t allocation_size = |
| iree_hal_buffer_allocation_size(buffer); |
| iree_atomic_fetch_add_int64(bytes_freed, allocation_size, |
| iree_memory_order_relaxed); |
| }); |
| } |
| |
| void iree_hal_cuda2_memory_pools_merge_statistics( |
| iree_hal_cuda2_memory_pools_t* pools, |
| iree_hal_allocator_statistics_t* statistics) { |
| IREE_STATISTICS({ |
| statistics->device_bytes_allocated = iree_atomic_load_int64( |
| &pools->statistics.device_bytes_allocated, iree_memory_order_relaxed); |
| statistics->host_bytes_allocated = iree_atomic_load_int64( |
| &pools->statistics.host_bytes_allocated, iree_memory_order_relaxed); |
| statistics->device_bytes_freed = iree_atomic_load_int64( |
| &pools->statistics.device_bytes_freed, iree_memory_order_relaxed); |
| statistics->host_bytes_freed = iree_atomic_load_int64( |
| &pools->statistics.host_bytes_freed, iree_memory_order_relaxed); |
| if (pools->device_local) { |
| cuuint64_t pool_peak = 0; |
| IREE_CUDA_IGNORE_ERROR( |
| pools->cuda_symbols, |
| cuMemPoolGetAttribute(pools->device_local, |
| CU_MEMPOOL_ATTR_USED_MEM_HIGH, &pool_peak)); |
| statistics->device_bytes_peak += (iree_device_size_t)pool_peak; |
| } |
| if (pools->other) { |
| cuuint64_t pool_peak = 0; |
| IREE_CUDA_IGNORE_ERROR( |
| pools->cuda_symbols, |
| cuMemPoolGetAttribute(pools->other, CU_MEMPOOL_ATTR_USED_MEM_HIGH, |
| &pool_peak)); |
| statistics->host_bytes_peak += (iree_device_size_t)pool_peak; |
| } |
| }); |
| } |
| |
| iree_status_t iree_hal_cuda2_memory_pools_trim( |
| iree_hal_cuda2_memory_pools_t* pools, |
| const iree_hal_cuda2_memory_pooling_params_t* pooling_params) { |
| IREE_CUDA_RETURN_IF_ERROR( |
| pools->cuda_symbols, |
| cuMemPoolTrimTo(pools->device_local, |
| pooling_params->device_local.minimum_capacity), |
| "cuMemPoolTrimTo"); |
| IREE_CUDA_RETURN_IF_ERROR( |
| pools->cuda_symbols, |
| cuMemPoolTrimTo(pools->other, pooling_params->other.minimum_capacity), |
| "cuMemPoolTrimTo"); |
| return iree_ok_status(); |
| } |
| |
| // NOTE: this is only issued if the buffer is destroyed without having had been |
| // scheduled for deallocation asynchronously. When a buffer is scheduled we drop |
| // the release callback so that this isn't called and we don't double-free. |
| static void iree_hal_cuda2_async_buffer_release_callback( |
| void* user_data, iree_hal_buffer_t* buffer) { |
| iree_hal_cuda2_memory_pools_t* pools = |
| (iree_hal_cuda2_memory_pools_t*)user_data; |
| IREE_TRACE_ZONE_BEGIN(z0); |
| |
| CUdeviceptr device_ptr = iree_hal_cuda2_buffer_device_pointer(buffer); |
| IREE_CUDA_IGNORE_ERROR(pools->cuda_symbols, cuMemFree(device_ptr)); |
| iree_hal_cuda2_memory_pool_track_free(pools, buffer); |
| |
| IREE_TRACE_ZONE_END(z0); |
| } |
| |
| iree_status_t iree_hal_cuda2_memory_pools_alloca( |
| iree_hal_cuda2_memory_pools_t* pools, CUstream stream, |
| iree_hal_allocator_pool_t pool, iree_hal_buffer_params_t params, |
| iree_device_size_t allocation_size, |
| iree_hal_buffer_t** IREE_RESTRICT out_buffer) { |
| IREE_TRACE_ZONE_BEGIN(z0); |
| IREE_TRACE_ZONE_APPEND_VALUE(z0, (int64_t)allocation_size); |
| |
| iree_hal_buffer_params_canonicalize(¶ms); |
| |
| // TODO: more pools and better selection; this is coarsely deciding between |
| // only device local (variables, constants, transients) and other (staging, |
| // external) but could use more buffer properties (including usage/export |
| // flags) to better isolate the different usage patterns and keep the pools |
| // operating with reasonable limits. We should be using the |pool| arg. |
| CUmemoryPool memory_pool = |
| iree_all_bits_set(params.type, IREE_HAL_MEMORY_TYPE_DEVICE_LOCAL) |
| ? pools->device_local |
| : pools->other; |
| |
| CUdeviceptr device_ptr = 0; |
| iree_status_t status = IREE_CURESULT_TO_STATUS( |
| pools->cuda_symbols, |
| cuMemAllocFromPoolAsync(&device_ptr, (size_t)allocation_size, memory_pool, |
| stream), |
| "cuMemAllocFromPoolAsync"); |
| |
| // Wrap the allocated CUDA buffer in a HAL buffer. |
| // NOTE: we don't provide a device allocator because we didn't allocate from |
| // one and instead we use a release callback to perform the free if the user |
| // doesn't dealloca the buffer. |
| iree_hal_buffer_t* buffer = NULL; |
| if (iree_status_is_ok(status)) { |
| iree_hal_buffer_release_callback_t release_callback = { |
| .fn = iree_hal_cuda2_async_buffer_release_callback, |
| .user_data = pools, |
| }; |
| status = iree_hal_cuda2_buffer_wrap( |
| /*device_allocator=*/NULL, params.type, params.access, params.usage, |
| allocation_size, /*byte_offset=*/0, |
| /*byte_length=*/allocation_size, IREE_HAL_CUDA_BUFFER_TYPE_ASYNC, |
| device_ptr, /*host_ptr=*/NULL, release_callback, pools->host_allocator, |
| &buffer); |
| } |
| |
| if (iree_status_is_ok(status)) { |
| // Update statistics (note that it may not yet be accurate). |
| iree_hal_cuda2_memory_pool_track_alloc(pools, buffer); |
| *out_buffer = buffer; |
| } else if (buffer) { |
| iree_hal_buffer_release(buffer); |
| } else { |
| IREE_CUDA_IGNORE_ERROR(pools->cuda_symbols, |
| cuMemFreeAsync(device_ptr, stream)); |
| } |
| |
| IREE_TRACE_ZONE_END(z0); |
| return status; |
| } |
| |
| iree_status_t iree_hal_cuda2_memory_pools_dealloca( |
| iree_hal_cuda2_memory_pools_t* pools, CUstream stream, |
| iree_hal_buffer_t* buffer) { |
| IREE_TRACE_ZONE_BEGIN(z0); |
| IREE_TRACE_ZONE_APPEND_VALUE( |
| z0, (int64_t)iree_hal_buffer_allocation_size(buffer)); |
| |
| // Only process the request if the buffer came from an async pool. |
| // We may get requests for deallocations on ones that didn't if one part of |
| // the application allocated the buffer synchronously and another deallocated |
| // it asynchronously. |
| iree_status_t status = iree_ok_status(); |
| if (iree_hal_cuda2_buffer_type(buffer) == IREE_HAL_CUDA_BUFFER_TYPE_ASYNC) { |
| // Try to schedule the buffer for freeing. |
| CUdeviceptr device_ptr = iree_hal_cuda2_buffer_device_pointer(buffer); |
| status = IREE_CURESULT_TO_STATUS(pools->cuda_symbols, |
| cuMemFreeAsync(device_ptr, stream), |
| "cuMemFreeAsync"); |
| if (iree_status_is_ok(status)) { |
| // Drop the release callback so that we don't try to double-free the |
| // buffer. Note that we only do this if the CUDA free succeeded as |
| // otherwise we still need to synchronously deallocate the buffer when it |
| // is destroyed. |
| iree_hal_cuda2_buffer_drop_release_callback(buffer); |
| |
| // Update statistics (note that it may not yet be accurate). |
| iree_hal_cuda2_memory_pool_track_free(pools, buffer); |
| } |
| } else { |
| // Not allocated via alloca, ignore. |
| IREE_TRACE_ZONE_APPEND_TEXT(z0, "ignored sync allocation"); |
| } |
| |
| IREE_TRACE_ZONE_END(z0); |
| return status; |
| } |