experimental/cuda2/memory_pools.c - 3p/openxla/iree - Git at Google

 // Copyright 2023 The IREE Authors
 //
 // Licensed under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

 #include "experimental/cuda2/memory_pools.h"

 #include "experimental/cuda2/cuda_buffer.h"
 #include "experimental/cuda2/cuda_dynamic_symbols.h"
 #include "experimental/cuda2/cuda_status_util.h"

 // NOTE: these are currently global for all devices; we could make
 // device-specific ones by malloc() and leaking (with LSAN note) unique string
 // values instead.
 #if IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_ALLOCATION_TRACKING
 static const char* IREE_HAL_CUDA_DEVICE_LOCAL_POOL_RESERVED_ID =
     "CUDA pool: device-local reserved";
 static const char* IREE_HAL_CUDA_OTHER_POOL_RESERVED_ID =
     "CUDA pool: other reserved";
 #endif  // IREE_TRACING_FEATURE_ALLOCATION_TRACKING

 static iree_status_t iree_hal_cuda2_create_memory_pool(
     const iree_hal_cuda2_dynamic_symbols_t* cuda_symbols, CUdevice cu_device,
     iree_hal_cuda2_memory_pool_params_t params,
     CUmemoryPool* IREE_RESTRICT out_pool) {
   *out_pool = NULL;

   CUmemPoolProps pool_props = {
       .allocType = CU_MEM_ALLOCATION_TYPE_PINNED,
       // TODO: allow sharing of certain pool memory types by fd/HANDLE.
       .handleTypes = CU_MEM_HANDLE_TYPE_NONE,
       .location =
           {
               .type = CU_MEM_LOCATION_TYPE_DEVICE,
               .id = cu_device,
           },
       .win32SecurityAttributes = NULL,
       .reserved = {0},
   };

   CUmemoryPool pool = NULL;
   IREE_CUDA_RETURN_IF_ERROR(cuda_symbols, cuMemPoolCreate(&pool, &pool_props),
                             "cuMemPoolCreate");

   iree_status_t status = IREE_CURESULT_TO_STATUS(
       cuda_symbols,
       cuMemPoolSetAttribute(pool, CU_MEMPOOL_ATTR_RELEASE_THRESHOLD,
                             &params.release_threshold),
       "cuMemPoolSetAttribute");

   if (iree_status_is_ok(status)) {
     *out_pool = pool;
   } else {
     IREE_CUDA_IGNORE_ERROR(cuda_symbols, cuMemPoolDestroy(pool));
   }
   return status;
 }

 iree_status_t iree_hal_cuda2_memory_pools_initialize(
     const iree_hal_cuda2_dynamic_symbols_t* cuda_symbols, CUdevice cu_device,
     const iree_hal_cuda2_memory_pooling_params_t* pooling_params,
     iree_allocator_t host_allocator,
     iree_hal_cuda2_memory_pools_t* IREE_RESTRICT out_pools) {
   IREE_ASSERT_ARGUMENT(cuda_symbols);
   IREE_ASSERT_ARGUMENT(pooling_params);
   IREE_ASSERT_ARGUMENT(out_pools);
   IREE_TRACE_ZONE_BEGIN(z0);

   memset(out_pools, 0, sizeof(*out_pools));
   out_pools->cuda_symbols = cuda_symbols;
   out_pools->host_allocator = host_allocator;

   iree_status_t status = iree_ok_status();

   if (iree_status_is_ok(status)) {
     status = iree_hal_cuda2_create_memory_pool(cuda_symbols, cu_device,
                                                pooling_params->device_local,
                                                &out_pools->device_local);
   }

   if (iree_status_is_ok(status)) {
     status = iree_hal_cuda2_create_memory_pool(
         cuda_symbols, cu_device, pooling_params->other, &out_pools->other);
   }

   IREE_TRACE_ZONE_END(z0);
   return status;
 }

 void iree_hal_cuda2_memory_pools_deinitialize(
     iree_hal_cuda2_memory_pools_t* pools) {
   IREE_TRACE_ZONE_BEGIN(z0);

   if (pools->device_local) {
     IREE_CUDA_IGNORE_ERROR(pools->cuda_symbols,
                            cuMemPoolDestroy(pools->device_local));
     pools->device_local = NULL;
   }

   if (pools->other) {
     IREE_CUDA_IGNORE_ERROR(pools->cuda_symbols, cuMemPoolDestroy(pools->other));
     pools->other = NULL;
   }

   IREE_TRACE_ZONE_END(z0);
 }

 static void iree_hal_cuda2_memory_pool_track_alloc(
     iree_hal_cuda2_memory_pools_t* pools, iree_hal_buffer_t* buffer) {
   bool is_device_local = iree_all_bits_set(iree_hal_buffer_memory_type(buffer),
                                            IREE_HAL_MEMORY_TYPE_DEVICE_LOCAL);
   (void)is_device_local;
   iree_device_size_t allocation_size = iree_hal_buffer_allocation_size(buffer);
   (void)allocation_size;
   IREE_TRACE_ALLOC_NAMED(
       is_device_local ? IREE_HAL_CUDA_DEVICE_LOCAL_POOL_RESERVED_ID
                       : IREE_HAL_CUDA_OTHER_POOL_RESERVED_ID,
       (void*)iree_hal_cuda2_buffer_device_pointer(buffer), allocation_size);
   IREE_STATISTICS({
     iree_atomic_int64_t* bytes_allocated =
         is_device_local ? &pools->statistics.device_bytes_allocated
                         : &pools->statistics.host_bytes_allocated;
     iree_atomic_fetch_add_int64(bytes_allocated, allocation_size,
                                 iree_memory_order_relaxed);
   });
 }

 static void iree_hal_cuda2_memory_pool_track_free(
     iree_hal_cuda2_memory_pools_t* pools, iree_hal_buffer_t* buffer) {
   bool is_device_local = iree_all_bits_set(iree_hal_buffer_memory_type(buffer),
                                            IREE_HAL_MEMORY_TYPE_DEVICE_LOCAL);
   (void)is_device_local;
   IREE_TRACE_FREE_NAMED(is_device_local
                             ? IREE_HAL_CUDA_DEVICE_LOCAL_POOL_RESERVED_ID
                             : IREE_HAL_CUDA_OTHER_POOL_RESERVED_ID,
                         (void*)iree_hal_cuda2_buffer_device_pointer(buffer));
   IREE_STATISTICS({
     iree_atomic_int64_t* bytes_freed =
         is_device_local ? &pools->statistics.device_bytes_freed
                         : &pools->statistics.host_bytes_freed;
     iree_device_size_t allocation_size =
         iree_hal_buffer_allocation_size(buffer);
     iree_atomic_fetch_add_int64(bytes_freed, allocation_size,
                                 iree_memory_order_relaxed);
   });
 }

 void iree_hal_cuda2_memory_pools_merge_statistics(
     iree_hal_cuda2_memory_pools_t* pools,
     iree_hal_allocator_statistics_t* statistics) {
   IREE_STATISTICS({
     statistics->device_bytes_allocated = iree_atomic_load_int64(
         &pools->statistics.device_bytes_allocated, iree_memory_order_relaxed);
     statistics->host_bytes_allocated = iree_atomic_load_int64(
         &pools->statistics.host_bytes_allocated, iree_memory_order_relaxed);
     statistics->device_bytes_freed = iree_atomic_load_int64(
         &pools->statistics.device_bytes_freed, iree_memory_order_relaxed);
     statistics->host_bytes_freed = iree_atomic_load_int64(
         &pools->statistics.host_bytes_freed, iree_memory_order_relaxed);
     if (pools->device_local) {
       cuuint64_t pool_peak = 0;
       IREE_CUDA_IGNORE_ERROR(
           pools->cuda_symbols,
           cuMemPoolGetAttribute(pools->device_local,
                                 CU_MEMPOOL_ATTR_USED_MEM_HIGH, &pool_peak));
       statistics->device_bytes_peak += (iree_device_size_t)pool_peak;
     }
     if (pools->other) {
       cuuint64_t pool_peak = 0;
       IREE_CUDA_IGNORE_ERROR(
           pools->cuda_symbols,
           cuMemPoolGetAttribute(pools->other, CU_MEMPOOL_ATTR_USED_MEM_HIGH,
                                 &pool_peak));
       statistics->host_bytes_peak += (iree_device_size_t)pool_peak;
     }
   });
 }

 iree_status_t iree_hal_cuda2_memory_pools_trim(
     iree_hal_cuda2_memory_pools_t* pools,
     const iree_hal_cuda2_memory_pooling_params_t* pooling_params) {
   IREE_CUDA_RETURN_IF_ERROR(
       pools->cuda_symbols,
       cuMemPoolTrimTo(pools->device_local,
                       pooling_params->device_local.minimum_capacity),
       "cuMemPoolTrimTo");
   IREE_CUDA_RETURN_IF_ERROR(
       pools->cuda_symbols,
       cuMemPoolTrimTo(pools->other, pooling_params->other.minimum_capacity),
       "cuMemPoolTrimTo");
   return iree_ok_status();
 }

 // NOTE: this is only issued if the buffer is destroyed without having had been
 // scheduled for deallocation asynchronously. When a buffer is scheduled we drop
 // the release callback so that this isn't called and we don't double-free.
 static void iree_hal_cuda2_async_buffer_release_callback(
     void* user_data, iree_hal_buffer_t* buffer) {
   iree_hal_cuda2_memory_pools_t* pools =
       (iree_hal_cuda2_memory_pools_t*)user_data;
   IREE_TRACE_ZONE_BEGIN(z0);

   CUdeviceptr device_ptr = iree_hal_cuda2_buffer_device_pointer(buffer);
   IREE_CUDA_IGNORE_ERROR(pools->cuda_symbols, cuMemFree(device_ptr));
   iree_hal_cuda2_memory_pool_track_free(pools, buffer);

   IREE_TRACE_ZONE_END(z0);
 }

 iree_status_t iree_hal_cuda2_memory_pools_alloca(
     iree_hal_cuda2_memory_pools_t* pools, CUstream stream,
     iree_hal_allocator_pool_t pool, iree_hal_buffer_params_t params,
     iree_device_size_t allocation_size,
     iree_hal_buffer_t** IREE_RESTRICT out_buffer) {
   IREE_TRACE_ZONE_BEGIN(z0);
   IREE_TRACE_ZONE_APPEND_VALUE_I64(z0, (int64_t)allocation_size);

   iree_hal_buffer_params_canonicalize(&params);

   // TODO: more pools and better selection; this is coarsely deciding between
   // only device local (variables, constants, transients) and other (staging,
   // external) but could use more buffer properties (including usage/export
   // flags) to better isolate the different usage patterns and keep the pools
   // operating with reasonable limits. We should be using the |pool| arg.
   CUmemoryPool memory_pool =
       iree_all_bits_set(params.type, IREE_HAL_MEMORY_TYPE_DEVICE_LOCAL)
           ? pools->device_local
           : pools->other;

   CUdeviceptr device_ptr = 0;
   iree_status_t status = IREE_CURESULT_TO_STATUS(
       pools->cuda_symbols,
       cuMemAllocFromPoolAsync(&device_ptr, (size_t)allocation_size, memory_pool,
                               stream),
       "cuMemAllocFromPoolAsync");

   // Wrap the allocated CUDA buffer in a HAL buffer.
   // NOTE: we don't provide a device allocator because we didn't allocate from
   // one and instead we use a release callback to perform the free if the user
   // doesn't dealloca the buffer.
   iree_hal_buffer_t* buffer = NULL;
   if (iree_status_is_ok(status)) {
     iree_hal_buffer_release_callback_t release_callback = {
         .fn = iree_hal_cuda2_async_buffer_release_callback,
         .user_data = pools,
     };
     status = iree_hal_cuda2_buffer_wrap(
         /*device_allocator=*/NULL, params.type, params.access, params.usage,
         allocation_size, /*byte_offset=*/0,
         /*byte_length=*/allocation_size, IREE_HAL_CUDA_BUFFER_TYPE_ASYNC,
         device_ptr, /*host_ptr=*/NULL, release_callback, pools->host_allocator,
         &buffer);
   }

   if (iree_status_is_ok(status)) {
     // Update statistics (note that it may not yet be accurate).
     iree_hal_cuda2_memory_pool_track_alloc(pools, buffer);
     *out_buffer = buffer;
   } else if (buffer) {
     iree_hal_buffer_release(buffer);
   } else {
     IREE_CUDA_IGNORE_ERROR(pools->cuda_symbols,
                            cuMemFreeAsync(device_ptr, stream));
   }

   IREE_TRACE_ZONE_END(z0);
   return status;
 }

 iree_status_t iree_hal_cuda2_memory_pools_dealloca(
     iree_hal_cuda2_memory_pools_t* pools, CUstream stream,
     iree_hal_buffer_t* buffer) {
   IREE_TRACE_ZONE_BEGIN(z0);
   IREE_TRACE_ZONE_APPEND_VALUE_I64(
       z0, (int64_t)iree_hal_buffer_allocation_size(buffer));

   // Only process the request if the buffer came from an async pool.
   // We may get requests for deallocations on ones that didn't if one part of
   // the application allocated the buffer synchronously and another deallocated
   // it asynchronously.
   iree_status_t status = iree_ok_status();
   if (iree_hal_cuda2_buffer_type(buffer) == IREE_HAL_CUDA_BUFFER_TYPE_ASYNC) {
     // Try to schedule the buffer for freeing.
     CUdeviceptr device_ptr = iree_hal_cuda2_buffer_device_pointer(buffer);
     status = IREE_CURESULT_TO_STATUS(pools->cuda_symbols,
                                      cuMemFreeAsync(device_ptr, stream),
                                      "cuMemFreeAsync");
     if (iree_status_is_ok(status)) {
       // Drop the release callback so that we don't try to double-free the
       // buffer. Note that we only do this if the CUDA free succeeded as
       // otherwise we still need to synchronously deallocate the buffer when it
       // is destroyed.
       iree_hal_cuda2_buffer_drop_release_callback(buffer);

       // Update statistics (note that it may not yet be accurate).
       iree_hal_cuda2_memory_pool_track_free(pools, buffer);
     }
   } else {
     // Not allocated via alloca, ignore.
     IREE_TRACE_ZONE_APPEND_TEXT(z0, "ignored sync allocation");
   }

   IREE_TRACE_ZONE_END(z0);
   return status;
 }
	// Copyright 2023 The IREE Authors
	//
	// Licensed under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

	#include "experimental/cuda2/memory_pools.h"

	#include "experimental/cuda2/cuda_buffer.h"
	#include "experimental/cuda2/cuda_dynamic_symbols.h"
	#include "experimental/cuda2/cuda_status_util.h"

	// NOTE: these are currently global for all devices; we could make
	// device-specific ones by malloc() and leaking (with LSAN note) unique string
	// values instead.
	#if IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_ALLOCATION_TRACKING
	static const char* IREE_HAL_CUDA_DEVICE_LOCAL_POOL_RESERVED_ID =
	"CUDA pool: device-local reserved";
	static const char* IREE_HAL_CUDA_OTHER_POOL_RESERVED_ID =
	"CUDA pool: other reserved";
	#endif // IREE_TRACING_FEATURE_ALLOCATION_TRACKING

	static iree_status_t iree_hal_cuda2_create_memory_pool(
	const iree_hal_cuda2_dynamic_symbols_t* cuda_symbols, CUdevice cu_device,
	iree_hal_cuda2_memory_pool_params_t params,
	CUmemoryPool* IREE_RESTRICT out_pool) {
	*out_pool = NULL;

	CUmemPoolProps pool_props = {
	.allocType = CU_MEM_ALLOCATION_TYPE_PINNED,
	// TODO: allow sharing of certain pool memory types by fd/HANDLE.
	.handleTypes = CU_MEM_HANDLE_TYPE_NONE,
	.location =
	{
	.type = CU_MEM_LOCATION_TYPE_DEVICE,
	.id = cu_device,
	},
	.win32SecurityAttributes = NULL,
	.reserved = {0},
	};

	CUmemoryPool pool = NULL;
	IREE_CUDA_RETURN_IF_ERROR(cuda_symbols, cuMemPoolCreate(&pool, &pool_props),
	"cuMemPoolCreate");

	iree_status_t status = IREE_CURESULT_TO_STATUS(
	cuda_symbols,
	cuMemPoolSetAttribute(pool, CU_MEMPOOL_ATTR_RELEASE_THRESHOLD,
	&params.release_threshold),
	"cuMemPoolSetAttribute");

	if (iree_status_is_ok(status)) {
	*out_pool = pool;
	} else {
	IREE_CUDA_IGNORE_ERROR(cuda_symbols, cuMemPoolDestroy(pool));
	}
	return status;
	}

	iree_status_t iree_hal_cuda2_memory_pools_initialize(
	const iree_hal_cuda2_dynamic_symbols_t* cuda_symbols, CUdevice cu_device,
	const iree_hal_cuda2_memory_pooling_params_t* pooling_params,
	iree_allocator_t host_allocator,
	iree_hal_cuda2_memory_pools_t* IREE_RESTRICT out_pools) {
	IREE_ASSERT_ARGUMENT(cuda_symbols);
	IREE_ASSERT_ARGUMENT(pooling_params);
	IREE_ASSERT_ARGUMENT(out_pools);
	IREE_TRACE_ZONE_BEGIN(z0);

	memset(out_pools, 0, sizeof(*out_pools));
	out_pools->cuda_symbols = cuda_symbols;
	out_pools->host_allocator = host_allocator;

	iree_status_t status = iree_ok_status();

	if (iree_status_is_ok(status)) {
	status = iree_hal_cuda2_create_memory_pool(cuda_symbols, cu_device,
	pooling_params->device_local,
	&out_pools->device_local);
	}

	if (iree_status_is_ok(status)) {
	status = iree_hal_cuda2_create_memory_pool(
	cuda_symbols, cu_device, pooling_params->other, &out_pools->other);
	}

	IREE_TRACE_ZONE_END(z0);
	return status;
	}

	void iree_hal_cuda2_memory_pools_deinitialize(
	iree_hal_cuda2_memory_pools_t* pools) {
	IREE_TRACE_ZONE_BEGIN(z0);

	if (pools->device_local) {
	IREE_CUDA_IGNORE_ERROR(pools->cuda_symbols,
	cuMemPoolDestroy(pools->device_local));
	pools->device_local = NULL;
	}

	if (pools->other) {
	IREE_CUDA_IGNORE_ERROR(pools->cuda_symbols, cuMemPoolDestroy(pools->other));
	pools->other = NULL;
	}

	IREE_TRACE_ZONE_END(z0);
	}

	static void iree_hal_cuda2_memory_pool_track_alloc(
	iree_hal_cuda2_memory_pools_t* pools, iree_hal_buffer_t* buffer) {
	bool is_device_local = iree_all_bits_set(iree_hal_buffer_memory_type(buffer),
	IREE_HAL_MEMORY_TYPE_DEVICE_LOCAL);
	(void)is_device_local;
	iree_device_size_t allocation_size = iree_hal_buffer_allocation_size(buffer);
	(void)allocation_size;
	IREE_TRACE_ALLOC_NAMED(
	is_device_local ? IREE_HAL_CUDA_DEVICE_LOCAL_POOL_RESERVED_ID
	: IREE_HAL_CUDA_OTHER_POOL_RESERVED_ID,
	(void*)iree_hal_cuda2_buffer_device_pointer(buffer), allocation_size);
	IREE_STATISTICS({
	iree_atomic_int64_t* bytes_allocated =
	is_device_local ? &pools->statistics.device_bytes_allocated
	: &pools->statistics.host_bytes_allocated;
	iree_atomic_fetch_add_int64(bytes_allocated, allocation_size,
	iree_memory_order_relaxed);
	});
	}

	static void iree_hal_cuda2_memory_pool_track_free(
	iree_hal_cuda2_memory_pools_t* pools, iree_hal_buffer_t* buffer) {
	bool is_device_local = iree_all_bits_set(iree_hal_buffer_memory_type(buffer),
	IREE_HAL_MEMORY_TYPE_DEVICE_LOCAL);
	(void)is_device_local;
	IREE_TRACE_FREE_NAMED(is_device_local
	? IREE_HAL_CUDA_DEVICE_LOCAL_POOL_RESERVED_ID
	: IREE_HAL_CUDA_OTHER_POOL_RESERVED_ID,
	(void*)iree_hal_cuda2_buffer_device_pointer(buffer));
	IREE_STATISTICS({
	iree_atomic_int64_t* bytes_freed =
	is_device_local ? &pools->statistics.device_bytes_freed
	: &pools->statistics.host_bytes_freed;
	iree_device_size_t allocation_size =
	iree_hal_buffer_allocation_size(buffer);
	iree_atomic_fetch_add_int64(bytes_freed, allocation_size,
	iree_memory_order_relaxed);
	});
	}

	void iree_hal_cuda2_memory_pools_merge_statistics(
	iree_hal_cuda2_memory_pools_t* pools,
	iree_hal_allocator_statistics_t* statistics) {
	IREE_STATISTICS({
	statistics->device_bytes_allocated = iree_atomic_load_int64(
	&pools->statistics.device_bytes_allocated, iree_memory_order_relaxed);
	statistics->host_bytes_allocated = iree_atomic_load_int64(
	&pools->statistics.host_bytes_allocated, iree_memory_order_relaxed);
	statistics->device_bytes_freed = iree_atomic_load_int64(
	&pools->statistics.device_bytes_freed, iree_memory_order_relaxed);
	statistics->host_bytes_freed = iree_atomic_load_int64(
	&pools->statistics.host_bytes_freed, iree_memory_order_relaxed);
	if (pools->device_local) {
	cuuint64_t pool_peak = 0;
	IREE_CUDA_IGNORE_ERROR(
	pools->cuda_symbols,
	cuMemPoolGetAttribute(pools->device_local,
	CU_MEMPOOL_ATTR_USED_MEM_HIGH, &pool_peak));
	statistics->device_bytes_peak += (iree_device_size_t)pool_peak;
	}
	if (pools->other) {
	cuuint64_t pool_peak = 0;
	IREE_CUDA_IGNORE_ERROR(
	pools->cuda_symbols,
	cuMemPoolGetAttribute(pools->other, CU_MEMPOOL_ATTR_USED_MEM_HIGH,
	&pool_peak));
	statistics->host_bytes_peak += (iree_device_size_t)pool_peak;
	}
	});
	}

	iree_status_t iree_hal_cuda2_memory_pools_trim(
	iree_hal_cuda2_memory_pools_t* pools,
	const iree_hal_cuda2_memory_pooling_params_t* pooling_params) {
	IREE_CUDA_RETURN_IF_ERROR(
	pools->cuda_symbols,
	cuMemPoolTrimTo(pools->device_local,
	pooling_params->device_local.minimum_capacity),
	"cuMemPoolTrimTo");
	IREE_CUDA_RETURN_IF_ERROR(
	pools->cuda_symbols,
	cuMemPoolTrimTo(pools->other, pooling_params->other.minimum_capacity),
	"cuMemPoolTrimTo");
	return iree_ok_status();
	}

	// NOTE: this is only issued if the buffer is destroyed without having had been
	// scheduled for deallocation asynchronously. When a buffer is scheduled we drop
	// the release callback so that this isn't called and we don't double-free.
	static void iree_hal_cuda2_async_buffer_release_callback(
	void* user_data, iree_hal_buffer_t* buffer) {
	iree_hal_cuda2_memory_pools_t* pools =
	(iree_hal_cuda2_memory_pools_t*)user_data;
	IREE_TRACE_ZONE_BEGIN(z0);

	CUdeviceptr device_ptr = iree_hal_cuda2_buffer_device_pointer(buffer);
	IREE_CUDA_IGNORE_ERROR(pools->cuda_symbols, cuMemFree(device_ptr));
	iree_hal_cuda2_memory_pool_track_free(pools, buffer);

	IREE_TRACE_ZONE_END(z0);
	}

	iree_status_t iree_hal_cuda2_memory_pools_alloca(
	iree_hal_cuda2_memory_pools_t* pools, CUstream stream,
	iree_hal_allocator_pool_t pool, iree_hal_buffer_params_t params,
	iree_device_size_t allocation_size,
	iree_hal_buffer_t** IREE_RESTRICT out_buffer) {
	IREE_TRACE_ZONE_BEGIN(z0);
	IREE_TRACE_ZONE_APPEND_VALUE_I64(z0, (int64_t)allocation_size);

	iree_hal_buffer_params_canonicalize(&params);

	// TODO: more pools and better selection; this is coarsely deciding between
	// only device local (variables, constants, transients) and other (staging,
	// external) but could use more buffer properties (including usage/export
	// flags) to better isolate the different usage patterns and keep the pools
	// operating with reasonable limits. We should be using the \|pool\| arg.
	CUmemoryPool memory_pool =
	iree_all_bits_set(params.type, IREE_HAL_MEMORY_TYPE_DEVICE_LOCAL)
	? pools->device_local
	: pools->other;

	CUdeviceptr device_ptr = 0;
	iree_status_t status = IREE_CURESULT_TO_STATUS(
	pools->cuda_symbols,
	cuMemAllocFromPoolAsync(&device_ptr, (size_t)allocation_size, memory_pool,
	stream),
	"cuMemAllocFromPoolAsync");

	// Wrap the allocated CUDA buffer in a HAL buffer.
	// NOTE: we don't provide a device allocator because we didn't allocate from
	// one and instead we use a release callback to perform the free if the user
	// doesn't dealloca the buffer.
	iree_hal_buffer_t* buffer = NULL;
	if (iree_status_is_ok(status)) {
	iree_hal_buffer_release_callback_t release_callback = {
	.fn = iree_hal_cuda2_async_buffer_release_callback,
	.user_data = pools,
	};
	status = iree_hal_cuda2_buffer_wrap(
	/device_allocator=/NULL, params.type, params.access, params.usage,
	allocation_size, /byte_offset=/0,
	/byte_length=/allocation_size, IREE_HAL_CUDA_BUFFER_TYPE_ASYNC,
	device_ptr, /host_ptr=/NULL, release_callback, pools->host_allocator,
	&buffer);
	}

	if (iree_status_is_ok(status)) {
	// Update statistics (note that it may not yet be accurate).
	iree_hal_cuda2_memory_pool_track_alloc(pools, buffer);
	*out_buffer = buffer;
	} else if (buffer) {
	iree_hal_buffer_release(buffer);
	} else {
	IREE_CUDA_IGNORE_ERROR(pools->cuda_symbols,
	cuMemFreeAsync(device_ptr, stream));
	}

	IREE_TRACE_ZONE_END(z0);
	return status;
	}

	iree_status_t iree_hal_cuda2_memory_pools_dealloca(
	iree_hal_cuda2_memory_pools_t* pools, CUstream stream,
	iree_hal_buffer_t* buffer) {
	IREE_TRACE_ZONE_BEGIN(z0);
	IREE_TRACE_ZONE_APPEND_VALUE_I64(
	z0, (int64_t)iree_hal_buffer_allocation_size(buffer));

	// Only process the request if the buffer came from an async pool.
	// We may get requests for deallocations on ones that didn't if one part of
	// the application allocated the buffer synchronously and another deallocated
	// it asynchronously.
	iree_status_t status = iree_ok_status();
	if (iree_hal_cuda2_buffer_type(buffer) == IREE_HAL_CUDA_BUFFER_TYPE_ASYNC) {
	// Try to schedule the buffer for freeing.
	CUdeviceptr device_ptr = iree_hal_cuda2_buffer_device_pointer(buffer);
	status = IREE_CURESULT_TO_STATUS(pools->cuda_symbols,
	cuMemFreeAsync(device_ptr, stream),
	"cuMemFreeAsync");
	if (iree_status_is_ok(status)) {
	// Drop the release callback so that we don't try to double-free the
	// buffer. Note that we only do this if the CUDA free succeeded as
	// otherwise we still need to synchronously deallocate the buffer when it
	// is destroyed.
	iree_hal_cuda2_buffer_drop_release_callback(buffer);

	// Update statistics (note that it may not yet be accurate).
	iree_hal_cuda2_memory_pool_track_free(pools, buffer);
	}
	} else {
	// Not allocated via alloca, ignore.
	IREE_TRACE_ZONE_APPEND_TEXT(z0, "ignored sync allocation");
	}

	IREE_TRACE_ZONE_END(z0);
	return status;
	}