Adding support for async memory pool allocations in the CUDA HAL. (#13440) These aren't actually async as the CUDA HAL is synchronous but will make use of CUDA's memory pooling features to reduce the alloc/free cost in a way more friendly to CUDA's memory management than just the normal allocator. With this our queue-ordered allocations in CUDA now average a few us (for me) and the caching allocator (or any other) is only needed for caching non-queue-ordered allocations. A few compiler tweaks to switch all allocations to queue-ordered will mean only explicitly allocated buffers (constants/variables, stuff the user does manually, etc) will not route to the pools. It'd also be possible to explore using the same pools the queue-ordered allocations use for explicit synchronous allocations (at least the device-local pool) but it'd be nicer to get those out of the critical path and then keep the pools separate such that the transient pool isn't filled with persistent allocations. Due to #13984 this relies on the `--iree-stream-emulate-memset` flag added in #13994 being set when graphs are enabled. Since this is not the default path today and there's just two test suites using it we just flip the flag for them.

commit: 8f9e962e60edb19786d41362968bec33797bab2a [log] [tgz]
author: Ben Vanik <ben.vanik@gmail.com> Thu Jun 08 08:44:18 2023 -0700
committer: GitHub <noreply@github.com> Thu Jun 08 08:44:18 2023 -0700
tree: e084967a37980268ae7d4d4d832c0482033b3a5c
parent: aced620654d52092ed383d56cb1c6e153cd72ba9 [diff]
diff --git a/runtime/src/iree/hal/drivers/cuda/BUILD.bazel b/runtime/src/iree/hal/drivers/cuda/BUILD.bazel
index e07f7ce..7b736c0 100644
--- a/runtime/src/iree/hal/drivers/cuda/BUILD.bazel
+++ b/runtime/src/iree/hal/drivers/cuda/BUILD.bazel

@@ -30,6 +30,8 @@
         "event_semaphore.h",
         "graph_command_buffer.c",
         "graph_command_buffer.h",
+        "memory_pools.c",
+        "memory_pools.h",
         "native_executable.c",
         "native_executable.h",
         "nccl_channel.c",

diff --git a/runtime/src/iree/hal/drivers/cuda/CMakeLists.txt b/runtime/src/iree/hal/drivers/cuda/CMakeLists.txt
index 9fff57c..18b3d38 100644
--- a/runtime/src/iree/hal/drivers/cuda/CMakeLists.txt
+++ b/runtime/src/iree/hal/drivers/cuda/CMakeLists.txt

@@ -31,6 +31,8 @@
     "event_semaphore.h"
     "graph_command_buffer.c"
     "graph_command_buffer.h"
+    "memory_pools.c"
+    "memory_pools.h"
     "native_executable.c"
     "native_executable.h"
     "nccl_channel.c"

diff --git a/runtime/src/iree/hal/drivers/cuda/api.h b/runtime/src/iree/hal/drivers/cuda/api.h
index 498d750..f33a56b 100644
--- a/runtime/src/iree/hal/drivers/cuda/api.h
+++ b/runtime/src/iree/hal/drivers/cuda/api.h

@@ -33,6 +33,26 @@
   char data[128];
 } iree_hal_cuda_nccl_id_t;
 
+// Parameters defining a CUmemoryPool.
+typedef struct iree_hal_cuda_memory_pool_params_t {
+  // Minimum number of bytes to keep in the pool when trimming with
+  // iree_hal_device_trim.
+  uint64_t minimum_capacity;
+  // Soft maximum number of bytes to keep in the pool.
+  // When more than this is allocated the extra will be freed at the next
+  // device synchronization in order to remain under the threshold.
+  uint64_t release_threshold;
+  // TODO: per-device access permissions array.
+} iree_hal_cuda_memory_pool_params_t;
+
+// Parameters for each CUmemoryPool used for queue-ordered allocations.
+typedef struct iree_hal_cuda_memory_pooling_params_t {
+  // Used exclusively for DEVICE_LOCAL allocations.
+  iree_hal_cuda_memory_pool_params_t device_local;
+  // Used for any host-visible/host-local memory types.
+  iree_hal_cuda_memory_pool_params_t other;
+} iree_hal_cuda_memory_pooling_params_t;
+
 // Parameters configuring an iree_hal_cuda_device_t.
 // Must be initialized with iree_hal_cuda_device_params_initialize prior to use.
 typedef struct iree_hal_cuda_device_params_t {
@@ -63,6 +83,9 @@
   // identify slow dispatches and refine from there; be wary of whole-program
   // tracing with this enabled.
   bool stream_tracing;
+
+  // Parameters for each CUmemoryPool used for queue-ordered allocations.
+  iree_hal_cuda_memory_pooling_params_t memory_pools;
 } iree_hal_cuda_device_params_t;
 
 // Initializes |out_params| to default values.

diff --git a/runtime/src/iree/hal/drivers/cuda/cuda_allocator.c b/runtime/src/iree/hal/drivers/cuda/cuda_allocator.c
index 4d22a01..eba6409 100644
--- a/runtime/src/iree/hal/drivers/cuda/cuda_allocator.c
+++ b/runtime/src/iree/hal/drivers/cuda/cuda_allocator.c

@@ -15,7 +15,7 @@
 #include "iree/hal/drivers/cuda/status_util.h"
 
 #if IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_ALLOCATION_TRACKING
-static const char* IREE_HAL_CUDA_ALLOCATOR_ID = "CUDA";
+static const char* IREE_HAL_CUDA_ALLOCATOR_ID = "CUDA unpooled";
 #endif  // IREE_TRACING_FEATURE_ALLOCATION_TRACKING
 
 typedef struct iree_hal_cuda_allocator_t {
@@ -24,6 +24,7 @@
   iree_hal_cuda_context_wrapper_t* context;
   CUdevice device;
   CUstream stream;
+  iree_hal_cuda_memory_pools_t* pools;
   bool supports_concurrent_managed_access;
 
   IREE_STATISTICS(iree_hal_allocator_statistics_t statistics;)
@@ -39,9 +40,11 @@
 
 iree_status_t iree_hal_cuda_allocator_create(
     iree_hal_device_t* base_device, iree_hal_cuda_context_wrapper_t* context,
-    CUdevice device, CUstream stream, iree_hal_allocator_t** out_allocator) {
+    CUdevice device, CUstream stream, iree_hal_cuda_memory_pools_t* pools,
+    iree_hal_allocator_t** out_allocator) {
   IREE_ASSERT_ARGUMENT(base_device);
   IREE_ASSERT_ARGUMENT(context);
+  IREE_ASSERT_ARGUMENT(pools);
   IREE_TRACE_ZONE_BEGIN(z0);
 
   // To support device-local + host-visible memory we need concurrent managed
@@ -75,6 +78,7 @@
     allocator->context = context;
     allocator->device = device;
     allocator->stream = stream;
+    allocator->pools = pools;
     allocator->supports_concurrent_managed_access =
         supports_concurrent_managed_access != 0;
     *out_allocator = (iree_hal_allocator_t*)allocator;
@@ -115,6 +119,8 @@
     iree_hal_cuda_allocator_t* allocator =
         iree_hal_cuda_allocator_cast(base_allocator);
     memcpy(out_statistics, &allocator->statistics, sizeof(*out_statistics));
+    iree_hal_cuda_memory_pools_merge_statistics(allocator->pools,
+                                                out_statistics);
   });
 }
 
@@ -274,6 +280,10 @@
       CUDA_IGNORE_ERROR(context->syms, cuMemHostUnregister(host_ptr));
       break;
     }
+    case IREE_HAL_CUDA_BUFFER_TYPE_ASYNC: {
+      IREE_TRACE_ZONE_APPEND_TEXT(z0, "(ignored; async)");
+      break;
+    }
   }
   IREE_TRACE_ZONE_END(z0);
 }
@@ -370,7 +380,8 @@
         compat_params.usage, allocation_size,
         /*byte_offset=*/0,
         /*byte_length=*/allocation_size, buffer_type, device_ptr, host_ptr,
-        iree_hal_buffer_release_callback_null(), &buffer);
+        iree_hal_buffer_release_callback_null(),
+        iree_hal_allocator_host_allocator(base_allocator), &buffer);
   }
 
   // Copy the initial contents into the buffer. This may require staging.
@@ -541,7 +552,8 @@
         compat_params.usage, external_buffer->size,
         /*byte_offset=*/0,
         /*byte_length=*/external_buffer->size, buffer_type, device_ptr,
-        host_ptr, release_callback, &buffer);
+        host_ptr, release_callback,
+        iree_hal_allocator_host_allocator(base_allocator), &buffer);
   }
 
   if (iree_status_is_ok(status)) {

diff --git a/runtime/src/iree/hal/drivers/cuda/cuda_allocator.h b/runtime/src/iree/hal/drivers/cuda/cuda_allocator.h
index b2f2728..0df31ee 100644
--- a/runtime/src/iree/hal/drivers/cuda/cuda_allocator.h
+++ b/runtime/src/iree/hal/drivers/cuda/cuda_allocator.h

@@ -10,16 +10,21 @@
 #include "iree/base/api.h"
 #include "iree/hal/api.h"
 #include "iree/hal/drivers/cuda/context_wrapper.h"
+#include "iree/hal/drivers/cuda/memory_pools.h"
 #include "iree/hal/drivers/cuda/status_util.h"
 
 #ifdef __cplusplus
 extern "C" {
 #endif  // __cplusplus
 
-// Create a cuda allocator.
+// Creates a CUDA memory allocator.
+// |device| and |stream| will be used for management operations.
+// |pools| provides memory pools that may be shared across multiple allocators
+// and the pointer must remain valid for the lifetime of the allocator.
 iree_status_t iree_hal_cuda_allocator_create(
     iree_hal_device_t* base_device, iree_hal_cuda_context_wrapper_t* context,
-    CUdevice device, CUstream stream, iree_hal_allocator_t** out_allocator);
+    CUdevice device, CUstream stream, iree_hal_cuda_memory_pools_t* pools,
+    iree_hal_allocator_t** out_allocator);
 
 #ifdef __cplusplus
 }  // extern "C"

diff --git a/runtime/src/iree/hal/drivers/cuda/cuda_buffer.c b/runtime/src/iree/hal/drivers/cuda/cuda_buffer.c
index 3b9c9e1..87c4147 100644
--- a/runtime/src/iree/hal/drivers/cuda/cuda_buffer.c
+++ b/runtime/src/iree/hal/drivers/cuda/cuda_buffer.c

@@ -42,13 +42,10 @@
     iree_device_size_t byte_offset, iree_device_size_t byte_length,
     iree_hal_cuda_buffer_type_t buffer_type, CUdeviceptr device_ptr,
     void* host_ptr, iree_hal_buffer_release_callback_t release_callback,
-    iree_hal_buffer_t** out_buffer) {
-  IREE_ASSERT_ARGUMENT(allocator);
+    iree_allocator_t host_allocator, iree_hal_buffer_t** out_buffer) {
   IREE_ASSERT_ARGUMENT(out_buffer);
   IREE_TRACE_ZONE_BEGIN(z0);
 
-  iree_allocator_t host_allocator =
-      iree_hal_allocator_host_allocator(allocator);
   iree_hal_cuda_buffer_t* buffer = NULL;
   iree_status_t status =
       iree_allocator_malloc(host_allocator, sizeof(*buffer), (void**)&buffer);
@@ -151,6 +148,12 @@
   return buffer->host_ptr;
 }
 
+void iree_hal_cuda_buffer_drop_release_callback(
+    iree_hal_buffer_t* base_buffer) {
+  iree_hal_cuda_buffer_t* buffer = iree_hal_cuda_buffer_cast(base_buffer);
+  buffer->release_callback = iree_hal_buffer_release_callback_null();
+}
+
 static const iree_hal_buffer_vtable_t iree_hal_cuda_buffer_vtable = {
     .recycle = iree_hal_buffer_recycle,
     .destroy = iree_hal_cuda_buffer_destroy,

diff --git a/runtime/src/iree/hal/drivers/cuda/cuda_buffer.h b/runtime/src/iree/hal/drivers/cuda/cuda_buffer.h
index 0b07e8d..640d6f5 100644
--- a/runtime/src/iree/hal/drivers/cuda/cuda_buffer.h
+++ b/runtime/src/iree/hal/drivers/cuda/cuda_buffer.h

@@ -17,11 +17,13 @@
 
 typedef enum iree_hal_cuda_buffer_type_e {
   // cuMemAlloc/cuMemAllocManaged + cuMemFree
-  IREE_HAL_CUDA_BUFFER_TYPE_DEVICE = 1u << 0,
+  IREE_HAL_CUDA_BUFFER_TYPE_DEVICE = 0,
   // cuMemHostAlloc + cuMemFreeHost
-  IREE_HAL_CUDA_BUFFER_TYPE_HOST = 1u << 1,
+  IREE_HAL_CUDA_BUFFER_TYPE_HOST,
   // cuMemHostRegister + cuMemHostUnregister
-  IREE_HAL_CUDA_BUFFER_TYPE_HOST_REGISTERED = 1u << 2,
+  IREE_HAL_CUDA_BUFFER_TYPE_HOST_REGISTERED,
+  // cuMemAllocFromPoolAsync + cuMemFree/cuMemFreeAsync
+  IREE_HAL_CUDA_BUFFER_TYPE_ASYNC,
 } iree_hal_cuda_buffer_type_t;
 
 // Wraps a CUDA allocation in an iree_hal_buffer_t.
@@ -32,7 +34,7 @@
     iree_device_size_t byte_offset, iree_device_size_t byte_length,
     iree_hal_cuda_buffer_type_t buffer_type, CUdeviceptr device_ptr,
     void* host_ptr, iree_hal_buffer_release_callback_t release_callback,
-    iree_hal_buffer_t** out_buffer);
+    iree_allocator_t host_allocator, iree_hal_buffer_t** out_buffer);
 
 // Returns the underlying CUDA buffer type.
 iree_hal_cuda_buffer_type_t iree_hal_cuda_buffer_type(
@@ -47,6 +49,12 @@
 // Returns the CUDA host pointer for the given |buffer|, if available.
 void* iree_hal_cuda_buffer_host_pointer(const iree_hal_buffer_t* buffer);
 
+// Drops the release callback so that when the buffer is destroyed no callback
+// will be made. This is not thread safe but all callers are expected to be
+// holding an allocation and the earliest the buffer could be destroyed is after
+// this call returns and the caller has released its reference.
+void iree_hal_cuda_buffer_drop_release_callback(iree_hal_buffer_t* buffer);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif  // __cplusplus

diff --git a/runtime/src/iree/hal/drivers/cuda/cuda_device.c b/runtime/src/iree/hal/drivers/cuda/cuda_device.c
index a47a039..03ef410 100644
--- a/runtime/src/iree/hal/drivers/cuda/cuda_device.c
+++ b/runtime/src/iree/hal/drivers/cuda/cuda_device.c

@@ -15,10 +15,12 @@
 #include "iree/base/tracing.h"
 #include "iree/hal/drivers/cuda/context_wrapper.h"
 #include "iree/hal/drivers/cuda/cuda_allocator.h"
+#include "iree/hal/drivers/cuda/cuda_buffer.h"
 #include "iree/hal/drivers/cuda/cuda_event.h"
 #include "iree/hal/drivers/cuda/dynamic_symbols.h"
 #include "iree/hal/drivers/cuda/event_semaphore.h"
 #include "iree/hal/drivers/cuda/graph_command_buffer.h"
+#include "iree/hal/drivers/cuda/memory_pools.h"
 #include "iree/hal/drivers/cuda/nccl_channel.h"
 #include "iree/hal/drivers/cuda/nop_executable_cache.h"
 #include "iree/hal/drivers/cuda/pipeline_layout.h"
@@ -54,6 +56,7 @@
   iree_hal_cuda_context_wrapper_t context_wrapper;
   iree_hal_cuda_tracing_context_t* tracing_context;
 
+  iree_hal_cuda_memory_pools_t memory_pools;
   iree_hal_allocator_t* device_allocator;
 
   // Optional provider used for creating/configuring collective channels.
@@ -134,10 +137,16 @@
         &device->block_pool, host_allocator, &device->tracing_context);
   }
 
+  // Create memory pools first so that we can share them with the allocator.
   if (iree_status_is_ok(status)) {
-    status = iree_hal_cuda_allocator_create((iree_hal_device_t*)device,
-                                            &device->context_wrapper, cu_device,
-                                            stream, &device->device_allocator);
+    status = iree_hal_cuda_memory_pools_initialize(
+        &device->context_wrapper, &params->memory_pools, &device->memory_pools);
+  }
+
+  if (iree_status_is_ok(status)) {
+    status = iree_hal_cuda_allocator_create(
+        (iree_hal_device_t*)device, &device->context_wrapper, cu_device, stream,
+        &device->memory_pools, &device->device_allocator);
   }
 
   if (iree_status_is_ok(status) &&
@@ -219,6 +228,9 @@
   // Buffers may have been retaining collective resources.
   iree_hal_channel_provider_release(device->channel_provider);
 
+  // Destroy memory pools that hold on to reserved memory.
+  iree_hal_cuda_memory_pools_deinitialize(&device->memory_pools);
+
   // TODO: support multiple streams.
   iree_hal_cuda_tracing_context_free(device->tracing_context);
   CUDA_IGNORE_ERROR(device->context_wrapper.syms,
@@ -274,7 +286,20 @@
 static iree_status_t iree_hal_cuda_device_trim(iree_hal_device_t* base_device) {
   iree_hal_cuda_device_t* device = iree_hal_cuda_device_cast(base_device);
   iree_arena_block_pool_trim(&device->block_pool);
-  return iree_hal_allocator_trim(device->device_allocator);
+  IREE_RETURN_IF_ERROR(iree_hal_allocator_trim(device->device_allocator));
+  // TODO: move to memory pool manager.
+  CUDA_RETURN_IF_ERROR(
+      device->context_wrapper.syms,
+      cuMemPoolTrimTo(
+          device->memory_pools.device_local,
+          device->params.memory_pools.device_local.minimum_capacity),
+      "cuMemPoolTrimTo");
+  CUDA_RETURN_IF_ERROR(
+      device->context_wrapper.syms,
+      cuMemPoolTrimTo(device->memory_pools.other,
+                      device->params.memory_pools.other.minimum_capacity),
+      "cuMemPoolTrimTo");
+  return iree_ok_status();
 }
 
 static iree_status_t iree_hal_cuda_device_query_i64(
@@ -470,6 +495,10 @@
   return IREE_HAL_SEMAPHORE_COMPATIBILITY_HOST_ONLY;
 }
 
+// TODO: implement multiple streams; today we only have one and queue_affinity
+//       is ignored.
+// TODO: implement proper semaphores in CUDA to ensure ordering and avoid
+//       the barrier here.
 static iree_status_t iree_hal_cuda_device_queue_alloca(
     iree_hal_device_t* base_device, iree_hal_queue_affinity_t queue_affinity,
     const iree_hal_semaphore_list_t wait_semaphore_list,
@@ -477,27 +506,57 @@
     iree_hal_allocator_pool_t pool, iree_hal_buffer_params_t params,
     iree_device_size_t allocation_size,
     iree_hal_buffer_t** IREE_RESTRICT out_buffer) {
-  // TODO(benvanik): queue-ordered allocations.
-  // TODO(benvanik): tracing of the allocations (just for sequencing).
+  iree_hal_cuda_device_t* device = iree_hal_cuda_device_cast(base_device);
+
+  // NOTE: block on the semaphores here; we could avoid this by properly
+  // sequencing device work with semaphores. The CUDA HAL is not currently
+  // asynchronous.
   IREE_RETURN_IF_ERROR(iree_hal_semaphore_list_wait(wait_semaphore_list,
                                                     iree_infinite_timeout()));
-  IREE_RETURN_IF_ERROR(iree_hal_allocator_allocate_buffer(
-      iree_hal_device_allocator(base_device), params, allocation_size,
-      iree_const_byte_span_empty(), out_buffer));
-  IREE_RETURN_IF_ERROR(iree_hal_semaphore_list_signal(signal_semaphore_list));
-  return iree_ok_status();
+
+  // Allocate from the pool; likely to fail in cases of virtual memory
+  // exhaustion but the error may be deferred until a later synchronization.
+  iree_status_t status = iree_hal_cuda_memory_pools_alloca(
+      &device->memory_pools, device->stream, pool, params, allocation_size,
+      out_buffer);
+
+  // Only signal if not returning a synchronous error - synchronous failure
+  // indicates that the stream is unchanged (it's not really since we waited
+  // above, but we at least won't deadlock like this).
+  if (iree_status_is_ok(status)) {
+    IREE_RETURN_IF_ERROR(iree_hal_semaphore_list_signal(signal_semaphore_list));
+  }
+  return status;
 }
 
+// TODO: implement multiple streams; today we only have one and queue_affinity
+//       is ignored.
+// TODO: implement proper semaphores in CUDA to ensure ordering and avoid
+//       the barrier here.
 static iree_status_t iree_hal_cuda_device_queue_dealloca(
     iree_hal_device_t* base_device, iree_hal_queue_affinity_t queue_affinity,
     const iree_hal_semaphore_list_t wait_semaphore_list,
     const iree_hal_semaphore_list_t signal_semaphore_list,
     iree_hal_buffer_t* buffer) {
-  // TODO(benvanik): queue-ordered allocations.
-  // TODO(benvanik): tracing of the allocations (just for sequencing).
-  IREE_RETURN_IF_ERROR(iree_hal_device_queue_barrier(
-      base_device, queue_affinity, wait_semaphore_list, signal_semaphore_list));
-  return iree_ok_status();
+  iree_hal_cuda_device_t* device = iree_hal_cuda_device_cast(base_device);
+
+  // NOTE: block on the semaphores here; we could avoid this by properly
+  // sequencing device work with semaphores. The CUDA HAL is not currently
+  // asynchronous.
+  IREE_RETURN_IF_ERROR(iree_hal_semaphore_list_wait(wait_semaphore_list,
+                                                    iree_infinite_timeout()));
+
+  // Schedule the buffer deallocation.
+  iree_status_t status = iree_hal_cuda_memory_pools_dealloca(
+      &device->memory_pools, device->stream, buffer);
+
+  // Only signal if not returning a synchronous error - synchronous failure
+  // indicates that the stream is unchanged (it's not really since we waited
+  // above, but we at least won't deadlock like this).
+  if (iree_status_is_ok(status)) {
+    IREE_RETURN_IF_ERROR(iree_hal_semaphore_list_signal(signal_semaphore_list));
+  }
+  return status;
 }
 
 static iree_status_t iree_hal_cuda_device_queue_execute(
@@ -556,7 +615,7 @@
 }
 
 static iree_status_t iree_hal_cuda_device_profiling_begin(
-    iree_hal_device_t* device,
+    iree_hal_device_t* base_device,
     const iree_hal_device_profiling_options_t* options) {
   // Unimplemented (and that's ok).
   // We could hook in to CUPTI here or use the much simpler cuProfilerStart API.
@@ -564,7 +623,7 @@
 }
 
 static iree_status_t iree_hal_cuda_device_profiling_end(
-    iree_hal_device_t* device) {
+    iree_hal_device_t* base_device) {
   // Unimplemented (and that's ok).
   return iree_ok_status();
 }

diff --git a/runtime/src/iree/hal/drivers/cuda/dynamic_symbol_tables.h b/runtime/src/iree/hal/drivers/cuda/dynamic_symbol_tables.h
index 857f492..e2bb69a 100644
--- a/runtime/src/iree/hal/drivers/cuda/dynamic_symbol_tables.h
+++ b/runtime/src/iree/hal/drivers/cuda/dynamic_symbol_tables.h

@@ -47,6 +47,15 @@
 CU_PFN_DECL(cuMemHostRegister, void*, size_t, unsigned int)
 CU_PFN_DECL(cuMemHostUnregister, void*)
 CU_PFN_DECL(cuMemHostGetDevicePointer, CUdeviceptr*, void*, unsigned int)
+CU_PFN_DECL(cuMemPoolCreate, CUmemoryPool*, const CUmemPoolProps*)
+CU_PFN_DECL(cuMemPoolDestroy, CUmemoryPool)
+CU_PFN_DECL(cuMemPoolSetAccess, CUmemoryPool, const CUmemAccessDesc*, size_t)
+CU_PFN_DECL(cuMemPoolGetAttribute, CUmemoryPool, CUmemPool_attribute, void*)
+CU_PFN_DECL(cuMemPoolSetAttribute, CUmemoryPool, CUmemPool_attribute, void*)
+CU_PFN_DECL(cuMemPoolTrimTo, CUmemoryPool, size_t)
+CU_PFN_DECL(cuMemAllocFromPoolAsync, CUdeviceptr*, size_t, CUmemoryPool,
+            CUstream)
+CU_PFN_DECL(cuMemFreeAsync, CUdeviceptr dptr, CUstream hStream)
 CU_PFN_DECL(cuModuleGetFunction, CUfunction*, CUmodule, const char*)
 CU_PFN_DECL(cuModuleLoadDataEx, CUmodule*, const void*, unsigned int,
             CUjit_option*, void**)

diff --git a/runtime/src/iree/hal/drivers/cuda/graph_command_buffer.c b/runtime/src/iree/hal/drivers/cuda/graph_command_buffer.c
index 83bc618..bc587a2 100644
--- a/runtime/src/iree/hal/drivers/cuda/graph_command_buffer.c
+++ b/runtime/src/iree/hal/drivers/cuda/graph_command_buffer.c

@@ -392,8 +392,8 @@
   CUDA_MEMSET_NODE_PARAMS params = {
       .dst = target_device_buffer + target_offset,
       .elementSize = pattern_length,
-      // width in number of elements despite what driver documentation says.
-      .width = length / pattern_length,
+      .pitch = 0,                        // unused if height == 1
+      .width = length / pattern_length,  // element count
       .height = 1,
       .value = dword_pattern,
   };

diff --git a/runtime/src/iree/hal/drivers/cuda/memory_pools.c b/runtime/src/iree/hal/drivers/cuda/memory_pools.c
new file mode 100644
index 0000000..56b50c8
--- /dev/null
+++ b/runtime/src/iree/hal/drivers/cuda/memory_pools.c

@@ -0,0 +1,273 @@
+// Copyright 2023 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/hal/drivers/cuda/memory_pools.h"
+
+#include "iree/base/tracing.h"
+#include "iree/hal/drivers/cuda/cuda_buffer.h"
+#include "iree/hal/drivers/cuda/status_util.h"
+
+// NOTE: these are currently global for all devices; we could make
+// device-specific ones by malloc() and leaking (with LSAN note) unique string
+// values instead.
+#if IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_ALLOCATION_TRACKING
+static const char* IREE_HAL_CUDA_DEVICE_LOCAL_POOL_RESERVED_ID =
+    "CUDA pool: device-local reserved";
+static const char* IREE_HAL_CUDA_OTHER_POOL_RESERVED_ID =
+    "CUDA pool: other reserved";
+#endif  // IREE_TRACING_FEATURE_ALLOCATION_TRACKING
+
+static iree_status_t iree_hal_cuda_create_memory_pool(
+    iree_hal_cuda_context_wrapper_t* context,
+    iree_hal_cuda_memory_pool_params_t params,
+    CUmemoryPool* IREE_RESTRICT out_pool) {
+  *out_pool = NULL;
+
+  CUmemPoolProps pool_props = {
+      .allocType = CU_MEM_ALLOCATION_TYPE_PINNED,
+      // TODO: allow sharing of certain pool memory types by fd/HANDLE.
+      .handleTypes = CU_MEM_HANDLE_TYPE_NONE,
+      .location =
+          {
+              .type = CU_MEM_LOCATION_TYPE_DEVICE,
+              .id = context->cu_device,
+          },
+      .win32SecurityAttributes = NULL,
+      .reserved = {0},
+  };
+
+  CUmemoryPool pool = NULL;
+  CUDA_RETURN_IF_ERROR(context->syms, cuMemPoolCreate(&pool, &pool_props),
+                       "cuMemPoolCreate");
+
+  iree_status_t status = CU_RESULT_TO_STATUS(
+      context->syms,
+      cuMemPoolSetAttribute(pool, CU_MEMPOOL_ATTR_RELEASE_THRESHOLD,
+                            &params.release_threshold),
+      "cuMemPoolSetAttribute");
+
+  if (iree_status_is_ok(status)) {
+    *out_pool = pool;
+  } else {
+    CUDA_IGNORE_ERROR(context->syms, cuMemPoolDestroy(pool));
+  }
+  return status;
+}
+
+iree_status_t iree_hal_cuda_memory_pools_initialize(
+    iree_hal_cuda_context_wrapper_t* context,
+    const iree_hal_cuda_memory_pooling_params_t* pooling_params,
+    iree_hal_cuda_memory_pools_t* IREE_RESTRICT out_pools) {
+  IREE_ASSERT_ARGUMENT(context);
+  IREE_ASSERT_ARGUMENT(pooling_params);
+  IREE_ASSERT_ARGUMENT(out_pools);
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  memset(out_pools, 0, sizeof(*out_pools));
+  out_pools->context = context;
+
+  iree_status_t status = iree_ok_status();
+
+  if (iree_status_is_ok(status)) {
+    status = iree_hal_cuda_create_memory_pool(
+        context, pooling_params->device_local, &out_pools->device_local);
+  }
+
+  if (iree_status_is_ok(status)) {
+    status = iree_hal_cuda_create_memory_pool(context, pooling_params->other,
+                                              &out_pools->other);
+  }
+
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+void iree_hal_cuda_memory_pools_deinitialize(
+    iree_hal_cuda_memory_pools_t* pools) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  if (pools->device_local) {
+    CUDA_IGNORE_ERROR(pools->context->syms,
+                      cuMemPoolDestroy(pools->device_local));
+    pools->device_local = NULL;
+  }
+
+  if (pools->other) {
+    CUDA_IGNORE_ERROR(pools->context->syms, cuMemPoolDestroy(pools->other));
+    pools->other = NULL;
+  }
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+static void iree_hal_cuda_memory_pool_track_alloc(
+    iree_hal_cuda_memory_pools_t* pools, iree_hal_buffer_t* buffer) {
+  bool is_device_local = iree_all_bits_set(iree_hal_buffer_memory_type(buffer),
+                                           IREE_HAL_MEMORY_TYPE_DEVICE_LOCAL);
+  (void)is_device_local;
+  iree_device_size_t allocation_size = iree_hal_buffer_allocation_size(buffer);
+  (void)allocation_size;
+  IREE_TRACE_ALLOC_NAMED(
+      is_device_local ? IREE_HAL_CUDA_DEVICE_LOCAL_POOL_RESERVED_ID
+                      : IREE_HAL_CUDA_OTHER_POOL_RESERVED_ID,
+      (void*)iree_hal_cuda_buffer_device_pointer(buffer), allocation_size);
+  IREE_STATISTICS({
+    iree_atomic_int64_t* bytes_allocated =
+        is_device_local ? &pools->statistics.device_bytes_allocated
+                        : &pools->statistics.host_bytes_allocated;
+    iree_atomic_fetch_add_int64(bytes_allocated, allocation_size,
+                                iree_memory_order_relaxed);
+  });
+}
+
+static void iree_hal_cuda_memory_pool_track_free(
+    iree_hal_cuda_memory_pools_t* pools, iree_hal_buffer_t* buffer) {
+  bool is_device_local = iree_all_bits_set(iree_hal_buffer_memory_type(buffer),
+                                           IREE_HAL_MEMORY_TYPE_DEVICE_LOCAL);
+  (void)is_device_local;
+  IREE_TRACE_FREE_NAMED(is_device_local
+                            ? IREE_HAL_CUDA_DEVICE_LOCAL_POOL_RESERVED_ID
+                            : IREE_HAL_CUDA_OTHER_POOL_RESERVED_ID,
+                        (void*)iree_hal_cuda_buffer_device_pointer(buffer));
+  IREE_STATISTICS({
+    iree_atomic_int64_t* bytes_freed =
+        is_device_local ? &pools->statistics.device_bytes_freed
+                        : &pools->statistics.host_bytes_freed;
+    iree_device_size_t allocation_size =
+        iree_hal_buffer_allocation_size(buffer);
+    iree_atomic_fetch_add_int64(bytes_freed, allocation_size,
+                                iree_memory_order_relaxed);
+  });
+}
+
+void iree_hal_cuda_memory_pools_merge_statistics(
+    iree_hal_cuda_memory_pools_t* pools,
+    iree_hal_allocator_statistics_t* statistics) {
+  IREE_STATISTICS({
+    statistics->device_bytes_allocated = iree_atomic_load_int64(
+        &pools->statistics.device_bytes_allocated, iree_memory_order_relaxed);
+    statistics->host_bytes_allocated = iree_atomic_load_int64(
+        &pools->statistics.host_bytes_allocated, iree_memory_order_relaxed);
+    statistics->device_bytes_freed = iree_atomic_load_int64(
+        &pools->statistics.device_bytes_freed, iree_memory_order_relaxed);
+    statistics->host_bytes_freed = iree_atomic_load_int64(
+        &pools->statistics.host_bytes_freed, iree_memory_order_relaxed);
+    if (pools->device_local) {
+      cuuint64_t pool_peak = 0;
+      CUDA_IGNORE_ERROR(
+          pools->context->syms,
+          cuMemPoolGetAttribute(pools->device_local,
+                                CU_MEMPOOL_ATTR_USED_MEM_HIGH, &pool_peak));
+      statistics->device_bytes_peak += (iree_device_size_t)pool_peak;
+    }
+    if (pools->other) {
+      cuuint64_t pool_peak = 0;
+      CUDA_IGNORE_ERROR(
+          pools->context->syms,
+          cuMemPoolGetAttribute(pools->other, CU_MEMPOOL_ATTR_USED_MEM_HIGH,
+                                &pool_peak));
+      statistics->host_bytes_peak += (iree_device_size_t)pool_peak;
+    }
+  });
+}
+
+// NOTE: this is only issued if the buffer is destroyed without having had been
+// scheduled for deallocation asynchronously. When a buffer is scheduled we drop
+// the release callback so that this isn't called and we don't double-free.
+static void iree_hal_cuda_async_buffer_release_callback(
+    void* user_data, iree_hal_buffer_t* buffer) {
+  iree_hal_cuda_memory_pools_t* pools =
+      (iree_hal_cuda_memory_pools_t*)user_data;
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  CUdeviceptr device_ptr = iree_hal_cuda_buffer_device_pointer(buffer);
+  CUDA_IGNORE_ERROR(pools->context->syms, cuMemFree(device_ptr));
+  iree_hal_cuda_memory_pool_track_free(pools, buffer);
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+iree_status_t iree_hal_cuda_memory_pools_alloca(
+    iree_hal_cuda_memory_pools_t* pools, CUstream stream,
+    iree_hal_allocator_pool_t pool, iree_hal_buffer_params_t params,
+    iree_device_size_t allocation_size,
+    iree_hal_buffer_t** IREE_RESTRICT out_buffer) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+  IREE_TRACE_ZONE_APPEND_VALUE(z0, (int64_t)allocation_size);
+
+  iree_hal_buffer_params_canonicalize(&params);
+
+  // TODO: more pools and better selection; this is coarsely deciding between
+  // only device local (variables, constants, transients) and other (staging,
+  // external) but could use more buffer properties (including usage/export
+  // flags) to better isolate the different usage patterns and keep the pools
+  // operating with reasonable limits. We should be using the |pool| arg.
+  CUmemoryPool memory_pool =
+      iree_all_bits_set(params.type, IREE_HAL_MEMORY_TYPE_DEVICE_LOCAL)
+          ? pools->device_local
+          : pools->other;
+
+  CUdeviceptr device_ptr = 0;
+  iree_status_t status = CU_RESULT_TO_STATUS(
+      pools->context->syms,
+      cuMemAllocFromPoolAsync(&device_ptr, (size_t)allocation_size, memory_pool,
+                              stream),
+      "cuMemAllocFromPoolAsync");
+
+  // Wrap the allocated CUDA buffer in a HAL buffer.
+  // NOTE: we don't provide a device allocator because we didn't allocate from
+  // one and instead we use a release callback to perform the free if the user
+  // doesn't dealloca the buffer.
+  iree_hal_buffer_t* buffer = NULL;
+  if (iree_status_is_ok(status)) {
+    iree_hal_buffer_release_callback_t release_callback = {
+        .fn = iree_hal_cuda_async_buffer_release_callback,
+        .user_data = pools,
+    };
+    status = iree_hal_cuda_buffer_wrap(
+        /*device_allocator=*/NULL, params.type, params.access, params.usage,
+        allocation_size, /*byte_offset=*/0,
+        /*byte_length=*/allocation_size, IREE_HAL_CUDA_BUFFER_TYPE_ASYNC,
+        device_ptr, /*host_ptr=*/NULL, release_callback,
+        pools->context->host_allocator, &buffer);
+  }
+
+  if (iree_status_is_ok(status)) {
+    // Update statistics (note that it may not yet be accurate).
+    iree_hal_cuda_memory_pool_track_alloc(pools, buffer);
+    *out_buffer = buffer;
+  } else if (buffer) {
+    iree_hal_buffer_release(buffer);
+  } else {
+    CUDA_IGNORE_ERROR(pools->context->syms, cuMemFreeAsync(device_ptr, stream));
+  }
+
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+iree_status_t iree_hal_cuda_memory_pools_dealloca(
+    iree_hal_cuda_memory_pools_t* pools, CUstream stream,
+    iree_hal_buffer_t* buffer) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+  IREE_TRACE_ZONE_APPEND_VALUE(
+      z0, (int64_t)iree_hal_buffer_allocation_size(buffer));
+
+  // Try to schedule the buffer for freeing.
+  CUdeviceptr device_ptr = iree_hal_cuda_buffer_device_pointer(buffer);
+  iree_status_t status =
+      CU_RESULT_TO_STATUS(pools->context->syms,
+                          cuMemFreeAsync(device_ptr, stream), "cuMemFreeAsync");
+
+  // Drop the release callback so that we don't try to double-free the buffer.
+  iree_hal_cuda_buffer_drop_release_callback(buffer);
+
+  // Update statistics (note that it may not yet be accurate).
+  iree_hal_cuda_memory_pool_track_free(pools, buffer);
+
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}

diff --git a/runtime/src/iree/hal/drivers/cuda/memory_pools.h b/runtime/src/iree/hal/drivers/cuda/memory_pools.h
new file mode 100644
index 0000000..328a7b9
--- /dev/null
+++ b/runtime/src/iree/hal/drivers/cuda/memory_pools.h

@@ -0,0 +1,71 @@
+// Copyright 2023 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_HAL_DRIVERS_CUDA_MEMORY_POOLS_H_
+#define IREE_HAL_DRIVERS_CUDA_MEMORY_POOLS_H_
+
+#include "iree/base/api.h"
+#include "iree/base/internal/atomics.h"
+#include "iree/hal/api.h"
+#include "iree/hal/drivers/cuda/api.h"
+#include "iree/hal/drivers/cuda/context_wrapper.h"
+#include "iree/hal/drivers/cuda/cuda_headers.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// Retained CUDA memory pools for various allocation types.
+typedef struct iree_hal_cuda_memory_pools_t {
+  // CUDA context the pools are attached to.
+  iree_hal_cuda_context_wrapper_t* context;
+  // Used exclusively for DEVICE_LOCAL allocations.
+  CUmemoryPool device_local;
+  // Used for any host-visible/host-local memory types.
+  CUmemoryPool other;
+
+  IREE_STATISTICS(struct {
+    iree_atomic_int64_t device_bytes_allocated;
+    iree_atomic_int64_t device_bytes_freed;
+    iree_atomic_int64_t host_bytes_allocated;
+    iree_atomic_int64_t host_bytes_freed;
+  } statistics;)
+} iree_hal_cuda_memory_pools_t;
+
+// Initializes |out_pools| by configuring new CUDA memory pools.
+iree_status_t iree_hal_cuda_memory_pools_initialize(
+    iree_hal_cuda_context_wrapper_t* context,
+    const iree_hal_cuda_memory_pooling_params_t* pooling_params,
+    iree_hal_cuda_memory_pools_t* IREE_RESTRICT out_pools);
+
+// Deinitializes the |pools| and releases the underlying CUDA resources.
+void iree_hal_cuda_memory_pools_deinitialize(
+    iree_hal_cuda_memory_pools_t* pools);
+
+// Merges statistics information from |pools| into |statistics|.
+void iree_hal_cuda_memory_pools_merge_statistics(
+    iree_hal_cuda_memory_pools_t* pools,
+    iree_hal_allocator_statistics_t* statistics);
+
+// Asynchronously allocates a buffer from an appropriate pool.
+// The allocation will be stream-ordered on |stream|.
+iree_status_t iree_hal_cuda_memory_pools_alloca(
+    iree_hal_cuda_memory_pools_t* pools, CUstream stream,
+    iree_hal_allocator_pool_t pool, iree_hal_buffer_params_t params,
+    iree_device_size_t allocation_size,
+    iree_hal_buffer_t** IREE_RESTRICT out_buffer);
+
+// Asynchronously deallocates a buffer from its pool.
+// The deallocation will be stream-ordered on |stream|.
+iree_status_t iree_hal_cuda_memory_pools_dealloca(
+    iree_hal_cuda_memory_pools_t* pools, CUstream stream,
+    iree_hal_buffer_t* buffer);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // IREE_HAL_DRIVERS_CUDA_MEMORY_POOLS_H_
commit	8f9e962e60edb19786d41362968bec33797bab2a	[log] [tgz]
author	Ben Vanik <ben.vanik@gmail.com>	Thu Jun 08 08:44:18 2023 -0700
committer	GitHub <noreply@github.com>	Thu Jun 08 08:44:18 2023 -0700
tree	e084967a37980268ae7d4d4d832c0482033b3a5c
parent	aced620654d52092ed383d56cb1c6e153cd72ba9 [diff]