[cuda] Fix segfault caused by CUevent outliving CUdevice (#14875) We use `iree_hal_cuda2_event_t` to wrap a `CUevent` so that we can have multiple places referencing the `CUevent` with automatic resource releasing when refcount becomes zero. The `iree_hal_cuda2_event_t` objects are further placed in a pool to amortize the cost of creation via recycling. But how it is set up right now, the pool's lifetime is associated with the HAL device. So it will be destroyed together with the device, including the CUDA context and so. All events that are in the pool will be destroyed; but there is no proper handling of outstanding `iree_hal_cuda2_event_t` in this flow. If a `iree_hal_cuda2_event_t`'s refcount decreases to zero after device destruction, it will cause segfaults in CUDA driver given the CUDA context is already gone. So this commit fixes the issue by retaining the pool in the `iree_hal_cuda2_event_t` object and retaining the HAL device in the pool.

commit: 6e0481682151f9843ba4a79e42b77eae9bd28224 [log] [tgz]
author: Lei Zhang <antiagainst@google.com> Tue Aug 29 21:54:32 2023 -0400
committer: GitHub <noreply@github.com> Tue Aug 29 18:54:32 2023 -0700
tree: 295c572c39ab24213ba96e97ab50d8cfdd68f37e
parent: 8105859455d5111247078aae7903a424a82fae16 [diff]
diff --git a/experimental/cuda2/cuda_device.c b/experimental/cuda2/cuda_device.c
index 034feae..370b693 100644
--- a/experimental/cuda2/cuda_device.c
+++ b/experimental/cuda2/cuda_device.c

@@ -132,9 +132,6 @@
     CUstream dispatch_stream, CUstream callback_stream, CUcontext context,
     const iree_hal_cuda2_dynamic_symbols_t* cuda_symbols,
     const iree_hal_cuda2_nccl_dynamic_symbols_t* nccl_symbols,
-    iree_event_pool_t* host_event_pool,
-    iree_hal_cuda2_event_pool_t* device_event_pool,
-    iree_hal_cuda2_timepoint_pool_t* timepoint_pool,
     iree_allocator_t host_allocator, iree_hal_device_t** out_device) {
   iree_hal_cuda2_device_t* device = NULL;
   iree_host_size_t total_size = iree_sizeof_struct(*device) + identifier.size;
@@ -159,9 +156,6 @@
   device->dispatch_cu_stream = dispatch_stream;
   device->callback_cu_stream = callback_stream;
   device->host_allocator = host_allocator;
-  device->host_event_pool = host_event_pool;
-  device->device_event_pool = device_event_pool;
-  device->timepoint_pool = timepoint_pool;
 
   iree_status_t status = iree_hal_cuda2_pending_queue_actions_create(
       cuda_symbols, &device->block_pool, host_allocator,
@@ -245,6 +239,17 @@
         cuda_symbols, cuStreamCreate(&callback_stream, CU_STREAM_NON_BLOCKING));
   }
 
+  if (iree_status_is_ok(status)) {
+    status = iree_hal_cuda2_device_create_internal(
+        driver, identifier, params, device, dispatch_stream, callback_stream,
+        context, cuda_symbols, nccl_symbols, host_allocator, out_device);
+  } else {
+    // Release resources we have accquired thus far.
+    if (callback_stream) cuda_symbols->cuStreamDestroy(callback_stream);
+    if (dispatch_stream) cuda_symbols->cuStreamDestroy(dispatch_stream);
+    if (context) cuda_symbols->cuDevicePrimaryCtxRelease(device);
+  }
+
   iree_event_pool_t* host_event_pool = NULL;
   if (iree_status_is_ok(status)) {
     status = iree_event_pool_allocate(params->event_pool_capacity,
@@ -254,7 +259,7 @@
   iree_hal_cuda2_event_pool_t* device_event_pool = NULL;
   if (iree_status_is_ok(status)) {
     status = iree_hal_cuda2_event_pool_allocate(
-        cuda_symbols, params->event_pool_capacity, host_allocator,
+        *out_device, cuda_symbols, params->event_pool_capacity, host_allocator,
         &device_event_pool);
   }
 
@@ -266,19 +271,18 @@
   }
 
   if (iree_status_is_ok(status)) {
-    status = iree_hal_cuda2_device_create_internal(
-        driver, identifier, params, device, dispatch_stream, callback_stream,
-        context, cuda_symbols, nccl_symbols, host_event_pool, device_event_pool,
-        timepoint_pool, host_allocator, out_device);
-  }
-
-  if (!iree_status_is_ok(status)) {
+    iree_hal_cuda2_device_t* cuda_device =
+        iree_hal_cuda2_device_cast(*out_device);
+    cuda_device->host_event_pool = host_event_pool;
+    cuda_device->device_event_pool = device_event_pool;
+    cuda_device->timepoint_pool = timepoint_pool;
+  } else {
+    // Release resources we have accquired after HAL device creation.
     if (timepoint_pool) iree_hal_cuda2_timepoint_pool_free(timepoint_pool);
-    if (device_event_pool) iree_hal_cuda2_event_pool_free(device_event_pool);
+    if (device_event_pool) iree_hal_cuda2_event_pool_release(device_event_pool);
     if (host_event_pool) iree_event_pool_free(host_event_pool);
-    if (callback_stream) cuda_symbols->cuStreamDestroy(callback_stream);
-    if (dispatch_stream) cuda_symbols->cuStreamDestroy(dispatch_stream);
-    if (context) cuda_symbols->cuDevicePrimaryCtxRelease(device);
+    // Release other resources via the HAL device.
+    iree_hal_device_release(*out_device);
   }
 
   IREE_TRACE_ZONE_END(z0);
@@ -320,9 +324,13 @@
   iree_hal_cuda2_tracing_context_free(device->tracing_context);
 
   // Destroy various pools for synchronization.
-  iree_hal_cuda2_timepoint_pool_free(device->timepoint_pool);
-  iree_hal_cuda2_event_pool_free(device->device_event_pool);
-  iree_event_pool_free(device->host_event_pool);
+  if (device->timepoint_pool) {
+    iree_hal_cuda2_timepoint_pool_free(device->timepoint_pool);
+  }
+  if (device->device_event_pool) {
+    iree_hal_cuda2_event_pool_release(device->device_event_pool);
+  }
+  if (device->host_event_pool) iree_event_pool_free(device->host_event_pool);
 
   IREE_CUDA_IGNORE_ERROR(symbols, cuStreamDestroy(device->dispatch_cu_stream));
   IREE_CUDA_IGNORE_ERROR(symbols, cuStreamDestroy(device->callback_cu_stream));

diff --git a/experimental/cuda2/event_pool.c b/experimental/cuda2/event_pool.c
index d90bf79..11d7f78 100644
--- a/experimental/cuda2/event_pool.c
+++ b/experimental/cuda2/event_pool.c

@@ -22,21 +22,22 @@
 //===----------------------------------------------------------------------===//
 
 struct iree_hal_cuda2_event_t {
-  // The allocator used to create the event.
-  iree_allocator_t host_allocator;
-  // The symbols used to create and destroy CUevent objects.
-  const iree_hal_cuda2_dynamic_symbols_t* symbols;
-
-  // The event pool that owns this event. This cannot be NULL.
-  iree_hal_cuda2_event_pool_t* pool;
-  // The underlying CUevent object.
-  CUevent cu_event;
-
   // A reference count used to manage resource lifetime. Its value range:
   // * 1 - when inside the event pool and to be acquired;
   // * >= 1 - when acquired outside of the event pool;
   // * 0 - when before releasing back to the pool or destruction.
   iree_atomic_ref_count_t ref_count;
+
+  // The allocator used to create the event.
+  iree_allocator_t host_allocator;
+  // The symbols used to create and destroy CUevent objects.
+  const iree_hal_cuda2_dynamic_symbols_t* symbols;
+
+  // The event pool that owns this event. This cannot be NULL. We retain it to
+  // make sure the event outlive the pool.
+  iree_hal_cuda2_event_pool_t* pool;
+  // The underlying CUevent object.
+  CUevent cu_event;
 };
 
 CUevent iree_hal_cuda2_event_handle(const iree_hal_cuda2_event_t* event) {
@@ -69,17 +70,18 @@
   IREE_RETURN_AND_END_ZONE_IF_ERROR(
       z0,
       iree_allocator_malloc(host_allocator, sizeof(*event), (void**)&event));
+  iree_atomic_ref_count_init(&event->ref_count);  // -> 1
   event->host_allocator = host_allocator;
   event->symbols = symbols;
   event->pool = pool;
   event->cu_event = NULL;
-  iree_atomic_ref_count_init(&event->ref_count);  // -> 1
 
   iree_status_t status = IREE_CURESULT_TO_STATUS(
       symbols, cuEventCreate(&event->cu_event, CU_EVENT_DISABLE_TIMING),
       "cuEventCreate");
   if (iree_status_is_ok(status)) {
     *out_event = event;
+    iree_hal_cuda2_event_pool_retain(pool);  // +1
   } else {
     iree_atomic_ref_count_dec(&event->ref_count);  // -> 0
     iree_hal_cuda2_event_destroy(event);
@@ -93,14 +95,17 @@
   iree_atomic_ref_count_inc(&event->ref_count);
 }
 
-static void iree_hal_cuda2_event_pool_release(
+static void iree_hal_cuda2_event_pool_release_event(
     iree_hal_cuda2_event_pool_t* event_pool, iree_host_size_t event_count,
     iree_hal_cuda2_event_t** events);
 
 void iree_hal_cuda2_event_release(iree_hal_cuda2_event_t* event) {
   if (iree_atomic_ref_count_dec(&event->ref_count) == 1) {
+    iree_hal_cuda2_event_pool_t* pool = event->pool;
     // Release back to the pool if the reference count becomes 0.
-    iree_hal_cuda2_event_pool_release(event->pool, 1, &event);
+    iree_hal_cuda2_event_pool_release_event(pool, 1, &event);
+    // Drop our reference to the pool itself.
+    iree_hal_cuda2_event_pool_release(pool);  // -1
   }
 }
 
@@ -109,11 +114,18 @@
 //===----------------------------------------------------------------------===//
 
 struct iree_hal_cuda2_event_pool_t {
+  // A reference count used to manage resource lifetime.
+  iree_atomic_ref_count_t ref_count;
+
   // The allocator used to create the event pool.
   iree_allocator_t host_allocator;
   // The symbols used to create and destroy CUevent objects.
   const iree_hal_cuda2_dynamic_symbols_t* symbols;
 
+  // The HAL device that owns this pool. This cannot be NULL. We retain it to
+  // make sure the pool outlive the device.
+  iree_hal_device_t* owning_device;
+
   // Guards event related fields in the pool. We don't expect a performant
   // program to frequently allocate events for synchronization purposes; the
   // traffic to this pool should be low. So it should be fine to use mutex to
@@ -131,7 +143,11 @@
 };
 // + Additional inline allocation for holding events up to the capacity.
 
+static void iree_hal_cuda2_event_pool_free(
+    iree_hal_cuda2_event_pool_t* event_pool);
+
 iree_status_t iree_hal_cuda2_event_pool_allocate(
+    iree_hal_device_t* owning_device,
     const iree_hal_cuda2_dynamic_symbols_t* symbols,
     iree_host_size_t available_capacity, iree_allocator_t host_allocator,
     iree_hal_cuda2_event_pool_t** out_event_pool) {
@@ -147,6 +163,7 @@
   IREE_RETURN_AND_END_ZONE_IF_ERROR(
       z0,
       iree_allocator_malloc(host_allocator, total_size, (void**)&event_pool));
+  iree_atomic_ref_count_init(&event_pool->ref_count);  // -> 1
   event_pool->host_allocator = host_allocator;
   event_pool->symbols = symbols;
   iree_slim_mutex_initialize(&event_pool->event_mutex);
@@ -163,6 +180,7 @@
 
   if (iree_status_is_ok(status)) {
     *out_event_pool = event_pool;
+    iree_hal_device_retain(owning_device);  // +1
   } else {
     iree_hal_cuda2_event_pool_free(event_pool);
   }
@@ -170,8 +188,10 @@
   return status;
 }
 
-void iree_hal_cuda2_event_pool_free(iree_hal_cuda2_event_pool_t* event_pool) {
+static void iree_hal_cuda2_event_pool_free(
+    iree_hal_cuda2_event_pool_t* event_pool) {
   iree_allocator_t host_allocator = event_pool->host_allocator;
+  iree_hal_device_t* device = event_pool->owning_device;
   IREE_TRACE_ZONE_BEGIN(z0);
 
   for (iree_host_size_t i = 0; i < event_pool->available_count; ++i) {
@@ -179,12 +199,28 @@
     iree_atomic_ref_count_dec(&event->ref_count);  // -> 0
     iree_hal_cuda2_event_destroy(event);
   }
+  IREE_ASSERT_REF_COUNT_ZERO(&event_pool->ref_count);
+
   iree_slim_mutex_deinitialize(&event_pool->event_mutex);
   iree_allocator_free(host_allocator, event_pool);
 
+  // Now release the owning device.
+  iree_hal_device_release(device);  // -1
+
   IREE_TRACE_ZONE_END(z0);
 }
 
+void iree_hal_cuda2_event_pool_retain(iree_hal_cuda2_event_pool_t* event_pool) {
+  iree_atomic_ref_count_inc(&event_pool->ref_count);
+}
+
+void iree_hal_cuda2_event_pool_release(
+    iree_hal_cuda2_event_pool_t* event_pool) {
+  if (iree_atomic_ref_count_dec(&event_pool->ref_count) == 1) {
+    iree_hal_cuda2_event_pool_free(event_pool);
+  }
+}
+
 iree_status_t iree_hal_cuda2_event_pool_acquire(
     iree_hal_cuda2_event_pool_t* event_pool, iree_host_size_t event_count,
     iree_hal_cuda2_event_t** out_events) {
@@ -221,8 +257,8 @@
                                            &out_events[from_pool_count + i]);
       if (!iree_status_is_ok(status)) {
         // Must release all events we've acquired so far.
-        iree_hal_cuda2_event_pool_release(event_pool, from_pool_count + i,
-                                          out_events);
+        iree_hal_cuda2_event_pool_release_event(event_pool, from_pool_count + i,
+                                                out_events);
         IREE_TRACE_ZONE_END(z1);
         IREE_TRACE_ZONE_END(z0);
         return status;
@@ -235,7 +271,7 @@
   return iree_ok_status();
 }
 
-static void iree_hal_cuda2_event_pool_release(
+static void iree_hal_cuda2_event_pool_release_event(
     iree_hal_cuda2_event_pool_t* event_pool, iree_host_size_t event_count,
     iree_hal_cuda2_event_t** events) {
   IREE_ASSERT_ARGUMENT(event_pool);

diff --git a/experimental/cuda2/event_pool.h b/experimental/cuda2/event_pool.h
index 50f9624..f1e4974 100644
--- a/experimental/cuda2/event_pool.h
+++ b/experimental/cuda2/event_pool.h

@@ -9,6 +9,7 @@
 
 #include "experimental/cuda2/cuda_dynamic_symbols.h"
 #include "iree/base/api.h"
+#include "iree/hal/api.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -52,15 +53,18 @@
 // Extra events requested beyond the capability are directly created and
 // destroyed without pooling.
 iree_status_t iree_hal_cuda2_event_pool_allocate(
+    iree_hal_device_t* owning_device,
     const iree_hal_cuda2_dynamic_symbols_t* symbols,
     iree_host_size_t available_capacity, iree_allocator_t host_allocator,
     iree_hal_cuda2_event_pool_t** out_event_pool);
 
-// Deallocates an event pool and destroys all events.
+// Retains the given |event_pool| by increasing its reference count.
+void iree_hal_cuda2_event_pool_retain(iree_hal_cuda2_event_pool_t* event_pool);
+
+// Releases the given |event_pool| by decreasing its reference count.
 //
-// All events that were acquired from the pool must have already been released
-// back to it prior to deallocation.
-void iree_hal_cuda2_event_pool_free(iree_hal_cuda2_event_pool_t* event_pool);
+// Once the |event_pool|'s reference count becomes zero, it will be freed.
+void iree_hal_cuda2_event_pool_release(iree_hal_cuda2_event_pool_t* event_pool);
 
 // Acquires one or more events from the event pool.
 //
commit	6e0481682151f9843ba4a79e42b77eae9bd28224	[log] [tgz]
author	Lei Zhang <antiagainst@google.com>	Tue Aug 29 21:54:32 2023 -0400
committer	GitHub <noreply@github.com>	Tue Aug 29 18:54:32 2023 -0700
tree	295c572c39ab24213ba96e97ab50d8cfdd68f37e
parent	8105859455d5111247078aae7903a424a82fae16 [diff]