[cuda] Fix segfault caused by CUevent outliving CUdevice (#14875) We use `iree_hal_cuda2_event_t` to wrap a `CUevent` so that we can have multiple places referencing the `CUevent` with automatic resource releasing when refcount becomes zero. The `iree_hal_cuda2_event_t` objects are further placed in a pool to amortize the cost of creation via recycling. But how it is set up right now, the pool's lifetime is associated with the HAL device. So it will be destroyed together with the device, including the CUDA context and so. All events that are in the pool will be destroyed; but there is no proper handling of outstanding `iree_hal_cuda2_event_t` in this flow. If a `iree_hal_cuda2_event_t`'s refcount decreases to zero after device destruction, it will cause segfaults in CUDA driver given the CUDA context is already gone. So this commit fixes the issue by retaining the pool in the `iree_hal_cuda2_event_t` object and retaining the HAL device in the pool.

commit: 6e0481682151f9843ba4a79e42b77eae9bd28224 [log] [tgz]
author: Lei Zhang <antiagainst@google.com> Tue Aug 29 21:54:32 2023 -0400
committer: GitHub <noreply@github.com> Tue Aug 29 18:54:32 2023 -0700
tree: 295c572c39ab24213ba96e97ab50d8cfdd68f37e
parent: 8105859455d5111247078aae7903a424a82fae16 [diff] [blame]
diff --git a/experimental/cuda2/cuda_device.c b/experimental/cuda2/cuda_device.c
index 034feae..370b693 100644
--- a/experimental/cuda2/cuda_device.c
+++ b/experimental/cuda2/cuda_device.c

@@ -132,9 +132,6 @@
     CUstream dispatch_stream, CUstream callback_stream, CUcontext context,
     const iree_hal_cuda2_dynamic_symbols_t* cuda_symbols,
     const iree_hal_cuda2_nccl_dynamic_symbols_t* nccl_symbols,
-    iree_event_pool_t* host_event_pool,
-    iree_hal_cuda2_event_pool_t* device_event_pool,
-    iree_hal_cuda2_timepoint_pool_t* timepoint_pool,
     iree_allocator_t host_allocator, iree_hal_device_t** out_device) {
   iree_hal_cuda2_device_t* device = NULL;
   iree_host_size_t total_size = iree_sizeof_struct(*device) + identifier.size;
@@ -159,9 +156,6 @@
   device->dispatch_cu_stream = dispatch_stream;
   device->callback_cu_stream = callback_stream;
   device->host_allocator = host_allocator;
-  device->host_event_pool = host_event_pool;
-  device->device_event_pool = device_event_pool;
-  device->timepoint_pool = timepoint_pool;
 
   iree_status_t status = iree_hal_cuda2_pending_queue_actions_create(
       cuda_symbols, &device->block_pool, host_allocator,
@@ -245,6 +239,17 @@
         cuda_symbols, cuStreamCreate(&callback_stream, CU_STREAM_NON_BLOCKING));
   }
 
+  if (iree_status_is_ok(status)) {
+    status = iree_hal_cuda2_device_create_internal(
+        driver, identifier, params, device, dispatch_stream, callback_stream,
+        context, cuda_symbols, nccl_symbols, host_allocator, out_device);
+  } else {
+    // Release resources we have accquired thus far.
+    if (callback_stream) cuda_symbols->cuStreamDestroy(callback_stream);
+    if (dispatch_stream) cuda_symbols->cuStreamDestroy(dispatch_stream);
+    if (context) cuda_symbols->cuDevicePrimaryCtxRelease(device);
+  }
+
   iree_event_pool_t* host_event_pool = NULL;
   if (iree_status_is_ok(status)) {
     status = iree_event_pool_allocate(params->event_pool_capacity,
@@ -254,7 +259,7 @@
   iree_hal_cuda2_event_pool_t* device_event_pool = NULL;
   if (iree_status_is_ok(status)) {
     status = iree_hal_cuda2_event_pool_allocate(
-        cuda_symbols, params->event_pool_capacity, host_allocator,
+        *out_device, cuda_symbols, params->event_pool_capacity, host_allocator,
         &device_event_pool);
   }
 
@@ -266,19 +271,18 @@
   }
 
   if (iree_status_is_ok(status)) {
-    status = iree_hal_cuda2_device_create_internal(
-        driver, identifier, params, device, dispatch_stream, callback_stream,
-        context, cuda_symbols, nccl_symbols, host_event_pool, device_event_pool,
-        timepoint_pool, host_allocator, out_device);
-  }
-
-  if (!iree_status_is_ok(status)) {
+    iree_hal_cuda2_device_t* cuda_device =
+        iree_hal_cuda2_device_cast(*out_device);
+    cuda_device->host_event_pool = host_event_pool;
+    cuda_device->device_event_pool = device_event_pool;
+    cuda_device->timepoint_pool = timepoint_pool;
+  } else {
+    // Release resources we have accquired after HAL device creation.
     if (timepoint_pool) iree_hal_cuda2_timepoint_pool_free(timepoint_pool);
-    if (device_event_pool) iree_hal_cuda2_event_pool_free(device_event_pool);
+    if (device_event_pool) iree_hal_cuda2_event_pool_release(device_event_pool);
     if (host_event_pool) iree_event_pool_free(host_event_pool);
-    if (callback_stream) cuda_symbols->cuStreamDestroy(callback_stream);
-    if (dispatch_stream) cuda_symbols->cuStreamDestroy(dispatch_stream);
-    if (context) cuda_symbols->cuDevicePrimaryCtxRelease(device);
+    // Release other resources via the HAL device.
+    iree_hal_device_release(*out_device);
   }
 
   IREE_TRACE_ZONE_END(z0);
@@ -320,9 +324,13 @@
   iree_hal_cuda2_tracing_context_free(device->tracing_context);
 
   // Destroy various pools for synchronization.
-  iree_hal_cuda2_timepoint_pool_free(device->timepoint_pool);
-  iree_hal_cuda2_event_pool_free(device->device_event_pool);
-  iree_event_pool_free(device->host_event_pool);
+  if (device->timepoint_pool) {
+    iree_hal_cuda2_timepoint_pool_free(device->timepoint_pool);
+  }
+  if (device->device_event_pool) {
+    iree_hal_cuda2_event_pool_release(device->device_event_pool);
+  }
+  if (device->host_event_pool) iree_event_pool_free(device->host_event_pool);
 
   IREE_CUDA_IGNORE_ERROR(symbols, cuStreamDestroy(device->dispatch_cu_stream));
   IREE_CUDA_IGNORE_ERROR(symbols, cuStreamDestroy(device->callback_cu_stream));
commit	6e0481682151f9843ba4a79e42b77eae9bd28224	[log] [tgz]
author	Lei Zhang <antiagainst@google.com>	Tue Aug 29 21:54:32 2023 -0400
committer	GitHub <noreply@github.com>	Tue Aug 29 18:54:32 2023 -0700
tree	295c572c39ab24213ba96e97ab50d8cfdd68f37e
parent	8105859455d5111247078aae7903a424a82fae16 [diff] [blame]