[cuda] Fix segfault caused by CUevent outliving CUdevice (#14875)

We use `iree_hal_cuda2_event_t` to wrap a `CUevent` so that we can have
multiple places referencing the `CUevent` with automatic resource
releasing when refcount becomes zero.

The `iree_hal_cuda2_event_t` objects are further placed in a pool to
amortize the cost of creation via recycling. But how it is set up right
now, the pool's lifetime is associated with the HAL device. So it will
be destroyed together with the device, including the CUDA context and
so. All events that are in the pool will be destroyed; but there is no
proper handling of outstanding `iree_hal_cuda2_event_t` in this flow.

If a `iree_hal_cuda2_event_t`'s refcount decreases to zero after device
destruction, it will cause segfaults in CUDA driver given the CUDA
context is already gone.

So this commit fixes the issue by retaining the pool in the
`iree_hal_cuda2_event_t` object and retaining the HAL device in the
pool.
diff --git a/experimental/cuda2/cuda_device.c b/experimental/cuda2/cuda_device.c
index 034feae..370b693 100644
--- a/experimental/cuda2/cuda_device.c
+++ b/experimental/cuda2/cuda_device.c
@@ -132,9 +132,6 @@
     CUstream dispatch_stream, CUstream callback_stream, CUcontext context,
     const iree_hal_cuda2_dynamic_symbols_t* cuda_symbols,
     const iree_hal_cuda2_nccl_dynamic_symbols_t* nccl_symbols,
-    iree_event_pool_t* host_event_pool,
-    iree_hal_cuda2_event_pool_t* device_event_pool,
-    iree_hal_cuda2_timepoint_pool_t* timepoint_pool,
     iree_allocator_t host_allocator, iree_hal_device_t** out_device) {
   iree_hal_cuda2_device_t* device = NULL;
   iree_host_size_t total_size = iree_sizeof_struct(*device) + identifier.size;
@@ -159,9 +156,6 @@
   device->dispatch_cu_stream = dispatch_stream;
   device->callback_cu_stream = callback_stream;
   device->host_allocator = host_allocator;
-  device->host_event_pool = host_event_pool;
-  device->device_event_pool = device_event_pool;
-  device->timepoint_pool = timepoint_pool;
 
   iree_status_t status = iree_hal_cuda2_pending_queue_actions_create(
       cuda_symbols, &device->block_pool, host_allocator,
@@ -245,6 +239,17 @@
         cuda_symbols, cuStreamCreate(&callback_stream, CU_STREAM_NON_BLOCKING));
   }
 
+  if (iree_status_is_ok(status)) {
+    status = iree_hal_cuda2_device_create_internal(
+        driver, identifier, params, device, dispatch_stream, callback_stream,
+        context, cuda_symbols, nccl_symbols, host_allocator, out_device);
+  } else {
+    // Release resources we have accquired thus far.
+    if (callback_stream) cuda_symbols->cuStreamDestroy(callback_stream);
+    if (dispatch_stream) cuda_symbols->cuStreamDestroy(dispatch_stream);
+    if (context) cuda_symbols->cuDevicePrimaryCtxRelease(device);
+  }
+
   iree_event_pool_t* host_event_pool = NULL;
   if (iree_status_is_ok(status)) {
     status = iree_event_pool_allocate(params->event_pool_capacity,
@@ -254,7 +259,7 @@
   iree_hal_cuda2_event_pool_t* device_event_pool = NULL;
   if (iree_status_is_ok(status)) {
     status = iree_hal_cuda2_event_pool_allocate(
-        cuda_symbols, params->event_pool_capacity, host_allocator,
+        *out_device, cuda_symbols, params->event_pool_capacity, host_allocator,
         &device_event_pool);
   }
 
@@ -266,19 +271,18 @@
   }
 
   if (iree_status_is_ok(status)) {
-    status = iree_hal_cuda2_device_create_internal(
-        driver, identifier, params, device, dispatch_stream, callback_stream,
-        context, cuda_symbols, nccl_symbols, host_event_pool, device_event_pool,
-        timepoint_pool, host_allocator, out_device);
-  }
-
-  if (!iree_status_is_ok(status)) {
+    iree_hal_cuda2_device_t* cuda_device =
+        iree_hal_cuda2_device_cast(*out_device);
+    cuda_device->host_event_pool = host_event_pool;
+    cuda_device->device_event_pool = device_event_pool;
+    cuda_device->timepoint_pool = timepoint_pool;
+  } else {
+    // Release resources we have accquired after HAL device creation.
     if (timepoint_pool) iree_hal_cuda2_timepoint_pool_free(timepoint_pool);
-    if (device_event_pool) iree_hal_cuda2_event_pool_free(device_event_pool);
+    if (device_event_pool) iree_hal_cuda2_event_pool_release(device_event_pool);
     if (host_event_pool) iree_event_pool_free(host_event_pool);
-    if (callback_stream) cuda_symbols->cuStreamDestroy(callback_stream);
-    if (dispatch_stream) cuda_symbols->cuStreamDestroy(dispatch_stream);
-    if (context) cuda_symbols->cuDevicePrimaryCtxRelease(device);
+    // Release other resources via the HAL device.
+    iree_hal_device_release(*out_device);
   }
 
   IREE_TRACE_ZONE_END(z0);
@@ -320,9 +324,13 @@
   iree_hal_cuda2_tracing_context_free(device->tracing_context);
 
   // Destroy various pools for synchronization.
-  iree_hal_cuda2_timepoint_pool_free(device->timepoint_pool);
-  iree_hal_cuda2_event_pool_free(device->device_event_pool);
-  iree_event_pool_free(device->host_event_pool);
+  if (device->timepoint_pool) {
+    iree_hal_cuda2_timepoint_pool_free(device->timepoint_pool);
+  }
+  if (device->device_event_pool) {
+    iree_hal_cuda2_event_pool_release(device->device_event_pool);
+  }
+  if (device->host_event_pool) iree_event_pool_free(device->host_event_pool);
 
   IREE_CUDA_IGNORE_ERROR(symbols, cuStreamDestroy(device->dispatch_cu_stream));
   IREE_CUDA_IGNORE_ERROR(symbols, cuStreamDestroy(device->callback_cu_stream));