[cuda] Fix event_pool reference counting (#14900)

There are still remaining reference counting issues that lead to memory
leaks (iree-run-module never destroys cuda device), but this one leads
to crashes because of heap-use-after-free.
diff --git a/experimental/cuda2/event_pool.c b/experimental/cuda2/event_pool.c
index 11d7f78..5ca26ef 100644
--- a/experimental/cuda2/event_pool.c
+++ b/experimental/cuda2/event_pool.c
@@ -81,7 +81,6 @@
       "cuEventCreate");
   if (iree_status_is_ok(status)) {
     *out_event = event;
-    iree_hal_cuda2_event_pool_retain(pool);  // +1
   } else {
     iree_atomic_ref_count_dec(&event->ref_count);  // -> 0
     iree_hal_cuda2_event_destroy(event);
@@ -104,7 +103,7 @@
     iree_hal_cuda2_event_pool_t* pool = event->pool;
     // Release back to the pool if the reference count becomes 0.
     iree_hal_cuda2_event_pool_release_event(pool, 1, &event);
-    // Drop our reference to the pool itself.
+    // Drop our reference to the pool itself when we return event to it.
     iree_hal_cuda2_event_pool_release(pool);  // -1
   }
 }
@@ -267,6 +266,12 @@
     IREE_TRACE_ZONE_END(z1);
   }
 
+  // Retain a reference to a pool when we pass event to the caller. When the
+  // caller returns event back to the pool they'll release the reference.
+  for (iree_host_size_t i = 0; i < event_count; ++i) {
+    iree_hal_cuda2_event_pool_retain(out_events[i]->pool);  // +1
+  }
+
   IREE_TRACE_ZONE_END(z0);
   return iree_ok_status();
 }