[hip][cuda] Free resources on the completion thread. (#18247)

This cleans up everything on the completion thread rather than queuing
it back up on the worker thread. This reduces our cleanup latency, and
provides a unidirectional data flow instead of moving back to the worker
thread.


![1](https://github.com/user-attachments/assets/2385bfce-d45e-4067-8d22-94761819ebce)

![2](https://github.com/user-attachments/assets/3687335b-b051-4d9a-999a-0603aaf7cac7)

Signed-off-by: Andrew Woloszyn <andrew.woloszyn@gmail.com>
diff --git a/runtime/src/iree/hal/utils/deferred_work_queue.c b/runtime/src/iree/hal/utils/deferred_work_queue.c
index 99ba702..e8ee7bf 100644
--- a/runtime/src/iree/hal/utils/deferred_work_queue.c
+++ b/runtime/src/iree/hal/utils/deferred_work_queue.c
@@ -32,17 +32,8 @@
 typedef enum iree_hal_deferred_work_queue_action_state_e {
   // The current action is active as waiting for or under execution.
   IREE_HAL_QUEUE_ACTION_STATE_ALIVE,
-  // The current action is done execution and waiting for destruction.
-  IREE_HAL_QUEUE_ACTION_STATE_ZOMBIE,
 } iree_hal_deferred_work_queue_action_state_t;
 
-// How many work items must complete in order for an action to complete.
-// We keep track of the remaining work for an action so we don't exit worker
-// threads prematurely.
-// +1 for issuing an execution of an action.
-// +1 for cleaning up a zombie action.
-static const iree_host_size_t total_work_items_to_complete_an_action = 2;
-
 // A work queue action.
 // Note that this struct does not have internal synchronization; it's expected
 // to work together with the deferred work queue, which synchronizes accesses.
@@ -812,9 +803,7 @@
                                                          action);
       // One work item is the callback that makes it across from the
       // completion thread.
-      // The other is the cleanup of the action.
-      actions->pending_work_items_count +=
-          total_work_items_to_complete_an_action;
+      actions->pending_work_items_count += 1;
     }
     iree_slim_mutex_unlock(&actions->action_mutex);
   } else {
@@ -848,20 +837,8 @@
   iree_hal_semaphore_list_fail(action->signal_semaphore_list,
                                iree_status_clone(status));
 
-  iree_host_size_t work_items_remaining = 0;
-  switch (action->state) {
-    case IREE_HAL_QUEUE_ACTION_STATE_ALIVE:
-      work_items_remaining = total_work_items_to_complete_an_action;
-      break;
-    case IREE_HAL_QUEUE_ACTION_STATE_ZOMBIE:
-      work_items_remaining = 1;
-      break;
-    default:
-      // Someone forgot to handle all enum values?
-      iree_abort();
-  }
   iree_slim_mutex_lock(&actions->action_mutex);
-  action->owning_actions->pending_work_items_count -= work_items_remaining;
+  action->owning_actions->pending_work_items_count -= 1;
   iree_hal_deferred_work_queue_fail_status_locked(actions, status);
   iree_hal_deferred_work_queue_action_destroy(action);
 }
@@ -944,8 +921,6 @@
       (iree_hal_deferred_work_queue_action_t*)user_data;
   IREE_ASSERT_EQ(action->kind, IREE_HAL_QUEUE_ACTION_TYPE_EXECUTION);
   IREE_ASSERT_EQ(action->state, IREE_HAL_QUEUE_ACTION_STATE_ALIVE);
-  iree_hal_deferred_work_queue_t* actions = action->owning_actions;
-
   if (IREE_UNLIKELY(!iree_status_is_ok(status))) {
     iree_hal_deferred_work_queue_action_fail(action, status);
     IREE_TRACE_ZONE_END(z0);
@@ -963,20 +938,7 @@
     return status;
   }
 
-  // Flip the action state to zombie and enqueue it again so that we can let
-  // the worker thread clean it up. Note that this is necessary because cleanup
-  // may involve GPU API calls like buffer releasing or unregistering, so we can
-  // not inline it here.
-  action->state = IREE_HAL_QUEUE_ACTION_STATE_ZOMBIE;
-  iree_slim_mutex_lock(&actions->action_mutex);
-  iree_hal_deferred_work_queue_action_list_push_back(&actions->action_list,
-                                                     action);
-  // The callback (work item) is complete.
-  --actions->pending_work_items_count;
-  iree_slim_mutex_unlock(&actions->action_mutex);
-
-  // We need to trigger execution of this action again, so it gets cleaned up.
-  status = iree_hal_deferred_work_queue_issue(actions);
+  iree_hal_deferred_work_queue_action_destroy(action);
 
   IREE_TRACE_ZONE_END(z0);
   return status;
@@ -1115,21 +1077,6 @@
   return iree_ok_status();
 }
 
-// Performs the given cleanup |action| on the CPU.
-static void iree_hal_deferred_work_queue_issue_cleanup(
-    iree_hal_deferred_work_queue_action_t* action) {
-  iree_hal_deferred_work_queue_t* actions = action->owning_actions;
-  IREE_TRACE_ZONE_BEGIN(z0);
-
-  iree_hal_deferred_work_queue_action_destroy(action);
-
-  // Now we fully executed and cleaned up this action. Decrease the work items
-  // counter.
-  iree_hal_deferred_work_queue_decrement_work_items_count(actions);
-
-  IREE_TRACE_ZONE_END(z0);
-}
-
 iree_status_t iree_hal_deferred_work_queue_issue(
     iree_hal_deferred_work_queue_t* actions) {
   IREE_TRACE_ZONE_BEGIN(z0);
@@ -1338,9 +1285,6 @@
         case IREE_HAL_QUEUE_ACTION_STATE_ALIVE:
           status = iree_hal_deferred_work_queue_issue_execution(action);
           break;
-        case IREE_HAL_QUEUE_ACTION_STATE_ZOMBIE:
-          iree_hal_deferred_work_queue_issue_cleanup(action);
-          break;
       }
 
       if (IREE_UNLIKELY(!iree_status_is_ok(status))) {
@@ -1349,6 +1293,7 @@
             &actions->working_area.ready_worklist, entry);
         break;
       }
+      iree_hal_deferred_work_queue_decrement_work_items_count(actions);
     }
 
     if (IREE_UNLIKELY(!iree_status_is_ok(status))) {