[hip] Switched the completion list to use a list instead of a stack. (#18048)
This was sometimes causing completion events to come in out of
order.
This was previously merged into the shared/sdxl_quantized branch.
Working on PR to move all of the updates made in hip over to cuda.
---------
Signed-off-by: Andrew Woloszyn <andrew.woloszyn@gmail.com>
diff --git a/runtime/src/iree/hal/drivers/hip/pending_queue_actions.c b/runtime/src/iree/hal/drivers/hip/pending_queue_actions.c
index be52cb3..eec5acd 100644
--- a/runtime/src/iree/hal/drivers/hip/pending_queue_actions.c
+++ b/runtime/src/iree/hal/drivers/hip/pending_queue_actions.c
@@ -184,19 +184,68 @@
//===----------------------------------------------------------------------===//
// Ready action atomic slist entry struct.
-typedef struct iree_hal_hip_atomic_slist_entry_t {
+typedef struct iree_hal_hip_entry_list_node_t {
iree_hal_hip_queue_action_t* ready_list_head;
- iree_atomic_slist_intrusive_ptr_t slist_next;
-} iree_hal_hip_atomic_slist_entry_t;
+ struct iree_hal_hip_entry_list_node_t* next;
+} iree_hal_hip_entry_list_node_t;
-// Ready action atomic slist.
-IREE_TYPED_ATOMIC_SLIST_WRAPPER(iree_hal_hip_ready_action,
- iree_hal_hip_atomic_slist_entry_t,
- offsetof(iree_hal_hip_atomic_slist_entry_t,
- slist_next));
+typedef struct iree_hal_hip_entry_list_t {
+ iree_slim_mutex_t guard_mutex;
+
+ iree_hal_hip_entry_list_node_t* head IREE_GUARDED_BY(guard_mutex);
+ iree_hal_hip_entry_list_node_t* tail IREE_GUARDED_BY(guard_mutex);
+} iree_hal_hip_entry_list_t;
+
+static iree_hal_hip_entry_list_node_t* iree_hal_hip_entry_list_pop(
+ iree_hal_hip_entry_list_t* list) {
+ iree_hal_hip_entry_list_node_t* out = NULL;
+ iree_slim_mutex_lock(&list->guard_mutex);
+ if (list->head) {
+ out = list->head;
+ list->head = list->head->next;
+ if (out == list->tail) {
+ list->tail = NULL;
+ }
+ }
+ iree_slim_mutex_unlock(&list->guard_mutex);
+ return out;
+}
+
+void iree_hal_hip_entry_list_push(iree_hal_hip_entry_list_t* list,
+ iree_hal_hip_entry_list_node_t* next) {
+ iree_slim_mutex_lock(&list->guard_mutex);
+ next->next = NULL;
+ if (list->tail) {
+ list->tail->next = next;
+ list->tail = next;
+ } else {
+ list->head = next;
+ list->tail = next;
+ }
+ iree_slim_mutex_unlock(&list->guard_mutex);
+}
+
+static void iree_hal_hip_ready_action_list_deinitialize(
+ iree_hal_hip_entry_list_t* list, iree_allocator_t host_allocator) {
+ iree_hal_hip_entry_list_node_t* head = list->head;
+ while (head) {
+ if (!head) break;
+ iree_hal_hip_queue_action_list_destroy(head->ready_list_head);
+ list->head = head->next;
+ iree_allocator_free(host_allocator, head);
+ }
+ iree_slim_mutex_deinitialize(&list->guard_mutex);
+}
+
+static void iree_hal_hip_ready_action_list_initialize(
+ iree_hal_hip_entry_list_t* list) {
+ list->head = NULL;
+ list->tail = NULL;
+ iree_slim_mutex_initialize(&list->guard_mutex);
+}
// Ready action atomic slist entry struct.
-typedef struct iree_hal_hip_atomic_slist_completion_t {
+typedef struct iree_hal_hip_completion_list_node_t {
// The callback and user data for that callback. To be called
// when the associated event has completed.
iree_status_t (*callback)(void* user_data);
@@ -206,45 +255,69 @@
// If this event was created just for the completion thread, and therefore
// needs to be cleaned up.
bool created_event;
- iree_atomic_slist_intrusive_ptr_t slist_next;
-} iree_hal_hip_atomic_slist_completion_t;
+ struct iree_hal_hip_completion_list_node_t* next;
+} iree_hal_hip_completion_list_node_t;
-// Ready action atomic slist.
-IREE_TYPED_ATOMIC_SLIST_WRAPPER(iree_hal_hip_completion,
- iree_hal_hip_atomic_slist_completion_t,
- offsetof(iree_hal_hip_atomic_slist_completion_t,
- slist_next));
+typedef struct iree_hal_hip_completion_list_t {
+ iree_slim_mutex_t guard_mutex;
+ iree_hal_hip_completion_list_node_t* head IREE_GUARDED_BY(guard_mutex);
+ iree_hal_hip_completion_list_node_t* tail IREE_GUARDED_BY(guard_mutex);
+} iree_hal_hip_completion_list_t;
-static void iree_hal_hip_ready_action_slist_destroy(
- iree_hal_hip_ready_action_slist_t* list, iree_allocator_t host_allocator) {
- while (true) {
- iree_hal_hip_atomic_slist_entry_t* entry =
- iree_hal_hip_ready_action_slist_pop(list);
- if (!entry) break;
- iree_hal_hip_queue_action_list_destroy(entry->ready_list_head);
- iree_allocator_free(host_allocator, entry);
+static iree_hal_hip_completion_list_node_t* iree_hal_hip_completion_list_pop(
+ iree_hal_hip_completion_list_t* list) {
+ iree_hal_hip_completion_list_node_t* out = NULL;
+ iree_slim_mutex_lock(&list->guard_mutex);
+ if (list->head) {
+ out = list->head;
+ list->head = list->head->next;
+ if (out == list->tail) {
+ list->tail = NULL;
+ }
}
- iree_hal_hip_ready_action_slist_deinitialize(list);
+ iree_slim_mutex_unlock(&list->guard_mutex);
+ return out;
}
-static void iree_hal_hip_completion_slist_destroy(
- iree_hal_hip_completion_slist_t* list,
+void iree_hal_hip_completion_list_push(
+ iree_hal_hip_completion_list_t* list,
+ iree_hal_hip_completion_list_node_t* next) {
+ iree_slim_mutex_lock(&list->guard_mutex);
+ next->next = NULL;
+ if (list->tail) {
+ list->tail->next = next;
+ list->tail = next;
+ } else {
+ list->head = next;
+ list->tail = next;
+ }
+ iree_slim_mutex_unlock(&list->guard_mutex);
+}
+
+static void iree_hal_hip_completion_list_initialize(
+ iree_hal_hip_completion_list_t* list) {
+ list->head = NULL;
+ list->tail = NULL;
+ iree_slim_mutex_initialize(&list->guard_mutex);
+}
+
+static void iree_hal_hip_completion_list_deinitialize(
+ iree_hal_hip_completion_list_t* list,
const iree_hal_hip_dynamic_symbols_t* symbols,
iree_allocator_t host_allocator) {
- while (true) {
- iree_hal_hip_atomic_slist_completion_t* entry =
- iree_hal_hip_completion_slist_pop(list);
- if (!entry) break;
- if (entry->created_event) {
- IREE_HIP_IGNORE_ERROR(symbols, hipEventDestroy(entry->event));
+ iree_hal_hip_completion_list_node_t* head = list->head;
+ while (head) {
+ if (head->created_event) {
+ IREE_HIP_IGNORE_ERROR(symbols, hipEventDestroy(head->event));
}
- iree_allocator_free(host_allocator, entry);
+ list->head = list->head->next;
+ iree_allocator_free(host_allocator, head);
}
- iree_hal_hip_completion_slist_deinitialize(list);
+ iree_slim_mutex_deinitialize(&list->guard_mutex);
}
static iree_hal_hip_queue_action_t* iree_hal_hip_atomic_slist_entry_pop_front(
- iree_hal_hip_atomic_slist_entry_t* list) {
+ iree_hal_hip_entry_list_node_t* list) {
IREE_ASSERT(list->ready_list_head);
iree_hal_hip_queue_action_t* action = list->ready_list_head;
@@ -289,8 +362,8 @@
// Notification to the parent thread to indicate the worker committed exiting.
// TODO: maybe remove this. We can just wait on the worker thread to exit.
iree_notification_t exit_notification;
- iree_hal_hip_ready_action_slist_t ready_worklist; // atomic
- iree_atomic_int32_t worker_state; // atomic
+ iree_hal_hip_entry_list_t ready_worklist;
+ iree_atomic_int32_t worker_state; // atomic
// TODO: use status to provide more context for the error.
iree_atomic_intptr_t error_code; // atomic
@@ -323,8 +396,8 @@
iree_notification_t state_notification;
// Notification to the parent thread to indicate the worker committed exiting.
iree_notification_t exit_notification;
- iree_hal_hip_completion_slist_t completion_list; // atomic
- iree_atomic_int32_t worker_state; // atomic
+ iree_hal_hip_completion_list_t completion_list;
+ iree_atomic_int32_t worker_state; // atomic
iree_atomic_intptr_t error_code; // atomic
@@ -348,7 +421,7 @@
iree_hal_hip_working_area_t* working_area) {
iree_notification_initialize(&working_area->state_notification);
iree_notification_initialize(&working_area->exit_notification);
- iree_hal_hip_ready_action_slist_initialize(&working_area->ready_worklist);
+ iree_hal_hip_ready_action_list_initialize(&working_area->ready_worklist);
iree_atomic_store_int32(&working_area->worker_state,
IREE_HAL_HIP_WORKER_STATE_IDLE_WAITING,
iree_memory_order_release);
@@ -365,8 +438,8 @@
static void iree_hal_hip_working_area_deinitialize(
iree_hal_hip_working_area_t* working_area) {
- iree_hal_hip_ready_action_slist_destroy(&working_area->ready_worklist,
- working_area->host_allocator);
+ iree_hal_hip_ready_action_list_deinitialize(&working_area->ready_worklist,
+ working_area->host_allocator);
iree_notification_deinitialize(&working_area->exit_notification);
iree_notification_deinitialize(&working_area->state_notification);
iree_slim_mutex_deinitialize(&working_area->pending_work_items_count_mutex);
@@ -380,7 +453,7 @@
iree_hal_hip_completion_area_t* completion_area) {
iree_notification_initialize(&completion_area->state_notification);
iree_notification_initialize(&completion_area->exit_notification);
- iree_hal_hip_completion_slist_initialize(&completion_area->completion_list);
+ iree_hal_hip_completion_list_initialize(&completion_area->completion_list);
iree_atomic_store_int32(&completion_area->worker_state,
IREE_HAL_HIP_WORKER_STATE_IDLE_WAITING,
iree_memory_order_release);
@@ -397,9 +470,9 @@
static void iree_hal_hip_completion_area_deinitialize(
iree_hal_hip_completion_area_t* completion_area) {
- iree_hal_hip_completion_slist_destroy(&completion_area->completion_list,
- completion_area->symbols,
- completion_area->host_allocator);
+ iree_hal_hip_completion_list_deinitialize(&completion_area->completion_list,
+ completion_area->symbols,
+ completion_area->host_allocator);
iree_notification_deinitialize(&completion_area->exit_notification);
iree_notification_deinitialize(&completion_area->state_notification);
iree_slim_mutex_deinitialize(
@@ -525,6 +598,8 @@
static bool iree_hal_hip_worker_committed_exiting(
iree_hal_hip_working_area_t* working_area);
+static bool iree_hal_hip_completion_committed_exiting(
+ iree_hal_hip_completion_area_t* working_area);
void iree_hal_hip_pending_queue_actions_destroy(
iree_hal_resource_t* base_actions) {
@@ -567,7 +642,7 @@
// Wait until the worker acknowledged exiting.
iree_notification_await(
&completion_area->exit_notification,
- (iree_condition_fn_t)iree_hal_hip_worker_committed_exiting,
+ (iree_condition_fn_t)iree_hal_hip_completion_committed_exiting,
completion_area, iree_infinite_timeout());
}
@@ -997,7 +1072,7 @@
iree_slim_mutex_unlock(
&action->owning_actions->working_area.pending_work_items_count_mutex);
- iree_hal_hip_atomic_slist_completion_t* entry = NULL;
+ iree_hal_hip_completion_list_node_t* entry = NULL;
// TODO: avoid host allocator malloc; use some pool for the allocation.
iree_status_t status = iree_allocator_malloc(
action->owning_actions->host_allocator, sizeof(*entry), (void**)&entry);
@@ -1013,7 +1088,8 @@
entry->created_event = created_event;
entry->callback = iree_hal_hip_execution_device_signal_host_callback;
entry->user_data = action;
- iree_hal_hip_completion_slist_push(
+
+ iree_hal_hip_completion_list_push(
&action->owning_actions->completion_area.completion_list, entry);
iree_slim_mutex_lock(
@@ -1167,7 +1243,7 @@
return status;
}
- iree_hal_hip_atomic_slist_entry_t* entry = NULL;
+ iree_hal_hip_entry_list_node_t* entry = NULL;
// TODO: avoid host allocator malloc; use some pool for the allocation.
if (iree_status_is_ok(status)) {
status = iree_allocator_malloc(actions->host_allocator, sizeof(*entry),
@@ -1185,8 +1261,7 @@
// Now push the ready list to the worker and have it to issue the actions to
// the GPU.
entry->ready_list_head = ready_list.head;
- iree_hal_hip_ready_action_slist_push(&actions->working_area.ready_worklist,
- entry);
+ iree_hal_hip_entry_list_push(&actions->working_area.ready_worklist, entry);
// We can only overwrite the worker state if the previous state is idle
// waiting; we cannot overwrite exit related states. so we need to perform
@@ -1241,23 +1316,28 @@
IREE_HAL_HIP_WORKER_STATE_EXIT_COMMITTED;
}
+static bool iree_hal_hip_completion_committed_exiting(
+ iree_hal_hip_completion_area_t* working_area) {
+ return iree_atomic_load_int32(&working_area->worker_state,
+ iree_memory_order_acquire) ==
+ IREE_HAL_HIP_WORKER_STATE_EXIT_COMMITTED;
+}
+
// Processes all ready actions in the given |worklist|.
static iree_status_t iree_hal_hip_worker_process_ready_list(
- iree_allocator_t host_allocator,
- iree_hal_hip_ready_action_slist_t* worklist) {
+ iree_allocator_t host_allocator, iree_hal_hip_entry_list_t* worklist) {
IREE_TRACE_ZONE_BEGIN(z0);
iree_status_t status = iree_ok_status();
while (true) {
- iree_hal_hip_atomic_slist_entry_t* entry =
- iree_hal_hip_ready_action_slist_pop(worklist);
+ iree_hal_hip_entry_list_node_t* entry =
+ iree_hal_hip_entry_list_pop(worklist);
if (!entry) break;
// Process the current batch of ready actions.
while (entry->ready_list_head) {
iree_hal_hip_queue_action_t* action =
iree_hal_hip_atomic_slist_entry_pop_front(entry);
-
switch (action->state) {
case IREE_HAL_HIP_QUEUE_ACTION_STATE_ALIVE:
status = iree_hal_hip_pending_queue_actions_issue_execution(action);
@@ -1276,7 +1356,7 @@
// Let common destruction path take care of destroying the worklist.
// When we know all host stream callbacks are done and not touching
// anything.
- iree_hal_hip_ready_action_slist_push(worklist, entry);
+ iree_hal_hip_entry_list_push(worklist, entry);
break;
}
@@ -1331,12 +1411,12 @@
}
static iree_status_t iree_hal_hip_worker_process_completion(
- iree_hal_hip_completion_slist_t* worklist,
+ iree_hal_hip_completion_list_t* worklist,
iree_hal_hip_completion_area_t* completion_area) {
iree_status_t status = iree_ok_status();
while (true) {
- iree_hal_hip_atomic_slist_completion_t* entry =
- iree_hal_hip_completion_slist_pop(worklist);
+ iree_hal_hip_completion_list_node_t* entry =
+ iree_hal_hip_completion_list_pop(worklist);
if (!entry) break;
IREE_TRACE_ZONE_BEGIN_NAMED(z1, "hipEventSynchronize");
@@ -1347,7 +1427,7 @@
// Let common destruction path take care of destroying the worklist.
// When we know all host stream callbacks are done and not touching
// anything.
- iree_hal_hip_completion_slist_push(worklist, entry);
+ iree_hal_hip_completion_list_push(worklist, entry);
status =
iree_make_status(IREE_STATUS_ABORTED, "could not wait on hip event");
break;
@@ -1373,7 +1453,7 @@
// The main function for the completion worker thread.
static int iree_hal_hip_completion_execute(
iree_hal_hip_completion_area_t* completion_area) {
- iree_hal_hip_completion_slist_t* worklist = &completion_area->completion_list;
+ iree_hal_hip_completion_list_t* worklist = &completion_area->completion_list;
iree_status_t status = IREE_HIP_RESULT_TO_STATUS(
completion_area->symbols, hipSetDevice(completion_area->device),
@@ -1459,7 +1539,7 @@
// The main function for the ready-list processing worker thread.
static int iree_hal_hip_worker_execute(
iree_hal_hip_working_area_t* working_area) {
- iree_hal_hip_ready_action_slist_t* worklist = &working_area->ready_worklist;
+ iree_hal_hip_entry_list_t* worklist = &working_area->ready_worklist;
// Hip stores thread-local data based on the device. Some hip commands pull
// the device from there, and it defaults to device 0 (e.g. hipEventCreate),