Adding task system utilization tracing. (#13941)

Adds one new plot per executor in a process showing a 0-100% utilization
value as tracked by the workers (vs the CPU usage sampled a much lower
rate).


![image](https://github.com/openxla/iree/assets/75337/7fc353d2-3fe8-4e33-8b60-056751231381)

Inspecting the area under/over the curve is an easy way to spot
dispatches that could have improved distribution:

![image](https://github.com/openxla/iree/assets/75337/1383ad5d-c802-43ce-9352-e584eb040921)
(here 2x of the wall time is only 1x the utilization, meaning with
perfect distribution we could go 2x faster by 2xing the utilization)
diff --git a/runtime/src/iree/hal/drivers/local_task/task_queue.c b/runtime/src/iree/hal/drivers/local_task/task_queue.c
index 18e8cab..e6389dd 100644
--- a/runtime/src/iree/hal/drivers/local_task/task_queue.c
+++ b/runtime/src/iree/hal/drivers/local_task/task_queue.c
@@ -284,11 +284,11 @@
   // A list of semaphores to signal upon retiring.
   iree_hal_semaphore_list_t signal_semaphores;
 
-  // Command buffers retained until all have retired.
+  // Resources retained until all have retired.
   // We could release them earlier but that would require tracking individual
-  // command buffer task completion.
-  iree_host_size_t command_buffer_count;
-  iree_hal_command_buffer_t* command_buffers[];
+  // resource-level completion.
+  iree_host_size_t resource_count;
+  iree_hal_resource_t* resources[];
 } iree_hal_task_queue_retire_cmd_t;
 
 // Retires a submission by signaling semaphores to their desired value and
@@ -303,9 +303,9 @@
   // Release command buffers now that all are known to have retired.
   // We do this before signaling so that waiting threads can immediately reuse
   // resources that are released.
-  for (iree_host_size_t i = 0; i < cmd->command_buffer_count; ++i) {
-    iree_hal_command_buffer_release(cmd->command_buffers[i]);
-    cmd->command_buffers[i] = NULL;
+  for (iree_host_size_t i = 0; i < cmd->resource_count; ++i) {
+    iree_hal_resource_release(cmd->resources[i]);
+    cmd->resources[i] = NULL;
   }
 
   // Signal all semaphores to their new values.
@@ -331,12 +331,12 @@
   iree_hal_task_queue_retire_cmd_t* cmd =
       (iree_hal_task_queue_retire_cmd_t*)task;
 
-  // Release command buffers now that all are known to have retired.
+  // Release resources now that all are known to have retired.
   // In success cases we try to do this eagerly to allow for more potential
   // reuse but during full/partial failures they may still be live here.
-  for (iree_host_size_t i = 0; i < cmd->command_buffer_count; ++i) {
-    iree_hal_command_buffer_release(cmd->command_buffers[i]);
-    cmd->command_buffers[i] = NULL;
+  for (iree_host_size_t i = 0; i < cmd->resource_count; ++i) {
+    iree_hal_resource_release(cmd->resources[i]);
+    cmd->resources[i] = NULL;
   }
 
   // If the command failed then fail all semaphores to ensure future
@@ -361,8 +361,8 @@
 // The command will own an arena that can be used for other submission-related
 // allocations.
 static iree_status_t iree_hal_task_queue_retire_cmd_allocate(
-    iree_task_scope_t* scope, iree_host_size_t command_buffer_count,
-    iree_hal_command_buffer_t* const* command_buffers,
+    iree_task_scope_t* scope, iree_host_size_t resource_count,
+    iree_hal_resource_t* const* resources,
     const iree_hal_semaphore_list_t* signal_semaphores,
     iree_arena_block_pool_t* block_pool,
     iree_hal_task_queue_retire_cmd_t** out_cmd) {
@@ -373,7 +373,7 @@
   // Allocate the command from the arena.
   iree_hal_task_queue_retire_cmd_t* cmd = NULL;
   iree_host_size_t total_cmd_size =
-      sizeof(*cmd) + command_buffer_count * sizeof(*cmd->command_buffers);
+      sizeof(*cmd) + resource_count * sizeof(*cmd->resources);
   iree_status_t status =
       iree_arena_allocate(&arena, total_cmd_size, (void**)&cmd);
   if (iree_status_is_ok(status)) {
@@ -396,10 +396,10 @@
     memcpy(&cmd->arena, &arena, sizeof(cmd->arena));
 
     // Retain command buffers.
-    cmd->command_buffer_count = command_buffer_count;
-    for (iree_host_size_t i = 0; i < command_buffer_count; ++i) {
-      cmd->command_buffers[i] = command_buffers[i];
-      iree_hal_command_buffer_retain(cmd->command_buffers[i]);
+    cmd->resource_count = resource_count;
+    for (iree_host_size_t i = 0; i < resource_count; ++i) {
+      cmd->resources[i] = resources[i];
+      iree_hal_resource_retain(cmd->resources[i]);
     }
 
     *out_cmd = cmd;
@@ -458,7 +458,8 @@
   // arena which we will use to allocate all other commands.
   iree_hal_task_queue_retire_cmd_t* retire_cmd = NULL;
   IREE_RETURN_IF_ERROR(iree_hal_task_queue_retire_cmd_allocate(
-      &queue->scope, batch->command_buffer_count, batch->command_buffers,
+      &queue->scope, batch->command_buffer_count,
+      (iree_hal_resource_t* const*)batch->command_buffers,
       &batch->signal_semaphores, queue->block_pool, &retire_cmd));
 
   // NOTE: if we fail from here on we must drop the retire_cmd arena.
@@ -484,7 +485,7 @@
   // After this task completes the commands have been issued but have not yet
   // completed and the issued commands may complete in any order.
   iree_hal_task_queue_issue_cmd_t* issue_cmd = NULL;
-  if (iree_status_is_ok(status)) {
+  if (iree_status_is_ok(status) && batch->command_buffer_count > 0) {
     status = iree_hal_task_queue_issue_cmd_allocate(
         &queue->scope, queue, &retire_cmd->task.header,
         batch->command_buffer_count, batch->command_buffers, &retire_cmd->arena,
@@ -501,14 +502,15 @@
   iree_task_submission_initialize(&submission);
 
   // Sequencing: wait on semaphores or go directly into the executor queue.
+  iree_task_t* head_task =
+      issue_cmd ? &issue_cmd->task.header : &retire_cmd->task.header;
   if (wait_cmd != NULL) {
     // Ensure that we only issue command buffers after all waits have completed.
-    iree_task_set_completion_task(&wait_cmd->task.header,
-                                  &issue_cmd->task.header);
+    iree_task_set_completion_task(&wait_cmd->task.header, head_task);
     iree_task_submission_enqueue(&submission, &wait_cmd->task.header);
   } else {
     // No waits needed; directly enqueue.
-    iree_task_submission_enqueue(&submission, &issue_cmd->task.header);
+    iree_task_submission_enqueue(&submission, head_task);
   }
 
   // Submit the tasks immediately. The executor may queue them up until we
diff --git a/runtime/src/iree/task/affinity_set.h b/runtime/src/iree/task/affinity_set.h
index bf2bf6a..3dbf756 100644
--- a/runtime/src/iree/task/affinity_set.h
+++ b/runtime/src/iree/task/affinity_set.h
@@ -44,12 +44,14 @@
   return UINT64_MAX;
 }
 
-#define iree_task_affinity_set_count_leading_zeros \
-  iree_math_count_leading_zeros_u64
-#define iree_task_affinity_set_count_trailing_zeros \
-  iree_math_count_trailing_zeros_u64
-#define iree_task_affinity_set_count_ones iree_math_count_ones_u64
-#define iree_task_affinity_set_rotr iree_math_rotr_u64
+#define iree_task_affinity_set_ones(count) \
+  (0xFFFFFFFFFFFFFFFFull >> (64 - (count)))
+#define iree_task_affinity_set_count_leading_zeros(set) \
+  iree_math_count_leading_zeros_u64(set)
+#define iree_task_affinity_set_count_trailing_zeros(set) \
+  iree_math_count_trailing_zeros_u64(set)
+#define iree_task_affinity_set_count_ones(set) iree_math_count_ones_u64(set)
+#define iree_task_affinity_set_rotr(set, count) iree_math_rotr_u64(set, count)
 
 //===----------------------------------------------------------------------===//
 // iree_atomic_task_affinity_set_t
diff --git a/runtime/src/iree/task/executor.c b/runtime/src/iree/task/executor.c
index 4b25af5..498305a 100644
--- a/runtime/src/iree/task/executor.c
+++ b/runtime/src/iree/task/executor.c
@@ -10,6 +10,7 @@
 #include <stddef.h>
 #include <string.h>
 
+#include "iree/base/internal/debugging.h"
 #include "iree/base/internal/math.h"
 #include "iree/base/tracing.h"
 #include "iree/task/affinity_set.h"
@@ -81,6 +82,23 @@
   iree_atomic_task_slist_initialize(&executor->incoming_ready_slist);
   iree_slim_mutex_initialize(&executor->coordinator_mutex);
 
+  IREE_TRACE({
+    static iree_atomic_int32_t executor_id = IREE_ATOMIC_VAR_INIT(0);
+    char trace_name[32];
+    int trace_name_length =
+        snprintf(trace_name, sizeof(trace_name), "iree-executor-%d",
+                 iree_atomic_fetch_add_int32(&executor_id, 1,
+                                             iree_memory_order_seq_cst));
+    IREE_LEAK_CHECK_DISABLE_PUSH();
+    executor->trace_name = malloc(trace_name_length + 1);
+    memcpy((void*)executor->trace_name, trace_name, trace_name_length + 1);
+    IREE_LEAK_CHECK_DISABLE_POP();
+    IREE_TRACE_SET_PLOT_TYPE(executor->trace_name,
+                             IREE_TRACING_PLOT_TYPE_PERCENTAGE, /*step=*/true,
+                             /*fill=*/true, /*color=*/0xFF1F883Du);
+    IREE_TRACE_PLOT_VALUE_F32(executor->trace_name, 0.0f);
+  });
+
   // Simple PRNG used to generate seeds for the per-worker PRNGs used to
   // distribute work. This isn't strong (and doesn't need to be); it's just
   // enough to ensure each worker gets a sufficiently random seed for itself to
@@ -135,13 +153,10 @@
     uint8_t* worker_local_memory =
         (uint8_t*)executor->workers + worker_list_size;
 
-    iree_task_affinity_set_t worker_idle_mask = 0;
-    iree_task_affinity_set_t worker_live_mask = 0;
-    for (iree_host_size_t i = 0; i < worker_count; ++i) {
-      iree_task_affinity_set_t worker_bit = iree_task_affinity_for_worker(i);
-      worker_idle_mask |= worker_bit;
-      worker_live_mask |= worker_bit;
+    iree_task_affinity_set_t worker_mask =
+        iree_task_affinity_set_ones(worker_count);
 
+    for (iree_host_size_t i = 0; i < worker_count; ++i) {
       iree_task_worker_t* worker = &executor->workers[i];
       status = iree_task_worker_initialize(
           executor, i, iree_task_topology_get_group(topology, i),
@@ -152,13 +167,11 @@
       worker_local_memory += options.worker_local_memory_size;
       if (!iree_status_is_ok(status)) break;
     }
-    // The masks are accessed with 'relaxed' order because they are just hints.
+
     iree_atomic_task_affinity_set_store(&executor->worker_idle_mask,
-                                        worker_idle_mask,
-                                        iree_memory_order_relaxed);
+                                        worker_mask, iree_memory_order_release);
     iree_atomic_task_affinity_set_store(&executor->worker_live_mask,
-                                        worker_live_mask,
-                                        iree_memory_order_relaxed);
+                                        worker_mask, iree_memory_order_release);
   }
 
   if (!iree_status_is_ok(status)) {
diff --git a/runtime/src/iree/task/executor_impl.h b/runtime/src/iree/task/executor_impl.h
index db00add..cfcf15c 100644
--- a/runtime/src/iree/task/executor_impl.h
+++ b/runtime/src/iree/task/executor_impl.h
@@ -30,6 +30,11 @@
   iree_atomic_ref_count_t ref_count;
   iree_allocator_t allocator;
 
+  // Leaked dynamically allocated name used for tracing calls.
+  // This pointer - once allocated - will be valid for the lifetime of the
+  // process and can be used for IREE_TRACE plotting/allocation calls.
+  IREE_TRACE(const char* trace_name;)
+
   // Defines how work is selected across queues.
   // TODO(benvanik): make mutable; currently always the same reserved value.
   iree_task_scheduling_mode_t scheduling_mode;
diff --git a/runtime/src/iree/task/worker.c b/runtime/src/iree/task/worker.c
index 55a8433..9627330 100644
--- a/runtime/src/iree/task/worker.c
+++ b/runtime/src/iree/task/worker.c
@@ -276,9 +276,16 @@
     iree_wait_token_t wait_token =
         iree_notification_prepare_wait(&worker->wake_notification);
     // The masks are accessed with 'relaxed' order because they are just hints.
-    iree_atomic_task_affinity_set_fetch_and(&worker->executor->worker_idle_mask,
-                                            ~worker->worker_bit,
-                                            iree_memory_order_relaxed);
+    iree_task_affinity_set_t old_idle_mask =
+        iree_atomic_task_affinity_set_fetch_and(
+            &worker->executor->worker_idle_mask, ~worker->worker_bit,
+            iree_memory_order_relaxed);
+    (void)old_idle_mask;
+    IREE_TRACE_PLOT_VALUE_F32(
+        worker->executor->trace_name,
+        100.0f - 100.0f *
+                     (iree_task_affinity_set_count_ones(old_idle_mask) - 1) /
+                     (float)worker->executor->worker_count);
 
     // Check state to see if we've been asked to exit.
     if (iree_atomic_load_int32(&worker->state, iree_memory_order_acquire) ==
@@ -309,9 +316,15 @@
     // We've finished all the work we have scheduled so set our idle flag.
     // This ensures that if any other thread comes in and wants to give us
     // work we will properly coordinate/wake below.
-    iree_atomic_task_affinity_set_fetch_or(&worker->executor->worker_idle_mask,
-                                           worker->worker_bit,
-                                           iree_memory_order_relaxed);
+    old_idle_mask = iree_atomic_task_affinity_set_fetch_or(
+        &worker->executor->worker_idle_mask, worker->worker_bit,
+        iree_memory_order_relaxed);
+    (void)old_idle_mask;
+    IREE_TRACE_PLOT_VALUE_F32(
+        worker->executor->trace_name,
+        100.0f - 100.0f *
+                     (iree_task_affinity_set_count_ones(old_idle_mask) + 1) /
+                     (float)worker->executor->worker_count);
 
     // When we encounter a complete lack of work we can self-nominate to check
     // the global work queue and distribute work to other threads. Only one