Adding task system utilization tracing. (#13941)
Adds one new plot per executor in a process showing a 0-100% utilization
value as tracked by the workers (vs the CPU usage sampled a much lower
rate).

Inspecting the area under/over the curve is an easy way to spot
dispatches that could have improved distribution:

(here 2x of the wall time is only 1x the utilization, meaning with
perfect distribution we could go 2x faster by 2xing the utilization)
diff --git a/runtime/src/iree/hal/drivers/local_task/task_queue.c b/runtime/src/iree/hal/drivers/local_task/task_queue.c
index 18e8cab..e6389dd 100644
--- a/runtime/src/iree/hal/drivers/local_task/task_queue.c
+++ b/runtime/src/iree/hal/drivers/local_task/task_queue.c
@@ -284,11 +284,11 @@
// A list of semaphores to signal upon retiring.
iree_hal_semaphore_list_t signal_semaphores;
- // Command buffers retained until all have retired.
+ // Resources retained until all have retired.
// We could release them earlier but that would require tracking individual
- // command buffer task completion.
- iree_host_size_t command_buffer_count;
- iree_hal_command_buffer_t* command_buffers[];
+ // resource-level completion.
+ iree_host_size_t resource_count;
+ iree_hal_resource_t* resources[];
} iree_hal_task_queue_retire_cmd_t;
// Retires a submission by signaling semaphores to their desired value and
@@ -303,9 +303,9 @@
// Release command buffers now that all are known to have retired.
// We do this before signaling so that waiting threads can immediately reuse
// resources that are released.
- for (iree_host_size_t i = 0; i < cmd->command_buffer_count; ++i) {
- iree_hal_command_buffer_release(cmd->command_buffers[i]);
- cmd->command_buffers[i] = NULL;
+ for (iree_host_size_t i = 0; i < cmd->resource_count; ++i) {
+ iree_hal_resource_release(cmd->resources[i]);
+ cmd->resources[i] = NULL;
}
// Signal all semaphores to their new values.
@@ -331,12 +331,12 @@
iree_hal_task_queue_retire_cmd_t* cmd =
(iree_hal_task_queue_retire_cmd_t*)task;
- // Release command buffers now that all are known to have retired.
+ // Release resources now that all are known to have retired.
// In success cases we try to do this eagerly to allow for more potential
// reuse but during full/partial failures they may still be live here.
- for (iree_host_size_t i = 0; i < cmd->command_buffer_count; ++i) {
- iree_hal_command_buffer_release(cmd->command_buffers[i]);
- cmd->command_buffers[i] = NULL;
+ for (iree_host_size_t i = 0; i < cmd->resource_count; ++i) {
+ iree_hal_resource_release(cmd->resources[i]);
+ cmd->resources[i] = NULL;
}
// If the command failed then fail all semaphores to ensure future
@@ -361,8 +361,8 @@
// The command will own an arena that can be used for other submission-related
// allocations.
static iree_status_t iree_hal_task_queue_retire_cmd_allocate(
- iree_task_scope_t* scope, iree_host_size_t command_buffer_count,
- iree_hal_command_buffer_t* const* command_buffers,
+ iree_task_scope_t* scope, iree_host_size_t resource_count,
+ iree_hal_resource_t* const* resources,
const iree_hal_semaphore_list_t* signal_semaphores,
iree_arena_block_pool_t* block_pool,
iree_hal_task_queue_retire_cmd_t** out_cmd) {
@@ -373,7 +373,7 @@
// Allocate the command from the arena.
iree_hal_task_queue_retire_cmd_t* cmd = NULL;
iree_host_size_t total_cmd_size =
- sizeof(*cmd) + command_buffer_count * sizeof(*cmd->command_buffers);
+ sizeof(*cmd) + resource_count * sizeof(*cmd->resources);
iree_status_t status =
iree_arena_allocate(&arena, total_cmd_size, (void**)&cmd);
if (iree_status_is_ok(status)) {
@@ -396,10 +396,10 @@
memcpy(&cmd->arena, &arena, sizeof(cmd->arena));
// Retain command buffers.
- cmd->command_buffer_count = command_buffer_count;
- for (iree_host_size_t i = 0; i < command_buffer_count; ++i) {
- cmd->command_buffers[i] = command_buffers[i];
- iree_hal_command_buffer_retain(cmd->command_buffers[i]);
+ cmd->resource_count = resource_count;
+ for (iree_host_size_t i = 0; i < resource_count; ++i) {
+ cmd->resources[i] = resources[i];
+ iree_hal_resource_retain(cmd->resources[i]);
}
*out_cmd = cmd;
@@ -458,7 +458,8 @@
// arena which we will use to allocate all other commands.
iree_hal_task_queue_retire_cmd_t* retire_cmd = NULL;
IREE_RETURN_IF_ERROR(iree_hal_task_queue_retire_cmd_allocate(
- &queue->scope, batch->command_buffer_count, batch->command_buffers,
+ &queue->scope, batch->command_buffer_count,
+ (iree_hal_resource_t* const*)batch->command_buffers,
&batch->signal_semaphores, queue->block_pool, &retire_cmd));
// NOTE: if we fail from here on we must drop the retire_cmd arena.
@@ -484,7 +485,7 @@
// After this task completes the commands have been issued but have not yet
// completed and the issued commands may complete in any order.
iree_hal_task_queue_issue_cmd_t* issue_cmd = NULL;
- if (iree_status_is_ok(status)) {
+ if (iree_status_is_ok(status) && batch->command_buffer_count > 0) {
status = iree_hal_task_queue_issue_cmd_allocate(
&queue->scope, queue, &retire_cmd->task.header,
batch->command_buffer_count, batch->command_buffers, &retire_cmd->arena,
@@ -501,14 +502,15 @@
iree_task_submission_initialize(&submission);
// Sequencing: wait on semaphores or go directly into the executor queue.
+ iree_task_t* head_task =
+ issue_cmd ? &issue_cmd->task.header : &retire_cmd->task.header;
if (wait_cmd != NULL) {
// Ensure that we only issue command buffers after all waits have completed.
- iree_task_set_completion_task(&wait_cmd->task.header,
- &issue_cmd->task.header);
+ iree_task_set_completion_task(&wait_cmd->task.header, head_task);
iree_task_submission_enqueue(&submission, &wait_cmd->task.header);
} else {
// No waits needed; directly enqueue.
- iree_task_submission_enqueue(&submission, &issue_cmd->task.header);
+ iree_task_submission_enqueue(&submission, head_task);
}
// Submit the tasks immediately. The executor may queue them up until we
diff --git a/runtime/src/iree/task/affinity_set.h b/runtime/src/iree/task/affinity_set.h
index bf2bf6a..3dbf756 100644
--- a/runtime/src/iree/task/affinity_set.h
+++ b/runtime/src/iree/task/affinity_set.h
@@ -44,12 +44,14 @@
return UINT64_MAX;
}
-#define iree_task_affinity_set_count_leading_zeros \
- iree_math_count_leading_zeros_u64
-#define iree_task_affinity_set_count_trailing_zeros \
- iree_math_count_trailing_zeros_u64
-#define iree_task_affinity_set_count_ones iree_math_count_ones_u64
-#define iree_task_affinity_set_rotr iree_math_rotr_u64
+#define iree_task_affinity_set_ones(count) \
+ (0xFFFFFFFFFFFFFFFFull >> (64 - (count)))
+#define iree_task_affinity_set_count_leading_zeros(set) \
+ iree_math_count_leading_zeros_u64(set)
+#define iree_task_affinity_set_count_trailing_zeros(set) \
+ iree_math_count_trailing_zeros_u64(set)
+#define iree_task_affinity_set_count_ones(set) iree_math_count_ones_u64(set)
+#define iree_task_affinity_set_rotr(set, count) iree_math_rotr_u64(set, count)
//===----------------------------------------------------------------------===//
// iree_atomic_task_affinity_set_t
diff --git a/runtime/src/iree/task/executor.c b/runtime/src/iree/task/executor.c
index 4b25af5..498305a 100644
--- a/runtime/src/iree/task/executor.c
+++ b/runtime/src/iree/task/executor.c
@@ -10,6 +10,7 @@
#include <stddef.h>
#include <string.h>
+#include "iree/base/internal/debugging.h"
#include "iree/base/internal/math.h"
#include "iree/base/tracing.h"
#include "iree/task/affinity_set.h"
@@ -81,6 +82,23 @@
iree_atomic_task_slist_initialize(&executor->incoming_ready_slist);
iree_slim_mutex_initialize(&executor->coordinator_mutex);
+ IREE_TRACE({
+ static iree_atomic_int32_t executor_id = IREE_ATOMIC_VAR_INIT(0);
+ char trace_name[32];
+ int trace_name_length =
+ snprintf(trace_name, sizeof(trace_name), "iree-executor-%d",
+ iree_atomic_fetch_add_int32(&executor_id, 1,
+ iree_memory_order_seq_cst));
+ IREE_LEAK_CHECK_DISABLE_PUSH();
+ executor->trace_name = malloc(trace_name_length + 1);
+ memcpy((void*)executor->trace_name, trace_name, trace_name_length + 1);
+ IREE_LEAK_CHECK_DISABLE_POP();
+ IREE_TRACE_SET_PLOT_TYPE(executor->trace_name,
+ IREE_TRACING_PLOT_TYPE_PERCENTAGE, /*step=*/true,
+ /*fill=*/true, /*color=*/0xFF1F883Du);
+ IREE_TRACE_PLOT_VALUE_F32(executor->trace_name, 0.0f);
+ });
+
// Simple PRNG used to generate seeds for the per-worker PRNGs used to
// distribute work. This isn't strong (and doesn't need to be); it's just
// enough to ensure each worker gets a sufficiently random seed for itself to
@@ -135,13 +153,10 @@
uint8_t* worker_local_memory =
(uint8_t*)executor->workers + worker_list_size;
- iree_task_affinity_set_t worker_idle_mask = 0;
- iree_task_affinity_set_t worker_live_mask = 0;
- for (iree_host_size_t i = 0; i < worker_count; ++i) {
- iree_task_affinity_set_t worker_bit = iree_task_affinity_for_worker(i);
- worker_idle_mask |= worker_bit;
- worker_live_mask |= worker_bit;
+ iree_task_affinity_set_t worker_mask =
+ iree_task_affinity_set_ones(worker_count);
+ for (iree_host_size_t i = 0; i < worker_count; ++i) {
iree_task_worker_t* worker = &executor->workers[i];
status = iree_task_worker_initialize(
executor, i, iree_task_topology_get_group(topology, i),
@@ -152,13 +167,11 @@
worker_local_memory += options.worker_local_memory_size;
if (!iree_status_is_ok(status)) break;
}
- // The masks are accessed with 'relaxed' order because they are just hints.
+
iree_atomic_task_affinity_set_store(&executor->worker_idle_mask,
- worker_idle_mask,
- iree_memory_order_relaxed);
+ worker_mask, iree_memory_order_release);
iree_atomic_task_affinity_set_store(&executor->worker_live_mask,
- worker_live_mask,
- iree_memory_order_relaxed);
+ worker_mask, iree_memory_order_release);
}
if (!iree_status_is_ok(status)) {
diff --git a/runtime/src/iree/task/executor_impl.h b/runtime/src/iree/task/executor_impl.h
index db00add..cfcf15c 100644
--- a/runtime/src/iree/task/executor_impl.h
+++ b/runtime/src/iree/task/executor_impl.h
@@ -30,6 +30,11 @@
iree_atomic_ref_count_t ref_count;
iree_allocator_t allocator;
+ // Leaked dynamically allocated name used for tracing calls.
+ // This pointer - once allocated - will be valid for the lifetime of the
+ // process and can be used for IREE_TRACE plotting/allocation calls.
+ IREE_TRACE(const char* trace_name;)
+
// Defines how work is selected across queues.
// TODO(benvanik): make mutable; currently always the same reserved value.
iree_task_scheduling_mode_t scheduling_mode;
diff --git a/runtime/src/iree/task/worker.c b/runtime/src/iree/task/worker.c
index 55a8433..9627330 100644
--- a/runtime/src/iree/task/worker.c
+++ b/runtime/src/iree/task/worker.c
@@ -276,9 +276,16 @@
iree_wait_token_t wait_token =
iree_notification_prepare_wait(&worker->wake_notification);
// The masks are accessed with 'relaxed' order because they are just hints.
- iree_atomic_task_affinity_set_fetch_and(&worker->executor->worker_idle_mask,
- ~worker->worker_bit,
- iree_memory_order_relaxed);
+ iree_task_affinity_set_t old_idle_mask =
+ iree_atomic_task_affinity_set_fetch_and(
+ &worker->executor->worker_idle_mask, ~worker->worker_bit,
+ iree_memory_order_relaxed);
+ (void)old_idle_mask;
+ IREE_TRACE_PLOT_VALUE_F32(
+ worker->executor->trace_name,
+ 100.0f - 100.0f *
+ (iree_task_affinity_set_count_ones(old_idle_mask) - 1) /
+ (float)worker->executor->worker_count);
// Check state to see if we've been asked to exit.
if (iree_atomic_load_int32(&worker->state, iree_memory_order_acquire) ==
@@ -309,9 +316,15 @@
// We've finished all the work we have scheduled so set our idle flag.
// This ensures that if any other thread comes in and wants to give us
// work we will properly coordinate/wake below.
- iree_atomic_task_affinity_set_fetch_or(&worker->executor->worker_idle_mask,
- worker->worker_bit,
- iree_memory_order_relaxed);
+ old_idle_mask = iree_atomic_task_affinity_set_fetch_or(
+ &worker->executor->worker_idle_mask, worker->worker_bit,
+ iree_memory_order_relaxed);
+ (void)old_idle_mask;
+ IREE_TRACE_PLOT_VALUE_F32(
+ worker->executor->trace_name,
+ 100.0f - 100.0f *
+ (iree_task_affinity_set_count_ones(old_idle_mask) + 1) /
+ (float)worker->executor->worker_count);
// When we encounter a complete lack of work we can self-nominate to check
// the global work queue and distribute work to other threads. Only one