iree/task/task.h - 3p/openxla/iree - Git at Google

 // Copyright 2020 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //      https://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.

 #ifndef IREE_TASK_TASK_H_
 #define IREE_TASK_TASK_H_

 #include "iree/base/api.h"
 #include "iree/base/atomic_slist.h"
 #include "iree/base/atomics.h"
 #include "iree/base/synchronization.h"
 #include "iree/base/wait_handle.h"
 #include "iree/task/affinity_set.h"

 #ifdef __cplusplus
 extern "C" {
 #endif  // __cplusplus

 typedef struct iree_task_list_s iree_task_list_t;
 typedef struct iree_task_pool_s iree_task_pool_t;
 typedef struct iree_task_scope_s iree_task_scope_t;
 typedef struct iree_task_submission_s iree_task_submission_t;

 //==============================================================================
 // Task header for internal tracking
 //==============================================================================

 // Specifies the type of a task and how executors handle it.
 enum iree_task_type_e {
   // Task is a no-op (performs no work) and exists for flexibility.
   IREE_TASK_TYPE_NOP = 0u,

   // Task will synchronously call a function before continuing.
   IREE_TASK_TYPE_CALL = 1u,

   // Task exists only as a barrier to join/fork tasks and has no executable
   // payload.
   IREE_TASK_TYPE_BARRIER = 2u,

   // Task is a fence indicating that a certain point in the task graph has been
   // reached. All tasks prior to this fence (by way of happens-before
   // dependencies) are guaranteed to have retired.
   IREE_TASK_TYPE_FENCE = 3u,

   // Task is a wait on an external wait handle (fd, HANDLE, etc).
   // Executors will wait on the handle until it is signaled and meets the
   // specified condition prior to readying the dependent tasks.
   IREE_TASK_TYPE_WAIT = 4u,

   // Task is a 3D grid dispatch of zero or more tiles.
   // Dispatches are issued when ready by either being split into zero or more
   // slices with one or more tiles each based on the workgroup count or one
   // shard per worker that should process the dispatch.
   //
   // If IREE_TASK_FLAG_DISPATCH_INDIRECT is set then the dispatch reads the
   // workgroup count from a buffer immediately prior to fan-out instead of using
   // the values embedded in the task structure.
   //
   // After a dispatch has been issued the IREE_TASK_FLAG_DISPATCH_RETIRE flag is
   // set to indicate that when the dispatch becomes ready again it will be after
   // all slices have completed.
   IREE_TASK_TYPE_DISPATCH = 5u,

   // Task is a slice of a larger contiguous dispatch tile range. The full
   // dispatch will be sliced into zero or more slices and each slice will be
   // posted to a particular worker for executiion. If work progresses unevenly
   // then entire slices will be stolen across workers to balance out the timing.
   // Slices retire once they have completed the tiles assigned to them.
   IREE_TASK_TYPE_DISPATCH_SLICE = 6u,

   // Task is one of potentially many shards processing a larger dispatch grid.
   // Each shard may have a preference as to which parts of grid it will focus
   // on but is able to otherwise steal any available region directly from the
   // shared dispatch coordination state. Shards retire once there are no more
   // tiles remaining in the dispatch grid.
   IREE_TASK_TYPE_DISPATCH_SHARD = 7u,
 };
 typedef uint8_t iree_task_type_t;

 enum iree_task_flags_e {
   // The wait handle the task is specified to wait on has resolved and the task
   // can now be considered complete.
   IREE_TASK_FLAG_WAIT_COMPLETED = 1u << 0,

   // The workgroup count for the dispatch is provided by way of a pointer to a
   // list of 3 uint32_t values that will be sampled immediately prior to
   // issuing of the dispatch. The contents of the pointer can be safely modified
   // up until the last dependency has completed and the dispatch is about to be
   // issued.
   IREE_TASK_FLAG_DISPATCH_INDIRECT = 1u << 1,

   // The dispatch should be sliced across workers via the low-contention
   // IREE_TASK_TYPE_DISPATCH_SLICE mechanism. This moves the dispatch overhead
   // to the time when the grid is sliced for a savings during when the grid is
   // executed.
   IREE_TASK_FLAG_DISPATCH_SLICED = 1u << 2,

   // The dispatch has been issued and the task is waiting for one or more
   // slices to complete. After they complete the dispatch will be readied and
   // can be retired.
   //
   // Though added by the executor after issuing a dispatch users can also set
   // this to indicate that all dispatch slices for a particular dispatch have
   // been statically scheduled. Executors will then skip issuing the dispatch
   // and instead wait until all slices complete, enabling IREE_TASK_TYPE_BARRIER
   // behavior but without an additional task as dispatches are still required
   // to store information for slices.
   IREE_TASK_FLAG_DISPATCH_RETIRE = 1u << 3,
 };
 typedef uint16_t iree_task_flags_t;

 typedef struct iree_task_s iree_task_t;

 // A function called to cleanup tasks.
 // The provided |status| is unowned and must be cloned if used beyond the scope
 // of the cleanup function (such as when stored for later usage).
 typedef void(IREE_API_PTR* iree_task_cleanup_fn_t)(iree_task_t* task,
                                                    iree_status_t status);

 // A task within the task system that runs on an executor.
 // Tasks have an iree_task_type_t that defines which parameters are valid and
 // how the executor is to treat the task. Dependency edges can be defined that
 // determine the execution order of tasks within the executors.
 struct iree_alignas(iree_max_align_t) iree_task_s {
   // Instrusive pointer used to store tasks within iree_task_list_t and
   // iree_atomic_task_list_t singly-linked lists. This must come first in the
   // structure so that it is at the appropriate alignment.
   iree_task_t* next_task;

   // The scope this task is attributed to. Errors with the task will be
   // propagated to the scope and errors in the scope will cause pending tasks to
   // be skipped.
   iree_task_scope_t* scope;

   // Optional function to call to cleanup the task on completion.
   // Will be called after the task has retired or if the task fails to issue
   // (dependency failed, etc).
   iree_task_cleanup_fn_t cleanup_fn;

   // Optional task that will be notified when the task completes.
   // The task will have its pending_dependency_count decremented and will be
   // readied for execution when the count reaches 0.
   iree_task_t* completion_task;

   // Specifies which workers will be used to execute this task.
   // Forked tasks will inherit their parent task affinity (possibly with some
   // task-dependent rules) to partition workloads across workers with knowledge
   // of the specific work being performed. For example, some dispatches can be
   // limited to run on certain microarchitectures that workers have affinity
   // with at the OS scheduler level (such as little.BIG topologies).
   iree_task_affinity_set_t affinity_set;

   // Total number of dependent tasks still outstanding. Decremented each time
   // a dependent task completes. The task is considered ready to execute when
   // this value reaches 0.
   iree_atomic_int32_t pending_dependency_count;

   // Optional pool the task should be returned to after it has resolved. If the
   // task was allocated as part of a larger data structure (embedded within
   // an arena for example) then this can be NULL to prevent the task system
   // from interfering.
   iree_task_pool_t* pool;

   // Specifies the type of the task and how the executor handles it.
   iree_task_type_t type;

   // Task-specific flag bits.
   iree_task_flags_t flags;
 };
 static_assert(offsetof(iree_task_t, next_task) == 0,
               "next_task intrusive pointer must be at offset 0");

 // Initializes a task header with the given type.
 // Must be called on all tasks to ensure proper dependency tracking and list
 // state prior to enqueuing. Only the task header structure is initialized and
 // any additional data as part of the wrapping task type must be initialized by
 // the caller.
 void iree_task_initialize(iree_task_type_t type, iree_task_scope_t* scope,
                           iree_task_t* out_task);

 // Sets the optional function called when the task completes (whether successful
 // or not).
 void iree_task_set_cleanup_fn(iree_task_t* task,
                               iree_task_cleanup_fn_t cleanup_fn);

 // Sets up a dependency edge from |task| to |completion_task| such that when
 // |task| completes |completion_task| will be notified and have its
 // pending_dependency_count decremented.
 void iree_task_set_completion_task(iree_task_t* task,
                                    iree_task_t* completion_task);

 // Returns true if the |task| is ready to execute immediately.
 // Though this is safe to call from any thread the test may have false-negatives
 // (ready tasks are not returned as ready) due to cross-thread synchronization
 // latency. Note that tasks may yield themselves during execution and switch
 // from ready to waiting (such as when an indirect dispatch needs to wait for
 // all tiles to complete).
 bool iree_task_is_ready(iree_task_t* task);

 // Discards the task and any dependent tasks.
 // Any dependent tasks that need to be discarded will be added to
 // |discard_worklist| for the caller to continue discarding.
 void iree_task_discard(iree_task_t* task, iree_task_list_t* discard_worklist);

 //==============================================================================
 // IREE_TASK_TYPE_NOP
 //==============================================================================

 // Task is a no-op (performs no work) and exists for flexibility.
 // NOP tasks can be used to link together task lists from multiple threads
 // where it may otherwise not be ideal to have heavy-weight concurrency
 // structures. NOP tasks can also be useful for neutering another task type
 // after it has already been recorded into a list such as when cancellations
 // occur.
 typedef iree_alignas(iree_max_align_t) struct {
   // Task header: implementation detail, do not use.
   iree_task_t header;
 } iree_task_nop_t;

 void iree_task_nop_initialize(iree_task_scope_t* scope,
                               iree_task_nop_t* out_task);

 //==============================================================================
 // IREE_TASK_TYPE_CALL
 //==============================================================================

 typedef iree_status_t(IREE_API_PTR* iree_task_call_closure_fn_t)(
     uintptr_t user_context, iree_task_t* task,
     iree_task_submission_t* pending_submission);

 // A function closure representing the function to call and its arguments.
 typedef struct {
   // Function called per tile invocation.
   iree_task_call_closure_fn_t fn;

   // User-defined argument passed to task functions during invocation.
   // Opaque pointer-sized values that could point to user data structures or
   // contain embedded values. No lifetime management is performed by the task
   // system and it is required that users ensure that the memory referenced is
   // live until after the task has completed.
   uintptr_t user_context;

   // TODO(benvanik): cleanup function? right now assume arg is never freed.
 } iree_task_call_closure_t;

 // Binds a function pointer and the arguments it should be called with.
 // If the arguments represent pointers they must remain live until the task
 // has completed execution.
 static inline iree_task_call_closure_t iree_task_make_call_closure(
     iree_task_call_closure_fn_t fn, uintptr_t user_context) {
   iree_task_call_closure_t closure = {fn, user_context};
   return closure;
 }

 // A task that will synchronously call a function from the executor and wait
 // for it to complete before continuing.
 //
 // Memory referenced by closure arguments must be kept valid until the function
 // executes (in general with the same lifetime as the task itself).
 typedef iree_alignas(iree_max_align_t) struct {
   // Task header: implementation detail, do not use.
   iree_task_t header;

   // Function closure to call when the task is executed.
   iree_task_call_closure_t closure;
 } iree_task_call_t;

 void iree_task_call_initialize(iree_task_scope_t* scope,
                                iree_task_call_closure_t closure,
                                iree_task_call_t* out_task);

 //==============================================================================
 // IREE_TASK_TYPE_BARRIER
 //==============================================================================

 // A join point for fork/join-style scheduling.
 // References a set of dependent tasks that will be notified and possibly
 // readied when the barrier is reached.
 //
 // This allows for modeling one-to-many and many-to-many relationships. The base
 // task dependency system only models one-to-one and should be used if possible
 // to avoid the additional overhead of a barrier task both in memory and task
 // indirection/queuing.
 //
 // Example:
 //  * [A] -> Barrier -> [C, D]
 //  - A executes
 //  - Barrier is processed after A completes
 //  - C and D execute concurrently (in any order)
 //
 //  * [A, B] -> Barrier -> [C, D]
 //  - A and B execute concurrently (in any order)
 //  - Barrier is processed after both A and B complete
 //  - C and D execute concurrently
 //
 //  * [A] -> Barrier -> [B]
 //  - Don't do this and use the base task dependency instead; it'll work, but
 //    it's much better to avoid the additional barrier indirection when
 //    possible.
 typedef iree_alignas(iree_max_align_t) struct {
   // Task header: implementation detail, do not use.
   iree_task_t header;

   // Number of valid tasks in the dependent_tasks list.
   iree_host_size_t dependent_task_count;
   // [0-dependent_task_count] tasks that will be notified when the barrier is
   // reached. Each task will have its pending_dependency_count decremented and
   // when the count reaches 0 be added to the ready list.
   iree_task_t* const* dependent_tasks;
 } iree_task_barrier_t;

 void iree_task_barrier_initialize(iree_task_scope_t* scope,
                                   iree_host_size_t dependent_task_count,
                                   iree_task_t* const* dependent_tasks,
                                   iree_task_barrier_t* out_task);

 void iree_task_barrier_initialize_empty(iree_task_scope_t* scope,
                                         iree_task_barrier_t* out_task);

 void iree_task_barrier_set_dependent_tasks(
     iree_task_barrier_t* task, iree_host_size_t dependent_task_count,
     iree_task_t* const* dependent_tasks);

 //==============================================================================
 // IREE_TASK_TYPE_FENCE
 //==============================================================================

 // A fence indicating that a certain point in the task graph has been reached.
 // All tasks prior to this fence (by way of happens-before dependencies) are
 // guaranteed to have retired.
 //
 // When all of the dependencies of a fence have retired the fence will notify
 // the parent scope of the task by decrementing the pending_submissions count
 // and publishing an idle_notification if it was the last in-flight submission.
 typedef iree_alignas(iree_max_align_t) struct {
   // Task header: implementation detail, do not use.
   iree_task_t header;

   // TODO(benvanik): user-defined fence data for semaphore signaling. Optional
   // wait_handle to signal?
 } iree_task_fence_t;

 void iree_task_fence_initialize(iree_task_scope_t* scope,
                                 iree_task_fence_t* out_task);

 //==============================================================================
 // IREE_TASK_TYPE_WAIT
 //==============================================================================

 typedef struct {
   // Task header: implementation detail, do not use.
   iree_task_t header;

   // The external wait handle that the task is waiting on.
   // TODO(benvanik): multiple wait handles.
   iree_wait_handle_t wait_handle;

   // TODO(benvanik): deadline_ns.
   // TODO(benvanik): condition (possibly a closure to evaluate) ala condvar.
 } iree_task_wait_t;

 void iree_task_wait_initialize(iree_task_scope_t* scope,
                                iree_wait_handle_t wait_handle,
                                iree_task_wait_t* out_task);

 //==============================================================================
 // IREE_TASK_TYPE_DISPATCH_* structures
 //==============================================================================

 // Statistics tracked across an entire dispatch operation.
 // Each tile contributes to these statistics as they execute to provide an
 // aggregate set of statistics that can be reported to tracing/user queries.
 //
 // We want to keep this structure relatively compact as it does add overhead.
 // If statistics are used purely for interactive tracing then they can be
 // piped directly to the tracing tool using IREE_TRACE_* macros. If the
 // statistics are programmatically queried for benchmarks or reporting then
 // they belong here where we can efficiently move them around.
 //
 // If we find ourselves with a lot of hardware-specific counters (vs more
 // generic ones like 'l2 cache misses' or 'ipc') then we can sprinkle in some
 // #ifdefs.
 typedef struct {
   // TODO(benvanik): statistics counters.
   iree_atomic_int32_t reserved;
 } iree_task_dispatch_statistics_t;

 // Merges statistics from |source| to |target| atomically per-field.
 // As each field is updated independently and in a relaxed memory order it's
 // possible for statistics consumers to see a tear.
 void iree_task_dispatch_statistics_merge(
     const iree_task_dispatch_statistics_t* source,
     iree_task_dispatch_statistics_t* target);

 typedef struct {
   // TODO(benvanik): coroutine storage.
   // Ideally we'll be able to have a fixed coroutine storage size per dispatch
   // (via @llvm.coro.size) such that we can preallocate all of the storage for
   // a dispatch in one shot. If we need to do dynamic allocation we will need a
   // ringbuffer or other kind of pool to allocate from on-demand.
   uint32_t reserved;
 } iree_task_tile_storage_t;

 // Per-tile context provided to each dispatch function invocation in the grid.
 // This information is unique to the tile being dispatched and may contain
 // specific state about the calling thread/fiber/etc.
 //
 // If tile execution is suspended by hitting a coroutine suspend point then the
 // coroutine state will be stored within the tile context until the tile is
 // resumed.
 typedef iree_alignas(iree_max_align_t) struct {
   // Workgroup ID for the current invocation.
   uint32_t workgroup_xyz[3];
   // Workgroup size for each invocation.
   uint32_t workgroup_size[3];
   // Total workgroup count for the task. Can be used in conjunction with the
   // per-invocation workgroup_xyz and workgroup_size to compute offsets/indices.
   uint32_t workgroup_count[3];
   // TODO(benvanik): workgroup index to amortize calculating linear offsets.
   // (like gl_GlobalInvocationID)

   // Incoherent memory shared across all invocations of the task.
   // Aligned to at least the natural pointer size of the machine. Functions must
   // use atomic operations to ensure proper memory ordering.
   iree_byte_span_t shared_memory;

   // Shared statistics counters for the dispatch slice.
   iree_task_dispatch_statistics_t* statistics;

   // TODO(benvanik): cpuid uarch.
   // TODO(benvanik): per-tile coroutine storage.
 } iree_task_tile_context_t;

 typedef struct iree_task_dispatch_s iree_task_dispatch_t;

 // Shared state for all shards processing a dispatch.
 typedef iree_alignas(iree_max_align_t) struct {
   // Direct reference to the parent dispatch that all shards are processing.
   iree_task_dispatch_t* dispatch_task;

   // The tail tile index; the next reservation will start from here.
   iree_atomic_int32_t tile_index;

   // The total number of tiles in the dispatch bounding tile_index.
   uint32_t tile_count;

   // Maximum number of tiles to fetch per tile reservation from the grid.
   // Bounded by IREE_TASK_DISPATCH_MAX_TILES_PER_SHARD_RESERVATION and a
   // reasonable number chosen based on the tile and shard counts.
   uint32_t tiles_per_reservation;

   // Total workgroup count for the task. Can be used in conjunction with the
   // per-invocation workgroup_xyz and workgroup_size to compute offsets/indices.
   uint32_t workgroup_count[3];

   // Incoherent memory shared across all invocations of the task.
   // Aligned to at least the natural pointer size of the machine. Functions must
   // use atomic operations to ensure proper memory ordering.
   iree_byte_span_t shared_memory;
 } iree_task_dispatch_shard_state_t;

 //==============================================================================
 // Dispatch function closures
 //==============================================================================

 typedef iree_status_t(IREE_API_PTR* iree_task_dispatch_closure_fn_t)(
     uintptr_t user_context, const iree_task_tile_context_t* tile_context,
     iree_task_submission_t* pending_submission);

 // A function closure representing the function to call and its arguments.
 typedef struct {
   // Function called per tile invocation.
   iree_task_dispatch_closure_fn_t fn;

   // User-defined argument passed to task functions during invocation.
   // Opaque pointer-sized values that could point to user data structures or
   // contain embedded values. No lifetime management is performed by the task
   // system and it is required that users ensure that the memory referenced is
   // live until after the task has completed.
   uintptr_t user_context;
 } iree_task_dispatch_closure_t;

 // Binds a function pointer and the arguments it should be called with.
 // If the arguments represent pointers they must remain live until the task
 // has completed execution.
 static inline iree_task_dispatch_closure_t iree_task_make_dispatch_closure(
     iree_task_dispatch_closure_fn_t fn, uintptr_t user_context) {
   iree_task_dispatch_closure_t closure = {fn, user_context};
   return closure;
 }

 //==============================================================================
 // IREE_TASK_TYPE_DISPATCH
 //==============================================================================

 // An execution request across a tiled grid.
 // Dispatches are fork points where zero or more dispatch slice tasks are
 // spawned and processed prior to joining again on the dispatch completion task.
 //
 // The total workgroup count indicates the [x,y,z] extents of the dispatch grid.
 // The count may either be embedded directly into the dispatch or provided as a
 // pointer to the workgroup_count[3] that will be read immediately prior to
 // forking. If any dimension of the workgroup count is zero then the dispatch is
 // skipped and the completion task will be readied immediately.
 //
 // Example:
 //   dispatch([5, 1, 1])
 //     forked into slices based on affinity/scheduling parameters:
 //     -> dispatch_slice([0-1, 1, 1])
 //     -> dispatch_slice([2-3, 1, 1])
 //     -> dispatch_slice([4-5, 1, 1])
 //   completion_task run after all slices complete
 typedef iree_alignas(iree_max_align_t) struct iree_task_dispatch_s {
   // Task header: implementation detail, do not use.
   iree_task_t header;

   // Function closure to call per tile.
   iree_task_dispatch_closure_t closure;

   // Workgroup size for each invocation. Passed on to tiles without
   // modification and not used for scheduling.
   uint32_t workgroup_size[3];

   // 3D workgroup count used to tile the dispatch.
   // [1,1,1] specifies single invocation of the function. A value of 0 in
   // any dimension will skip execution of the function.
   union {
     // Embedded immutable 3D workgroup count value.
     uint32_t value[3];
     // Pointer to the uint32_t[3] containing the 3D workgroup count.
     // Sampled immediately prior to execution.
     const uint32_t* ptr;
   } workgroup_count;

   // Optional transient shared memory size to allocate and pass into the
   // iree_task_context_t::shared_memory of each invocation of the task
   // closure.
   iree_host_size_t shared_memory_size;

   // Statistics storage used for aggregating counters across all slices.
   iree_task_dispatch_statistics_t statistics;

   // Shared state across all slices/shards/etc.
   // Stored once per dispatch and then referenced by all subtasks.
   union {
     iree_task_dispatch_shard_state_t shard_state;
   } shared;
 } iree_task_dispatch_t;

 void iree_task_dispatch_initialize(iree_task_scope_t* scope,
                                    iree_task_dispatch_closure_t closure,
                                    const uint32_t workgroup_size[3],
                                    const uint32_t workgroup_count[3],
                                    iree_task_dispatch_t* out_task);

 void iree_task_dispatch_initialize_indirect(
     iree_task_scope_t* scope, iree_task_dispatch_closure_t closure,
     const uint32_t workgroup_size[3], const uint32_t* workgroup_count_ptr,
     iree_task_dispatch_t* out_task);

 //==============================================================================
 // IREE_TASK_TYPE_DISPATCH_SLICE
 //==============================================================================

 // TODO(benvanik): per-region dependencies (allow slices to execute directly
 // across dispatches).

 // A slice of tiles within a larger dispatch grid.
 // These tasks are designed to be synthesized by the task system when processing
 // a dispatch task. Based on the workgroup count, affinity settings, and
 // available executor threads zero or more slices are enqueued, executed, and
 // retired as part of the complete dispatch task. The dispatch is only
 // considered completed and subsquent tasks readied once all slices are
 // complete.
 //
 // Slices aggregate statistics from all tiles within them and then upon
 // completion merge those into the shared dispatch statistics. As slices may
 // suspend and resume several times the dispatch-level statistics should only be
 // read once all slices have completed fully.
 //
 // In general slices represent a contiguous range of tiles along the most
 // rapidly changing dimension (x, then y, then z). This ensures that we at least
 // give the opportunity for cache locality to the tiles as they are processed.
 // If work stealing is enabled then slices may shed their trailing tiles to
 // other threads that have completed all of their work (at a cost of power vs.
 // potential latency savings).
 typedef iree_alignas(iree_max_align_t) struct {
   // Task header: implementation detail, do not use.
   iree_task_t header;

   // NOTE: the following fields are mostly replicated from iree_task_dispatch_t.
   // This removes the need for touching the dispatch struct when beginning a
   // tile which would likely be a cache miss as we fan out to other cores.

   // Function closure to call per tile (same as the closure in the dispatch).
   iree_task_dispatch_closure_t closure;

   // Base workgroup ID for the slice range.
   uint32_t workgroup_base[3];
   // Total count of tiles within the slice range.
   uint32_t workgroup_range[3];

   // Workgroup size for each invocation.
   uint32_t workgroup_size[3];
   // Total workgroup count for the task. Can be used in conjunction with the
   // per-invocation workgroup_xyz and workgroup_size to compute offsets/indices.
   uint32_t workgroup_count[3];

   // Incoherent memory shared across all invocations of the task.
   // Aligned to at least the natural pointer size of the machine. Functions must
   // use atomic operations to ensure proper memory ordering.
   iree_byte_span_t shared_memory;

   // Shared statistics counters for the entire dispatch. References the storage
   // held in the parent iree_task_dispatch_t.
   iree_task_dispatch_statistics_t* dispatch_statistics;
   // Statistics just for this single slice. The statistics will be added to the
   // dispatch_statistics after the slice completes to prevent excessive
   // contention on the shared dispatch statistics across multiple threads.
   iree_task_dispatch_statistics_t slice_statistics;

   // Per-tile initialized coroutine storage for all tiles in the range
   // initialized as each tile begins execution.
   // TODO(benvanik): coroutine storage as iree_task_tile_storage_t.
 } iree_task_dispatch_slice_t;

 // TODO(benvanik): document initialize() for slice pre-planning/embeddeding.
 // This would be useful to reduce latency when the number of slices is small
 // (~<5) as the dispatch wouldn't need to be issued. This can also be used to
 // implement per-region dependencies as direct slice->slice deps vs. fork/join
 // dispatch->dispatch deps. Show how IREE_TASK_FLAG_DISPATCH_RETIRE is set.
 void iree_task_dispatch_slice_initialize(iree_task_dispatch_t* dispatch_task,
                                          const uint32_t workgroup_base[3],
                                          const uint32_t workgroup_range[3],
                                          const uint32_t workgroup_count[3],
                                          iree_task_dispatch_slice_t* out_task);

 //==============================================================================
 // IREE_TASK_TYPE_DISPATCH_SHARD
 //==============================================================================

 typedef iree_alignas(iree_max_align_t) struct {
   // Task header: implementation detail, do not use.
   iree_task_t header;

   // Active dispatch progress shared across all shards.
   // Each shard will be read/modify/writing this and there's likely to be
   // contention.
   iree_task_dispatch_shard_state_t* shared_state;
 } iree_task_dispatch_shard_t;

 void iree_task_dispatch_shard_initialize(
     iree_task_dispatch_t* dispatch_task,
     iree_task_dispatch_shard_state_t* shared_state,
     iree_task_dispatch_shard_t* out_task);

 #ifdef __cplusplus
 }  // extern "C"
 #endif  // __cplusplus

 #endif  // IREE_TASK_TASK_H_