blob: cc4ffbfc86c96b11897fcf7dce74d70256d1d2a3 [file] [log] [blame]
// Copyright 2020 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef IREE_TASK_TASK_H_
#define IREE_TASK_TASK_H_
#include "iree/base/api.h"
#include "iree/base/atomic_slist.h"
#include "iree/base/atomics.h"
#include "iree/base/synchronization.h"
#include "iree/base/wait_handle.h"
#include "iree/task/affinity_set.h"
#ifdef __cplusplus
extern "C" {
#endif // __cplusplus
typedef struct iree_task_list_s iree_task_list_t;
typedef struct iree_task_pool_s iree_task_pool_t;
typedef struct iree_task_scope_s iree_task_scope_t;
typedef struct iree_task_submission_s iree_task_submission_t;
//==============================================================================
// Task header for internal tracking
//==============================================================================
// Specifies the type of a task and how executors handle it.
enum iree_task_type_e {
// Task is a no-op (performs no work) and exists for flexibility.
IREE_TASK_TYPE_NOP = 0u,
// Task will synchronously call a function before continuing.
IREE_TASK_TYPE_CALL = 1u,
// Task exists only as a barrier to join/fork tasks and has no executable
// payload.
IREE_TASK_TYPE_BARRIER = 2u,
// Task is a fence indicating that a certain point in the task graph has been
// reached. All tasks prior to this fence (by way of happens-before
// dependencies) are guaranteed to have retired.
IREE_TASK_TYPE_FENCE = 3u,
// Task is a wait on an external wait handle (fd, HANDLE, etc).
// Executors will wait on the handle until it is signaled and meets the
// specified condition prior to readying the dependent tasks.
IREE_TASK_TYPE_WAIT = 4u,
// Task is a 3D grid dispatch of zero or more tiles.
// Dispatches are issued when ready by either being split into zero or more
// slices with one or more tiles each based on the workgroup count or one
// shard per worker that should process the dispatch.
//
// If IREE_TASK_FLAG_DISPATCH_INDIRECT is set then the dispatch reads the
// workgroup count from a buffer immediately prior to fan-out instead of using
// the values embedded in the task structure.
//
// After a dispatch has been issued the IREE_TASK_FLAG_DISPATCH_RETIRE flag is
// set to indicate that when the dispatch becomes ready again it will be after
// all slices have completed.
IREE_TASK_TYPE_DISPATCH = 5u,
// Task is a slice of a larger contiguous dispatch tile range. The full
// dispatch will be sliced into zero or more slices and each slice will be
// posted to a particular worker for executiion. If work progresses unevenly
// then entire slices will be stolen across workers to balance out the timing.
// Slices retire once they have completed the tiles assigned to them.
IREE_TASK_TYPE_DISPATCH_SLICE = 6u,
// Task is one of potentially many shards processing a larger dispatch grid.
// Each shard may have a preference as to which parts of grid it will focus
// on but is able to otherwise steal any available region directly from the
// shared dispatch coordination state. Shards retire once there are no more
// tiles remaining in the dispatch grid.
IREE_TASK_TYPE_DISPATCH_SHARD = 7u,
};
typedef uint8_t iree_task_type_t;
enum iree_task_flags_e {
// The wait handle the task is specified to wait on has resolved and the task
// can now be considered complete.
IREE_TASK_FLAG_WAIT_COMPLETED = 1u << 0,
// The workgroup count for the dispatch is provided by way of a pointer to a
// list of 3 uint32_t values that will be sampled immediately prior to
// issuing of the dispatch. The contents of the pointer can be safely modified
// up until the last dependency has completed and the dispatch is about to be
// issued.
IREE_TASK_FLAG_DISPATCH_INDIRECT = 1u << 1,
// The dispatch should be sliced across workers via the low-contention
// IREE_TASK_TYPE_DISPATCH_SLICE mechanism. This moves the dispatch overhead
// to the time when the grid is sliced for a savings during when the grid is
// executed.
IREE_TASK_FLAG_DISPATCH_SLICED = 1u << 2,
// The dispatch has been issued and the task is waiting for one or more
// slices to complete. After they complete the dispatch will be readied and
// can be retired.
//
// Though added by the executor after issuing a dispatch users can also set
// this to indicate that all dispatch slices for a particular dispatch have
// been statically scheduled. Executors will then skip issuing the dispatch
// and instead wait until all slices complete, enabling IREE_TASK_TYPE_BARRIER
// behavior but without an additional task as dispatches are still required
// to store information for slices.
IREE_TASK_FLAG_DISPATCH_RETIRE = 1u << 3,
};
typedef uint16_t iree_task_flags_t;
typedef struct iree_task_s iree_task_t;
// A function called to cleanup tasks.
// The provided |status| is unowned and must be cloned if used beyond the scope
// of the cleanup function (such as when stored for later usage).
typedef void(IREE_API_PTR* iree_task_cleanup_fn_t)(iree_task_t* task,
iree_status_t status);
// A task within the task system that runs on an executor.
// Tasks have an iree_task_type_t that defines which parameters are valid and
// how the executor is to treat the task. Dependency edges can be defined that
// determine the execution order of tasks within the executors.
struct iree_alignas(iree_max_align_t) iree_task_s {
// Instrusive pointer used to store tasks within iree_task_list_t and
// iree_atomic_task_list_t singly-linked lists. This must come first in the
// structure so that it is at the appropriate alignment.
iree_task_t* next_task;
// The scope this task is attributed to. Errors with the task will be
// propagated to the scope and errors in the scope will cause pending tasks to
// be skipped.
iree_task_scope_t* scope;
// Optional function to call to cleanup the task on completion.
// Will be called after the task has retired or if the task fails to issue
// (dependency failed, etc).
iree_task_cleanup_fn_t cleanup_fn;
// Optional task that will be notified when the task completes.
// The task will have its pending_dependency_count decremented and will be
// readied for execution when the count reaches 0.
iree_task_t* completion_task;
// Specifies which workers will be used to execute this task.
// Forked tasks will inherit their parent task affinity (possibly with some
// task-dependent rules) to partition workloads across workers with knowledge
// of the specific work being performed. For example, some dispatches can be
// limited to run on certain microarchitectures that workers have affinity
// with at the OS scheduler level (such as little.BIG topologies).
iree_task_affinity_set_t affinity_set;
// Total number of dependent tasks still outstanding. Decremented each time
// a dependent task completes. The task is considered ready to execute when
// this value reaches 0.
iree_atomic_int32_t pending_dependency_count;
// Optional pool the task should be returned to after it has resolved. If the
// task was allocated as part of a larger data structure (embedded within
// an arena for example) then this can be NULL to prevent the task system
// from interfering.
iree_task_pool_t* pool;
// Specifies the type of the task and how the executor handles it.
iree_task_type_t type;
// Task-specific flag bits.
iree_task_flags_t flags;
};
static_assert(offsetof(iree_task_t, next_task) == 0,
"next_task intrusive pointer must be at offset 0");
// Initializes a task header with the given type.
// Must be called on all tasks to ensure proper dependency tracking and list
// state prior to enqueuing. Only the task header structure is initialized and
// any additional data as part of the wrapping task type must be initialized by
// the caller.
void iree_task_initialize(iree_task_type_t type, iree_task_scope_t* scope,
iree_task_t* out_task);
// Sets the optional function called when the task completes (whether successful
// or not).
void iree_task_set_cleanup_fn(iree_task_t* task,
iree_task_cleanup_fn_t cleanup_fn);
// Sets up a dependency edge from |task| to |completion_task| such that when
// |task| completes |completion_task| will be notified and have its
// pending_dependency_count decremented.
void iree_task_set_completion_task(iree_task_t* task,
iree_task_t* completion_task);
// Returns true if the |task| is ready to execute immediately.
// Though this is safe to call from any thread the test may have false-negatives
// (ready tasks are not returned as ready) due to cross-thread synchronization
// latency. Note that tasks may yield themselves during execution and switch
// from ready to waiting (such as when an indirect dispatch needs to wait for
// all tiles to complete).
bool iree_task_is_ready(iree_task_t* task);
// Discards the task and any dependent tasks.
// Any dependent tasks that need to be discarded will be added to
// |discard_worklist| for the caller to continue discarding.
void iree_task_discard(iree_task_t* task, iree_task_list_t* discard_worklist);
//==============================================================================
// IREE_TASK_TYPE_NOP
//==============================================================================
// Task is a no-op (performs no work) and exists for flexibility.
// NOP tasks can be used to link together task lists from multiple threads
// where it may otherwise not be ideal to have heavy-weight concurrency
// structures. NOP tasks can also be useful for neutering another task type
// after it has already been recorded into a list such as when cancellations
// occur.
typedef iree_alignas(iree_max_align_t) struct {
// Task header: implementation detail, do not use.
iree_task_t header;
} iree_task_nop_t;
void iree_task_nop_initialize(iree_task_scope_t* scope,
iree_task_nop_t* out_task);
//==============================================================================
// IREE_TASK_TYPE_CALL
//==============================================================================
typedef iree_status_t(IREE_API_PTR* iree_task_call_closure_fn_t)(
uintptr_t user_context, iree_task_t* task,
iree_task_submission_t* pending_submission);
// A function closure representing the function to call and its arguments.
typedef struct {
// Function called per tile invocation.
iree_task_call_closure_fn_t fn;
// User-defined argument passed to task functions during invocation.
// Opaque pointer-sized values that could point to user data structures or
// contain embedded values. No lifetime management is performed by the task
// system and it is required that users ensure that the memory referenced is
// live until after the task has completed.
uintptr_t user_context;
// TODO(benvanik): cleanup function? right now assume arg is never freed.
} iree_task_call_closure_t;
// Binds a function pointer and the arguments it should be called with.
// If the arguments represent pointers they must remain live until the task
// has completed execution.
static inline iree_task_call_closure_t iree_task_make_call_closure(
iree_task_call_closure_fn_t fn, uintptr_t user_context) {
iree_task_call_closure_t closure = {fn, user_context};
return closure;
}
// A task that will synchronously call a function from the executor and wait
// for it to complete before continuing.
//
// Memory referenced by closure arguments must be kept valid until the function
// executes (in general with the same lifetime as the task itself).
typedef iree_alignas(iree_max_align_t) struct {
// Task header: implementation detail, do not use.
iree_task_t header;
// Function closure to call when the task is executed.
iree_task_call_closure_t closure;
} iree_task_call_t;
void iree_task_call_initialize(iree_task_scope_t* scope,
iree_task_call_closure_t closure,
iree_task_call_t* out_task);
//==============================================================================
// IREE_TASK_TYPE_BARRIER
//==============================================================================
// A join point for fork/join-style scheduling.
// References a set of dependent tasks that will be notified and possibly
// readied when the barrier is reached.
//
// This allows for modeling one-to-many and many-to-many relationships. The base
// task dependency system only models one-to-one and should be used if possible
// to avoid the additional overhead of a barrier task both in memory and task
// indirection/queuing.
//
// Example:
// * [A] -> Barrier -> [C, D]
// - A executes
// - Barrier is processed after A completes
// - C and D execute concurrently (in any order)
//
// * [A, B] -> Barrier -> [C, D]
// - A and B execute concurrently (in any order)
// - Barrier is processed after both A and B complete
// - C and D execute concurrently
//
// * [A] -> Barrier -> [B]
// - Don't do this and use the base task dependency instead; it'll work, but
// it's much better to avoid the additional barrier indirection when
// possible.
typedef iree_alignas(iree_max_align_t) struct {
// Task header: implementation detail, do not use.
iree_task_t header;
// Number of valid tasks in the dependent_tasks list.
iree_host_size_t dependent_task_count;
// [0-dependent_task_count] tasks that will be notified when the barrier is
// reached. Each task will have its pending_dependency_count decremented and
// when the count reaches 0 be added to the ready list.
iree_task_t* const* dependent_tasks;
} iree_task_barrier_t;
void iree_task_barrier_initialize(iree_task_scope_t* scope,
iree_host_size_t dependent_task_count,
iree_task_t* const* dependent_tasks,
iree_task_barrier_t* out_task);
void iree_task_barrier_initialize_empty(iree_task_scope_t* scope,
iree_task_barrier_t* out_task);
void iree_task_barrier_set_dependent_tasks(
iree_task_barrier_t* task, iree_host_size_t dependent_task_count,
iree_task_t* const* dependent_tasks);
//==============================================================================
// IREE_TASK_TYPE_FENCE
//==============================================================================
// A fence indicating that a certain point in the task graph has been reached.
// All tasks prior to this fence (by way of happens-before dependencies) are
// guaranteed to have retired.
//
// When all of the dependencies of a fence have retired the fence will notify
// the parent scope of the task by decrementing the pending_submissions count
// and publishing an idle_notification if it was the last in-flight submission.
typedef iree_alignas(iree_max_align_t) struct {
// Task header: implementation detail, do not use.
iree_task_t header;
// TODO(benvanik): user-defined fence data for semaphore signaling. Optional
// wait_handle to signal?
} iree_task_fence_t;
void iree_task_fence_initialize(iree_task_scope_t* scope,
iree_task_fence_t* out_task);
//==============================================================================
// IREE_TASK_TYPE_WAIT
//==============================================================================
typedef struct {
// Task header: implementation detail, do not use.
iree_task_t header;
// The external wait handle that the task is waiting on.
// TODO(benvanik): multiple wait handles.
iree_wait_handle_t wait_handle;
// TODO(benvanik): deadline_ns.
// TODO(benvanik): condition (possibly a closure to evaluate) ala condvar.
} iree_task_wait_t;
void iree_task_wait_initialize(iree_task_scope_t* scope,
iree_wait_handle_t wait_handle,
iree_task_wait_t* out_task);
//==============================================================================
// IREE_TASK_TYPE_DISPATCH_* structures
//==============================================================================
// Statistics tracked across an entire dispatch operation.
// Each tile contributes to these statistics as they execute to provide an
// aggregate set of statistics that can be reported to tracing/user queries.
//
// We want to keep this structure relatively compact as it does add overhead.
// If statistics are used purely for interactive tracing then they can be
// piped directly to the tracing tool using IREE_TRACE_* macros. If the
// statistics are programmatically queried for benchmarks or reporting then
// they belong here where we can efficiently move them around.
//
// If we find ourselves with a lot of hardware-specific counters (vs more
// generic ones like 'l2 cache misses' or 'ipc') then we can sprinkle in some
// #ifdefs.
typedef struct {
// TODO(benvanik): statistics counters.
iree_atomic_int32_t reserved;
} iree_task_dispatch_statistics_t;
// Merges statistics from |source| to |target| atomically per-field.
// As each field is updated independently and in a relaxed memory order it's
// possible for statistics consumers to see a tear.
void iree_task_dispatch_statistics_merge(
const iree_task_dispatch_statistics_t* source,
iree_task_dispatch_statistics_t* target);
typedef struct {
// TODO(benvanik): coroutine storage.
// Ideally we'll be able to have a fixed coroutine storage size per dispatch
// (via @llvm.coro.size) such that we can preallocate all of the storage for
// a dispatch in one shot. If we need to do dynamic allocation we will need a
// ringbuffer or other kind of pool to allocate from on-demand.
uint32_t reserved;
} iree_task_tile_storage_t;
// Per-tile context provided to each dispatch function invocation in the grid.
// This information is unique to the tile being dispatched and may contain
// specific state about the calling thread/fiber/etc.
//
// If tile execution is suspended by hitting a coroutine suspend point then the
// coroutine state will be stored within the tile context until the tile is
// resumed.
typedef iree_alignas(iree_max_align_t) struct {
// Workgroup ID for the current invocation.
uint32_t workgroup_xyz[3];
// Workgroup size for each invocation.
uint32_t workgroup_size[3];
// Total workgroup count for the task. Can be used in conjunction with the
// per-invocation workgroup_xyz and workgroup_size to compute offsets/indices.
uint32_t workgroup_count[3];
// TODO(benvanik): workgroup index to amortize calculating linear offsets.
// (like gl_GlobalInvocationID)
// Incoherent memory shared across all invocations of the task.
// Aligned to at least the natural pointer size of the machine. Functions must
// use atomic operations to ensure proper memory ordering.
iree_byte_span_t shared_memory;
// Shared statistics counters for the dispatch slice.
iree_task_dispatch_statistics_t* statistics;
// TODO(benvanik): cpuid uarch.
// TODO(benvanik): per-tile coroutine storage.
} iree_task_tile_context_t;
typedef struct iree_task_dispatch_s iree_task_dispatch_t;
// Shared state for all shards processing a dispatch.
typedef iree_alignas(iree_max_align_t) struct {
// Direct reference to the parent dispatch that all shards are processing.
iree_task_dispatch_t* dispatch_task;
// The tail tile index; the next reservation will start from here.
iree_atomic_int32_t tile_index;
// The total number of tiles in the dispatch bounding tile_index.
uint32_t tile_count;
// Maximum number of tiles to fetch per tile reservation from the grid.
// Bounded by IREE_TASK_DISPATCH_MAX_TILES_PER_SHARD_RESERVATION and a
// reasonable number chosen based on the tile and shard counts.
uint32_t tiles_per_reservation;
// Total workgroup count for the task. Can be used in conjunction with the
// per-invocation workgroup_xyz and workgroup_size to compute offsets/indices.
uint32_t workgroup_count[3];
// Incoherent memory shared across all invocations of the task.
// Aligned to at least the natural pointer size of the machine. Functions must
// use atomic operations to ensure proper memory ordering.
iree_byte_span_t shared_memory;
} iree_task_dispatch_shard_state_t;
//==============================================================================
// Dispatch function closures
//==============================================================================
typedef iree_status_t(IREE_API_PTR* iree_task_dispatch_closure_fn_t)(
uintptr_t user_context, const iree_task_tile_context_t* tile_context,
iree_task_submission_t* pending_submission);
// A function closure representing the function to call and its arguments.
typedef struct {
// Function called per tile invocation.
iree_task_dispatch_closure_fn_t fn;
// User-defined argument passed to task functions during invocation.
// Opaque pointer-sized values that could point to user data structures or
// contain embedded values. No lifetime management is performed by the task
// system and it is required that users ensure that the memory referenced is
// live until after the task has completed.
uintptr_t user_context;
} iree_task_dispatch_closure_t;
// Binds a function pointer and the arguments it should be called with.
// If the arguments represent pointers they must remain live until the task
// has completed execution.
static inline iree_task_dispatch_closure_t iree_task_make_dispatch_closure(
iree_task_dispatch_closure_fn_t fn, uintptr_t user_context) {
iree_task_dispatch_closure_t closure = {fn, user_context};
return closure;
}
//==============================================================================
// IREE_TASK_TYPE_DISPATCH
//==============================================================================
// An execution request across a tiled grid.
// Dispatches are fork points where zero or more dispatch slice tasks are
// spawned and processed prior to joining again on the dispatch completion task.
//
// The total workgroup count indicates the [x,y,z] extents of the dispatch grid.
// The count may either be embedded directly into the dispatch or provided as a
// pointer to the workgroup_count[3] that will be read immediately prior to
// forking. If any dimension of the workgroup count is zero then the dispatch is
// skipped and the completion task will be readied immediately.
//
// Example:
// dispatch([5, 1, 1])
// forked into slices based on affinity/scheduling parameters:
// -> dispatch_slice([0-1, 1, 1])
// -> dispatch_slice([2-3, 1, 1])
// -> dispatch_slice([4-5, 1, 1])
// completion_task run after all slices complete
typedef iree_alignas(iree_max_align_t) struct iree_task_dispatch_s {
// Task header: implementation detail, do not use.
iree_task_t header;
// Function closure to call per tile.
iree_task_dispatch_closure_t closure;
// Workgroup size for each invocation. Passed on to tiles without
// modification and not used for scheduling.
uint32_t workgroup_size[3];
// 3D workgroup count used to tile the dispatch.
// [1,1,1] specifies single invocation of the function. A value of 0 in
// any dimension will skip execution of the function.
union {
// Embedded immutable 3D workgroup count value.
uint32_t value[3];
// Pointer to the uint32_t[3] containing the 3D workgroup count.
// Sampled immediately prior to execution.
const uint32_t* ptr;
} workgroup_count;
// Optional transient shared memory size to allocate and pass into the
// iree_task_context_t::shared_memory of each invocation of the task
// closure.
iree_host_size_t shared_memory_size;
// Statistics storage used for aggregating counters across all slices.
iree_task_dispatch_statistics_t statistics;
// Shared state across all slices/shards/etc.
// Stored once per dispatch and then referenced by all subtasks.
union {
iree_task_dispatch_shard_state_t shard_state;
} shared;
} iree_task_dispatch_t;
void iree_task_dispatch_initialize(iree_task_scope_t* scope,
iree_task_dispatch_closure_t closure,
const uint32_t workgroup_size[3],
const uint32_t workgroup_count[3],
iree_task_dispatch_t* out_task);
void iree_task_dispatch_initialize_indirect(
iree_task_scope_t* scope, iree_task_dispatch_closure_t closure,
const uint32_t workgroup_size[3], const uint32_t* workgroup_count_ptr,
iree_task_dispatch_t* out_task);
//==============================================================================
// IREE_TASK_TYPE_DISPATCH_SLICE
//==============================================================================
// TODO(benvanik): per-region dependencies (allow slices to execute directly
// across dispatches).
// A slice of tiles within a larger dispatch grid.
// These tasks are designed to be synthesized by the task system when processing
// a dispatch task. Based on the workgroup count, affinity settings, and
// available executor threads zero or more slices are enqueued, executed, and
// retired as part of the complete dispatch task. The dispatch is only
// considered completed and subsquent tasks readied once all slices are
// complete.
//
// Slices aggregate statistics from all tiles within them and then upon
// completion merge those into the shared dispatch statistics. As slices may
// suspend and resume several times the dispatch-level statistics should only be
// read once all slices have completed fully.
//
// In general slices represent a contiguous range of tiles along the most
// rapidly changing dimension (x, then y, then z). This ensures that we at least
// give the opportunity for cache locality to the tiles as they are processed.
// If work stealing is enabled then slices may shed their trailing tiles to
// other threads that have completed all of their work (at a cost of power vs.
// potential latency savings).
typedef iree_alignas(iree_max_align_t) struct {
// Task header: implementation detail, do not use.
iree_task_t header;
// NOTE: the following fields are mostly replicated from iree_task_dispatch_t.
// This removes the need for touching the dispatch struct when beginning a
// tile which would likely be a cache miss as we fan out to other cores.
// Function closure to call per tile (same as the closure in the dispatch).
iree_task_dispatch_closure_t closure;
// Base workgroup ID for the slice range.
uint32_t workgroup_base[3];
// Total count of tiles within the slice range.
uint32_t workgroup_range[3];
// Workgroup size for each invocation.
uint32_t workgroup_size[3];
// Total workgroup count for the task. Can be used in conjunction with the
// per-invocation workgroup_xyz and workgroup_size to compute offsets/indices.
uint32_t workgroup_count[3];
// Incoherent memory shared across all invocations of the task.
// Aligned to at least the natural pointer size of the machine. Functions must
// use atomic operations to ensure proper memory ordering.
iree_byte_span_t shared_memory;
// Shared statistics counters for the entire dispatch. References the storage
// held in the parent iree_task_dispatch_t.
iree_task_dispatch_statistics_t* dispatch_statistics;
// Statistics just for this single slice. The statistics will be added to the
// dispatch_statistics after the slice completes to prevent excessive
// contention on the shared dispatch statistics across multiple threads.
iree_task_dispatch_statistics_t slice_statistics;
// Per-tile initialized coroutine storage for all tiles in the range
// initialized as each tile begins execution.
// TODO(benvanik): coroutine storage as iree_task_tile_storage_t.
} iree_task_dispatch_slice_t;
// TODO(benvanik): document initialize() for slice pre-planning/embeddeding.
// This would be useful to reduce latency when the number of slices is small
// (~<5) as the dispatch wouldn't need to be issued. This can also be used to
// implement per-region dependencies as direct slice->slice deps vs. fork/join
// dispatch->dispatch deps. Show how IREE_TASK_FLAG_DISPATCH_RETIRE is set.
void iree_task_dispatch_slice_initialize(iree_task_dispatch_t* dispatch_task,
const uint32_t workgroup_base[3],
const uint32_t workgroup_range[3],
const uint32_t workgroup_count[3],
iree_task_dispatch_slice_t* out_task);
//==============================================================================
// IREE_TASK_TYPE_DISPATCH_SHARD
//==============================================================================
typedef iree_alignas(iree_max_align_t) struct {
// Task header: implementation detail, do not use.
iree_task_t header;
// Active dispatch progress shared across all shards.
// Each shard will be read/modify/writing this and there's likely to be
// contention.
iree_task_dispatch_shard_state_t* shared_state;
} iree_task_dispatch_shard_t;
void iree_task_dispatch_shard_initialize(
iree_task_dispatch_t* dispatch_task,
iree_task_dispatch_shard_state_t* shared_state,
iree_task_dispatch_shard_t* out_task);
#ifdef __cplusplus
} // extern "C"
#endif // __cplusplus
#endif // IREE_TASK_TASK_H_