blob: 4a8cdf949fe20974970051e3409924e35a6ed699 [file] [log] [blame]
// Copyright 2020 The IREE Authors
//
// Licensed under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
#ifndef IREE_TASK_EXECUTOR_H_
#define IREE_TASK_EXECUTOR_H_
#include <stdint.h>
#include "iree/base/api.h"
#include "iree/base/internal/atomics.h"
#include "iree/base/internal/event_pool.h"
#include "iree/base/internal/wait_handle.h"
#include "iree/task/scope.h"
#include "iree/task/submission.h"
#include "iree/task/task.h"
#include "iree/task/topology.h"
#ifdef __cplusplus
extern "C" {
#endif // __cplusplus
//==============================================================================
// IREE Task Executor
//==============================================================================
//
// Roughly models wavefront-style GPU dispatch. Users submit task DAGs with
// fine-grained dependency information for the executor to schedule across a set
// of workers. As tasks become ready to execute they are placed into per-worker
// FIFOs and workers run through them in a breadth-first fashion executing and
// resolving tasks and building up new waves of ready tasks. Workers will always
// make forward progress and only when they run out of work will they attempt to
// self-nominate to play the role of coordinator and schedule any newly-
// submitted or readied tasks. Only once all tasks have been retired and
// waits on external resources remain does the task system suspend itself until
// more tasks are submitted or an external wait resolves.
//
// Our goal is to do the minimal amount of work to get the maximum amount of
// concurrency the user requests or allows (by way of their dependencies).
// Whether on a single core where you want to timeshare with an application or
// across hundreds the same architecture holds. Where there is inefficiency it's
// almost always surmountable with properly constructed tasks: choose the right
// granularity for dispatches, choose the right fan-out for tiles within those
// dispatches, choose the right places to insert barriers to force fan-in to
// reduce memory utilization or right places to batch barriers to allow less
// synchronization with the work queue, etc. All of those choices are ones this
// system is designed to handle dynamically via the task graphs provided that
// are themselves (in the IREE world) mapped 1:1 with the GPU-esque grid
// dispatch and command buffer model. It's a super-power if a human is authoring
// all that information but what makes it particularly powerful here is that we
// are authoring that in the compiler based on a tremendous amount of
// higher-level information we can derive from the whole program. Every bit of
// dynamism here is matched with the ability to tighten down the screws and gain
// back anything lost by way of compiler improvements while also being able to
// generalize out to far more complex systems (higher parallelism, higher and
// more efficient concurrency, etc).
//
// The design of this system allows for a spectrum of dynamic behavior based on
// desired usage scenarios:
// - variable number of persistent workers based on compute/memory topology
// - per-task scope and per-task worker affinity to control for:
// - power islands on multi-core systems with fine-grained power management
// - heterogenous microarchitectures in big.LITTLE/etc compute complexes
// - task isolation between multiple active requests or users
// - latency prioritization by partitioning workloads by priority
// - scheduling overhead tradeoffs by varying:
// - coordination/flush frequency to reduce cross-thread communication
// - by statically inserting dispatch shards to avoid dynamic fan-out
// - thread donation to avoid likely context switches upon submit+wait
// - multi-wait across all users by sharing a wait set
// - per-worker work-stealing specification of victim workers in the topology
// - limited work-stealing to prevent chained stealing/cascading theft
//
// Required reading:
// https://www.usenix.org/conference/osdi20/presentation/ma
// (closest equivalent to this scheduling model)
// https://www.cister-labs.pt/summer2017/w3/Parallelism%20-%20Dag%20Model.pdf
// (good overall, our worker local lists/mailboxes are work-stealing queues)
// http://people.csail.mit.edu/shanir/publications/Flat%20Combining%20SPAA%2010.pdf
// (what we model with the coordinator)
// http://mcg.cs.tau.ac.il/papers/opodis2010-quasi.pdf
// (we exploit relaxed consistency for all our cross-thread queuing, see ^)
// https://moodycamel.com/blog/2014/a-fast-general-purpose-lock-free-queue-for-c++.htm
// (moodycamel is the state of the art on scaling queues; read it all)
// https://blog.molecular-matters.com/2015/08/24/job-system-2-0-lock-free-work-stealing-part-1-basics/
// https://blog.molecular-matters.com/2015/09/08/job-system-2-0-lock-free-work-stealing-part-2-a-specialized-allocator/
// https://blog.molecular-matters.com/2015/09/25/job-system-2-0-lock-free-work-stealing-part-3-going-lock-free/
// https://blog.molecular-matters.com/2015/11/09/job-system-2-0-lock-free-work-stealing-part-4-parallel_for/
// https://blog.molecular-matters.com/2016/04/04/job-system-2-0-lock-free-work-stealing-part-5-dependencies/
// (fantastic 5 part blog series; very similar to this)
// http://www.cs.cmu.edu/afs/cs.cmu.edu/Web/People/blelloch/papers/jacm99.pdf
// (provably optimal dynamic nested parallelism in 1999; basically: GPUs)
// http://www.cs.cmu.edu/~blelloch/papers/locality2000.pdf
// (followup to jacm99; using locality now to guide work stealing)
// https://www.cs.cmu.edu/afs/cs.cmu.edu/Web/People/blelloch/papers/CGK07.pdf
// (worker affinity and task locality for constructive cache sharing)
//
//==============================================================================
// Life of an iree_task_t / high level algorithm
//==============================================================================
//
// 1. Users allocate (from iree_task_pool_t, slice from arenas, etc) and
// construct a DAG of iree_task_ts.
//
// a. Task dependency information is setup via completion_tasks for simple
// dependencies, implicit fan-out/fan-in (dispatches), or explicit fan-in
// (barriers).
//
// b. Tasks are pushed into iree_task_submission_t (LIFO, thread-local list).
// If the task has no initial unmet initial dependencies it is placed into
// the ready_list. If it is initially waiting on an external resource such
// as iree_wait_handle_t then it is placed into the waiting_list.
//
// 2. iree_task_executor_submit (LIFO, atomic slist)
// Submissions have their task thread-local lists concatenated into a LIFO
// incoming_ready_slist or incoming_waiting_slist shared by the executor.
//
// 3. iree_task_executor_flush (or a worker puts on its coordinator hat 🎩)
//
// a. Tasks are flushed from the incoming_ready_slist into a coordinator-local
// FIFO task queue and incoming_waiting_slist is concatenated into the
// primary executor waitlist.
//
// b. iree_task_executor_poll_waiting_tasks: finds waiting tasks that are now
// ready and they are moved into the coordinator-local FIFO task queue.
//
// c. iree_task_executor_schedule_ready_tasks: walks the FIFO task queue and
// builds a iree_task_post_batch_t containing the per-worker tasks
// in LIFO order.
//
// d. iree_task_post_batch_submit: per-worker tasks are pushed to their
// respective iree_task_worker_t mailbox_slist and the workers with new
// tasks are notified to wake up (if not already awake).
//
// 4. iree_task_worker_main_pump_once (LIFO mailbox -> FIFO thread-local list)
// When either woken or after completing all available thread-local work
// each worker will check its mailbox_slist to see if any tasks have been
// posted.
//
// a. Tasks are flushed from the LIFO mailbox into the local_task_queue FIFO
// for the particular worker.
//
// b. If the mailbox is empty the worker *may* attempt to steal work from
// another nearby worker in the topology.
//
// c. Any tasks in the local_task_queue are executed until empty.
// Tasks are retired and dependent tasks (via completion_task or barriers)
// are made ready and placed in the executor incoming_ready_slist or
// incoming_waiting_slist as with iree_task_executor_submit.
//
// d. If no more thread-local work is available and the mailbox_slist is
// empty the worker will self-nominate for coordination and attempt to don
// the coordinator hat with iree_task_executor_coordinate. If new work
// becomes available after coordination step 5 repeats.
//
// e. If another worker (or iree_task_executor_flush) is already wearing the
// coordinator hat then the worker will go to sleep.
//
//==============================================================================
// Scaling Down
//==============================================================================
//
// IREE is built at all levels - and both in the compiler and runtime - to scale
// to different needs. Everything that IREE imposes on the runtime performance
// and binary size is a spectrum of choices made that allows a user to only pay
// for what they use.
//
// If a deployment scenario does not need complex multithreading and
// out-of-order execution then this task system can be used in single-threaded
// mode to at least allow for offloading from the main application thread. In
// even more constrained scenarios (or embeddings within other systems that have
// thread pools of their own) it can be used in zero-threaded mode with only
// donated threads from the user performing work when the user wants it to
// happen within its control. It still gives the benefits of wave-style
// scheduling, multi-waiting, locality-aware work distribution, etc as well as
// giving us a single target interface from the compiler to communicate
// fine-grained dependency information to the runtime.
//
// If the cost of a few KB of data structures and some cheap uncontended atomic
// linked list concatenations is still scary (it shouldn't be for 95% of uses)
// then it's also possible to have a HAL driver that doesn't use this task
// system at all and instead just executes the command buffers directly just
// like our Vulkan/Metal/etc GPU backends do. Even though I don't recommend that
// (one wouldn't be saving as much as they think and be losing a lot instead)
// the layering holds and it can be useful if there's an existing external
// sophisticated task execution system (ala taskflow) that is already in present
// in an application.
//
// One assertion of IREE is that for models that take more than milliseconds to
// execute then asynchronous scheduling is almost always worth it even on
// systems with single cores. The ability to cooperatively schedule model
// execution allows applications significant control over their total program
// scheduling behavior; just as on a Commodore 64 you'd have to interrupt work
// on vsync to begin scanning out pixels to the screen and then resume afterward
// it's rare to see any system even scaling down to double-digit MHz
// microcontrollers that doesn't benefit from the ability to cleanly suspend and
// resume execution.
//
// But even if *all* of that is too much, the compile-time representations in
// the HAL IR are designed to be lowered away: execution modeling does not need
// to bottom out on a hal.command_buffer.dispatch that maps 1:1 with the runtime
// iree_hal_command_buffer_dispatch call: dispatch can be lowered into LLVM
// IR calls and finally into native code to do precisely what you want. The HAL
// at runtime is a useful abstraction to allow for switching your target
// execution system (statically or dynamically across deployments) and to share
// the same execution system across multiple models that may be executing
// simultaneously but it is _not_ a requirement that the IREE HAL runtime
// implementation is used. It's called multi-level IR for a reason and the HAL
// IR is just one level that may have many more below it.
//
// So yeah: don't worry. It's almost certain that the thing making or breaking
// the performance of models over 1ms of execution time is not the HAL, and that
// in models at or above that scale the benefits we get from being able to
// holistically schedule the work far outstrip any specialization that can be
// done by hand. That's to say: only worry about this if your model is literally
// 4 floats coming from an IMU and a few hundred scalar instructions to predict
// whether the user is walking, and that shouldn't be using the runtime HAL at
// all and really likely doesn't benefit from using IREE at any scale - just go
// straight to LLVM IR from the source.
//
//==============================================================================
// Scaling Up
//==============================================================================
//
// The task system has an implicit limit of 64 workers. This intentional
// limitation simplifies several parts of the code while also preventing misuse:
// it rarely (if ever) makes sense to have more than 64 compute-dominated
// threads working on a single problem. Achieving high performance in such
// situations requires extremely careful control over the OS scheduler, memory
// bandwidth consumption, and synchronization. It's always possible to make the
// problem more compute-bound or very carefully try to fit in specific cache
// sizes to avoid more constrained bandwidth paths but it's a non-portable
// whack-a-mole style solution that is in conflict with a lot of what IREE seeks
// to do with respect to low-latency and multi-tenant workloads.
//
// If more than 64 unique L1/L2 caches (or realistically more than probably ~32)
// are available *and* all of them are attached to the same memory controllers
// (no NUMA involved) then the solution is straightfoward: use multiple IREE
// task executors. Either within a process or in separate processes the
// granularity is coarse enough to not be a burden and changes the problem from
// needing 100% perfect work scaling of a single task to needing a naive
// distributed workload solution at the algorithm level.
//
// Many useful effects also fall out of solving the work distribution problem.
// Even for single-tenant workloads being able to split work between two
// executors allows for natural mappings on NUMA systems or completely
// independent machines. When supporting multi-tenant workloads (even if the
// same program is acting as multiple-tenants in a minibatched-style algorithm)
// the improvements of isolation both in memory access patterns and in variance
// from potentially bad system behavior dramatically improve: there aren't many
// opportunities for contention in this system but one can guarantee zero
// contention by simply not sharing the resources!
// A bitfield specifying the scheduling mode used for configuring how (or if)
// work is balanced across queues.
enum iree_task_scheduling_mode_bits_t {
// TODO(benvanik): batch, round-robin, FCFS, SJF, etc.
// We can also allow for custom scheduling, though I'm skeptical of the value
// of that. We should look into what GPUs do in hardware for balancing things
// (if anything this sophisticated at all). The potential benefit here is that
// we can optimize for offline workloads by allowing each queue to be drained
// until blocking - hopefully optimizing cache coherency and reducing the
// total memory high-water mark - or optimize for latency across all queues by
// taking tasks from all queues equally. There are other more interesting
// scheduling strategies such as preferring the widest tasks available from
// any queue such that we are keeping as many workers active as possible to
// reach peak utilization or artificially limiting which tasks we allow
// through to keep certain CPU cores asleep unless absolutely required.
IREE_TASK_SCHEDULING_MODE_RESERVED = 0u,
// Creates all workers suspended and waits until work is first scheduled to
// them to resume. This trades off initial blocking startup time waking the
// threads for potential latency additions later on as threads take longer to
// wake on their first use.
//
// Prefer this setting in systems where startup time is the priority and work
// may not be scheduled for awhile or scheduled unevenly to start; otherwise
// the executor creation will take longer and a thundering herd will occur
// forcing context switches even if no work is needed.
//
// Avoid in systems where the latency from initial submission to worker
// execution is critical as this will ensure all worker threads are waiting
// for their respective wake notifications. The kernel then will be able to
// much faster schedule all worker quantums and in many cases all workers will
// begin processing simultaneously immediately after the submission is made.
IREE_TASK_SCHEDULING_MODE_DEFER_WORKER_STARTUP = 1u << 0,
// TODO(#4027): implement IREE_TASK_SCHEDULING_MODE_DEDICATED_WAIT_THREAD.
// Creates a dedicated thread performing waits on root wait handles.
// On workloads with many short-duration waits this will reduce total latency
// as the waits are aggressively processed and dependent tasks are scheduled.
// It also keeps any wait-related syscalls off the worker threads that would
// otherwise need to perform the syscalls during coordination.
IREE_TASK_SCHEDULING_MODE_DEDICATED_WAIT_THREAD = 1u << 1,
};
typedef uint32_t iree_task_scheduling_mode_t;
// Base task system executor interface.
typedef struct iree_task_executor_t iree_task_executor_t;
// Creates a task executor using the specified topology.
//
// |worker_local_memory_size| defines the bytes to be allocated and reserved for
// each worker to use for local memory operations. Will be rounded up to the
// next power of two. Dispatches performed will be able to request up to this
// amount of memory for their invocations and no more. May be 0 if no worker
// local memory is required.
//
// |topology| is only used during creation and need not live beyond this call.
// |out_executor| must be released by the caller.
iree_status_t iree_task_executor_create(
iree_task_scheduling_mode_t scheduling_mode,
const iree_task_topology_t* topology,
iree_host_size_t worker_local_memory_size, iree_allocator_t allocator,
iree_task_executor_t** out_executor);
// Retains the given |executor| for the caller.
void iree_task_executor_retain(iree_task_executor_t* executor);
// Releases the given |executor| from the caller.
void iree_task_executor_release(iree_task_executor_t* executor);
// Trims pools and caches used by the executor and its workers.
void iree_task_executor_trim(iree_task_executor_t* executor);
// Returns an iree_event_t pool managed by the executor.
// Users of the task system should acquire their transient events from this.
// Long-lived events should be allocated on their own in order to avoid
// expending the pool and harming high-frequency event acquisition.
iree_event_pool_t* iree_task_executor_event_pool(
iree_task_executor_t* executor);
// Acquires a fence for the given |scope| from the executor fence pool.
iree_status_t iree_task_executor_acquire_fence(iree_task_executor_t* executor,
iree_task_scope_t* scope,
iree_task_fence_t** out_fence);
// TODO(benvanik): scheduling mode mutation, compute quota control, etc.
// Submits a batch of tasks for execution.
// The submission represents a DAG of tasks all reachable from the initial
// submission lists.
//
// Ownership of the tasks remains with the caller for the lifetime of the
// submission unless tasks have a custom pool specified that they can be
// returned to.
//
// Safe to call from any thread. Wait-free but may block for a small duration
// during initial scheduling of the submitted tasks.
//
// NOTE: it's possible for all work in the submission to complete prior to this
// function returning.
void iree_task_executor_submit(iree_task_executor_t* executor,
iree_task_submission_t* submission);
// Flushes any pending task batches for execution.
//
// Safe to call from any thread. Wait-free but may block for a small duration
// during initial scheduling of the submitted tasks.
//
// NOTE: due to races it's possible for new work to arrive from other threads
// after the flush has occurred but prior to this call returning.
void iree_task_executor_flush(iree_task_executor_t* executor);
// Donates the calling thread to the executor until either |wait_handle|
// resolves or |deadline_ns| is exceeded. Flushes any pending task batches prior
// to doing any work or waiting.
//
// If there are no tasks available then the calling thread will block as if
// iree_wait_one had been used on |wait_handle|. If tasks are ready then the
// caller will not block prior to starting to perform work on behalf of the
// executor.
//
// Donation is intended as an optimization to elide context switches when the
// caller would have waited anyway; now instead of performing a kernel wait and
// most certainly incurring a context switch the caller immediately begins
// taking work from the queue - likely even prior to any of the executor workers
// waking (assuming they were idle).
//
// Note that donation may not always be strictly a win: the caller may have an
// arbitrary thread affinity that may cause oversubscription of resources within
// the topology. This can cause additional contention for compute resources and
// increase kernel scheduling overhead as threads are swapped or migrated.
// Measure, measure, measure! If there is any IO that can be performed during
// the time that a caller would otherwise donate themselves to the executor that
// should always be preferred as should smaller computation (again to not
// oversubscribe resources). Treat donation as a hail mary to prevent a kernel
// wait and not something that will magically make things execute faster.
// Especially in large applications it's almost certainly better to do something
// useful with the calling thread (even if that's go to sleep).
//
// Safe to call from any thread (though bad to reentrantly call from workers).
iree_status_t iree_task_executor_donate_caller(iree_task_executor_t* executor,
iree_wait_handle_t* wait_handle,
iree_time_t deadline_ns);
#ifdef __cplusplus
} // extern "C"
#endif // __cplusplus
#endif // IREE_TASK_EXECUTOR_H_