iree/task/executor.h - 3p/openxla/iree - Git at Google

 // Copyright 2020 The IREE Authors
 //
 // Licensed under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

 #ifndef IREE_TASK_EXECUTOR_H_
 #define IREE_TASK_EXECUTOR_H_

 #include <stdint.h>

 #include "iree/base/api.h"
 #include "iree/base/internal/atomics.h"
 #include "iree/base/internal/event_pool.h"
 #include "iree/base/internal/wait_handle.h"
 #include "iree/task/scope.h"
 #include "iree/task/submission.h"
 #include "iree/task/task.h"
 #include "iree/task/topology.h"

 #ifdef __cplusplus
 extern "C" {
 #endif  // __cplusplus

 //==============================================================================
 // IREE Task Executor
 //==============================================================================
 //
 // Roughly models wavefront-style GPU dispatch. Users submit task DAGs with
 // fine-grained dependency information for the executor to schedule across a set
 // of workers. As tasks become ready to execute they are placed into per-worker
 // FIFOs and workers run through them in a breadth-first fashion executing and
 // resolving tasks and building up new waves of ready tasks. Workers will always
 // make forward progress and only when they run out of work will they attempt to
 // self-nominate to play the role of coordinator and schedule any newly-
 // submitted or readied tasks. Only once all tasks have been retired and
 // waits on external resources remain does the task system suspend itself until
 // more tasks are submitted or an external wait resolves.
 //
 // Our goal is to do the minimal amount of work to get the maximum amount of
 // concurrency the user requests or allows (by way of their dependencies).
 // Whether on a single core where you want to timeshare with an application or
 // across hundreds the same architecture holds. Where there is inefficiency it's
 // almost always surmountable with properly constructed tasks: choose the right
 // granularity for dispatches, choose the right fan-out for tiles within those
 // dispatches, choose the right places to insert barriers to force fan-in to
 // reduce memory utilization or right places to batch barriers to allow less
 // synchronization with the work queue, etc. All of those choices are ones this
 // system is designed to handle dynamically via the task graphs provided that
 // are themselves (in the IREE world) mapped 1:1 with the GPU-esque grid
 // dispatch and command buffer model. It's a super-power if a human is authoring
 // all that information but what makes it particularly powerful here is that we
 // are authoring that in the compiler based on a tremendous amount of
 // higher-level information we can derive from the whole program. Every bit of
 // dynamism here is matched with the ability to tighten down the screws and gain
 // back anything lost by way of compiler improvements while also being able to
 // generalize out to far more complex systems (higher parallelism, higher and
 // more efficient concurrency, etc).
 //
 // The design of this system allows for a spectrum of dynamic behavior based on
 // desired usage scenarios:
 // - variable number of persistent workers based on compute/memory topology
 // - per-task scope and per-task worker affinity to control for:
 //   - power islands on multi-core systems with fine-grained power management
 //   - heterogenous microarchitectures in big.LITTLE/etc compute complexes
 //   - task isolation between multiple active requests or users
 //   - latency prioritization by partitioning workloads by priority
 // - scheduling overhead tradeoffs by varying:
 //   - coordination/flush frequency to reduce cross-thread communication
 //   - by statically inserting dispatch shards to avoid dynamic fan-out
 //   - thread donation to avoid likely context switches upon submit+wait
 //   - multi-wait across all users by sharing a wait set
 //   - per-worker work-stealing specification of victim workers in the topology
 //   - limited work-stealing to prevent chained stealing/cascading theft
 //
 // Required reading:
 //  https://www.usenix.org/conference/osdi20/presentation/ma
 //    (closest equivalent to this scheduling model)
 //  https://www.cister-labs.pt/summer2017/w3/Parallelism%20-%20Dag%20Model.pdf
 //    (good overall, our worker local lists/mailboxes are work-stealing queues)
 //  http://people.csail.mit.edu/shanir/publications/Flat%20Combining%20SPAA%2010.pdf
 //    (what we model with the coordinator)
 //  http://mcg.cs.tau.ac.il/papers/opodis2010-quasi.pdf
 //    (we exploit relaxed consistency for all our cross-thread queuing, see ^)
 //  https://moodycamel.com/blog/2014/a-fast-general-purpose-lock-free-queue-for-c++.htm
 //    (moodycamel is the state of the art on scaling queues; read it all)
 //  https://blog.molecular-matters.com/2015/08/24/job-system-2-0-lock-free-work-stealing-part-1-basics/
 //  https://blog.molecular-matters.com/2015/09/08/job-system-2-0-lock-free-work-stealing-part-2-a-specialized-allocator/
 //  https://blog.molecular-matters.com/2015/09/25/job-system-2-0-lock-free-work-stealing-part-3-going-lock-free/
 //  https://blog.molecular-matters.com/2015/11/09/job-system-2-0-lock-free-work-stealing-part-4-parallel_for/
 //  https://blog.molecular-matters.com/2016/04/04/job-system-2-0-lock-free-work-stealing-part-5-dependencies/
 //    (fantastic 5 part blog series; very similar to this)
 //  http://www.cs.cmu.edu/afs/cs.cmu.edu/Web/People/blelloch/papers/jacm99.pdf
 //    (provably optimal dynamic nested parallelism in 1999; basically: GPUs)
 //  http://www.cs.cmu.edu/~blelloch/papers/locality2000.pdf
 //    (followup to jacm99; using locality now to guide work stealing)
 //  https://www.cs.cmu.edu/afs/cs.cmu.edu/Web/People/blelloch/papers/CGK07.pdf
 //    (worker affinity and task locality for constructive cache sharing)
 //
 //==============================================================================
 // Life of an iree_task_t / high level algorithm
 //==============================================================================
 //
 // 1. Users allocate (from iree_task_pool_t, slice from arenas, etc) and
 //    construct a DAG of iree_task_ts.
 //
 //   a. Task dependency information is setup via completion_tasks for simple
 //      dependencies, implicit fan-out/fan-in (dispatches), or explicit fan-in
 //      (barriers).
 //
 //   b. Tasks are pushed into iree_task_submission_t (LIFO, thread-local list).
 //      If the task has no initial unmet initial dependencies it is placed into
 //      the ready_list. If it is initially waiting on an external resource such
 //      as iree_wait_handle_t then it is placed into the waiting_list.
 //
 // 2. iree_task_executor_submit (LIFO, atomic slist)
 //    Submissions have their task thread-local lists concatenated into a LIFO
 //    incoming_ready_slist or incoming_waiting_slist shared by the executor.
 //
 // 3. iree_task_executor_flush (or a worker puts on its coordinator hat 🎩)
 //
 //   a. Tasks are flushed from the incoming_ready_slist into a coordinator-local
 //      FIFO task queue and incoming_waiting_slist is concatenated into the
 //      primary executor waitlist.
 //
 //   b. iree_task_executor_poll_waiting_tasks: finds waiting tasks that are now
 //      ready and they are moved into the coordinator-local FIFO task queue.
 //
 //   c. iree_task_executor_schedule_ready_tasks: walks the FIFO task queue and
 //      builds a iree_task_post_batch_t containing the per-worker tasks
 //      in LIFO order.
 //
 //   d. iree_task_post_batch_submit: per-worker tasks are pushed to their
 //      respective iree_task_worker_t mailbox_slist and the workers with new
 //      tasks are notified to wake up (if not already awake).
 //
 // 4. iree_task_worker_main_pump_once (LIFO mailbox -> FIFO thread-local list)
 //    When either woken or after completing all available thread-local work
 //    each worker will check its mailbox_slist to see if any tasks have been
 //    posted.
 //
 //    a. Tasks are flushed from the LIFO mailbox into the local_task_queue FIFO
 //       for the particular worker.
 //
 //    b. If the mailbox is empty the worker *may* attempt to steal work from
 //       another nearby worker in the topology.
 //
 //    c. Any tasks in the local_task_queue are executed until empty.
 //       Tasks are retired and dependent tasks (via completion_task or barriers)
 //       are made ready and placed in the executor incoming_ready_slist or
 //       incoming_waiting_slist as with iree_task_executor_submit.
 //
 //    d. If no more thread-local work is available and the mailbox_slist is
 //       empty the worker will self-nominate for coordination and attempt to don
 //       the coordinator hat with iree_task_executor_coordinate. If new work
 //       becomes available after coordination step 5 repeats.
 //
 //    e. If another worker (or iree_task_executor_flush) is already wearing the
 //       coordinator hat then the worker will go to sleep.
 //
 //==============================================================================
 // Scaling Down
 //==============================================================================
 //
 // IREE is built at all levels - and both in the compiler and runtime - to scale
 // to different needs. Everything that IREE imposes on the runtime performance
 // and binary size is a spectrum of choices made that allows a user to only pay
 // for what they use.
 //
 // If a deployment scenario does not need complex multithreading and
 // out-of-order execution then this task system can be used in single-threaded
 // mode to at least allow for offloading from the main application thread. In
 // even more constrained scenarios (or embeddings within other systems that have
 // thread pools of their own) it can be used in zero-threaded mode with only
 // donated threads from the user performing work when the user wants it to
 // happen within its control. It still gives the benefits of wave-style
 // scheduling, multi-waiting, locality-aware work distribution, etc as well as
 // giving us a single target interface from the compiler to communicate
 // fine-grained dependency information to the runtime.
 //
 // If the cost of a few KB of data structures and some cheap uncontended atomic
 // linked list concatenations is still scary (it shouldn't be for 95% of uses)
 // then it's also possible to have a HAL driver that doesn't use this task
 // system at all and instead just executes the command buffers directly just
 // like our Vulkan/Metal/etc GPU backends do. Even though I don't recommend that
 // (one wouldn't be saving as much as they think and be losing a lot instead)
 // the layering holds and it can be useful if there's an existing external
 // sophisticated task execution system (ala taskflow) that is already in present
 // in an application.
 //
 // One assertion of IREE is that for models that take more than milliseconds to
 // execute then asynchronous scheduling is almost always worth it even on
 // systems with single cores. The ability to cooperatively schedule model
 // execution allows applications significant control over their total program
 // scheduling behavior; just as on a Commodore 64 you'd have to interrupt work
 // on vsync to begin scanning out pixels to the screen and then resume afterward
 // it's rare to see any system even scaling down to double-digit MHz
 // microcontrollers that doesn't benefit from the ability to cleanly suspend and
 // resume execution.
 //
 // But even if *all* of that is too much, the compile-time representations in
 // the HAL IR are designed to be lowered away: execution modeling does not need
 // to bottom out on a hal.command_buffer.dispatch that maps 1:1 with the runtime
 // iree_hal_command_buffer_dispatch call: dispatch can be lowered into LLVM
 // IR calls and finally into native code to do precisely what you want. The HAL
 // at runtime is a useful abstraction to allow for switching your target
 // execution system (statically or dynamically across deployments) and to share
 // the same execution system across multiple models that may be executing
 // simultaneously but it is _not_ a requirement that the IREE HAL runtime
 // implementation is used. It's called multi-level IR for a reason and the HAL
 // IR is just one level that may have many more below it.
 //
 // So yeah: don't worry. It's almost certain that the thing making or breaking
 // the performance of models over 1ms of execution time is not the HAL, and that
 // in models at or above that scale the benefits we get from being able to
 // holistically schedule the work far outstrip any specialization that can be
 // done by hand. That's to say: only worry about this if your model is literally
 // 4 floats coming from an IMU and a few hundred scalar instructions to predict
 // whether the user is walking, and that shouldn't be using the runtime HAL at
 // all and really likely doesn't benefit from using IREE at any scale - just go
 // straight to LLVM IR from the source.
 //
 //==============================================================================
 // Scaling Up
 //==============================================================================
 //
 // The task system has an implicit limit of 64 workers. This intentional
 // limitation simplifies several parts of the code while also preventing misuse:
 // it rarely (if ever) makes sense to have more than 64 compute-dominated
 // threads working on a single problem. Achieving high performance in such
 // situations requires extremely careful control over the OS scheduler, memory
 // bandwidth consumption, and synchronization. It's always possible to make the
 // problem more compute-bound or very carefully try to fit in specific cache
 // sizes to avoid more constrained bandwidth paths but it's a non-portable
 // whack-a-mole style solution that is in conflict with a lot of what IREE seeks
 // to do with respect to low-latency and multi-tenant workloads.
 //
 // If more than 64 unique L1/L2 caches (or realistically more than probably ~32)
 // are available *and* all of them are attached to the same memory controllers
 // (no NUMA involved) then the solution is straightfoward: use multiple IREE
 // task executors. Either within a process or in separate processes the
 // granularity is coarse enough to not be a burden and changes the problem from
 // needing 100% perfect work scaling of a single task to needing a naive
 // distributed workload solution at the algorithm level.
 //
 // Many useful effects also fall out of solving the work distribution problem.
 // Even for single-tenant workloads being able to split work between two
 // executors allows for natural mappings on NUMA systems or completely
 // independent machines. When supporting multi-tenant workloads (even if the
 // same program is acting as multiple-tenants in a minibatched-style algorithm)
 // the improvements of isolation both in memory access patterns and in variance
 // from potentially bad system behavior dramatically improve: there aren't many
 // opportunities for contention in this system but one can guarantee zero
 // contention by simply not sharing the resources!

 // A bitfield specifying the scheduling mode used for configuring how (or if)
 // work is balanced across queues.
 enum iree_task_scheduling_mode_bits_t {
   // TODO(benvanik): batch, round-robin, FCFS, SJF, etc.
   // We can also allow for custom scheduling, though I'm skeptical of the value
   // of that. We should look into what GPUs do in hardware for balancing things
   // (if anything this sophisticated at all). The potential benefit here is that
   // we can optimize for offline workloads by allowing each queue to be drained
   // until blocking - hopefully optimizing cache coherency and reducing the
   // total memory high-water mark - or optimize for latency across all queues by
   // taking tasks from all queues equally. There are other more interesting
   // scheduling strategies such as preferring the widest tasks available from
   // any queue such that we are keeping as many workers active as possible to
   // reach peak utilization or artificially limiting which tasks we allow
   // through to keep certain CPU cores asleep unless absolutely required.
   IREE_TASK_SCHEDULING_MODE_RESERVED = 0u,

   // Creates all workers suspended and waits until work is first scheduled to
   // them to resume. This trades off initial blocking startup time waking the
   // threads for potential latency additions later on as threads take longer to
   // wake on their first use.
   //
   // Prefer this setting in systems where startup time is the priority and work
   // may not be scheduled for awhile or scheduled unevenly to start; otherwise
   // the executor creation will take longer and a thundering herd will occur
   // forcing context switches even if no work is needed.
   //
   // Avoid in systems where the latency from initial submission to worker
   // execution is critical as this will ensure all worker threads are waiting
   // for their respective wake notifications. The kernel then will be able to
   // much faster schedule all worker quantums and in many cases all workers will
   // begin processing simultaneously immediately after the submission is made.
   IREE_TASK_SCHEDULING_MODE_DEFER_WORKER_STARTUP = 1u << 0,

   // TODO(#4027): implement IREE_TASK_SCHEDULING_MODE_DEDICATED_WAIT_THREAD.
   // Creates a dedicated thread performing waits on root wait handles.
   // On workloads with many short-duration waits this will reduce total latency
   // as the waits are aggressively processed and dependent tasks are scheduled.
   // It also keeps any wait-related syscalls off the worker threads that would
   // otherwise need to perform the syscalls during coordination.
   IREE_TASK_SCHEDULING_MODE_DEDICATED_WAIT_THREAD = 1u << 1,
 };
 typedef uint32_t iree_task_scheduling_mode_t;

 // Base task system executor interface.
 typedef struct iree_task_executor_t iree_task_executor_t;

 // Creates a task executor using the specified topology.
 //
 // |worker_local_memory_size| defines the bytes to be allocated and reserved for
 // each worker to use for local memory operations. Will be rounded up to the
 // next power of two. Dispatches performed will be able to request up to this
 // amount of memory for their invocations and no more. May be 0 if no worker
 // local memory is required.
 //
 // |topology| is only used during creation and need not live beyond this call.
 // |out_executor| must be released by the caller.
 iree_status_t iree_task_executor_create(
     iree_task_scheduling_mode_t scheduling_mode,
     const iree_task_topology_t* topology,
     iree_host_size_t worker_local_memory_size, iree_allocator_t allocator,
     iree_task_executor_t** out_executor);

 // Retains the given |executor| for the caller.
 void iree_task_executor_retain(iree_task_executor_t* executor);

 // Releases the given |executor| from the caller.
 void iree_task_executor_release(iree_task_executor_t* executor);

 // Trims pools and caches used by the executor and its workers.
 void iree_task_executor_trim(iree_task_executor_t* executor);

 // Returns an iree_event_t pool managed by the executor.
 // Users of the task system should acquire their transient events from this.
 // Long-lived events should be allocated on their own in order to avoid
 // expending the pool and harming high-frequency event acquisition.
 iree_event_pool_t* iree_task_executor_event_pool(
     iree_task_executor_t* executor);

 // Acquires a fence for the given |scope| from the executor fence pool.
 iree_status_t iree_task_executor_acquire_fence(iree_task_executor_t* executor,
                                                iree_task_scope_t* scope,
                                                iree_task_fence_t** out_fence);

 // TODO(benvanik): scheduling mode mutation, compute quota control, etc.

 // Submits a batch of tasks for execution.
 // The submission represents a DAG of tasks all reachable from the initial
 // submission lists.
 //
 // Ownership of the tasks remains with the caller for the lifetime of the
 // submission unless tasks have a custom pool specified that they can be
 // returned to.
 //
 // Safe to call from any thread. Wait-free but may block for a small duration
 // during initial scheduling of the submitted tasks.
 //
 // NOTE: it's possible for all work in the submission to complete prior to this
 // function returning.
 void iree_task_executor_submit(iree_task_executor_t* executor,
                                iree_task_submission_t* submission);

 // Flushes any pending task batches for execution.
 //
 // Safe to call from any thread. Wait-free but may block for a small duration
 // during initial scheduling of the submitted tasks.
 //
 // NOTE: due to races it's possible for new work to arrive from other threads
 // after the flush has occurred but prior to this call returning.
 void iree_task_executor_flush(iree_task_executor_t* executor);

 // Donates the calling thread to the executor until either |wait_handle|
 // resolves or |deadline_ns| is exceeded. Flushes any pending task batches prior
 // to doing any work or waiting.
 //
 // If there are no tasks available then the calling thread will block as if
 // iree_wait_one had been used on |wait_handle|. If tasks are ready then the
 // caller will not block prior to starting to perform work on behalf of the
 // executor.
 //
 // Donation is intended as an optimization to elide context switches when the
 // caller would have waited anyway; now instead of performing a kernel wait and
 // most certainly incurring a context switch the caller immediately begins
 // taking work from the queue - likely even prior to any of the executor workers
 // waking (assuming they were idle).
 //
 // Note that donation may not always be strictly a win: the caller may have an
 // arbitrary thread affinity that may cause oversubscription of resources within
 // the topology. This can cause additional contention for compute resources and
 // increase kernel scheduling overhead as threads are swapped or migrated.
 // Measure, measure, measure! If there is any IO that can be performed during
 // the time that a caller would otherwise donate themselves to the executor that
 // should always be preferred as should smaller computation (again to not
 // oversubscribe resources). Treat donation as a hail mary to prevent a kernel
 // wait and not something that will magically make things execute faster.
 // Especially in large applications it's almost certainly better to do something
 // useful with the calling thread (even if that's go to sleep).
 //
 // Safe to call from any thread (though bad to reentrantly call from workers).
 iree_status_t iree_task_executor_donate_caller(iree_task_executor_t* executor,
                                                iree_wait_handle_t* wait_handle,
                                                iree_time_t deadline_ns);

 #ifdef __cplusplus
 }  // extern "C"
 #endif  // __cplusplus

 #endif  // IREE_TASK_EXECUTOR_H_