blob: c88dd2678e5784b437d88a803ae357ac49fe93bf [file] [log] [blame]
// Copyright 2020 The IREE Authors
//
// Licensed under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
#ifndef IREE_TASK_TUNING_H_
#define IREE_TASK_TUNING_H_
#ifdef __cplusplus
extern "C" {
#endif // __cplusplus
// Maximum number of workers that an executor can manage.
// A 64 worker hard limit is based on us using uint64_t as a bitmask to select
// workers. It's easy to go smaller (just use fewer bits) if it's known that
// only <64 will ever be used (such as for devices with 2 cores).
#define IREE_TASK_EXECUTOR_MAX_WORKER_COUNT (64)
// Initial number of shard tasks that are allocated in the executor pool.
// Increasing this number will decrease initial allocation storms in cases of
// extremely wide concurrency regions (many dispatches running at the same time)
// at the cost of a higher minimum memory consumption.
#define IREE_TASK_EXECUTOR_INITIAL_SHARD_RESERVATION_PER_WORKER (4)
// Maximum number of events retained by the executor event pool.
#define IREE_TASK_EXECUTOR_EVENT_POOL_CAPACITY 64
// Maximum number of simultaneous waits an executor may perform as part of a
// wait-any operation. A larger value may enable better wake coalescing by the
// kernel. This is only a count limiting wait tasks that have been scheduled and
// been promoted to the root executor waiting list. There may be any number of
// waits deeper in the pipeline so long as they don't all become ready
// simultaneously.
//
// Realistically, though, if we have more than 64 outstanding **root** waits
// it's hard to reason about if/when the executor queue could make forward
// progress and indicates a possible error in task assignment.
//
// Also, the underlying iree_wait_set_t may not support more than 64 handles on
// certain platforms without emulation. Trying to keep us on the fast-path
// with a reasonable number seems fine for now until we have a need for more.
//
// NOTE: we reserve 1 wait handle for our own internal use. This allows us to
// wake the coordination worker when new work is submitted from external
// sources.
#define IREE_TASK_EXECUTOR_MAX_OUTSTANDING_WAITS (64 - 1)
// Amount of time that can remain in a delay task while still retiring.
// This prevents additional system sleeps when the remaining time before the
// deadline is less than the granularity the system is likely able to sleep for.
// Some platforms may have as much as 10-15ms of potential slop and sleeping for
// 1ms may result in 10-15ms.
#define IREE_TASK_EXECUTOR_DELAY_SLOP_NS (1 /*ms*/ * 1000000)
// Allows for dividing the total number of attempts that a worker will make to
// steal tasks from other workers. By default all other workers will be
// attempted while setting this to 2, for example, will try for only half of
// the available workers.
// Setting this to 0 will disable thefts.
#define IREE_TASK_EXECUTOR_MAX_THEFT_ATTEMPTS_DIVISOR (1)
// Maximum number of tasks that will be stolen in one go from another worker.
//
// Too few tasks will cause additional overhead as the worker repeatedly sips
// away tasks and when it does get tasks it may suffer spatial locality cache
// issues as it is effectively walking backwards in memory to both touch the
// tasks and - a much larger impact - running tasks that themselves are walking
// orders of magnitude more memory backwards.
//
// Too many tasks will cause additional latency on workers that may interfere
// with higher level scheduling; for example, if a worker runs out of tasks and
// immediately steals 8000 of them from another worker it's going to take until
// those 8000 complete before any work that arrives specifically for the worker
// is able to start processing.
//
// In real-time systems too few tasks is better (slightly more work for much
// lower variance in execution) while in batch mode systems too many tasks is
// better (as latencies don't matter so long as throughput is maximized).
#define IREE_TASK_EXECUTOR_MAX_THEFT_TASK_COUNT \
IREE_TASK_EXECUTOR_MAX_WORKER_COUNT
// Number of tiles that will be batched into a single reservation from the grid.
// This is a maximum; if there are fewer tiles that would otherwise allow for
// maximum parallelism then this may be ignored.
//
// The more tiles reserved at a time the higher the chance for latency to
// increase as many reserved tiles are held up on one worker while another may
// have otherwise been able to steal them and help finish them sooner.
//
// The fewer tiles reserved at a time the higher the chance for cache-locality
// destroying behavior where multiple workers all stomp on the same cache lines
// (as say worker 0 and worker 1 both fight over sequential tiles adjacent in
// memory).
#define IREE_TASK_DISPATCH_MAX_TILES_PER_SHARD_RESERVATION (8)
// Whether to enable per-tile colors for each tile tracing zone based on the
// tile grid xyz. Not cheap and can be disabled to reduce tracing overhead.
// TODO(#4017): make per-tile color tracing fast enough to always have on.
#define IREE_TASK_TRACING_PER_TILE_COLORS 1
#ifdef __cplusplus
} // extern "C"
#endif // __cplusplus
#endif // IREE_TASK_TUNING_H_